Module:Punycode: Difference between revisions
Appearance
// via Wikitext Extension for VSCode Tag: Reverted |
// via Wikitext Extension for VSCode Tag: Manual revert |
||
| Line 1: | Line 1: | ||
-- Module:Punycode | |||
-- Module:Punycode | -- Implements RFC3492 (Punycode) encoding and decoding; requires mw.ustring for proper Unicode support | ||
-- | |||
local punycode = {} | local punycode = {} | ||
-- | -- Cache for frequently processed strings (persists during a single page render) | ||
local encodeCache = {} | |||
local decodeCache = {} | |||
local | |||
local | |||
------------------------------------------------------ | -------------------------- | ||
-- Configuration Constants | |||
-------------------------------------------------------------- | -------------------------- | ||
local base = 36 | |||
local tmin = 1 | |||
local tmax = 26 | |||
local skew = 38 | |||
local damp = 700 | |||
local initial_bias = 72 | |||
local initial_n = 128 -- 0x80 | |||
local delimiter = '-' -- ASCII hyphen | |||
-------------------------- | |||
-- Helper functions for Unicode handling. | |||
-------------------------- | |||
-- Converts a UTF-8 string to an array of Unicode code points. | |||
local function toCodePoints(s) | local function toCodePoints(s) | ||
if not s or s == "" then | |||
return {} | |||
end | |||
-- Estimate the number of characters (may not be exact for multi-byte chars) | |||
local len = mw.ustring.len(s) | |||
local cps = {} | |||
-- Pre-allocate the table with an index counter for direct assignment | |||
local i = 1 | |||
for char in mw.ustring.gmatch(s, ".") do | |||
cps[i] = mw.ustring.codepoint(char) | |||
i = i + 1 | |||
end | end | ||
return cps | return cps | ||
end | end | ||
local function | -- Converts an array of Unicode code points to a UTF-8 string. | ||
if | local function fromCodePoints(cps) | ||
if not cps or #cps == 0 then | |||
return | return "" | ||
end | end | ||
-- Pre-allocate the table with the exact size needed | |||
local chars = {} | |||
for i = 1, #cps do | |||
chars[i] = mw.ustring.char(cps[i]) | |||
end | end | ||
return | |||
return table.concat(chars) | |||
end | end | ||
local function | -------------------------- | ||
-- Digit conversion functions | |||
-------------------------- | |||
return | local function digitToBasic(digit) | ||
if digit < 26 then | |||
return string.char(digit + string.byte('a')) | |||
else | |||
return string.char(digit - 26 + string.byte('0')) | |||
end | |||
end | end | ||
local function basicToDigit(cp) | |||
if cp >= string.byte('0') and cp <= string.byte('9') then | |||
return cp - string.byte('0') + 26 | |||
local function | elseif cp >= string.byte('A') and cp <= string.byte('Z') then | ||
return cp - string.byte('A') | |||
elseif cp >= string.byte('a') and cp <= string.byte('z') then | |||
return cp - string.byte('a') | |||
else | |||
return base | |||
end | |||
return base | |||
end | end | ||
-------------------------- | |||
-- Bias adaptation (RFC3492, Section 3.4) | |||
-------------------------- | |||
local function adapt(delta, numpoints, first) | local function adapt(delta, numpoints, first) | ||
delta = | if first then | ||
delta = math.floor(delta / damp) | |||
else | |||
delta = math.floor(delta / 2) | |||
end | |||
delta = delta + math.floor(delta / numpoints) | delta = delta + math.floor(delta / numpoints) | ||
local k = 0 | local k = 0 | ||
| Line 105: | Line 99: | ||
end | end | ||
---------------------------------------------------------- | -------------------------- | ||
-- Punycode Encoding Function | |||
-------------------------- | |||
local | function punycode.encode(input) | ||
-- Input validation and cache check | |||
if not input or input == "" then | |||
return "" | |||
end | |||
input = mw.ustring.lower(input) | |||
-- Check cache first for previously encoded strings | |||
if encodeCache[input] then | |||
return encodeCache[input] | |||
end | |||
local output = {} | |||
local cp_array = toCodePoints(input) | |||
local n = initial_n | |||
local delta = 0 | |||
local bias = initial_bias | |||
local basic_count = 0 | |||
-- Copy basic code points (ASCII < 128) | |||
for _, cp in ipairs(cp_array) do | |||
if cp < 128 then | |||
table.insert(output, mw.ustring.char(cp)) | |||
basic_count = basic_count + 1 | |||
end | |||
end | |||
local h = basic_count | |||
if basic_count > 0 then | |||
table.insert(output, delimiter) | |||
end | end | ||
while h < #cp_array do | |||
while h < # | |||
local m = 0x7FFFFFFF | local m = 0x7FFFFFFF | ||
for _, cp in ipairs( | for _, cp in ipairs(cp_array) do | ||
if cp >= n and cp < m then m = cp end | if cp >= n and cp < m then | ||
m = cp | |||
end | |||
end | end | ||
delta = delta + (m - n) * (h + 1) | delta = delta + (m - n) * (h + 1) | ||
n = m | n = m | ||
for _, cp in ipairs(cp_array) do | |||
for _, cp in ipairs( | |||
if cp < n then | if cp < n then | ||
delta = delta + 1 | delta = delta + 1 | ||
elseif cp == n then | elseif cp == n then | ||
local q | local q = delta | ||
local k = base | |||
while true do | while true do | ||
local t | local t | ||
if | if k <= bias then | ||
elseif k >= bias + tmax | t = tmin | ||
else | elseif k >= bias + tmax then | ||
t = tmax | |||
else | |||
t = k - bias | |||
end | |||
if q < t then break end | if q < t then break end | ||
local code = t + ((q - t) % (base - t)) | |||
q | table.insert(output, digitToBasic(code)) | ||
k | q = math.floor((q - t) / (base - t)) | ||
k = k + base | |||
end | end | ||
table.insert(output, digitToBasic(q)) | |||
bias | bias = adapt(delta, h + 1, h == basic_count) | ||
delta = 0 | delta = 0 | ||
h | h = h + 1 | ||
end | end | ||
end | end | ||
delta = delta + 1 | delta = delta + 1 | ||
n | n = n + 1 | ||
end | end | ||
local result = table.concat( | local result = table.concat(output) | ||
encodeCache[ | |||
-- Cache the result before returning | |||
encodeCache[input] = result | |||
return result | return result | ||
end | end | ||
function punycode.decode( | -------------------------- | ||
if | -- Punycode Decoding Function | ||
if decodeCache[ | -------------------------- | ||
function punycode.decode(input) | |||
-- Input validation and cache check | |||
if not input or input == "" then | |||
return "" | |||
end | |||
-- Check cache first for previously decoded strings | |||
if decodeCache[input] then | |||
return decodeCache[input] | |||
end | |||
local cp_array = {} | |||
local d = input:find(delimiter, 1, true) | |||
local b = 0 | |||
if d then | |||
for i = 1, d - 1 do | |||
local cp = input:byte(i) | |||
table.insert(cp_array, cp) | |||
b = b + 1 | |||
end | |||
else | |||
d = 0 | |||
end | |||
local | local n = initial_n | ||
local bias = initial_bias | |||
local i = 0 | |||
local index = d + 1 | |||
local input_len = #input | |||
local | while index <= input_len do | ||
local oldi = i | |||
local w = 1 | |||
local k = base | |||
local | |||
while true do | while true do | ||
if | if index > input_len then | ||
local digit = basicToDigit( | error("Invalid input: punycode decode incomplete") | ||
end | |||
local digit = basicToDigit(input:byte(index)) | |||
index = index + 1 | |||
i = i + digit * w | |||
local t | local t | ||
if | if k <= bias then | ||
elseif k >= bias + tmax | t = tmin | ||
else | elseif k >= bias + tmax then | ||
t = tmax | |||
else | |||
t = k - bias | |||
end | |||
if digit < t then break end | if digit < t then break end | ||
w = w * (base - t) | w = w * (base - t) | ||
k = k + base | k = k + base | ||
end | end | ||
bias = adapt( | bias = adapt(i - oldi, #cp_array + 1, oldi == 0) | ||
n | n = n + math.floor(i / (#cp_array + 1)) | ||
i = i % (#cp_array + 1) | |||
table.insert( | table.insert(cp_array, i + 1, n) | ||
i = i + 1 | |||
end | end | ||
local result = fromCodePoints( | local result = fromCodePoints(cp_array) | ||
decodeCache[ | |||
-- Cache the result before returning | |||
decodeCache[input] = result | |||
return result | return result | ||
end | end | ||
return punycode | return punycode | ||
Revision as of 00:05, 16 May 2025
Documentation for this module may be created at Module:Punycode/doc
-- Module:Punycode
-- Implements RFC3492 (Punycode) encoding and decoding; requires mw.ustring for proper Unicode support
local punycode = {}
-- Cache for frequently processed strings (persists during a single page render)
local encodeCache = {}
local decodeCache = {}
--------------------------
-- Configuration Constants
--------------------------
local base = 36
local tmin = 1
local tmax = 26
local skew = 38
local damp = 700
local initial_bias = 72
local initial_n = 128 -- 0x80
local delimiter = '-' -- ASCII hyphen
--------------------------
-- Helper functions for Unicode handling.
--------------------------
-- Converts a UTF-8 string to an array of Unicode code points.
local function toCodePoints(s)
if not s or s == "" then
return {}
end
-- Estimate the number of characters (may not be exact for multi-byte chars)
local len = mw.ustring.len(s)
local cps = {}
-- Pre-allocate the table with an index counter for direct assignment
local i = 1
for char in mw.ustring.gmatch(s, ".") do
cps[i] = mw.ustring.codepoint(char)
i = i + 1
end
return cps
end
-- Converts an array of Unicode code points to a UTF-8 string.
local function fromCodePoints(cps)
if not cps or #cps == 0 then
return ""
end
-- Pre-allocate the table with the exact size needed
local chars = {}
for i = 1, #cps do
chars[i] = mw.ustring.char(cps[i])
end
return table.concat(chars)
end
--------------------------
-- Digit conversion functions
--------------------------
local function digitToBasic(digit)
if digit < 26 then
return string.char(digit + string.byte('a'))
else
return string.char(digit - 26 + string.byte('0'))
end
end
local function basicToDigit(cp)
if cp >= string.byte('0') and cp <= string.byte('9') then
return cp - string.byte('0') + 26
elseif cp >= string.byte('A') and cp <= string.byte('Z') then
return cp - string.byte('A')
elseif cp >= string.byte('a') and cp <= string.byte('z') then
return cp - string.byte('a')
else
return base
end
end
--------------------------
-- Bias adaptation (RFC3492, Section 3.4)
--------------------------
local function adapt(delta, numpoints, first)
if first then
delta = math.floor(delta / damp)
else
delta = math.floor(delta / 2)
end
delta = delta + math.floor(delta / numpoints)
local k = 0
while delta > ((base - tmin) * tmax) / 2 do
delta = math.floor(delta / (base - tmin))
k = k + base
end
return k + math.floor(((base - tmin + 1) * delta) / (delta + skew))
end
--------------------------
-- Punycode Encoding Function
--------------------------
function punycode.encode(input)
-- Input validation and cache check
if not input or input == "" then
return ""
end
input = mw.ustring.lower(input)
-- Check cache first for previously encoded strings
if encodeCache[input] then
return encodeCache[input]
end
local output = {}
local cp_array = toCodePoints(input)
local n = initial_n
local delta = 0
local bias = initial_bias
local basic_count = 0
-- Copy basic code points (ASCII < 128)
for _, cp in ipairs(cp_array) do
if cp < 128 then
table.insert(output, mw.ustring.char(cp))
basic_count = basic_count + 1
end
end
local h = basic_count
if basic_count > 0 then
table.insert(output, delimiter)
end
while h < #cp_array do
local m = 0x7FFFFFFF
for _, cp in ipairs(cp_array) do
if cp >= n and cp < m then
m = cp
end
end
delta = delta + (m - n) * (h + 1)
n = m
for _, cp in ipairs(cp_array) do
if cp < n then
delta = delta + 1
elseif cp == n then
local q = delta
local k = base
while true do
local t
if k <= bias then
t = tmin
elseif k >= bias + tmax then
t = tmax
else
t = k - bias
end
if q < t then break end
local code = t + ((q - t) % (base - t))
table.insert(output, digitToBasic(code))
q = math.floor((q - t) / (base - t))
k = k + base
end
table.insert(output, digitToBasic(q))
bias = adapt(delta, h + 1, h == basic_count)
delta = 0
h = h + 1
end
end
delta = delta + 1
n = n + 1
end
local result = table.concat(output)
-- Cache the result before returning
encodeCache[input] = result
return result
end
--------------------------
-- Punycode Decoding Function
--------------------------
function punycode.decode(input)
-- Input validation and cache check
if not input or input == "" then
return ""
end
-- Check cache first for previously decoded strings
if decodeCache[input] then
return decodeCache[input]
end
local cp_array = {}
local d = input:find(delimiter, 1, true)
local b = 0
if d then
for i = 1, d - 1 do
local cp = input:byte(i)
table.insert(cp_array, cp)
b = b + 1
end
else
d = 0
end
local n = initial_n
local bias = initial_bias
local i = 0
local index = d + 1
local input_len = #input
while index <= input_len do
local oldi = i
local w = 1
local k = base
while true do
if index > input_len then
error("Invalid input: punycode decode incomplete")
end
local digit = basicToDigit(input:byte(index))
index = index + 1
i = i + digit * w
local t
if k <= bias then
t = tmin
elseif k >= bias + tmax then
t = tmax
else
t = k - bias
end
if digit < t then break end
w = w * (base - t)
k = k + base
end
bias = adapt(i - oldi, #cp_array + 1, oldi == 0)
n = n + math.floor(i / (#cp_array + 1))
i = i % (#cp_array + 1)
table.insert(cp_array, i + 1, n)
i = i + 1
end
local result = fromCodePoints(cp_array)
-- Cache the result before returning
decodeCache[input] = result
return result
end
return punycode