Module:Punycode: Difference between revisions
// via Wikitext Extension for VSCode Tag: Reverted |
// via Wikitext Extension for VSCode |
||
| (5 intermediate revisions by the same user not shown) | |||
| Line 1: | Line 1: | ||
-- | --[[ | ||
* Name: Punycode | |||
* Author: Mark W. Datysgeld | |||
* Description: RFC3492 Punycode implementation for IDN support with caching | |||
* Notes: encode/decode for single labels (no dots); toASCII/toUnicode for full domains (handles dots, xn-- prefixes); UTF-8 compatible with mw.ustring fallback; includes caching for performance | |||
]] | |||
local punycode = {} | |||
local | ---------------------------------------------------------------- | ||
-- Caches (persist only for a single page render) | |||
---------------------------------------------------------------- | |||
local encodeCache, decodeCache = {}, {} | |||
------------------------------------------------------------ | ---------------------------------------------------------------- | ||
-- | -- Constants from RFC 3492 | ||
------------------------------------------------------------ | ---------------------------------------------------------------- | ||
local base, tmin, tmax = 36, 1, 26 | local base, tmin, tmax = 36, 1, 26 | ||
local skew, damp = 38, 700 | local skew, damp = 38, 700 | ||
local initial_bias = 72 | local initial_bias = 72 | ||
local initial_n = 128 | local initial_n = 128 -- 0x80 | ||
local | local delimiter = '-' -- ASCII hyphen | ||
------------------------------------------------------------ | ---------------------------------------------------------------- | ||
-- | -- UTF-8 helpers (mw.ustring exists in Scribunto; falls back otherwise) | ||
------------------------------------------------------------ | ---------------------------------------------------------------- | ||
local us = mw and mw.ustring | |||
local function toCodePoints(s) | local function toCodePoints(s) | ||
if s == "" then return {} end | |||
if us then | |||
local cps, i = {}, 1 | |||
for ch in us.gmatch(s, ".") do | |||
cps[i] = us.codepoint(ch) | |||
i = i + 1 | |||
end | |||
return cps | |||
end | |||
-- plain Lua 5.1 fallback (minimal; good enough for Punycode paths) | |||
local cps, i, len = {}, 1, #s | local cps, i, len = {}, 1, #s | ||
while i <= len do | while i <= len do | ||
local b1 = s:byte(i) | local b1 = s:byte(i) | ||
if b1 < 0x80 then | if b1 < 0x80 then | ||
cps[#cps + 1] = b1 | cps[#cps + 1], i = b1, i + 1 | ||
elseif b1 < 0xE0 then | |||
elseif b1 < 0xE0 then | |||
local b2 = s:byte(i + 1) | local b2 = s:byte(i + 1) | ||
cps[#cps + 1] = (b1 - 0xC0) * 0x40 + (b2 - 0x80) | cps[#cps + 1] = (b1 - 0xC0) * 0x40 + (b2 - 0x80) | ||
i = i + 2 | i = i + 2 | ||
elseif b1 < 0xF0 then | elseif b1 < 0xF0 then | ||
local b2, b3 = s:byte(i + 1, i + 2) | local b2, b3 = s:byte(i + 1, i + 2) | ||
cps[#cps + 1] = (b1 - 0xE0) * 0x1000 + (b2 - 0x80) * 0x40 + (b3 - 0x80) | cps[#cps + 1] = | ||
(b1 - 0xE0) * 0x1000 | |||
+ (b2 - 0x80) * 0x40 | |||
+ (b3 - 0x80) | |||
i = i + 3 | i = i + 3 | ||
else | else | ||
local b2, b3, b4 = s:byte(i + 1, i + 3) | local b2, b3, b4 = s:byte(i + 1, i + 3) | ||
cps[#cps + 1] = | cps[#cps + 1] = | ||
(b1 - 0xF0) * 0x40000 | |||
(b2 - 0x80) * 0x1000 | + (b2 - 0x80) * 0x1000 | ||
(b3 - 0x80) * 0x40 | + (b3 - 0x80) * 0x40 | ||
(b4 - 0x80) | + (b4 - 0x80) | ||
i = i + 4 | i = i + 4 | ||
end | end | ||
| Line 49: | Line 65: | ||
end | end | ||
local function | local function fromCodePoints(cps) | ||
if us then | |||
local out = {} | |||
for i = 1, #cps do out[i] = us.char(cps[i]) end | |||
return table.concat(out) | |||
end | |||
local function cp2utf8(cp) | |||
if cp < 0x80 then return string.char(cp) end | |||
if cp < 0x800 then | |||
return string.char( | |||
0xC0 + math.floor(cp / 0x40), | |||
0x80 + (cp % 0x40) | |||
) | |||
end | |||
if cp < 0x10000 then | |||
return string.char( | |||
0xE0 + math.floor(cp / 0x1000), | |||
0x80 + (math.floor(cp / 0x40) % 0x40), | |||
0x80 + (cp % 0x40) | |||
) | |||
end | |||
return string.char( | return string.char( | ||
0xF0 + math.floor(cp / 0x40000), | 0xF0 + math.floor(cp / 0x40000), | ||
| Line 71: | Line 93: | ||
) | ) | ||
end | end | ||
local out = {} | local out = {} | ||
for i = 1, #cps do | for i = 1, #cps do out[i] = cp2utf8(cps[i]) end | ||
return table.concat(out) | return table.concat(out) | ||
end | end | ||
------------------------------------------------------------ | ---------------------------------------------------------------- | ||
-- | -- RFC 3492 helpers | ||
------------------------------------------------------------ | ---------------------------------------------------------------- | ||
local function digitToBasic(d) | local function digitToBasic(d) | ||
return string.char(d < 26 and (d + 97) or (d - 26 + 48)) -- a-z / 0-9 | return string.char(d < 26 and (d + 97) or (d - 26 + 48)) -- a-z / 0-9 | ||
end | end | ||
local function basicToDigit( | local function basicToDigit(byte) | ||
if | if byte >= 48 and byte <= 57 then return byte - 22 end -- '0'-'9' → 26-35 | ||
if | if byte >= 65 and byte <= 90 then return byte - 65 end -- 'A'-'Z' | ||
if | if byte >= 97 and byte <= 122 then return byte - 97 end -- 'a'-'z' | ||
return base | return base -- invalid | ||
end | end | ||
| Line 106: | Line 123: | ||
end | end | ||
------------------------------------------------------------ | ---------------------------------------------------------------- | ||
-- | -- Single-label Punycode encode / decode | ||
------------------------------------------------------------ | ---------------------------------------------------------------- | ||
local | local function isASCII(str) | ||
for i = 1, #str do if str:byte(i) > 127 then return false end end | |||
return true | |||
end | |||
function punycode.encode(label) | function punycode.encode(label) | ||
if label == "" | if not label or label == "" then return "" end | ||
label = label:gsub("%.$", "") -- strip *trailing* dot | |||
if label:find("%.") then | |||
error("punycode.encode: one label at a time (no dots)") | |||
end | |||
label = (us and us.lower or string.lower)(label) | |||
if encodeCache[label] then return encodeCache[label] end | if encodeCache[label] then return encodeCache[label] end | ||
local cp_arr = toCodePoints(label) | |||
local | local out, n, delta, bias = {}, initial_n, 0, initial_bias | ||
local out | local basic = 0 | ||
local | |||
-- copy ASCII | -- copy ASCII code points | ||
for _, cp in ipairs( | for _, cp in ipairs(cp_arr) do | ||
if cp < | if cp < 128 then | ||
out[#out + 1] = string.char(cp) | out[#out + 1] = string.char(cp) | ||
basic = basic + 1 | basic = basic + 1 | ||
end | end | ||
end | end | ||
if basic > 0 then out[#out + 1] = | if basic > 0 and basic < #cp_arr then out[#out + 1] = delimiter end | ||
local h = basic | local h = basic | ||
while h < # | while h < #cp_arr do | ||
local m = 0x7FFFFFFF | |||
local m = | for _, cp in ipairs(cp_arr) do | ||
for _, cp in ipairs( | |||
if cp >= n and cp < m then m = cp end | if cp >= n and cp < m then m = cp end | ||
end | end | ||
delta = delta + (m - n) * (h + 1) | delta = delta + (m - n) * (h + 1) | ||
n = m | n = m | ||
for _, cp in ipairs(cp_arr) do | |||
for _, cp in ipairs( | |||
if cp < n then | if cp < n then | ||
delta = delta + 1 | delta = delta + 1 | ||
| Line 149: | Line 168: | ||
while true do | while true do | ||
local t | local t | ||
if k <= bias | if k <= bias then t = tmin | ||
elseif k >= bias + tmax | elseif k >= bias + tmax then t = tmax | ||
else | else t = k - bias end | ||
if q < t then break end | if q < t then break end | ||
out[#out + 1] = digitToBasic(t + (q - t) % (base - t)) | out[#out + 1] = digitToBasic(t + (q - t) % (base - t)) | ||
| Line 167: | Line 186: | ||
end | end | ||
local | local res = table.concat(out) | ||
encodeCache[label] = | encodeCache[label] = res | ||
return | return res | ||
end | end | ||
function punycode.decode(label) | function punycode.decode(label) | ||
if label == "" | if not label or label == "" then return "" end | ||
if decodeCache[label] then return decodeCache[label] end | if decodeCache[label] then return decodeCache[label] end | ||
local cps, d = {}, (label:find( | local cps, d = {}, (label:find(delimiter, 1, true) or 0) | ||
for i = 1, d - 1 do cps[#cps + 1] = label:byte(i) end | for i = 1, d - 1 do cps[#cps + 1] = label:byte(i) end | ||
| Line 185: | Line 204: | ||
local oldi, w, k = i_val, 1, base | local oldi, w, k = i_val, 1, base | ||
while true do | while true do | ||
local digit = basicToDigit(label:byte(pos)) | local digit = basicToDigit(label:byte(pos)) | ||
pos | pos = pos + 1 | ||
i_val = i_val + digit * w | i_val = i_val + digit * w | ||
local t | local t | ||
if k <= bias | if k <= bias then t = tmin | ||
elseif k >= bias + tmax | elseif k >= bias + tmax then t = tmax | ||
else | else t = k - bias end | ||
if digit < t then break end | if digit < t then break end | ||
w = w * (base - t) | w = w * (base - t) | ||
| Line 204: | Line 222: | ||
end | end | ||
local | local res = fromCodePoints(cps) | ||
decodeCache[label] = | decodeCache[label] = res | ||
return | return res | ||
end | |||
---------------------------------------------------------------- | |||
-- Domain-level helpers (the requested FIX) | |||
---------------------------------------------------------------- | |||
local function splitLabels(domain) | |||
local labels, i = {}, 1 | |||
for label in domain:gmatch("([^%.]+)") do | |||
labels[i], i = label, i + 1 | |||
end | |||
return labels | |||
end | end | ||
local function stripTrailingDot(s) | local function stripTrailingDot(s) | ||
return (s:sub(-1) == '.' and s:sub(1, -2) or s), (s:sub(-1) == '.') | return (s:sub(-1) == '.' and s:sub(1, -2) or s), | ||
(s:sub(-1) == '.') | |||
end | end | ||
-- Unicode → ASCII/IDNA (strips dot *before* encoding, encodes each label separately) | |||
function punycode.toASCII(domain) | function punycode.toASCII(domain) | ||
if domain == "" then return "" end | if not domain or domain == "" then return "" end | ||
local trailing | |||
domain, trailing = stripTrailingDot(domain) | domain, trailing = stripTrailingDot(domain) | ||
local | |||
for | local ascii = {} | ||
for _, lbl in ipairs(splitLabels(domain)) do | |||
ascii[#ascii + 1] = isASCII(lbl) and lbl | |||
or ("xn--" .. punycode.encode(lbl)) | |||
end | end | ||
local res = table.concat( | local res = table.concat(ascii, ".") | ||
return trailing and (res .. ".") or res | return trailing and (res .. ".") or res | ||
end | end | ||
-- ASCII/IDNA → Unicode (each label separately) | |||
function punycode.toUnicode(domain) | function punycode.toUnicode(domain) | ||
if domain == "" then return "" end | if not domain or domain == "" then return "" end | ||
local trailing | |||
domain, trailing = stripTrailingDot(domain) | domain, trailing = stripTrailingDot(domain) | ||
local | |||
for | local uni = {} | ||
if | for _, lbl in ipairs(splitLabels(domain)) do | ||
if lbl:sub(1, 4):lower() == "xn--" then | |||
uni[#uni + 1] = punycode.decode(lbl:sub(5)) | |||
else | else | ||
uni[#uni + 1] = lbl | |||
end | end | ||
end | end | ||
local res = table.concat( | local res = table.concat(uni, ".") | ||
return trailing and (res .. ".") or res | return trailing and (res .. ".") or res | ||
end | end | ||
return punycode | return punycode | ||