Module:Punycode: Difference between revisions
Appearance
// via Wikitext Extension for VSCode Tag: Reverted |
// via Wikitext Extension for VSCode Tag: Reverted |
||
| Line 1: | Line 1: | ||
-- Module:Punycode | ---------------------------------------------------------------- | ||
-- | -- Module:Punycode – pure Lua 5.1 RFC-3492 implementation | ||
-- punycode.encode(label) | -- Public API | ||
-- punycode.decode(label) | -- punycode.encode(label) -> "4dbrk0ce" | ||
-- punycode.toASCII(domain) | -- punycode.decode(label) -> "ישראל" | ||
-- punycode.toUnicode(domain) | -- punycode.toASCII(domain) -> "xn--4dbrk0ce" | ||
-- | -- punycode.toUnicode(domain) -> "ישראל" | ||
-- | ---------------------------------------------------------------- | ||
local punycode = {} | local punycode = {} | ||
| Line 16: | Line 15: | ||
local skew, damp = 38, 700 | local skew, damp = 38, 700 | ||
local initial_bias = 72 | local initial_bias = 72 | ||
local initial_n = 128 | local initial_n = 128 -- 0x80 | ||
local delim = | local delim = "-" -- ASCII hyphen | ||
---------------------------------------------------------------- | ---------------------------------------------------------------- | ||
-- UTF-8 helpers (pure Lua 5.1 | -- UTF-8 helpers (pure Lua 5.1) | ||
---------------------------------------------------------------- | ---------------------------------------------------------------- | ||
local function toCodePoints(s) | local function toCodePoints(s) | ||
| Line 26: | Line 25: | ||
while i <= len do | while i <= len do | ||
local b1 = s:byte(i) | local b1 = s:byte(i) | ||
if b1 < 0x80 then | if b1 < 0x80 then -- 1-byte sequence | ||
cps[#cps + 1] = b1 | cps[#cps + 1], i = b1, i + 1 | ||
elseif b1 < 0xE0 then -- 2-byte sequence | |||
elseif b1 < 0xE0 then | |||
local b2 = s:byte(i + 1) | local b2 = s:byte(i + 1) | ||
cps[#cps + 1] = (b1 - 0xC0) * 0x40 + (b2 - 0x80) | cps[#cps + 1] = (b1 - 0xC0) * 0x40 + (b2 - 0x80) | ||
i = i + 2 | i = i + 2 | ||
elseif b1 < 0xF0 then | elseif b1 < 0xF0 then -- 3-byte sequence | ||
local b2, b3 = s:byte(i + 1, i + 2) | local b2, b3 = s:byte(i + 1, i + 2) | ||
cps[#cps + 1] = | cps[#cps + 1] = | ||
(b1 - 0xE0) * 0x1000 | |||
(b2 - 0x80) * 0x40 | + (b2 - 0x80) * 0x40 | ||
(b3 - 0x80) | + (b3 - 0x80) | ||
i = i + 3 | i = i + 3 | ||
else | else -- 4-byte sequence | ||
local b2, b3, b4 = s:byte(i + 1, i + 3) | local b2, b3, b4 = s:byte(i + 1, i + 3) | ||
cps[#cps + 1] = | cps[#cps + 1] = | ||
(b1 - 0xF0) * 0x40000 | |||
(b2 - 0x80) * 0x1000 | + (b2 - 0x80) * 0x1000 | ||
(b3 - 0x80) * 0x40 | + (b3 - 0x80) * 0x40 | ||
(b4 - 0x80) | + (b4 - 0x80) | ||
i = i + 4 | i = i + 4 | ||
end | end | ||
| Line 54: | Line 52: | ||
local function cpToUtf8(cp) | local function cpToUtf8(cp) | ||
if cp < 0x80 then | if cp < 0x80 then return string.char(cp) end | ||
if cp < 0x800 then | |||
return string.char( | return string.char( | ||
0xC0 + math.floor(cp / 0x40), | 0xC0 + math.floor(cp / 0x40), | ||
0x80 + (cp % 0x40) | 0x80 + (cp % 0x40) | ||
) | ) | ||
end | |||
if cp < 0x10000 then | |||
return string.char( | return string.char( | ||
0xE0 + math.floor(cp / 0x1000), | 0xE0 + math.floor(cp / 0x1000), | ||
0x80 + (math.floor(cp / 0x40) % 0x40), | 0x80 + (math.floor(cp / 0x40) % 0x40), | ||
0x80 + (cp % 0x40) | 0x80 + (cp % 0x40) | ||
) | ) | ||
end | end | ||
return string.char( | |||
0xF0 + math.floor(cp / 0x40000), | |||
0x80 + (math.floor(cp / 0x1000) % 0x40), | |||
0x80 + (math.floor(cp / 0x40) % 0x40), | |||
0x80 + (cp % 0x40) | |||
) | |||
end | end | ||
local function fromCodePoints(cps) | local function fromCodePoints(cps) | ||
local | local out = {} | ||
for i = 1, #cps do | for i = 1, #cps do out[i] = cpToUtf8(cps[i]) end | ||
return table.concat(out) | |||
return table.concat( | |||
end | end | ||
| Line 89: | Line 84: | ||
---------------------------------------------------------------- | ---------------------------------------------------------------- | ||
local function digitToBasic(d) | local function digitToBasic(d) | ||
return string.char(d < 26 and (d + 97) or (d - 26 + 48)) | return string.char(d < 26 and (d + 97) or (d - 26 + 48)) -- a-z / 0-9 | ||
end | end | ||
local function basicToDigit(byte) | local function basicToDigit(byte) | ||
if byte >= 48 and byte <= 57 then return byte - 22 end | if byte >= 48 and byte <= 57 then return byte - 22 end -- '0'-'9' → 26-35 | ||
if byte >= 65 and byte <= 90 then return byte - 65 end | if byte >= 65 and byte <= 90 then return byte - 65 end -- 'A'-'Z' | ||
if byte >= 97 and byte <= 122 then return byte - 97 end | if byte >= 97 and byte <= 122 then return byte - 97 end -- 'a'-'z' | ||
return base | return base -- invalid | ||
end | end | ||
| Line 116: | Line 111: | ||
function punycode.encode(label) | function punycode.encode(label) | ||
if label == "" | if label == "" then return "" end | ||
if encodeCache[label] then return encodeCache[label] end | if encodeCache[label] then return encodeCache[label] end | ||
if label:find("%.") | if label:find("%.") then error("punycode.encode: single label expected") end | ||
label = string.lower(label) | label = string.lower(label) -- IDNA is case-insensitive | ||
local cps | local cps = toCodePoints(label) | ||
local out | local out = {} | ||
local n | local n = initial_n | ||
local delta | local bias = initial_bias | ||
local delta = 0 | |||
local basic = 0 | |||
-- copy ASCII code points | -- copy basic ASCII code points | ||
for _, cp in ipairs(cps) do | for _, cp in ipairs(cps) do | ||
if cp < 0x80 then | if cp < 0x80 then | ||
| Line 133: | Line 130: | ||
end | end | ||
end | end | ||
if basic > 0 then out[#out + 1] = delim end | -- delimiter only if mixture of basic + non-basic | ||
if basic > 0 and basic < #cps then out[#out + 1] = delim end | |||
local h = basic | local h = basic | ||
while h < #cps do | while h < #cps do | ||
local m = 0x7FFFFFFF | |||
local m = | |||
for _, cp in ipairs(cps) do | for _, cp in ipairs(cps) do | ||
if cp >= n and cp < m then m = cp end | if cp >= n and cp < m then m = cp end | ||
| Line 152: | Line 149: | ||
while true do | while true do | ||
local t | local t | ||
if k <= bias | if k <= bias then t = tmin | ||
elseif k >= bias + tmax | elseif k >= bias + tmax then t = tmax | ||
else | else t = k - bias end | ||
if q < t then break end | if q < t then break end | ||
out[#out + 1] = digitToBasic(t + (q - t) % (base - t)) | out[#out + 1] = digitToBasic(t + (q - t) % (base - t)) | ||
q | q = math.floor((q - t) / (base - t)) | ||
k | k = k + base | ||
end | end | ||
out[#out + 1] = digitToBasic(q) | out[#out + 1] = digitToBasic(q) | ||
| Line 176: | Line 173: | ||
function punycode.decode(label) | function punycode.decode(label) | ||
if label == "" | if label == "" then return "" end | ||
if decodeCache[label] then return decodeCache[label] end | if decodeCache[label] then return decodeCache[label] end | ||
local cps, d = {}, (label:find(delim, 1, true) or 0) | local cps, d = {}, (label:find(delim, 1, true) or 0) | ||
for i = 1, d - 1 do | for i = 1, d - 1 do cps[#cps + 1] = label:byte(i) end | ||
local n, i_val, bias = initial_n, 0, initial_bias | local n, i_val, bias = initial_n, 0, initial_bias | ||
| Line 195: | Line 190: | ||
i_val = i_val + digit * w | i_val = i_val + digit * w | ||
local t | local t | ||
if k <= bias | if k <= bias then t = tmin | ||
elseif k >= bias + tmax | elseif k >= bias + tmax then t = tmax | ||
else | else t = k - bias end | ||
if digit < t then break end | if digit < t then break end | ||
w = w * (base - t) | w = w * (base - t) | ||
| Line 215: | Line 210: | ||
---------------------------------------------------------------- | ---------------------------------------------------------------- | ||
-- Domain-level helpers | -- Domain-level helpers | ||
---------------------------------------------------------------- | ---------------------------------------------------------------- | ||
local function stripTrailingDot(s) | local function stripTrailingDot(s) | ||
return (s:sub(-1) == | return (s:sub(-1) == "." and s:sub(1, -2) or s), | ||
(s:sub(-1) == ".") | |||
end | |||
-- quick ASCII-only test: letters, digits, hyphen | |||
local function isPureASCII(label) | |||
return label:match("^[%a%d%-]+$") ~= nil | |||
end | end | ||
function punycode.toASCII(domain) | function punycode.toASCII(domain) | ||
if domain == "" then return "" end | if domain == "" then return "" end | ||
| Line 229: | Line 229: | ||
local out = {} | local out = {} | ||
for label in domain:gmatch("([^%.]+)") do | for label in domain:gmatch("([^%.]+)") do | ||
if isPureASCII(label) then | |||
out[#out + 1] = label | |||
else | |||
out[#out + 1] = "xn--" .. punycode.encode(label) | out[#out + 1] = "xn--" .. punycode.encode(label) | ||
end | end | ||
end | end | ||
| Line 240: | Line 239: | ||
end | end | ||
function punycode.toUnicode(domain) | function punycode.toUnicode(domain) | ||
if domain == "" then return "" end | if domain == "" then return "" end | ||
Revision as of 00:04, 16 May 2025
Documentation for this module may be created at Module:Punycode/doc
----------------------------------------------------------------
-- Module:Punycode – pure Lua 5.1 RFC-3492 implementation
-- Public API
-- punycode.encode(label) -> "4dbrk0ce"
-- punycode.decode(label) -> "ישראל"
-- punycode.toASCII(domain) -> "xn--4dbrk0ce"
-- punycode.toUnicode(domain) -> "ישראל"
----------------------------------------------------------------
local punycode = {}
----------------------------------------------------------------
-- RFC 3492 constants
----------------------------------------------------------------
local base, tmin, tmax = 36, 1, 26
local skew, damp = 38, 700
local initial_bias = 72
local initial_n = 128 -- 0x80
local delim = "-" -- ASCII hyphen
----------------------------------------------------------------
-- UTF-8 helpers (pure Lua 5.1)
----------------------------------------------------------------
local function toCodePoints(s)
local cps, i, len = {}, 1, #s
while i <= len do
local b1 = s:byte(i)
if b1 < 0x80 then -- 1-byte sequence
cps[#cps + 1], i = b1, i + 1
elseif b1 < 0xE0 then -- 2-byte sequence
local b2 = s:byte(i + 1)
cps[#cps + 1] = (b1 - 0xC0) * 0x40 + (b2 - 0x80)
i = i + 2
elseif b1 < 0xF0 then -- 3-byte sequence
local b2, b3 = s:byte(i + 1, i + 2)
cps[#cps + 1] =
(b1 - 0xE0) * 0x1000
+ (b2 - 0x80) * 0x40
+ (b3 - 0x80)
i = i + 3
else -- 4-byte sequence
local b2, b3, b4 = s:byte(i + 1, i + 3)
cps[#cps + 1] =
(b1 - 0xF0) * 0x40000
+ (b2 - 0x80) * 0x1000
+ (b3 - 0x80) * 0x40
+ (b4 - 0x80)
i = i + 4
end
end
return cps
end
local function cpToUtf8(cp)
if cp < 0x80 then return string.char(cp) end
if cp < 0x800 then
return string.char(
0xC0 + math.floor(cp / 0x40),
0x80 + (cp % 0x40)
)
end
if cp < 0x10000 then
return string.char(
0xE0 + math.floor(cp / 0x1000),
0x80 + (math.floor(cp / 0x40) % 0x40),
0x80 + (cp % 0x40)
)
end
return string.char(
0xF0 + math.floor(cp / 0x40000),
0x80 + (math.floor(cp / 0x1000) % 0x40),
0x80 + (math.floor(cp / 0x40) % 0x40),
0x80 + (cp % 0x40)
)
end
local function fromCodePoints(cps)
local out = {}
for i = 1, #cps do out[i] = cpToUtf8(cps[i]) end
return table.concat(out)
end
----------------------------------------------------------------
-- Punycode helpers
----------------------------------------------------------------
local function digitToBasic(d)
return string.char(d < 26 and (d + 97) or (d - 26 + 48)) -- a-z / 0-9
end
local function basicToDigit(byte)
if byte >= 48 and byte <= 57 then return byte - 22 end -- '0'-'9' → 26-35
if byte >= 65 and byte <= 90 then return byte - 65 end -- 'A'-'Z'
if byte >= 97 and byte <= 122 then return byte - 97 end -- 'a'-'z'
return base -- invalid
end
local function adapt(delta, numpoints, first)
delta = first and math.floor(delta / damp) or math.floor(delta / 2)
delta = delta + math.floor(delta / numpoints)
local k = 0
while delta > ((base - tmin) * tmax) / 2 do
delta = math.floor(delta / (base - tmin))
k = k + base
end
return k + math.floor(((base - tmin + 1) * delta) / (delta + skew))
end
----------------------------------------------------------------
-- Encode / decode a single label
----------------------------------------------------------------
local encodeCache, decodeCache = {}, {}
function punycode.encode(label)
if label == "" then return "" end
if encodeCache[label] then return encodeCache[label] end
if label:find("%.") then error("punycode.encode: single label expected") end
label = string.lower(label) -- IDNA is case-insensitive
local cps = toCodePoints(label)
local out = {}
local n = initial_n
local bias = initial_bias
local delta = 0
local basic = 0
-- copy basic ASCII code points
for _, cp in ipairs(cps) do
if cp < 0x80 then
out[#out + 1] = string.char(cp)
basic = basic + 1
end
end
-- delimiter only if mixture of basic + non-basic
if basic > 0 and basic < #cps then out[#out + 1] = delim end
local h = basic
while h < #cps do
local m = 0x7FFFFFFF
for _, cp in ipairs(cps) do
if cp >= n and cp < m then m = cp end
end
delta = delta + (m - n) * (h + 1)
n = m
for _, cp in ipairs(cps) do
if cp < n then
delta = delta + 1
elseif cp == n then
local q, k = delta, base
while true do
local t
if k <= bias then t = tmin
elseif k >= bias + tmax then t = tmax
else t = k - bias end
if q < t then break end
out[#out + 1] = digitToBasic(t + (q - t) % (base - t))
q = math.floor((q - t) / (base - t))
k = k + base
end
out[#out + 1] = digitToBasic(q)
bias = adapt(delta, h + 1, h == basic)
delta = 0
h = h + 1
end
end
delta = delta + 1
n = n + 1
end
local result = table.concat(out)
encodeCache[label] = result
return result
end
function punycode.decode(label)
if label == "" then return "" end
if decodeCache[label] then return decodeCache[label] end
local cps, d = {}, (label:find(delim, 1, true) or 0)
for i = 1, d - 1 do cps[#cps + 1] = label:byte(i) end
local n, i_val, bias = initial_n, 0, initial_bias
local pos, len = (d > 0) and (d + 1) or 1, #label
while pos <= len do
local oldi, w, k = i_val, 1, base
while true do
if pos > len then error("punycode.decode: bad input") end
local digit = basicToDigit(label:byte(pos))
pos = pos + 1
i_val = i_val + digit * w
local t
if k <= bias then t = tmin
elseif k >= bias + tmax then t = tmax
else t = k - bias end
if digit < t then break end
w = w * (base - t)
k = k + base
end
bias = adapt(i_val - oldi, #cps + 1, oldi == 0)
n = n + math.floor(i_val / (#cps + 1))
i_val = i_val % (#cps + 1)
table.insert(cps, i_val + 1, n)
i_val = i_val + 1
end
local result = fromCodePoints(cps)
decodeCache[label] = result
return result
end
----------------------------------------------------------------
-- Domain-level helpers
----------------------------------------------------------------
local function stripTrailingDot(s)
return (s:sub(-1) == "." and s:sub(1, -2) or s),
(s:sub(-1) == ".")
end
-- quick ASCII-only test: letters, digits, hyphen
local function isPureASCII(label)
return label:match("^[%a%d%-]+$") ~= nil
end
function punycode.toASCII(domain)
if domain == "" then return "" end
local trailing
domain, trailing = stripTrailingDot(domain)
local out = {}
for label in domain:gmatch("([^%.]+)") do
if isPureASCII(label) then
out[#out + 1] = label
else
out[#out + 1] = "xn--" .. punycode.encode(label)
end
end
local res = table.concat(out, ".")
return trailing and (res .. ".") or res
end
function punycode.toUnicode(domain)
if domain == "" then return "" end
local trailing
domain, trailing = stripTrailingDot(domain)
local out = {}
for label in domain:gmatch("([^%.]+)") do
if label:sub(1, 4):lower() == "xn--" then
out[#out + 1] = punycode.decode(label:sub(5))
else
out[#out + 1] = label
end
end
local res = table.concat(out, ".")
return trailing and (res .. ".") or res
end
return punycode