Jump to content

Module:Punycode: Difference between revisions

// via Wikitext Extension for VSCode
Tag: Reverted
// via Wikitext Extension for VSCode
Tag: Reverted
Line 1: Line 1:
-- Module:Punycode pure Lua 5.1 implementation of RFC 3492
----------------------------------------------------------------
-- Exposed API:
-- Module:Punycode – pure Lua 5.1 RFC-3492 implementation
--  punycode.encode(label)     – encode one DNS label (no dots) to Punycode
-- Public API
--  punycode.decode(label)     – decode one Punycode label
--  punycode.encode(label)       -> "4dbrk0ce"
--  punycode.toASCII(domain)   – Unicode → ASCII/IDNA (adds xn-- when needed)
--  punycode.decode(label)       -> "ישראל"
--  punycode.toUnicode(domain) – ASCII/IDNA → Unicode
--  punycode.toASCII(domain)     -> "xn--4dbrk0ce"
--
--  punycode.toUnicode(domain)   -> "ישראל"
-- No external dependencies; works in Scribunto and stock Lua 5.1.
----------------------------------------------------------------
 
local punycode = {}
local punycode = {}


Line 16: Line 15:
local skew, damp      = 38, 700
local skew, damp      = 38, 700
local initial_bias    = 72
local initial_bias    = 72
local initial_n        = 128       -- 0x80
local initial_n        = 128             -- 0x80
local delim            = '-'      -- ASCII hyphen
local delim            = "-"              -- ASCII hyphen


----------------------------------------------------------------
----------------------------------------------------------------
-- UTF-8 helpers (pure Lua 5.1 – no bit32 / utf8 libraries)
-- UTF-8 helpers (pure Lua 5.1)
----------------------------------------------------------------
----------------------------------------------------------------
local function toCodePoints(s)
local function toCodePoints(s)
Line 26: Line 25:
     while i <= len do
     while i <= len do
         local b1 = s:byte(i)
         local b1 = s:byte(i)
         if b1 < 0x80 then                       -- 1-byte
         if b1 < 0x80 then                       -- 1-byte sequence
             cps[#cps + 1] = b1
             cps[#cps + 1], i = b1, i + 1
            i = i + 1
         elseif b1 < 0xE0 then                   -- 2-byte sequence
         elseif b1 < 0xE0 then                   -- 2-byte
             local b2 = s:byte(i + 1)
             local b2 = s:byte(i + 1)
             cps[#cps + 1] = (b1 - 0xC0) * 0x40 + (b2 - 0x80)
             cps[#cps + 1] = (b1 - 0xC0) * 0x40 + (b2 - 0x80)
             i = i + 2
             i = i + 2
         elseif b1 < 0xF0 then                   -- 3-byte
         elseif b1 < 0xF0 then                   -- 3-byte sequence
             local b2, b3 = s:byte(i + 1, i + 2)
             local b2, b3 = s:byte(i + 1, i + 2)
             cps[#cps + 1] =
             cps[#cps + 1] =
                (b1 - 0xE0) * 0x1000 +
                  (b1 - 0xE0) * 0x1000
                 (b2 - 0x80) * 0x40 +
                 + (b2 - 0x80) * 0x40
                 (b3 - 0x80)
                 + (b3 - 0x80)
             i = i + 3
             i = i + 3
         else                                     -- 4-byte
         else                                   -- 4-byte sequence
             local b2, b3, b4 = s:byte(i + 1, i + 3)
             local b2, b3, b4 = s:byte(i + 1, i + 3)
             cps[#cps + 1] =
             cps[#cps + 1] =
                (b1 - 0xF0) * 0x40000 +
                  (b1 - 0xF0) * 0x40000
                 (b2 - 0x80) * 0x1000 +
                 + (b2 - 0x80) * 0x1000
                 (b3 - 0x80) * 0x40 +
                 + (b3 - 0x80) * 0x40
                 (b4 - 0x80)
                 + (b4 - 0x80)
             i = i + 4
             i = i + 4
         end
         end
Line 54: Line 52:


local function cpToUtf8(cp)
local function cpToUtf8(cp)
     if cp < 0x80 then
     if cp < 0x80   then return string.char(cp) end
        return string.char(cp)
     if cp < 0x800 then
     elseif cp < 0x800 then
         return string.char(
         return string.char(
             0xC0 + math.floor(cp / 0x40),
             0xC0 + math.floor(cp / 0x40),
             0x80 + (cp % 0x40)
             0x80 + (cp % 0x40)
         )
         )
     elseif cp < 0x10000 then
     end
    if cp < 0x10000 then
         return string.char(
         return string.char(
             0xE0 + math.floor(cp / 0x1000),
             0xE0 + math.floor(cp / 0x1000),
             0x80 + (math.floor(cp / 0x40) % 0x40),
             0x80 + (math.floor(cp / 0x40) % 0x40),
            0x80 + (cp % 0x40)
        )
    else
        return string.char(
            0xF0 + math.floor(cp / 0x40000),
            0x80 + (math.floor(cp / 0x1000) % 0x40),
            0x80 + (math.floor(cp / 0x40)  % 0x40),
             0x80 + (cp % 0x40)
             0x80 + (cp % 0x40)
         )
         )
     end
     end
    return string.char(
        0xF0 + math.floor(cp / 0x40000),
        0x80 + (math.floor(cp / 0x1000) % 0x40),
        0x80 + (math.floor(cp / 0x40)  % 0x40),
        0x80 + (cp % 0x40)
    )
end
end


local function fromCodePoints(cps)
local function fromCodePoints(cps)
     local buf = {}
     local out = {}
     for i = 1, #cps do
     for i = 1, #cps do out[i] = cpToUtf8(cps[i]) end
        buf[i] = cpToUtf8(cps[i])
     return table.concat(out)
    end
     return table.concat(buf)
end
end


Line 89: Line 84:
----------------------------------------------------------------
----------------------------------------------------------------
local function digitToBasic(d)
local function digitToBasic(d)
     return string.char(d < 26 and (d + 97) or (d - 26 + 48)) -- a-z / 0-9
     return string.char(d < 26 and (d + 97) or (d - 26 + 48)) -- a-z / 0-9
end
end


local function basicToDigit(byte)
local function basicToDigit(byte)
     if byte >= 48 and byte <= 57  then return byte - 22 end -- '0'-'9' → 26-35
     if byte >= 48 and byte <= 57  then return byte - 22 end   -- '0'-'9' → 26-35
     if byte >= 65 and byte <= 90  then return byte - 65 end -- 'A'-'Z'
     if byte >= 65 and byte <= 90  then return byte - 65 end   -- 'A'-'Z'
     if byte >= 97 and byte <= 122 then return byte - 97 end -- 'a'-'z'
     if byte >= 97 and byte <= 122 then return byte - 97 end   -- 'a'-'z'
     return base                                             -- invalid
     return base                                               -- invalid
end
end


Line 116: Line 111:


function punycode.encode(label)
function punycode.encode(label)
     if label == ""       then return "" end
     if label == ""         then return "" end
     if encodeCache[label] then return encodeCache[label] end
     if encodeCache[label]   then return encodeCache[label] end
     if label:find("%.")   then error("punycode.encode: single label expected") end
     if label:find("%.")     then error("punycode.encode: single label expected") end


     label = string.lower(label)               -- IDNA is case-insensitive
     label = string.lower(label)                 -- IDNA is case-insensitive
     local cps = toCodePoints(label)
     local cps   = toCodePoints(label)
     local out = {}
     local out   = {}
     local n, bias = initial_n, initial_bias
     local n     = initial_n
     local delta, basic = 0, 0
    local bias   = initial_bias
     local delta = 0
    local basic = 0


     -- copy ASCII code points
     -- copy basic ASCII code points
     for _, cp in ipairs(cps) do
     for _, cp in ipairs(cps) do
         if cp < 0x80 then
         if cp < 0x80 then
Line 133: Line 130:
         end
         end
     end
     end
     if basic > 0 then out[#out + 1] = delim end
    -- delimiter only if mixture of basic + non-basic
     if basic > 0 and basic < #cps then out[#out + 1] = delim end


     local h = basic
     local h = basic
     while h < #cps do
     while h < #cps do
        -- smallest cp ≥ n
         local m = 0x7FFFFFFF
         local m = 0x7fffffff
         for _, cp in ipairs(cps) do
         for _, cp in ipairs(cps) do
             if cp >= n and cp < m then m = cp end
             if cp >= n and cp < m then m = cp end
Line 152: Line 149:
                 while true do
                 while true do
                     local t
                     local t
                     if    k <= bias         then t = tmin
                     if    k <= bias         then t = tmin
                     elseif k >= bias + tmax then t = tmax
                     elseif k >= bias + tmax   then t = tmax
                     else                         t = k - bias end
                     else                         t = k - bias end
                     if q < t then break end
                     if q < t then break end
                     out[#out + 1] = digitToBasic(t + (q - t) % (base - t))
                     out[#out + 1] = digitToBasic(t + (q - t) % (base - t))
                     q   = math.floor((q - t) / (base - t))
                     q = math.floor((q - t) / (base - t))
                     k   = k + base
                     k = k + base
                 end
                 end
                 out[#out + 1] = digitToBasic(q)
                 out[#out + 1] = digitToBasic(q)
Line 176: Line 173:


function punycode.decode(label)
function punycode.decode(label)
     if label == ""       then return "" end
     if label == ""         then return "" end
     if decodeCache[label] then return decodeCache[label] end
     if decodeCache[label]   then return decodeCache[label] end


     local cps, d = {}, (label:find(delim, 1, true) or 0)
     local cps, d = {}, (label:find(delim, 1, true) or 0)
     for i = 1, d - 1 do
     for i = 1, d - 1 do cps[#cps + 1] = label:byte(i) end
        cps[#cps + 1] = label:byte(i)
    end


     local n, i_val, bias = initial_n, 0, initial_bias
     local n, i_val, bias = initial_n, 0, initial_bias
Line 195: Line 190:
             i_val = i_val + digit * w
             i_val = i_val + digit * w
             local t
             local t
             if    k <= bias         then t = tmin
             if    k <= bias         then t = tmin
             elseif k >= bias + tmax then t = tmax
             elseif k >= bias + tmax   then t = tmax
             else                         t = k - bias end
             else                         t = k - bias end
             if digit < t then break end
             if digit < t then break end
             w = w * (base - t)
             w = w * (base - t)
Line 215: Line 210:


----------------------------------------------------------------
----------------------------------------------------------------
-- Domain-level helpers (handle dots; call encode/decode per label)
-- Domain-level helpers
----------------------------------------------------------------
----------------------------------------------------------------
local function stripTrailingDot(s)
local function stripTrailingDot(s)
     return (s:sub(-1) == '.' and s:sub(1, -2) or s), (s:sub(-1) == '.')
     return (s:sub(-1) == "." and s:sub(1, -2) or s),
          (s:sub(-1) == ".")
end
 
-- quick ASCII-only test: letters, digits, hyphen
local function isPureASCII(label)
    return label:match("^[%a%d%-]+$") ~= nil
end
end


-- Unicode → ASCII/IDNA
function punycode.toASCII(domain)
function punycode.toASCII(domain)
     if domain == "" then return "" end
     if domain == "" then return "" end
Line 229: Line 229:
     local out = {}
     local out = {}
     for label in domain:gmatch("([^%.]+)") do
     for label in domain:gmatch("([^%.]+)") do
         -- any non-ASCII byte (128–255)?
         if isPureASCII(label) then
        if label:find("[\128-\255]") then
            out[#out + 1] = label
        else
             out[#out + 1] = "xn--" .. punycode.encode(label)
             out[#out + 1] = "xn--" .. punycode.encode(label)
        else
            out[#out + 1] = label
         end
         end
     end
     end
Line 240: Line 239:
end
end


-- ASCII/IDNA → Unicode
function punycode.toUnicode(domain)
function punycode.toUnicode(domain)
     if domain == "" then return "" end
     if domain == "" then return "" end

Revision as of 00:04, 16 May 2025

Documentation for this module may be created at Module:Punycode/doc

----------------------------------------------------------------
-- Module:Punycode – pure Lua 5.1 RFC-3492 implementation
-- Public API
--   punycode.encode(label)       -> "4dbrk0ce"
--   punycode.decode(label)       -> "ישראל"
--   punycode.toASCII(domain)     -> "xn--4dbrk0ce"
--   punycode.toUnicode(domain)   -> "ישראל"
----------------------------------------------------------------
local punycode = {}

----------------------------------------------------------------
-- RFC 3492 constants
----------------------------------------------------------------
local base, tmin, tmax = 36, 1, 26
local skew, damp       = 38, 700
local initial_bias     = 72
local initial_n        = 128              -- 0x80
local delim            = "-"              -- ASCII hyphen

----------------------------------------------------------------
-- UTF-8 helpers (pure Lua 5.1)
----------------------------------------------------------------
local function toCodePoints(s)
    local cps, i, len = {}, 1, #s
    while i <= len do
        local b1 = s:byte(i)
        if b1 < 0x80 then                       -- 1-byte sequence
            cps[#cps + 1], i = b1, i + 1
        elseif b1 < 0xE0 then                   -- 2-byte sequence
            local b2 = s:byte(i + 1)
            cps[#cps + 1] = (b1 - 0xC0) * 0x40 + (b2 - 0x80)
            i = i + 2
        elseif b1 < 0xF0 then                   -- 3-byte sequence
            local b2, b3 = s:byte(i + 1, i + 2)
            cps[#cps + 1] =
                  (b1 - 0xE0) * 0x1000
                + (b2 - 0x80) * 0x40
                + (b3 - 0x80)
            i = i + 3
        else                                    -- 4-byte sequence
            local b2, b3, b4 = s:byte(i + 1, i + 3)
            cps[#cps + 1] =
                  (b1 - 0xF0) * 0x40000
                + (b2 - 0x80) * 0x1000
                + (b3 - 0x80) * 0x40
                + (b4 - 0x80)
            i = i + 4
        end
    end
    return cps
end

local function cpToUtf8(cp)
    if cp < 0x80   then return string.char(cp) end
    if cp < 0x800  then
        return string.char(
            0xC0 + math.floor(cp / 0x40),
            0x80 + (cp % 0x40)
        )
    end
    if cp < 0x10000 then
        return string.char(
            0xE0 + math.floor(cp / 0x1000),
            0x80 + (math.floor(cp / 0x40) % 0x40),
            0x80 + (cp % 0x40)
        )
    end
    return string.char(
        0xF0 + math.floor(cp / 0x40000),
        0x80 + (math.floor(cp / 0x1000) % 0x40),
        0x80 + (math.floor(cp / 0x40)   % 0x40),
        0x80 + (cp % 0x40)
    )
end

local function fromCodePoints(cps)
    local out = {}
    for i = 1, #cps do out[i] = cpToUtf8(cps[i]) end
    return table.concat(out)
end

----------------------------------------------------------------
-- Punycode helpers
----------------------------------------------------------------
local function digitToBasic(d)
    return string.char(d < 26 and (d + 97) or (d - 26 + 48)) -- a-z / 0-9
end

local function basicToDigit(byte)
    if byte >= 48 and byte <= 57  then return byte - 22 end   -- '0'-'9' → 26-35
    if byte >= 65 and byte <= 90  then return byte - 65 end   -- 'A'-'Z'
    if byte >= 97 and byte <= 122 then return byte - 97 end   -- 'a'-'z'
    return base                                                -- invalid
end

local function adapt(delta, numpoints, first)
    delta = first and math.floor(delta / damp) or math.floor(delta / 2)
    delta = delta + math.floor(delta / numpoints)
    local k = 0
    while delta > ((base - tmin) * tmax) / 2 do
        delta = math.floor(delta / (base - tmin))
        k = k + base
    end
    return k + math.floor(((base - tmin + 1) * delta) / (delta + skew))
end

----------------------------------------------------------------
-- Encode / decode a single label
----------------------------------------------------------------
local encodeCache, decodeCache = {}, {}

function punycode.encode(label)
    if label == ""          then return "" end
    if encodeCache[label]   then return encodeCache[label] end
    if label:find("%.")     then error("punycode.encode: single label expected") end

    label = string.lower(label)                 -- IDNA is case-insensitive
    local cps    = toCodePoints(label)
    local out    = {}
    local n      = initial_n
    local bias   = initial_bias
    local delta  = 0
    local basic  = 0

    -- copy basic ASCII code points
    for _, cp in ipairs(cps) do
        if cp < 0x80 then
            out[#out + 1] = string.char(cp)
            basic = basic + 1
        end
    end
    -- delimiter only if mixture of basic + non-basic
    if basic > 0 and basic < #cps then out[#out + 1] = delim end

    local h = basic
    while h < #cps do
        local m = 0x7FFFFFFF
        for _, cp in ipairs(cps) do
            if cp >= n and cp < m then m = cp end
        end
        delta = delta + (m - n) * (h + 1)
        n = m

        for _, cp in ipairs(cps) do
            if cp < n then
                delta = delta + 1
            elseif cp == n then
                local q, k = delta, base
                while true do
                    local t
                    if     k <= bias          then t = tmin
                    elseif k >= bias + tmax   then t = tmax
                    else                          t = k - bias end
                    if q < t then break end
                    out[#out + 1] = digitToBasic(t + (q - t) % (base - t))
                    q  = math.floor((q - t) / (base - t))
                    k  = k + base
                end
                out[#out + 1] = digitToBasic(q)
                bias  = adapt(delta, h + 1, h == basic)
                delta = 0
                h     = h + 1
            end
        end
        delta = delta + 1
        n     = n + 1
    end

    local result = table.concat(out)
    encodeCache[label] = result
    return result
end

function punycode.decode(label)
    if label == ""          then return "" end
    if decodeCache[label]   then return decodeCache[label] end

    local cps, d = {}, (label:find(delim, 1, true) or 0)
    for i = 1, d - 1 do cps[#cps + 1] = label:byte(i) end

    local n, i_val, bias = initial_n, 0, initial_bias
    local pos, len       = (d > 0) and (d + 1) or 1, #label

    while pos <= len do
        local oldi, w, k = i_val, 1, base
        while true do
            if pos > len then error("punycode.decode: bad input") end
            local digit = basicToDigit(label:byte(pos))
            pos   = pos + 1
            i_val = i_val + digit * w
            local t
            if     k <= bias          then t = tmin
            elseif k >= bias + tmax   then t = tmax
            else                          t = k - bias end
            if digit < t then break end
            w = w * (base - t)
            k = k + base
        end
        bias = adapt(i_val - oldi, #cps + 1, oldi == 0)
        n    = n + math.floor(i_val / (#cps + 1))
        i_val = i_val % (#cps + 1)
        table.insert(cps, i_val + 1, n)
        i_val = i_val + 1
    end

    local result = fromCodePoints(cps)
    decodeCache[label] = result
    return result
end

----------------------------------------------------------------
-- Domain-level helpers
----------------------------------------------------------------
local function stripTrailingDot(s)
    return (s:sub(-1) == "." and s:sub(1, -2) or s),
           (s:sub(-1) == ".")
end

-- quick ASCII-only test: letters, digits, hyphen
local function isPureASCII(label)
    return label:match("^[%a%d%-]+$") ~= nil
end

function punycode.toASCII(domain)
    if domain == "" then return "" end
    local trailing
    domain, trailing = stripTrailingDot(domain)

    local out = {}
    for label in domain:gmatch("([^%.]+)") do
        if isPureASCII(label) then
            out[#out + 1] = label
        else
            out[#out + 1] = "xn--" .. punycode.encode(label)
        end
    end
    local res = table.concat(out, ".")
    return trailing and (res .. ".") or res
end

function punycode.toUnicode(domain)
    if domain == "" then return "" end
    local trailing
    domain, trailing = stripTrailingDot(domain)

    local out = {}
    for label in domain:gmatch("([^%.]+)") do
        if label:sub(1, 4):lower() == "xn--" then
            out[#out + 1] = punycode.decode(label:sub(5))
        else
            out[#out + 1] = label
        end
    end
    local res = table.concat(out, ".")
    return trailing and (res .. ".") or res
end

return punycode