Jump to content

Module:Punycode: Difference between revisions

// via Wikitext Extension for VSCode
Tag: Reverted
// via Wikitext Extension for VSCode
Tag: Manual revert
Line 1: Line 1:
----------------------------------------------------------------
-- Module:Punycode
-- Module:Punycode – pure Lua 5.1 RFC-3492 implementation
-- Implements RFC3492 (Punycode) encoding and decoding; requires mw.ustring for proper Unicode support
-- Public API
 
--   punycode.encode(label)       -> "4dbrk0ce"
--  punycode.decode(label)      -> "ישראל"
--  punycode.toASCII(domain)    -> "xn--4dbrk0ce"
--  punycode.toUnicode(domain)  -> "ישראל"
----------------------------------------------------------------
local punycode = {}
local punycode = {}


----------------------------------------------------------------
-- Cache for frequently processed strings (persists during a single page render)
-- RFC 3492 constants
local encodeCache = {}
----------------------------------------------------------------
local decodeCache = {}
local base, tmin, tmax = 36, 1, 26
local skew, damp      = 38, 700
local initial_bias    = 72
local initial_n        = 128              -- 0x80
local delim            = "-"              -- ASCII hyphen


----------------------------------------------------------------
--------------------------
-- UTF-8 helpers (pure Lua 5.1)
-- Configuration Constants
----------------------------------------------------------------
--------------------------
local base        = 36
local tmin        = 1
local tmax        = 26
local skew        = 38
local damp        = 700
local initial_bias = 72
local initial_n    = 128  -- 0x80
local delimiter    = '--- ASCII hyphen
 
--------------------------
-- Helper functions for Unicode handling.
--------------------------
-- Converts a UTF-8 string to an array of Unicode code points.
local function toCodePoints(s)
local function toCodePoints(s)
     local cps, i, len = {}, 1, #s
     if not s or s == "" then
     while i <= len do
        return {}
        local b1 = s:byte(i)
     end
        if b1 < 0x80 then                      -- 1-byte sequence
   
            cps[#cps + 1], i = b1, i + 1
    -- Estimate the number of characters (may not be exact for multi-byte chars)
        elseif b1 < 0xE0 then                  -- 2-byte sequence
    local len = mw.ustring.len(s)
            local b2 = s:byte(i + 1)
    local cps = {}
            cps[#cps + 1] = (b1 - 0xC0) * 0x40 + (b2 - 0x80)
   
            i = i + 2
    -- Pre-allocate the table with an index counter for direct assignment
        elseif b1 < 0xF0 then                  -- 3-byte sequence
    local i = 1
            local b2, b3 = s:byte(i + 1, i + 2)
    for char in mw.ustring.gmatch(s, ".") do
            cps[#cps + 1] =
        cps[i] = mw.ustring.codepoint(char)
                  (b1 - 0xE0) * 0x1000
        i = i + 1
                + (b2 - 0x80) * 0x40
                + (b3 - 0x80)
            i = i + 3
        else                                    -- 4-byte sequence
            local b2, b3, b4 = s:byte(i + 1, i + 3)
            cps[#cps + 1] =
                  (b1 - 0xF0) * 0x40000
                + (b2 - 0x80) * 0x1000
                + (b3 - 0x80) * 0x40
                + (b4 - 0x80)
            i = i + 4
        end
     end
     end
   
     return cps
     return cps
end
end


local function cpToUtf8(cp)
-- Converts an array of Unicode code points to a UTF-8 string.
     if cp < 0x80  then return string.char(cp) end
local function fromCodePoints(cps)
    if cp < 0x800  then
     if not cps or #cps == 0 then
         return string.char(
         return ""
            0xC0 + math.floor(cp / 0x40),
            0x80 + (cp % 0x40)
        )
     end
     end
     if cp < 0x10000 then
      
        return string.char(
    -- Pre-allocate the table with the exact size needed
            0xE0 + math.floor(cp / 0x1000),
    local chars = {}
            0x80 + (math.floor(cp / 0x40) % 0x40),
    for i = 1, #cps do
            0x80 + (cp % 0x40)
        chars[i] = mw.ustring.char(cps[i])
        )
     end
     end
     return string.char(
   
        0xF0 + math.floor(cp / 0x40000),
     return table.concat(chars)
        0x80 + (math.floor(cp / 0x1000) % 0x40),
        0x80 + (math.floor(cp / 0x40)  % 0x40),
        0x80 + (cp % 0x40)
    )
end
end


local function fromCodePoints(cps)
--------------------------
     local out = {}
-- Digit conversion functions
    for i = 1, #cps do out[i] = cpToUtf8(cps[i]) end
--------------------------
     return table.concat(out)
local function digitToBasic(digit)
     if digit < 26 then
        return string.char(digit + string.byte('a'))
     else
        return string.char(digit - 26 + string.byte('0'))
    end
end
end


----------------------------------------------------------------
local function basicToDigit(cp)
-- Punycode helpers
     if cp >= string.byte('0') and cp <= string.byte('9') then
----------------------------------------------------------------
        return cp - string.byte('0') + 26
local function digitToBasic(d)
     elseif cp >= string.byte('A') and cp <= string.byte('Z') then
     return string.char(d < 26 and (d + 97) or (d - 26 + 48)) -- a-z / 0-9
        return cp - string.byte('A')
end
     elseif cp >= string.byte('a') and cp <= string.byte('z') then
 
        return cp - string.byte('a')
local function basicToDigit(byte)
     else
    if byte >= 48 and byte <= 57  then return byte - 22 end  -- '0'-'9' → 26-35
        return base
     if byte >= 65 and byte <= 90  then return byte - 65 end  -- 'A'-'Z'
    end
     if byte >= 97 and byte <= 122 then return byte - 97 end  -- 'a'-'z'
     return base                                               -- invalid
end
end


--------------------------
-- Bias adaptation (RFC3492, Section 3.4)
--------------------------
local function adapt(delta, numpoints, first)
local function adapt(delta, numpoints, first)
     delta = first and math.floor(delta / damp) or math.floor(delta / 2)
     if first then
        delta = math.floor(delta / damp)
    else
        delta = math.floor(delta / 2)
    end
     delta = delta + math.floor(delta / numpoints)
     delta = delta + math.floor(delta / numpoints)
     local k = 0
     local k = 0
Line 105: Line 99:
end
end


----------------------------------------------------------------
--------------------------
-- Encode / decode a single label
-- Punycode Encoding Function
----------------------------------------------------------------
--------------------------
local encodeCache, decodeCache = {}, {}
function punycode.encode(input)
    -- Input validation and cache check
    if not input or input == "" then
        return ""
    end
   
    input = mw.ustring.lower(input)
    -- Check cache first for previously encoded strings
    if encodeCache[input] then
        return encodeCache[input]
    end
   
    local output = {}
    local cp_array = toCodePoints(input)
    local n = initial_n
    local delta = 0
    local bias = initial_bias
    local basic_count = 0


function punycode.encode(label)
    -- Copy basic code points (ASCII < 128)
     if label == ""          then return "" end
     for _, cp in ipairs(cp_array) do
    if encodeCache[label]  then return encodeCache[label] end
        if cp < 128 then
    if label:find("%.")    then error("punycode.encode: single label expected") end
            table.insert(output, mw.ustring.char(cp))
 
            basic_count = basic_count + 1
    label = string.lower(label)                 -- IDNA is case-insensitive
        end
    local cps    = toCodePoints(label)
     end
    local out    = {}
    local n      = initial_n
     local bias  = initial_bias
    local delta  = 0
    local basic  = 0


     -- copy basic ASCII code points
     local h = basic_count
     for _, cp in ipairs(cps) do
     if basic_count > 0 then
        if cp < 0x80 then
        table.insert(output, delimiter)
            out[#out + 1] = string.char(cp)
            basic = basic + 1
        end
     end
     end
    -- delimiter only if mixture of basic + non-basic
    if basic > 0 and basic < #cps then out[#out + 1] = delim end


    local h = basic
     while h < #cp_array do
     while h < #cps do
         local m = 0x7FFFFFFF
         local m = 0x7FFFFFFF
         for _, cp in ipairs(cps) do
         for _, cp in ipairs(cp_array) do
             if cp >= n and cp < m then m = cp end
             if cp >= n and cp < m then
                m = cp
            end
         end
         end
         delta = delta + (m - n) * (h + 1)
         delta = delta + (m - n) * (h + 1)
         n = m
         n = m
 
         for _, cp in ipairs(cp_array) do
         for _, cp in ipairs(cps) do
             if cp < n then
             if cp < n then
                 delta = delta + 1
                 delta = delta + 1
             elseif cp == n then
             elseif cp == n then
                 local q, k = delta, base
                 local q = delta
                local k = base
                 while true do
                 while true do
                     local t
                     local t
                     if     k <= bias         then t = tmin
                     if k <= bias then
                     elseif k >= bias + tmax   then t = tmax
                        t = tmin
                     else                         t = k - bias end
                     elseif k >= bias + tmax then
                        t = tmax
                     else
                        t = k - bias
                    end
                     if q < t then break end
                     if q < t then break end
                     out[#out + 1] = digitToBasic(t + (q - t) % (base - t))
                     local code = t + ((q - t) % (base - t))
                     q = math.floor((q - t) / (base - t))
                    table.insert(output, digitToBasic(code))
                     k = k + base
                     q = math.floor((q - t) / (base - t))
                     k = k + base
                 end
                 end
                 out[#out + 1] = digitToBasic(q)
                 table.insert(output, digitToBasic(q))
                 bias = adapt(delta, h + 1, h == basic)
                 bias = adapt(delta, h + 1, h == basic_count)
                 delta = 0
                 delta = 0
                 h     = h + 1
                 h = h + 1
             end
             end
         end
         end
         delta = delta + 1
         delta = delta + 1
         n     = n + 1
         n = n + 1
     end
     end


     local result = table.concat(out)
     local result = table.concat(output)
     encodeCache[label] = result
   
    -- Cache the result before returning
     encodeCache[input] = result
     return result
     return result
end
end


function punycode.decode(label)
--------------------------
     if label == ""         then return "" end
-- Punycode Decoding Function
     if decodeCache[label]   then return decodeCache[label] end
--------------------------
function punycode.decode(input)
    -- Input validation and cache check
     if not input or input == "" then
        return ""
    end
   
    -- Check cache first for previously decoded strings
     if decodeCache[input] then
        return decodeCache[input]
    end
   
    local cp_array = {}
    local d = input:find(delimiter, 1, true)
    local b = 0
    if d then
        for i = 1, d - 1 do
            local cp = input:byte(i)
            table.insert(cp_array, cp)
            b = b + 1
        end
    else
        d = 0
    end


     local cps, d = {}, (label:find(delim, 1, true) or 0)
     local n = initial_n
     for i = 1, d - 1 do cps[#cps + 1] = label:byte(i) end
    local bias = initial_bias
    local i = 0
     local index = d + 1
    local input_len = #input


     local n, i_val, bias = initial_n, 0, initial_bias
     while index <= input_len do
    local pos, len      = (d > 0) and (d + 1) or 1, #label
        local oldi = i
 
        local w = 1
    while pos <= len do
         local k = base
         local oldi, w, k = i_val, 1, base
         while true do
         while true do
             if pos > len then error("punycode.decode: bad input") end
             if index > input_len then
             local digit = basicToDigit(label:byte(pos))
                error("Invalid input: punycode decode incomplete")
             pos  = pos + 1
            end
             i_val = i_val + digit * w
             local digit = basicToDigit(input:byte(index))
             index = index + 1
             i = i + digit * w
             local t
             local t
             if     k <= bias         then t = tmin
             if k <= bias then
             elseif k >= bias + tmax   then t = tmax
                t = tmin
             else                         t = k - bias end
             elseif k >= bias + tmax then
                t = tmax
             else
                t = k - bias
            end
             if digit < t then break end
             if digit < t then break end
             w = w * (base - t)
             w = w * (base - t)
             k = k + base
             k = k + base
         end
         end
         bias = adapt(i_val - oldi, #cps + 1, oldi == 0)
         bias = adapt(i - oldi, #cp_array + 1, oldi == 0)
         n   = n + math.floor(i_val / (#cps + 1))
         n = n + math.floor(i / (#cp_array + 1))
         i_val = i_val % (#cps + 1)
         i = i % (#cp_array + 1)
         table.insert(cps, i_val + 1, n)
         table.insert(cp_array, i + 1, n)
         i_val = i_val + 1
         i = i + 1
     end
     end


     local result = fromCodePoints(cps)
     local result = fromCodePoints(cp_array)
     decodeCache[label] = result
   
    -- Cache the result before returning
     decodeCache[input] = result
     return result
     return result
end
----------------------------------------------------------------
-- Domain-level helpers
----------------------------------------------------------------
local function stripTrailingDot(s)
    return (s:sub(-1) == "." and s:sub(1, -2) or s),
          (s:sub(-1) == ".")
end
-- quick ASCII-only test: letters, digits, hyphen
local function isPureASCII(label)
    return label:match("^[%a%d%-]+$") ~= nil
end
function punycode.toASCII(domain)
    if domain == "" then return "" end
    local trailing
    domain, trailing = stripTrailingDot(domain)
    local out = {}
    for label in domain:gmatch("([^%.]+)") do
        if isPureASCII(label) then
            out[#out + 1] = label
        else
            out[#out + 1] = "xn--" .. punycode.encode(label)
        end
    end
    local res = table.concat(out, ".")
    return trailing and (res .. ".") or res
end
function punycode.toUnicode(domain)
    if domain == "" then return "" end
    local trailing
    domain, trailing = stripTrailingDot(domain)
    local out = {}
    for label in domain:gmatch("([^%.]+)") do
        if label:sub(1, 4):lower() == "xn--" then
            out[#out + 1] = punycode.decode(label:sub(5))
        else
            out[#out + 1] = label
        end
    end
    local res = table.concat(out, ".")
    return trailing and (res .. ".") or res
end
end


return punycode
return punycode

Revision as of 00:05, 16 May 2025

Documentation for this module may be created at Module:Punycode/doc

-- Module:Punycode
-- Implements RFC3492 (Punycode) encoding and decoding; requires mw.ustring for proper Unicode support

local punycode = {}

-- Cache for frequently processed strings (persists during a single page render)
local encodeCache = {}
local decodeCache = {}

--------------------------
-- Configuration Constants
--------------------------
local base         = 36
local tmin         = 1
local tmax         = 26
local skew         = 38
local damp         = 700
local initial_bias = 72
local initial_n    = 128   -- 0x80
local delimiter    = '-'   -- ASCII hyphen

--------------------------
-- Helper functions for Unicode handling.
--------------------------
-- Converts a UTF-8 string to an array of Unicode code points.
local function toCodePoints(s)
    if not s or s == "" then
        return {}
    end
    
    -- Estimate the number of characters (may not be exact for multi-byte chars)
    local len = mw.ustring.len(s)
    local cps = {}
    
    -- Pre-allocate the table with an index counter for direct assignment
    local i = 1
    for char in mw.ustring.gmatch(s, ".") do
        cps[i] = mw.ustring.codepoint(char)
        i = i + 1
    end
    
    return cps
end

-- Converts an array of Unicode code points to a UTF-8 string.
local function fromCodePoints(cps)
    if not cps or #cps == 0 then
        return ""
    end
    
    -- Pre-allocate the table with the exact size needed
    local chars = {}
    for i = 1, #cps do
        chars[i] = mw.ustring.char(cps[i])
    end
    
    return table.concat(chars)
end

--------------------------
-- Digit conversion functions
--------------------------
local function digitToBasic(digit)
    if digit < 26 then
        return string.char(digit + string.byte('a'))
    else
        return string.char(digit - 26 + string.byte('0'))
    end
end

local function basicToDigit(cp)
    if cp >= string.byte('0') and cp <= string.byte('9') then
        return cp - string.byte('0') + 26
    elseif cp >= string.byte('A') and cp <= string.byte('Z') then
        return cp - string.byte('A')
    elseif cp >= string.byte('a') and cp <= string.byte('z') then
        return cp - string.byte('a')
    else
        return base
    end
end

--------------------------
-- Bias adaptation (RFC3492, Section 3.4)
--------------------------
local function adapt(delta, numpoints, first)
    if first then
        delta = math.floor(delta / damp)
    else
        delta = math.floor(delta / 2)
    end
    delta = delta + math.floor(delta / numpoints)
    local k = 0
    while delta > ((base - tmin) * tmax) / 2 do
        delta = math.floor(delta / (base - tmin))
        k = k + base
    end
    return k + math.floor(((base - tmin + 1) * delta) / (delta + skew))
end

--------------------------
-- Punycode Encoding Function
--------------------------
function punycode.encode(input)
    -- Input validation and cache check
    if not input or input == "" then
        return ""
    end
    
    input = mw.ustring.lower(input)
    -- Check cache first for previously encoded strings
    if encodeCache[input] then
        return encodeCache[input]
    end
    
    local output = {}
    local cp_array = toCodePoints(input)
    local n = initial_n
    local delta = 0
    local bias = initial_bias
    local basic_count = 0

    -- Copy basic code points (ASCII < 128)
    for _, cp in ipairs(cp_array) do
        if cp < 128 then
            table.insert(output, mw.ustring.char(cp))
            basic_count = basic_count + 1
        end
    end

    local h = basic_count
    if basic_count > 0 then
        table.insert(output, delimiter)
    end

    while h < #cp_array do
        local m = 0x7FFFFFFF
        for _, cp in ipairs(cp_array) do
            if cp >= n and cp < m then
                m = cp
            end
        end

        delta = delta + (m - n) * (h + 1)
        n = m
        for _, cp in ipairs(cp_array) do
            if cp < n then
                delta = delta + 1
            elseif cp == n then
                local q = delta
                local k = base
                while true do
                    local t
                    if k <= bias then
                        t = tmin
                    elseif k >= bias + tmax then
                        t = tmax
                    else
                        t = k - bias
                    end
                    if q < t then break end
                    local code = t + ((q - t) % (base - t))
                    table.insert(output, digitToBasic(code))
                    q = math.floor((q - t) / (base - t))
                    k = k + base
                end
                table.insert(output, digitToBasic(q))
                bias = adapt(delta, h + 1, h == basic_count)
                delta = 0
                h = h + 1
            end
        end
        delta = delta + 1
        n = n + 1
    end

    local result = table.concat(output)
    
    -- Cache the result before returning
    encodeCache[input] = result
    return result
end

--------------------------
-- Punycode Decoding Function
--------------------------
function punycode.decode(input)
    -- Input validation and cache check
    if not input or input == "" then
        return ""
    end
    
    -- Check cache first for previously decoded strings
    if decodeCache[input] then
        return decodeCache[input]
    end
    
    local cp_array = {}
    local d = input:find(delimiter, 1, true)
    local b = 0
    if d then
        for i = 1, d - 1 do
            local cp = input:byte(i)
            table.insert(cp_array, cp)
            b = b + 1
        end
    else
        d = 0
    end

    local n = initial_n
    local bias = initial_bias
    local i = 0
    local index = d + 1
    local input_len = #input

    while index <= input_len do
        local oldi = i
        local w = 1
        local k = base
        while true do
            if index > input_len then
                error("Invalid input: punycode decode incomplete")
            end
            local digit = basicToDigit(input:byte(index))
            index = index + 1
            i = i + digit * w
            local t
            if k <= bias then
                t = tmin
            elseif k >= bias + tmax then
                t = tmax
            else
                t = k - bias
            end
            if digit < t then break end
            w = w * (base - t)
            k = k + base
        end
        bias = adapt(i - oldi, #cp_array + 1, oldi == 0)
        n = n + math.floor(i / (#cp_array + 1))
        i = i % (#cp_array + 1)
        table.insert(cp_array, i + 1, n)
        i = i + 1
    end

    local result = fromCodePoints(cp_array)
    
    -- Cache the result before returning
    decodeCache[input] = result
    return result
end

return punycode