Jump to content

Module:Punycode: Difference between revisions

Created page with "-- Module:Punycode -- Implements RFC3492 (Punycode) encoding and decoding. -- Requires mw.ustring for proper Unicode support. local punycode = {} -------------------------- -- Configuration Constants -------------------------- local base = 36 local tmin = 1 local tmax = 26 local skew = 38 local damp = 700 local initial_bias = 72 local initial_n = 128 -- 0x80 local delimiter = '-' -- ASCII hyphen -------------------------- --..."
 
No edit summary
Line 2: Line 2:
-- Implements RFC3492 (Punycode) encoding and decoding.
-- Implements RFC3492 (Punycode) encoding and decoding.
-- Requires mw.ustring for proper Unicode support.
-- Requires mw.ustring for proper Unicode support.
local punycode = {}
local punycode = {}


Line 7: Line 8:
-- Configuration Constants
-- Configuration Constants
--------------------------
--------------------------
local base       = 36
local base         = 36
local tmin       = 1
local tmin         = 1
local tmax       = 26
local tmax         = 26
local skew       = 38
local skew         = 38
local damp       = 700
local damp         = 700
local initial_bias = 72
local initial_bias = 72
local initial_n    = 128  -- 0x80
local initial_n    = 128  -- 0x80
Line 22: Line 23:
local function toCodePoints(s)
local function toCodePoints(s)
     local cps = {}
     local cps = {}
    -- mw.ustring.gmatch with "." iterates over Unicode characters.
     for char in mw.ustring.gmatch(s, ".") do
     for char in mw.ustring.gmatch(s, ".") do
         table.insert(cps, mw.ustring.byte(char))
         table.insert(cps, mw.ustring.codepoint(char))
     end
     end
     return cps
     return cps
Line 41: Line 41:
-- Digit conversion functions
-- Digit conversion functions
--------------------------
--------------------------
-- Converts a basic digit value (0 .. base-1) to its corresponding basic code point (as a one-char string).
local function digitToBasic(digit)
local function digitToBasic(digit)
     if digit < 26 then
     if digit < 26 then
Line 50: Line 49:
end
end


-- Converts a basic code point (given as a number) to its digit value.
-- If the code point does not represent a valid digit, returns base.
local function basicToDigit(cp)
local function basicToDigit(cp)
     local code = cp
     if cp >= string.byte('0') and cp <= string.byte('9') then
    if code >= string.byte('0') and code <= string.byte('9') then
         return cp - string.byte('0') + 26
         return code - string.byte('0') + 26
     elseif cp >= string.byte('A') and cp <= string.byte('Z') then
     elseif code >= string.byte('A') and code <= string.byte('Z') then
         return cp - string.byte('A')
         return code - string.byte('A')
     elseif cp >= string.byte('a') and cp <= string.byte('z') then
     elseif code >= string.byte('a') and code <= string.byte('z') then
         return cp - string.byte('a')
         return code - string.byte('a')
     else
     else
         return base
         return base
Line 66: Line 62:


--------------------------
--------------------------
-- Bias adaptation function (RFC3492, Section 3.4)
-- Bias adaptation (RFC3492, Section 3.4)
--------------------------
--------------------------
local function adapt(delta, numpoints, first)
local function adapt(delta, numpoints, first)
Line 103: Line 99:


     local h = basic_count
     local h = basic_count
    -- Append delimiter if there were any basic code points
     if basic_count > 0 then
     if basic_count > 0 then
         table.insert(output, delimiter)
         table.insert(output, delimiter)
Line 109: Line 104:


     while h < #cp_array do
     while h < #cp_array do
        -- m is the minimum code point >= n
         local m = 0x7FFFFFFF
         local m = 0x7FFFFFFF
         for _, cp in ipairs(cp_array) do
         for _, cp in ipairs(cp_array) do
Line 158: Line 152:
function punycode.decode(input)
function punycode.decode(input)
     local cp_array = {}
     local cp_array = {}
     local d = input:find(delimiter, 1, true)  -- find delimiter (plain text search)
     local d = input:find(delimiter, 1, true)
     local b = 0
     local b = 0
     if d then
     if d then
        -- Copy basic code points before the delimiter
         for i = 1, d - 1 do
         for i = 1, d - 1 do
             local cp = input:byte(i)
             local cp = input:byte(i)
Line 210: Line 203:
end
end


--------------------------
-- Expose the module
--------------------------
return punycode
return punycode

Revision as of 23:54, 12 February 2025

Documentation for this module may be created at Module:Punycode/doc

-- Module:Punycode
-- Implements RFC3492 (Punycode) encoding and decoding.
-- Requires mw.ustring for proper Unicode support.

local punycode = {}

--------------------------
-- Configuration Constants
--------------------------
local base         = 36
local tmin         = 1
local tmax         = 26
local skew         = 38
local damp         = 700
local initial_bias = 72
local initial_n    = 128   -- 0x80
local delimiter    = '-'   -- ASCII hyphen

--------------------------
-- Helper functions for Unicode handling.
--------------------------
-- Converts a UTF-8 string to an array of Unicode code points.
local function toCodePoints(s)
    local cps = {}
    for char in mw.ustring.gmatch(s, ".") do
        table.insert(cps, mw.ustring.codepoint(char))
    end
    return cps
end

-- Converts an array of Unicode code points to a UTF-8 string.
local function fromCodePoints(cps)
    local chars = {}
    for _, cp in ipairs(cps) do
        table.insert(chars, mw.ustring.char(cp))
    end
    return table.concat(chars)
end

--------------------------
-- Digit conversion functions
--------------------------
local function digitToBasic(digit)
    if digit < 26 then
        return string.char(digit + string.byte('a'))
    else
        return string.char(digit - 26 + string.byte('0'))
    end
end

local function basicToDigit(cp)
    if cp >= string.byte('0') and cp <= string.byte('9') then
        return cp - string.byte('0') + 26
    elseif cp >= string.byte('A') and cp <= string.byte('Z') then
        return cp - string.byte('A')
    elseif cp >= string.byte('a') and cp <= string.byte('z') then
        return cp - string.byte('a')
    else
        return base
    end
end

--------------------------
-- Bias adaptation (RFC3492, Section 3.4)
--------------------------
local function adapt(delta, numpoints, first)
    if first then
        delta = math.floor(delta / damp)
    else
        delta = math.floor(delta / 2)
    end
    delta = delta + math.floor(delta / numpoints)
    local k = 0
    while delta > ((base - tmin) * tmax) / 2 do
        delta = math.floor(delta / (base - tmin))
        k = k + base
    end
    return k + math.floor(((base - tmin + 1) * delta) / (delta + skew))
end

--------------------------
-- Punycode Encoding Function
--------------------------
function punycode.encode(input)
    local output = {}
    local cp_array = toCodePoints(input)
    local n = initial_n
    local delta = 0
    local bias = initial_bias
    local basic_count = 0

    -- Copy basic code points (ASCII < 128)
    for _, cp in ipairs(cp_array) do
        if cp < 128 then
            table.insert(output, mw.ustring.char(cp))
            basic_count = basic_count + 1
        end
    end

    local h = basic_count
    if basic_count > 0 then
        table.insert(output, delimiter)
    end

    while h < #cp_array do
        local m = 0x7FFFFFFF
        for _, cp in ipairs(cp_array) do
            if cp >= n and cp < m then
                m = cp
            end
        end

        delta = delta + (m - n) * (h + 1)
        n = m
        for _, cp in ipairs(cp_array) do
            if cp < n then
                delta = delta + 1
            elseif cp == n then
                local q = delta
                local k = base
                while true do
                    local t
                    if k <= bias then
                        t = tmin
                    elseif k >= bias + tmax then
                        t = tmax
                    else
                        t = k - bias
                    end
                    if q < t then break end
                    local code = t + ((q - t) % (base - t))
                    table.insert(output, digitToBasic(code))
                    q = math.floor((q - t) / (base - t))
                    k = k + base
                end
                table.insert(output, digitToBasic(q))
                bias = adapt(delta, h + 1, h == basic_count)
                delta = 0
                h = h + 1
            end
        end
        delta = delta + 1
        n = n + 1
    end

    return table.concat(output)
end

--------------------------
-- Punycode Decoding Function
--------------------------
function punycode.decode(input)
    local cp_array = {}
    local d = input:find(delimiter, 1, true)
    local b = 0
    if d then
        for i = 1, d - 1 do
            local cp = input:byte(i)
            table.insert(cp_array, cp)
            b = b + 1
        end
    else
        d = 0
    end

    local n = initial_n
    local bias = initial_bias
    local i = 0
    local index = d + 1
    local input_len = #input

    while index <= input_len do
        local oldi = i
        local w = 1
        local k = base
        while true do
            if index > input_len then
                error("Invalid input: punycode decode incomplete")
            end
            local digit = basicToDigit(input:byte(index))
            index = index + 1
            i = i + digit * w
            local t
            if k <= bias then
                t = tmin
            elseif k >= bias + tmax then
                t = tmax
            else
                t = k - bias
            end
            if digit < t then break end
            w = w * (base - t)
            k = k + base
        end
        bias = adapt(i - oldi, #cp_array + 1, oldi == 0)
        n = n + math.floor(i / (#cp_array + 1))
        i = i % (#cp_array + 1)
        table.insert(cp_array, i + 1, n)
        i = i + 1
    end

    return fromCodePoints(cp_array)
end

return punycode