Module:CanonicalForms: Difference between revisions

// via Wikitext Extension for VSCode
// via Wikitext Extension for VSCode
 
(4 intermediate revisions by the same user not shown)
Line 1: Line 1:
-- Module:CanonicalForms
--[[
-- Normalizes strings by removing wiki markup and mapping them to canonical values.
* Name: CanonicalForms
-- Mapping tables include:
* Author: Mark W. Datysgeld
--  * canonical: Standard display value.
* Description: Text normalization utility that removes wiki markup and maps user input to canonical values using configurable lookup tables
--  * synonyms: Synonyms (case-insensitive) mapped to canonical.
* Notes: Example usage: local mapping = { { canonical = "gTLD", synonyms = {"generic", "g"} }, { canonical = "ccTLD", synonyms = {"country", "cc"} } }; local canonical, css, category = require('Module:CanonicalForms').normalize(inputString, mapping)
--  * [optional] css: Associated CSS class.
]]
--  * [optional] category: Auto-assignment category.
--
-- Example:
--  local mapping = {
--    { canonical = "gTLD", synonyms = {"gtld", "generic", "tld"} },
--    { canonical = "ccTLD", synonyms = {"cctld", "country", "cc"} }
--  }
--  local canonical, css, category = require('Module:CanonicalForms').normalize(inputString, mapping)


local CanonicalForms = {}
local CanonicalForms = {}


--- Normalize an input string.
-- Normalize an input string
-- Removes wiki markup and converts input to lowercase.
-- Removes wiki markup, converts to lowercase, and maps to canonical form
-- Checks mappingTable for a matching synonym and returns the corresponding canonical value.
-- @param input String to normalize.
-- @param mappingTable Array of mapping groups with 'canonical', 'synonyms', and optional 'css' and 'category'.
-- @return canonical Matched value, or cleaned input if no match.
-- @return css Optional CSS class.
-- @return category Optional category string.
function CanonicalForms.normalize(input, mappingTable)
function CanonicalForms.normalize(input, mappingTable)
     if not input or input == "" then
     if not input or input == "" then
Line 29: Line 15:
     end
     end


     -- Remove wiki-link markup; eg. "[[Brand TLD]]" to "Brand TLD"
     -- Remove wiki internal link markup (e.g., "[[Brand TLD]]" "Brand TLD")
     local cleanInput = input:gsub("%[%[([^|%]]+)|?[^%]]*%]%]", "%1"):lower()
     local cleanInput = input:gsub("%[%[([^|%]]+)|?[^%]]*%]%]", "%1"):lower()


     for _, group in ipairs(mappingTable) do
     -- Create lookup table for faster matching (first call only)
        for _, syn in ipairs(group.synonyms or {}) do
    if not mappingTable._lookupCache then
            if cleanInput == syn:lower() then
        local lookupCache = {}
                return group.canonical, group.css, group.category
        for _, group in ipairs(mappingTable) do
            -- Add the canonical form itself to the lookup (in lowercase)
            lookupCache[group.canonical:lower()] = group
           
            -- Add all synonyms to the lookup
            for _, syn in ipairs(group.synonyms or {}) do
                lookupCache[syn:lower()] = group
             end
             end
         end
         end
        mappingTable._lookupCache = lookupCache
    end
    -- Direct lookup via cache
    local match = mappingTable._lookupCache[cleanInput]
    if match then
        return match.canonical, match.css, match.category
     end
     end