Module:NormalizationDiacritic: Difference between revisions

// via Wikitext Extension for VSCode
// via Wikitext Extension for VSCode
 
(6 intermediate revisions by the same user not shown)
Line 1: Line 1:
-- Module:NormalizationDiacritic
--[[
-- Removes diacritics from text for word matching.
* Name: NormalizationDiacritic
-- Features:
* Author: Mark W. Datysgeld
--  * Supports Latin and Cyrillic scripts
* Description: Diacritic removal for text normalization in word matching, supporting Latin and Cyrillic scripts
--  * Normalizes apostrophe variants (' and ')
* Notes: Normalizes apostrophe variants; UTF-8 compatible; case-preserving; uses performant gmatch and table.concat pattern for UTF-8 processing; includes caching for performance; avoids string.gsub for better UTF-8 handling
--  * UTF-8 compatible
]]
--  * Case-preserving
--
-- Usage:
--  local DiacriticNormalization = require('Module:NormalizationDiacritic')
--  local normalized = DiacriticNormalization.removeDiacritics("Français")  -- Returns "Francais"


local p = {}
local p = {}
Line 19: Line 14:
local diacriticMap = {
local diacriticMap = {
     -- Apostrophe variants
     -- Apostrophe variants
     ["'"] = "'", -- RIGHT SINGLE QUOTATION MARK (U+2019)
     [""] = "'", -- RIGHT SINGLE QUOTATION MARK (U+2019)
     ["'"] = "'", -- LEFT SINGLE QUOTATION MARK (U+2018)
     [""] = "'", -- LEFT SINGLE QUOTATION MARK (U+2018)
     ["‛"] = "'", -- SINGLE HIGH-REVERSED-9 QUOTATION MARK (U+201B)
     ["‛"] = "'", -- SINGLE HIGH-REVERSED-9 QUOTATION MARK (U+201B)
     ["′"] = "'", -- PRIME (U+2032)
     ["′"] = "'", -- PRIME (U+2032)
Line 115: Line 110:
     end
     end
      
      
     -- Replace diacritics with base characters (UTF-8 aware)
     -- Build a table of characters, replacing diacritics
     local result = text:gsub(".", function(c) return diacriticMap[c] or c end)
     local parts = {}
    for char in text:gmatch("([%z\1-\127\194-\244][\128-\191]*)") do
        table.insert(parts, diacriticMap[char] or char)
    end
   
    -- Concatenate the parts into a single string
    local result = table.concat(parts)
      
      
    -- Cache result
     resultCache[text] = result
     resultCache[text] = result
     return result
     return result