Module:NormalizationDiacritic: Difference between revisions

// via Wikitext Extension for VSCode
// via Wikitext Extension for VSCode
 
(4 intermediate revisions by the same user not shown)
Line 1: Line 1:
-- Module:NormalizationDiacritic
--[[
-- Removes diacritics from text for word matching.
* Name: NormalizationDiacritic
-- Features:
* Author: Mark W. Datysgeld
--  * Supports Latin and Cyrillic scripts
* Description: Diacritic removal for text normalization in word matching, supporting Latin and Cyrillic scripts
--  * Normalizes apostrophe variants (' and ')
* Notes: Normalizes apostrophe variants; UTF-8 compatible; case-preserving; uses performant gmatch and table.concat pattern for UTF-8 processing; includes caching for performance; avoids string.gsub for better UTF-8 handling
--  * UTF-8 compatible
]]
--  * Case-preserving
--
-- Usage:
--  local DiacriticNormalization = require('Module:NormalizationDiacritic')
--  local normalized = DiacriticNormalization.removeDiacritics("Français")  -- Returns "Francais"


local p = {}
local p = {}
Line 115: Line 110:
     end
     end
      
      
     -- Replace diacritics with base characters (UTF-8 aware)
     -- Build a table of characters, replacing diacritics
     local result = {}
     local parts = {}
     for char in text:gmatch("([%z\1-\127\194-\244][\128-\191]*)") do
     for char in text:gmatch("([%z\1-\127\194-\244][\128-\191]*)") do
         result[#result + 1] = diacriticMap[char] or char
         table.insert(parts, diacriticMap[char] or char)
     end
     end
    result = table.concat(result)
      
      
     -- Cache result
     -- Concatenate the parts into a single string
    local result = table.concat(parts)
   
     resultCache[text] = result
     resultCache[text] = result
     return result
     return result