Module:NormalizationDiacritic: Difference between revisions

(6 intermediate revisions by the same user not shown)

Line 1:

-- ~~Module~~:NormalizationDiacritic

--[[

~~-- Removes diacritics from~~ text ~~for~~ word matching.

* Name: NormalizationDiacritic

~~-- Features:~~

* Author: Mark W. Datysgeld

~~-- * Supports~~ Latin and Cyrillic scripts

* Description: Diacritic removal for text normalization in word matching, supporting Latin and Cyrillic scripts

-- * Normalizes apostrophe variants ~~(' and ')~~

* Notes: Normalizes apostrophe variants; UTF-8 compatible; case-preserving; uses performant gmatch and table.concat pattern for UTF-8 processing; includes caching for performance; avoids string.gsub for better UTF-8 handling

-- * UTF-8 compatible

]]

~~-- * Case~~-preserving

--

~~-- Usage:~~

~~-- local DiacriticNormalization = require('Module:NormalizationDiacritic')~~

~~-- local normalized = DiacriticNormalization.removeDiacritics("Français") -- Returns "Francais"~~

local p = {}

Line 19:

Line 14:

local diacriticMap = {

-- Apostrophe variants

["'"] = "'", -- RIGHT SINGLE QUOTATION MARK (U+2019)

["’"] = "'", -- RIGHT SINGLE QUOTATION MARK (U+2019)

["'"] = "'", -- LEFT SINGLE QUOTATION MARK (U+2018)

["‘"] = "'", -- LEFT SINGLE QUOTATION MARK (U+2018)

["‛"] = "'", -- SINGLE HIGH-REVERSED-9 QUOTATION MARK (U+201B)

["′"] = "'", -- PRIME (U+2032)

Line 115:

Line 110:

end

-- ~~Replace~~ diacritics ~~with base characters (UTF-8 aware)~~

-- Build a table of characters, replacing diacritics

local ~~result~~ = text:~~gsub~~(".", ~~function(c) return~~ diacriticMap[c] or c end)

local parts = {}

for char in text:gmatch("([%z\1-\127\194-\244][\128-\191]*)") do

table.insert(parts, diacriticMap[char] or char)

end

-- Concatenate the parts into a single string

local result = table.concat(parts)

~~-- Cache result~~

resultCache[text] = result

return result

@@ Line 1: / Line 1: @@
--- Module:NormalizationDiacritic
+--[[
--- Removes diacritics from text for word matching.
+* Name: NormalizationDiacritic
--- Features:
+* Author: Mark W. Datysgeld
---   * Supports Latin and Cyrillic scripts
+* Description: Diacritic removal for text normalization in word matching, supporting Latin and Cyrillic scripts
---   * Normalizes apostrophe variants (' and ')
+* Notes: Normalizes apostrophe variants; UTF-8 compatible; case-preserving; uses performant gmatch and table.concat pattern for UTF-8 processing; includes caching for performance; avoids string.gsub for better UTF-8 handling
---   * UTF-8 compatible
+]]
---   * Case-preserving
---
--- Usage:
---   local DiacriticNormalization = require('Module:NormalizationDiacritic')
---   local normalized = DiacriticNormalization.removeDiacritics("Français")  -- Returns "Francais"
 local p = {}
@@ Line 19: / Line 14: @@
 local diacriticMap = {
      -- Apostrophe variants
-     ["'"] = "'", -- RIGHT SINGLE QUOTATION MARK (U+2019)
+     ["’"] = "'", -- RIGHT SINGLE QUOTATION MARK (U+2019)
-     ["'"] = "'", -- LEFT SINGLE QUOTATION MARK (U+2018)
+     ["‘"] = "'", -- LEFT SINGLE QUOTATION MARK (U+2018)
      ["‛"] = "'", -- SINGLE HIGH-REVERSED-9 QUOTATION MARK (U+201B)
      ["′"] = "'", -- PRIME (U+2032)
@@ Line 115: / Line 110: @@
      end
-     -- Replace diacritics with base characters (UTF-8 aware)
+     -- Build a table of characters, replacing diacritics
-     local result = text:gsub(".", function(c) return diacriticMap[c] or c end)
+     local parts = {}
+    for char in text:gmatch("([%z\1-\127\194-\244][\128-\191]*)") do
+        table.insert(parts, diacriticMap[char] or char)
+    end
+    -- Concatenate the parts into a single string
+    local result = table.concat(parts)
-    -- Cache result
      resultCache[text] = result
      return result