Module:NormalizationDiacritic: Difference between revisions
// via Wikitext Extension for VSCode |
// via Wikitext Extension for VSCode |
||
| (6 intermediate revisions by the same user not shown) | |||
| Line 1: | Line 1: | ||
-- | --[[ | ||
* Name: NormalizationDiacritic | |||
* Author: Mark W. Datysgeld | |||
* Description: Diacritic removal for text normalization in word matching, supporting Latin and Cyrillic scripts | |||
* Notes: Normalizes apostrophe variants; UTF-8 compatible; case-preserving; uses performant gmatch and table.concat pattern for UTF-8 processing; includes caching for performance; avoids string.gsub for better UTF-8 handling | |||
]] | |||
-- | |||
local p = {} | local p = {} | ||
| Line 19: | Line 14: | ||
local diacriticMap = { | local diacriticMap = { | ||
-- Apostrophe variants | -- Apostrophe variants | ||
[" | ["’"] = "'", -- RIGHT SINGLE QUOTATION MARK (U+2019) | ||
[" | ["‘"] = "'", -- LEFT SINGLE QUOTATION MARK (U+2018) | ||
["‛"] = "'", -- SINGLE HIGH-REVERSED-9 QUOTATION MARK (U+201B) | ["‛"] = "'", -- SINGLE HIGH-REVERSED-9 QUOTATION MARK (U+201B) | ||
["′"] = "'", -- PRIME (U+2032) | ["′"] = "'", -- PRIME (U+2032) | ||
| Line 115: | Line 110: | ||
end | end | ||
-- | -- Build a table of characters, replacing diacritics | ||
local | local parts = {} | ||
for char in text:gmatch("([%z\1-\127\194-\244][\128-\191]*)") do | |||
table.insert(parts, diacriticMap[char] or char) | |||
end | |||
-- Concatenate the parts into a single string | |||
local result = table.concat(parts) | |||
resultCache[text] = result | resultCache[text] = result | ||
return result | return result | ||