Module:NormalizationDiacritic: Difference between revisions
// via Wikitext Extension for VSCode |
// via Wikitext Extension for VSCode |
||
| (4 intermediate revisions by the same user not shown) | |||
| Line 1: | Line 1: | ||
-- | --[[ | ||
* Name: NormalizationDiacritic | |||
* Author: Mark W. Datysgeld | |||
* Description: Diacritic removal for text normalization in word matching, supporting Latin and Cyrillic scripts | |||
* Notes: Normalizes apostrophe variants; UTF-8 compatible; case-preserving; uses performant gmatch and table.concat pattern for UTF-8 processing; includes caching for performance; avoids string.gsub for better UTF-8 handling | |||
]] | |||
-- | |||
local p = {} | local p = {} | ||
| Line 115: | Line 110: | ||
end | end | ||
-- | -- Build a table of characters, replacing diacritics | ||
local | local parts = {} | ||
for char in text:gmatch("([%z\1-\127\194-\244][\128-\191]*)") do | for char in text:gmatch("([%z\1-\127\194-\244][\128-\191]*)") do | ||
table.insert(parts, diacriticMap[char] or char) | |||
end | end | ||
-- | -- Concatenate the parts into a single string | ||
local result = table.concat(parts) | |||
resultCache[text] = result | resultCache[text] = result | ||
return result | return result | ||