Jump to content

Module:CountryData

Revision as of 19:16, 3 July 2025 by MarkWD (talk | contribs) (// via Wikitext Extension for VSCode)

Documentation for this module may be created at Module:CountryData/doc

-- Module:CountryData
-- Unified module for country data management.
--
-- Features:
--   * Loads country data from JSON stored in Data:CountryDataset.json
--   * Normalizes country names to canonical forms
--   * Maps countries to ICANN regions
--   * Provides extensible property access
--   * Integrates with Semantic MediaWiki
--   * Formats country lists with region-specific emoji styling
--   * Processes countries for category assignment

-- Dependencies
local DiacriticNormalization = require('Module:NormalizationDiacritic')
local NormalizationText = require('Module:NormalizationText')
local loader = require('Module:DatasetLoader')

-- Module-level cache tables for improved performance
local dataCache = nil
local nameLookupCache = nil
local regionLookupCache = nil
local propertyCache = {}
local functionCache = {}

-- Default data structure to use if JSON loading fails
local DEFAULT_DATA = {
    schema_version = 1,
    last_updated = os.date('!%Y-%m-%dT%H:%M:%SZ'),
    countries = {},
    icann_regions = {}
}

--------------------------------------------------------------------------------
-- Helper Functions
--------------------------------------------------------------------------------

-- Create a cache key from a function name and arguments
local function createCacheKey(funcName, ...)
    local args = {...}
    local keyParts = {funcName}
    for i = 1, #args do
        table.insert(keyParts, tostring(args[i]) or "nil")
    end
    return table.concat(keyParts, ":")
end

-- Function to safely check if a table has a property
local function hasProperty(tbl, property)
    return tbl and type(tbl) == "table" and tbl[property] ~= nil
end

--------------------------------------------------------------------------------
-- Data Loading Layer
--------------------------------------------------------------------------------

-- Get name lookup cache: builds if not already cached
local function getNameLookup(data)
    if nameLookupCache then
        return nameLookupCache
    end
    
    -- If no data provided, return empty lookup
    if not data or not data.countries then
        nameLookupCache = {}
        return nameLookupCache
    end
    
    local lookup = {}
    
    -- Debug: Check if we have Panama in the data
    local ErrorHandling = require('Module:ErrorHandling')
    local debugContext = ErrorHandling.createContext("NameLookupDebug")
    
    if data.countries["PA"] then
        ErrorHandling.addError(debugContext, "PanamaDataDebug", 
            string.format("Found PA in data: name='%s'", data.countries["PA"].name or "nil"), 
            "", false)
    else
        ErrorHandling.addError(debugContext, "PanamaDataDebug", 
            "PA not found in data.countries", 
            "", false)
    end
    
    -- Optimization: Pre-count number of mappings
    local mappingCount = 0
    for code, country in pairs(data.countries) do
        -- Count canonical name
        mappingCount = mappingCount + 1
        
        -- Count variations if they exist
        if country.variations and type(country.variations) == "table" then
            mappingCount = mappingCount + #country.variations
        end
    end
    
    -- Build the lookup table with pre-counted size
    for code, country in pairs(data.countries) do
        local names_to_process = {}
        
        -- Add name field as primary display name
        local displayName = country.name or country.canonical_name
        if displayName then
            table.insert(names_to_process, displayName)
        end
        
        -- Add canonical_name if different from name
        if country.canonical_name and country.canonical_name ~= country.name then
            table.insert(names_to_process, country.canonical_name)
        end
        
        -- Add variations
        if country.variations and type(country.variations) == "table" then
            for _, variation in pairs(country.variations) do
                table.insert(names_to_process, variation)
            end
        end

        -- Debug Panama specifically
        if code == "PA" then
            ErrorHandling.addError(debugContext, "PanamaProcessingDebug", 
                string.format("Processing PA: %d names to process", #names_to_process), 
                "", false)
            
            for i, name in ipairs(names_to_process) do
                ErrorHandling.addError(debugContext, "PanamaNameDebug", 
                    string.format("PA name %d: '%s'", i, name), 
                    "", false)
            end
        end

        for _, name in ipairs(names_to_process) do
            local normalized = NormalizationText.normalizeText(name)
            lookup[normalized] = code
            
            -- Debug Panama normalization
            if code == "PA" then
                ErrorHandling.addError(debugContext, "PanamaNormalizationDebug", 
                    string.format("PA: '%s' -> normalized: '%s'", name, normalized), 
                    "", false)
            end
            
            local stripped = DiacriticNormalization.removeDiacritics(normalized)
            if stripped ~= normalized then
                lookup[stripped] = code
                
                -- Debug Panama diacritic stripping
                if code == "PA" then
                    ErrorHandling.addError(debugContext, "PanamaStrippedDebug", 
                        string.format("PA: '%s' -> stripped: '%s'", normalized, stripped), 
                        "", false)
                end
            end
        end
    end
    
    -- Debug: Check what keys were added for Panama
    local panamaKeys = {}
    for key, code in pairs(lookup) do
        if code == "PA" then
            table.insert(panamaKeys, key)
        end
    end
    
    ErrorHandling.addError(debugContext, "PanamaKeysDebug", 
        string.format("PA lookup keys: %s", table.concat(panamaKeys, ", ")), 
        "", false)
    
    nameLookupCache = lookup
    return lookup
end

-- Get region lookup cache: builds if not already cached
local function getRegionLookup(data)
    if regionLookupCache then
        return regionLookupCache
    end
    
    -- If no data provided, return empty lookup
    if not data or not data.icann_regions then
        regionLookupCache = {}
        return regionLookupCache
    end
    
    local lookup = {}
    
    -- Optimization: Pre-count number of mappings
    local mappingCount = 0
    for code, region in pairs(data.icann_regions) do
        -- Count canonical name
        mappingCount = mappingCount + 1
        
        -- Count variations if they exist
        if region.variations and type(region.variations) == "table" then
            mappingCount = mappingCount + #region.variations
        end
    end
    
    -- Build the lookup table with pre-counted size
    for code, region in pairs(data.icann_regions) do
        -- Add canonical name
        if region.name then
            lookup[NormalizationText.normalizeText(region.name)] = code
        end
        
        -- Add variations
        if region.variations and type(region.variations) == "table" then
            -- Use pairs instead of ipairs to handle both array and object structures
            for _, variation in pairs(region.variations) do
                lookup[NormalizationText.normalizeText(variation)] = code
            end
        end
    end
    
    regionLookupCache = lookup
    return lookup
end

-- Reset the module-level caches (useful for testing)
local function resetCaches()
    dataCache = nil
    nameLookupCache = nil
    regionLookupCache = nil
    propertyCache = {}
    functionCache = {}
end

-- Data loading function using DatasetLoader
local function loadData(frame)
    if dataCache then
        return dataCache
    end
    local raw = loader.get('CountryDataset')
    dataCache = {
        countries      = raw.countries      or {},
        icann_regions  = raw.icann_regions  or {},
        schema_version = raw.schema_version,
        last_updated   = raw.last_updated
    }
    return dataCache
end

--------------------------------------------------------------------------------
-- Core API Functions
--------------------------------------------------------------------------------

local CountryData = {}

-- Load data and initialize caches
function CountryData.loadData(frame)
    return loadData(frame)
end

-- Reset all caches (primarily for testing)
function CountryData.resetCaches()
    resetCaches()
    return true
end

-- Get country data by ISO code
function CountryData.getCountryByCode(code)
    if not code or code == "" then
        return nil
    end
    
    -- Check function cache first
    local cacheKey = createCacheKey("getCountryByCode", code)
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local data = loadData()
    
    -- Standardize code to uppercase for consistency
    code = code:upper()
    
    local result = nil
    if data and data.countries and data.countries[code] then
        result = data.countries[code]
    end
    
    -- Cache the result (including nil)
    functionCache[cacheKey] = result
    return result
end

-- Get country data by name (including variations)
function CountryData.getCountryByName(name)
    if not name or name == "" then
        return nil
    end
    
    -- Check function cache first
    local cacheKey = createCacheKey("getCountryByName", name)
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local data = loadData()
    local nameLookup = getNameLookup(data)
    
    -- Normalize the input
    local normalized = NormalizationText.normalizeText(name)
    
    -- Look up the code
    local code = nameLookup[normalized]

    -- If the first lookup fails, ALWAYS try the stripped version.
    if not code then
        local stripped = DiacriticNormalization.removeDiacritics(normalized)
        code = nameLookup[stripped]
    end
    
    local result = nil
    if code and data.countries[code] then
        result = data.countries[code]
    end
    
    -- Cache the result (including nil)
    functionCache[cacheKey] = result
    return result
end

-- Get country code by name
function CountryData.getCountryCodeByName(name)
    if not name or name == "" then
        return nil
    end
    
    -- Check function cache first
    local cacheKey = createCacheKey("getCountryCodeByName", name)
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local data = loadData()
    local nameLookup = getNameLookup(data)
    
    -- Normalize the input
    local normalized = NormalizationText.normalizeText(name)
    
    -- Look up the code
    local code = nameLookup[normalized]
    
    -- If the first lookup fails, ALWAYS try the stripped version.
    if not code then
        -- Try with diacritics removed
        local stripped = DiacriticNormalization.removeDiacritics(normalized)
        code = nameLookup[stripped]
    end
    
    -- Cache the result (including nil)
    functionCache[cacheKey] = code
    return code
end

function CountryData.normalizeCountryName(name)
    if not name or name == "" then
        return name
    end
    
    -- Check function cache first
    local cacheKey = createCacheKey("normalizeCountryName", name)
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local country = CountryData.getCountryByName(name)
    
    local result
    if country then
        -- Return name as the primary display name
        result = country.name or country.canonical_name
        -- make it category‑friendly:
        result = result
            :gsub(",%s*", "")                 -- drop any commas
            :gsub("%sand the%s+", " and ")    -- turn “ and the ” into “ and ”
    else
        -- If no match, return "(Unrecognized)"
        result = "(Unrecognized)"
    end
    
    -- Cache the result
    functionCache[cacheKey] = result
    return result
end

-- Get ICANN region for a country
function CountryData.getRegionByCountry(name)
    if not name or name == "" then
        return nil
    end
    
    -- Check function cache first
    local cacheKey = createCacheKey("getRegionByCountry", name)
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local country = CountryData.getCountryByName(name)
    
    local result
    if country and country.icann_region then
        result = country.icann_region
    else
        -- Return "(Unrecognized)" for consistency with normalizeCountryName
        result = "(Unrecognized)"
    end
    
    -- Cache the result
    functionCache[cacheKey] = result
    return result
end

-- Get all countries in a specific region
function CountryData.getCountriesByRegion(region)
    if not region or region == "" then
        return {}
    end
    
    -- Check function cache first
    local cacheKey = createCacheKey("getCountriesByRegion", region)
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local data = loadData()
    local regionLookup = getRegionLookup(data)
    
    -- Normalize the input
    local normalized = NormalizationText.normalizeText(region)
    
    -- Look up the region code
    local regionCode = regionLookup[normalized]
    
    local result = {}
    if regionCode and data.countries then
        -- Pre-count number of countries in region for allocation
        local countryCount = 0
        for _, country in pairs(data.countries) do
            if country.icann_region == regionCode then
                countryCount = countryCount + 1
            end
        end
        
        -- Populate the result with the pre-allocated size
        local index = 1
        for code, country in pairs(data.countries) do
            if country.icann_region == regionCode then
                result[index] = {
                    code = code,
                    name = country.name or country.canonical_name
                }
                index = index + 1
            end
        end
    end
    
    -- Cache the result
    functionCache[cacheKey] = result
    return result
end

-- Get list of all country codes
function CountryData.getAllCountryCodes()
    -- Check function cache first
    local cacheKey = "getAllCountryCodes"
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local data = loadData()
    local result = {}
    
    if data and data.countries then
        -- Pre-allocate the array to the number of countries
        local countryCount = 0
        for _ in pairs(data.countries) do
            countryCount = countryCount + 1
        end
        
        -- Now populate the array
        local index = 1
        for code in pairs(data.countries) do
            result[index] = code
            index = index + 1
        end
    end
    
    -- Cache the result
    functionCache[cacheKey] = result
    return result
end

-- Get list of all canonical country names
function CountryData.getAllCountryNames()
    -- Check function cache first
    local cacheKey = "getAllCountryNames"
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local data = loadData()
    local result = {}
    
    if data and data.countries then
        -- Pre-allocate the array to the number of countries
        local countryCount = 0
        for _ in pairs(data.countries) do
            countryCount = countryCount + 1
        end
        
        -- Populate the array
        local index = 1
        for _, country in pairs(data.countries) do
            local name = country.name or country.canonical_name
            result[index] = name
            index = index + 1
        end
    end
    
    -- Cache the result
    functionCache[cacheKey] = result
    return result
end

-- Get any property for a country by code
function CountryData.getCountryProperty(code, property)
    if not code or code == "" or not property or property == "" then
        return nil
    end
    
    -- Check property cache first
    local cacheKey = createCacheKey("getCountryProperty", code, property)
    if propertyCache[cacheKey] ~= nil then
        return propertyCache[cacheKey]
    end
    
    local country = CountryData.getCountryByCode(code)
    
    local result = nil
    if country and country[property] ~= nil then
        result = country[property]
    end
    
    -- Cache the result (including nil)
    propertyCache[cacheKey] = result
    return result
end

-- Get any property for a country by name
function CountryData.getCountryPropertyByName(name, property)
    if not name or name == "" or not property or property == "" then
        return nil
    end
    
    -- Check property cache first
    local cacheKey = createCacheKey("getCountryPropertyByName", name, property)
    if propertyCache[cacheKey] ~= nil then
        return propertyCache[cacheKey]
    end
    
    local code = CountryData.getCountryCodeByName(name)
    
    local result = nil
    if code then
        result = CountryData.getCountryProperty(code, property)
    end
    
    -- Cache the result (including nil)
    propertyCache[cacheKey] = result
    return result
end

-- List all available properties for a country
function CountryData.getAvailableProperties(code)
    if not code or code == "" then
        return {}
    end
    
    -- Check function cache first
    local cacheKey = createCacheKey("getAvailableProperties", code)
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local data = loadData()
    if not data or not data.countries or not data.countries[code] then
        return {}
    end
    
    local properties = {}
    
    -- Pre-allocate the table based on the number of properties
    local propertyCount = 0
    for _ in pairs(data.countries[code]) do
        propertyCount = propertyCount + 1
    end
    
    -- Fill the table with property names
    local index = 1
    for property in pairs(data.countries[code]) do
        properties[index] = property
        index = index + 1
    end
    
    -- Cache the result
    functionCache[cacheKey] = properties
    return properties
end

-- Get all unique property names across all countries
function CountryData.getAllPropertyNames()
    -- Check function cache first
    local cacheKey = "getAllPropertyNames"
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local data = loadData()
    if not data or not data.countries then
        return {}
    end
    
    local properties = {}
    local seen = {}
    
    -- First pass: count unique properties for pre-allocation
    local propertyCount = 0
    for _, country in pairs(data.countries) do
        for property in pairs(country) do
            if not seen[property] then
                seen[property] = true
                propertyCount = propertyCount + 1
            end
        end
    end
    
    -- Reset seen table
    seen = {}
    
    -- Second pass: fill the pre-allocated table
    local index = 1
    for _, country in pairs(data.countries) do
        for property in pairs(country) do
            if not seen[property] then
                seen[property] = true
                properties[index] = property
                index = index + 1
            end
        end
    end
    
    -- Cache the result
    functionCache[cacheKey] = properties
    return properties
end


-- Get semantic properties for countries and regions
-- Returns a table of properties that can be integrated with the batch processing system
function CountryData.getSemanticCountryRegionProperties(countryValue, errorContext)
    -- Debug: Function entry - use the passed errorContext if available
    local ErrorHandling = require('Module:ErrorHandling')
    local debugContext = errorContext or ErrorHandling.createContext("CountryData")
    ErrorHandling.addError(debugContext, "FunctionEntryDebug", 
        string.format("getSemanticCountryRegionProperties called with: '%s'", countryValue or "nil"), 
        "", false)
    
    -- Initialize return table
    local properties = {}
    
    if not countryValue or countryValue == "" then
        ErrorHandling.addError(debugContext, "EarlyReturnDebug", 
            "Returning early: countryValue is nil or empty", 
            "", false)
        return properties
    end
    
    -- Use standard semantic property names directly
    local countryPropertyName = "Has country"
    local regionPropertyName = "Has ICANN region"
    
    -- Split multi-value country strings
    local countries = {}
    for country in string.gmatch(countryValue, "[^;]+") do
        local trimmedCountry = country:match("^%s*(.-)%s*$")
        if trimmedCountry and trimmedCountry ~= "" then
            table.insert(countries, trimmedCountry)
        end
    end
    
    -- Debug: Track country normalization flow
    ErrorHandling.addError(debugContext, "CountryDebug", 
        string.format("Input='%s' Countries=%d FirstCountry='%s'", 
            countryValue or "nil", 
            #countries,
            countries[1] or "none"), 
        "", false)
    
    -- Process each country
    for _, country in ipairs(countries) do
        -- Debug: Show each country being processed
        ErrorHandling.addError(debugContext, "ProcessingCountryDebug", 
            string.format("Processing country: '%s'", country), 
            "", false)
        
        local normalizedCountry = CountryData.normalizeCountryName(country)
        
        -- Debug: Show normalization result
        ErrorHandling.addError(debugContext, "NormalizationDebug", 
            string.format("'%s' normalized to '%s'", country, normalizedCountry or "nil"), 
            "", false)
        
        -- Only process recognized countries
        if normalizedCountry ~= "(Unrecognized)" then
            -- Debug: Show successful recognition
            ErrorHandling.addError(debugContext, "RecognizedCountryDebug", 
                string.format("Country '%s' recognized as '%s'", country, normalizedCountry), 
                "", false)
            
            -- Add country to properties table
            properties[countryPropertyName] = properties[countryPropertyName] or {}
            table.insert(properties[countryPropertyName], normalizedCountry)
            
            -- Add region to properties table
            local region = CountryData.getRegionByCountry(country)
            if region and region ~= "(Unrecognized)" then
                properties[regionPropertyName] = properties[regionPropertyName] or {}
                table.insert(properties[regionPropertyName], region)
                
                -- Debug: Show region assignment
                ErrorHandling.addError(debugContext, "RegionDebug", 
                    string.format("Country '%s' assigned to region '%s'", normalizedCountry, region), 
                    "", false)
            end
        else
            -- Debug: Show unrecognized country
            ErrorHandling.addError(debugContext, "UnrecognizedCountryDebug", 
                string.format("Country '%s' not recognized", country), 
                "", false)
        end
    end
    
    -- Debug: Show final properties before return
    local propCount = 0
    for _ in pairs(properties) do propCount = propCount + 1 end
    ErrorHandling.addError(debugContext, "FinalPropertiesDebug", 
        string.format("Returning %d properties", propCount), 
        "", false)
    
    return properties
end

-- Export country data as JSON string (for JavaScript usage)
function CountryData.exportAsJson()
    local data = loadData()
    
    -- Ensure we have valid data
    if not data or not data.countries then
        return '{}'
    end
    
    -- Use MediaWiki's JSON encoder
    if mw.text and mw.text.jsonEncode then
        local success, result = pcall(function()
            return mw.text.jsonEncode(data)
        end)
        
        if success and result then
            return result
        end
    end
    
    -- Fallback to simple string if JSON encoding fails
    return '{}'
end

--------------------------------------------------------------------------------
-- Country Display Functions with contextual emoji
--------------------------------------------------------------------------------
-- Get region-specific CSS class for country display
local function getRegionClass(region)
    if not region or region == "(Unrecognized)" then
        return "region-default"
    end
    
    if region == "NA" or region == "LAC" then
        return "region-americas"
    elseif region == "AP" then
        return "region-asia-pacific"
    else
        return "region-europe-africa"
    end
end

-- Format a list of countries from a semicolon-separated string
-- Returns either plain text (single country) or bullet points (multiple countries)
-- Each country gets its own region-specific class for styling
function CountryData.formatCountryList(value)
    if not value or value == "" then return "" end
    
    -- Split and normalize countries
    local countries = {}
    for country in string.gmatch(value, "[^;]+") do
        local trimmed = country:match("^%s*(.-)%s*$")
        if trimmed and trimmed ~= "" then
            table.insert(countries, trimmed)
        end
    end
    
    local normalizedCountries = {}
    local validCountriesCount = 0
    
    for _, country in ipairs(countries) do
        local normalized = CountryData.normalizeCountryName(country)
        -- Only include recognized countries
        if normalized ~= "(Unrecognized)" then
            validCountriesCount = validCountriesCount + 1
            normalizedCountries[validCountriesCount] = normalized
        end
    end
    
    -- Generate output based on number of countries
    if validCountriesCount > 1 then
        local listItems = {}
        
        for _, country in ipairs(normalizedCountries) do
            -- Get the region for this specific country
            local countryRegion = CountryData.getRegionByCountry(country)
            local regionClass = getRegionClass(countryRegion)
            
            -- Create a list item with region-specific class
            table.insert(listItems, string.format("<li class=\"%s\">%s</li>", regionClass, country))
        end
        
        return string.format("<ul class=\"template-list template-list-country\">%s</ul>", 
                             table.concat(listItems, ""))
    elseif validCountriesCount == 1 then
        -- For a single country, create a similar list with just one item
        local countryRegion = CountryData.getRegionByCountry(normalizedCountries[1])
        local regionClass = getRegionClass(countryRegion)
        
        -- Single item list with the same styling
        return string.format("<ul class=\"template-list template-list-country\"><li class=\"%s\">%s</li></ul>", 
                             regionClass, normalizedCountries[1])
    end
    
    return ""
end

-- Alias for backward compatibility
function CountryData.formatCountries(value)
    return CountryData.formatCountryList(value)
end

-- Get a list of normalized countries for category assignment
function CountryData.getCountriesForCategories(value)
    if not value or value == "" then return {} end
    
    local countries = {}
    for country in string.gmatch(value, "[^;]+") do
        local trimmed = country:match("^%s*(.-)%s*$")
        if trimmed and trimmed ~= "" then
            table.insert(countries, trimmed)
        end
    end
    
    local normalizedCountries = {}
    local validCount = 0
    
    for _, country in ipairs(countries) do
        local normalized = CountryData.normalizeCountryName(country)
        -- Only include recognized countries
        if normalized ~= "(Unrecognized)" then
            validCount = validCount + 1
            normalizedCountries[validCount] = normalized
        end
    end
    
    return normalizedCountries
end

-- Return the module for use
-- Adds flag filename lookup
function CountryData.getFlagFileName(countryNameOrCode)
    if not countryNameOrCode or countryNameOrCode == '' then return nil end
    
    local inputName = countryNameOrCode:gsub('_', ' ') -- Clean the input
    local isoCode
    
    -- First, try to get the ISO code by treating inputName as a country name.
    -- CountryData.getCountryCodeByName handles internal normalization.
    isoCode = CountryData.getCountryCodeByName(inputName) 
    
    -- If no code was found by name, and the inputName itself is 2 characters long,
    -- it might be an ISO code already. Let's validate it.
    if not isoCode and #inputName == 2 then
        -- Check if this 2-char string is a valid country code by attempting to fetch country data.
        -- We use getCountryByCode because it directly uses the code.
        if CountryData.getCountryByCode(inputName) then 
            isoCode = inputName -- It's a valid code
        end
    end
    
    -- If we still don't have a valid ISO code, we can't proceed.
    if not isoCode or isoCode == '' then return nil end
    
    -- Ensure the code is indeed 2 letters long (as a final sanity check).
    if #isoCode ~= 2 then return nil end

    -- Construct the filename in the format "Flag-xx.svg" (e.g., "Flag-ad.svg")
    return 'Flag-' .. string.lower(isoCode) .. '.svg' 
end

return CountryData