Module:etymon/categories
Appearance
- This module lacks a documentation subpage. Please create it.
- Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox
local export = {} local M = require("Module:module loader").init({ require = { etymology = "Module:etymology", affix = "Module:affix", etymology_specialized = "Module:etymology/specialized", }, loadData = { data = "Module:etymon/data", }, }) -- Evaluate whether a keyword is transitive for a given term local function is_transitive(transitive_mode, page_lang, term_lang) if transitive_mode == M.data.TRANSITIVE.ALWAYS then return true elseif transitive_mode == M.data.TRANSITIVE.NEVER then return false elseif transitive_mode == M.data.TRANSITIVE.CROSS_LANG then return page_lang:getCode() ~= term_lang:getCode() elseif transitive_mode == M.data.TRANSITIVE.CROSS_LANG_NO_INTERNAL_SOURCE then return page_lang:getCode() ~= term_lang:getCode() end error("Unknown transitive mode: " .. tostring(transitive_mode)) end -- Get keyword config with language-specific overrides local function get_keyword_config(keyword, lang_exc) local base_config = M.data.keywords[keyword] if not base_config then return nil -- Invalid keyword end local overrides = lang_exc and lang_exc.keyword_overrides and lang_exc.keyword_overrides[keyword] if not overrides then return base_config end -- Merge overrides into base config local merged = {} for k, v in pairs(base_config) do merged[k] = v end for k, v in pairs(overrides) do merged[k] = v end return merged end function export.get_cat_name(source) local _, cat_name = M.etymology.get_display_and_cat_name(source, true) return cat_name end -- Normalize affix type aliases local aftype_aliases = { ["pre"] = "prefix", ["suf"] = "suffix", ["in"] = "infix", ["inter"] = "interfix", ["circum"] = "circumfix", ["naf"] = "non-affix", ["root"] = "non-affix", } -- Collect affix categories from top-level group containers local function collect_affix_categories(node, page_lang, available_etymon_ids, senseid_parent_etymon, lang_exc) local parts = {} local part_index = 1 for _, container in ipairs(node.children or {}) do local config = container.keyword_info if config and config.affix_categories then for _, term in ipairs(container.terms or {}) do if not term.unknown_term then local part_data = { term = term.title, tr = term.tr, ts = term.ts, alt = term.alt, itemno = part_index, orig_index = part_index } -- Determine affix type: explicit aftype > pos=root > auto-detect local aftype = term.aftype if aftype then aftype = aftype_aliases[aftype] or aftype part_data.type = aftype elseif term.args and term.args.pos and term.args.pos == "root" then part_data.type = "non-affix" end if term.lang:getCode() ~= page_lang:getCode() then part_data.lang = term.lang end local target_ids = available_etymon_ids[term.target_key] local has_multiple_ids = target_ids and #target_ids > 1 local id_exists_in_disambiguation = false local matched_id = nil -- Count available senseids for the target page local senseid_count = 0 local target_prefix = term.target_key .. ":" if senseid_parent_etymon then for key, _ in pairs(senseid_parent_etymon) do if key:sub(1, #target_prefix) == target_prefix then senseid_count = senseid_count + 1 end end end local has_multiple_senseids = senseid_count > 1 if term.id then -- Check if user provided a valid senseid local senseid_key = term.target_key .. ":" .. term.id if senseid_parent_etymon and senseid_parent_etymon[senseid_key] then if has_multiple_senseids then -- Ambiguous senseid: use senseid matched_id = term.id id_exists_in_disambiguation = true elseif has_multiple_ids then -- Unique senseid but ambiguous etymon: use etymon ID matched_id = term.etymon_id or term.id id_exists_in_disambiguation = true end else -- Check if user provided a valid etymon ID if has_multiple_ids and target_ids then for _, id_data in ipairs(target_ids) do local stored_id = type(id_data) == "table" and id_data.id or id_data if stored_id == term.id then -- Ambiguous etymon: use etymon ID id_exists_in_disambiguation = true matched_id = term.id break end end end -- Fallback: check resolved etymon_id (e.g. from previous steps) if not id_exists_in_disambiguation and has_multiple_ids and term.etymon_id and target_ids then for _, id_data in ipairs(target_ids) do local stored_id = type(id_data) == "table" and id_data.id or id_data if stored_id == term.etymon_id then id_exists_in_disambiguation = true matched_id = term.etymon_id break end end end end end -- Use the matched ID if found if term.override or id_exists_in_disambiguation then part_data.id = matched_id or term.id end table.insert(parts, part_data) part_index = part_index + 1 end end end end if #parts == 0 then return {} end local affix_data = { lang = page_lang, parts = parts, pos = "term", sort_key = nil } local affix_categories = M.affix.get_affix_categories_only(affix_data) local result = {} for _, cat in ipairs(affix_categories) do if type(cat) == "table" then table.insert(result, cat.cat) else table.insert(result, cat) end end return result end -- Add borrowing-related categories (top-level only) local function collect_borrowing_categories(categories, page_lang, term, config) if config.borrowing_type == "borrowed" then local temp_categories = {} M.etymology.insert_borrowed_cat(temp_categories, page_lang, term.lang) for _, cat in ipairs(temp_categories) do categories[cat] = true end end if config.specialized_borrowing then local result = M.etymology_specialized.specialized_borrowing { bortype = config.specialized_borrowing, lang = page_lang, sources = { term.lang }, terms = { { lang = term.lang, term = "-" } }, notext = true, nocat = false, } for cat_name in result:gmatch("%[%[Category:([^%]]+)%]%]") do categories[cat_name] = true end end end -- Add source-based derivation categories (top-level only) local function collect_source_derivation_categories(categories, page_lang, term, config) if not config.source_category_type then return end local temp_categories = {} M.etymology.insert_source_cat_get_display { lang = page_lang, source = term.lang, categories = temp_categories, borrowing_type = config.source_category_type, nocat = false, } for _, cat in ipairs(temp_categories) do categories[cat] = true end end -- Add source language categories local function collect_source_categories(categories, page_lang, term, chain, get_norm_lang_func) if page_lang:getCode() == get_norm_lang_func(term.lang):getCode() then return end local temp_categories = {} M.etymology.insert_source_cat_get_display { lang = page_lang, source = term.lang, categories = temp_categories, nocat = false, } for _, cat in ipairs(temp_categories) do categories[cat] = true end if chain.inherited then temp_categories = {} M.etymology.insert_source_cat_get_display { lang = page_lang, source = term.lang, categories = temp_categories, borrowing_type = "terms inherited", nocat = false, } for _, cat in ipairs(temp_categories) do categories[cat] = true end end end -- Add root/word categories local function collect_pos_categories(categories, page_lang, root_title, term, available_etymon_ids, chain, get_norm_lang_func, lang_exc, keyword) local pos_types = { root = "root", word = "word" } -- Determine pos: from term's postype, keyword's pos_override, or args.pos local pos local config = get_keyword_config(keyword, lang_exc) if term.postype then -- Term-level postype modifier takes highest priority pos = term.postype elseif config and config.pos_override then pos = config.pos_override elseif type(term.args) == "table" and term.args.pos then pos = term.args.pos end local pos_type = pos_types[pos] if not pos_type or term.unknown_term then return end -- Skip root/word categories for descendants of affix groups -- if pos_type then -- return -- end local same_language = get_norm_lang_func(page_lang):getFullCode() == get_norm_lang_func(term.lang):getFullCode() -- Skip self-references if same_language and root_title == term.title then return end -- Use makeEntryName to strip diacritics for category names local entry_name = term.lang:makeEntryName(term.title) local lang_name = page_lang:getCanonicalName() local cat_name if chain.passed_through then local etymon_lang_name = export.get_cat_name(term.lang) cat_name = lang_name .. " terms derived from the " .. etymon_lang_name .. " " .. pos_type .. " " .. entry_name else cat_name = lang_name .. " terms belonging to the " .. pos_type .. " " .. entry_name end -- Add ID disambiguation if needed (for roots/words: use etymon_id if resolved via senseid, otherwise use id) local target_ids = available_etymon_ids[term.target_key] local effective_id = term.etymon_id or term.id -- etymon_id if senseid, otherwise id is already an etymon id if target_ids and effective_id then local same_pos_count = 0 for _, id_data in ipairs(target_ids) do if type(id_data) == "table" and id_data.pos == pos then same_pos_count = same_pos_count + 1 end end if same_pos_count > 1 then cat_name = cat_name .. " (" .. effective_id .. ")" end end categories[cat_name] = true end -- Compute chain state for a term based on parent chain and keyword config -- Hyphen patterns for affix detection (regular hyphen + script-specific) local AFFIX_HYPHEN_PATTERN = "[%-%־ـ᠊]" -- regular hyphen, Hebrew maqqef, Arabic tatweel, Mongolian hyphen -- Check if a term is an actual affix (not a non-affix member of an affix group) local function is_actual_affix(term) -- Check explicit aftype modifier if term.aftype then local normalized = aftype_aliases[term.aftype] or term.aftype return normalized ~= "non-affix" end -- Check if pos=root (treated as non-affix) if term.args and term.args.pos and term.args.pos == "root" then return false end -- Auto-detect by hyphen: prefix ends with -, suffix starts with -, etc. if term.title then local title = term.title -- Strip leading * for reconstructed terms before checking hyphens title = title:gsub("^%*", "") -- Check for hyphens at start or end (handles script-specific hyphens too) if title:match("^" .. AFFIX_HYPHEN_PATTERN) or title:match(AFFIX_HYPHEN_PATTERN .. "$") then return true end end -- Default: not an affix return false end local function compute_category_chain(parent_chain, config, page_lang, term_lang, get_norm_lang_func, parent_term_lang, term) -- Track if we're inside an actual affix (for suppressing root categories on descendants) -- Only set if the term is an actual affix (prefix, suffix, etc.), not a non-affix member local inside_affix = parent_chain.inside_affix if config.affix_categories and term and is_actual_affix(term) then inside_affix = true end -- If no_child_categories is set, disable everything if config.no_child_categories then return { passed_through = parent_chain.passed_through or page_lang:getCode() ~= get_norm_lang_func(term_lang):getCode(), inherited = false, source = false, pos = false, recurse = false, inside_affix = inside_affix, } end local term_is_transitive = is_transitive(config.transitive, page_lang, term_lang) local new_source = parent_chain.source and term_is_transitive -- For CROSS_LANG_NO_INTERNAL_SOURCE: track internal derivation language context -- Check if this term is internal relative to parent term's language (if parent_term_lang provided) -- or relative to page language (if no parent_term_lang) local internal_lang = parent_chain.internal_lang local is_internal_in_context = false if config.transitive == M.data.TRANSITIVE.CROSS_LANG_NO_INTERNAL_SOURCE then local check_lang = parent_term_lang or page_lang local term_lang_code = get_norm_lang_func(term_lang):getCode() local check_lang_code = get_norm_lang_func(check_lang):getCode() if internal_lang then -- Already in an internal derivation context: check if this term is also internal is_internal_in_context = term_lang_code == internal_lang else -- Check if this term is internal relative to parent term (or page if no parent) is_internal_in_context = term_lang_code == check_lang_code end end -- Source chain behavior for CROSS_LANG_NO_INTERNAL_SOURCE if config.transitive == M.data.TRANSITIVE.CROSS_LANG_NO_INTERNAL_SOURCE then if is_internal_in_context then -- Internal derivation new_source = false internal_lang = get_norm_lang_func(term_lang):getCode() else -- Cross-language new_source = parent_chain.source and term_is_transitive internal_lang = nil end end local new_pos = parent_chain.pos return { passed_through = parent_chain.passed_through or page_lang:getCode() ~= get_norm_lang_func(term_lang):getCode(), inherited = parent_chain.inherited and config.inherited_chain, source = new_source, pos = new_pos, internal_lang = internal_lang, recurse = new_source or new_pos, inside_affix = inside_affix, } end function export.render(opts) opts = opts or {} local data_tree = opts.data_tree local page_lang = opts.page_lang local available_etymon_ids = opts.available_etymon_ids local senseid_parent_etymon = opts.senseid_parent_etymon local get_norm_lang_func = opts.get_norm_lang_func local lang_exc = opts.lang_exc local categories = {} local seen = {} local lang_name = page_lang:getCanonicalName() local root_title = data_tree.title -- Collect the tree recursively local function collect(node, parent_chain, is_toplevel) -- Avoid processing same node twice if not node.unknown_term and node.title then local key = node.lang:getFullCode() .. ":" .. (node.title or "") .. ":" .. (node.id or "") if seen[key] then return end seen[key] = true end -- Collect affix categories at top level only if is_toplevel then local affix_cats = collect_affix_categories(node, page_lang, available_etymon_ids, senseid_parent_etymon, lang_exc) for _, cat in ipairs(affix_cats) do categories[lang_name .. " " .. cat] = true end end -- Process each container for _, container in ipairs(node.children or {}) do local keyword = container.keyword local config = get_keyword_config(keyword, lang_exc) -- Skip invalid keywords if config then -- Process each term in the container for _, term in ipairs(container.terms or {}) do local term_chain = compute_category_chain(parent_chain, config, page_lang, term.lang, get_norm_lang_func, node.lang, term) local no_child_categories = config.no_child_categories == true local term_is_transitive = is_transitive(config.transitive, page_lang, term.lang) -- Top-level only processing if is_toplevel then -- Missing/ambiguous etymon tracking if not term.unknown_term and (term.status == M.data.STATUS.MISSING or term.status == M.data.STATUS.REDLINK) then categories[lang_name .. " entries referencing missing etymons"] = true end if not term.unknown_term and term.status == M.data.STATUS.AMBIGUOUS then categories[lang_name .. " entries referencing ambiguous etymons"] = true end if term.missing_descendants_header then categories[lang_name .. " entries referencing etymons without Descendants sections"] = true end if term.missing_descendants_entry then categories[lang_name .. " entries referencing etymons without this term in Descendants sections"] = true end -- Top-level category (e.g., "undefined derivations") if config.toplevel_category then categories[lang_name .. " " .. config.toplevel_category] = true end -- Borrowing categories (bor, lbor, slbor, ubor, obor) if config.borrowing_type or config.specialized_borrowing then collect_borrowing_categories(categories, page_lang, term, config) end -- Borrowing categories from <bor>, <lbor>, or <slbor> modifiers on :af/:surf terms if keyword == "affix" or keyword == "surf" then if term.bor then local bor_config = { borrowing_type = "borrowed" } collect_borrowing_categories(categories, page_lang, term, bor_config) elseif term.lbor then local bor_config = { specialized_borrowing = "learned" } collect_borrowing_categories(categories, page_lang, term, bor_config) elseif term.slbor then local bor_config = { specialized_borrowing = "semi-learned" } collect_borrowing_categories(categories, page_lang, term, bor_config) end end -- Source-based derivation categories (sl, calque, pcal) if config.source_category_type then collect_source_derivation_categories(categories, page_lang, term, config) end -- Skip all child categorisation if no_child_categories is set if not no_child_categories then -- Source categories only if transitive if term_is_transitive then collect_source_categories(categories, page_lang, term, term_chain, get_norm_lang_func) end -- Pos categories always (unless no_child_categories) collect_pos_categories(categories, page_lang, root_title, term, available_etymon_ids, term_chain, get_norm_lang_func, lang_exc, keyword) end else -- Below top level, respect the parent chain if parent_chain.source then collect_source_categories(categories, page_lang, term, term_chain, get_norm_lang_func) end if parent_chain.pos then collect_pos_categories(categories, page_lang, root_title, term, available_etymon_ids, term_chain, get_norm_lang_func, lang_exc, keyword) end end -- Recurse into term's children if needed and status allows if term_chain.recurse and (term.status == M.data.STATUS.OK or term.status == M.data.STATUS.INLINE) then collect(term, term_chain, false) end end end end end -- Initial chain state local initial_chain = { passed_through = false, inherited = true, source = true, pos = true, internal_lang = nil, recurse = true, inside_affix = false, } collect(data_tree, initial_chain, true) local cat_list = {} for cat in pairs(categories) do table.insert(cat_list, cat) end return cat_list end return export