Bước tới nội dung

Mô đun:zh-translit

Từ điển mở Wiktionary
local m_str_utils = require("Module:string utilities")

local find_templates = require("Module:template parser").find_templates
local get_section = require("Module:pages").get_section
local gsub = string.gsub
local insert = table.insert
local safe_require = require("Module:utilities").safe_require
local split = m_str_utils.split
local toNFD = mw.ustring.toNFD
local trim = m_str_utils.trim
local ugsub = m_str_utils.gsub
local ulen = m_str_utils.len
local ulower = m_str_utils.lower
local usub = m_str_utils.sub
local uupper = m_str_utils.upper

local tag

local lect_code = mw.loadData("Module:zh/data/lect codes").langcode_to_abbr

local export = {}

local function fail(lang, request)
	require("Module:debug/track")("zh-translit/needs manual translit/" .. lang)
	return nil
end

local function get_content(title)
	local content = mw.title.new(title)
	if not content then
		return false
	end
	return get_section(content:getContent(), "Chinese", 2)
end

-- Match function for regex ",(?! )".
local function split_on_comma_without_space(str, start)
	local i
	repeat
		i = str:find(",", start)
		if not i then
			return
		end
		start = i + 1
	until str:sub(start, start) ~= " "
	return i, i
end

local function handle_readings(readings, lang, tr)
	if lang == "ltc" or lang == "och" then
		if tr and readings ~= tr then
			return false
		end
		return readings
	elseif (
		lang == "cmn" or
		lang == "csp" or
		lang == "wuu" or
		lang == "yue" or
		lang == "zhx-tai"
	) then
		readings = split(readings, split_on_comma_without_space, true)
	else
		readings = split(readings, "/", true, true)
	end
	local tr_orig = tr
	for _, reading in ipairs(readings) do
		reading = trim(reading)
		if not reading:find("=") then
			if (
				not tr or
				tr == reading or
				gsub(ulower(tr), "%^", "") == reading
			) then
				tr = reading
			elseif ulower(reading) ~= tr then
				return false
			end
		elseif lang == "cmn" and reading == "cap=y" then
			local tr_cap = "^" .. tr
			if not tr_orig or tr_orig == tr_cap then
				tr = tr_cap
			end
		end
	end
	return tr
end

local function iterate_content(content, lang, see, seen, tr)
	for template in find_templates(content) do
		local name = template:get_name()
		if name == "zh-pron" then
			for k, v in pairs(template:get_arguments()) do
				if (
					#v > 0 and
					type(k) == "string" and
					k == lect_code[lang]
				) then
					tr = handle_readings(v, lang, tr)
					break
				end
			end
			if tr == false then
				return tr
			end
		elseif name == "zh-see" then
			local arg = trim(template:get_arguments()[1])
			if not seen[arg] then
				insert(see, arg)
			end
		end
	end
	return tr
end

function export.tr(text, lang, sc)
	if (not text) or text == "" then
		return text
	end
	
	if lang == "zh" or lang == "lzh" then
		lang = "cmn"
	end
	
	if not lect_code[lang] then
		lang = require("Module:languages").getByCode(lang, nil, true):getFullCode()
	end
	
	local content = get_content(text)
	if not content then
		return fail(lang)
	end
	
	local see = {}
	local seen = {
		[text] = true
	}
	local tr = iterate_content(content, lang, see, seen)
	
	if tr == nil then
		local i, title = 1
		while i <= #see do
			title = see[i]
			content = get_content(title)
			if content then
				tr = iterate_content(content, lang, see, seen, tr)
				if tr == false then
					return fail(lang)
				end
				seen[title] = true
			end
			i = i + 1
		end
	end
	
	if not tr then
		return fail(lang)
	end
	
	if lang == "cmn" then
		tr = tr:gsub("#", "")
		if tr:match("[\194-\244]") then
			tag = tag or mw.loadData("Module:zh/data/cmn-tag").MT
			tr = tr:gsub(".[\128-\191]*", function(m)
				if m == "一" then
					return "yī"
				elseif m == "不" then
					return "bù"
				else
					m = tag[m] and tag[m][1]
					if m then
						return toNFD(m):gsub("^[aeiou]", "\1%0") -- temporarily use \1 for apostrophes, as it's not in %p
					end
				end
			end)
			tr = ugsub(tr, "%f[^%z%s%p](^?)\1", "%1") -- remove any initial apostrophes inserted by the previous function
				:gsub("\1", "'")
		end
		tr = ugsub(tr, "%^('?.)", uupper)
	elseif lang == "csp" or lang == "yue" or lang == "zhx-tai" then
		tr = tr:gsub("%d[%d%*%-]*%f[^%d%*]", "<sup>%0</sup>")
	elseif lang == "hak" then
		-- TODO
	elseif lang == "ltc" or lang == "och" then
		if tr == "n" then
			return fail(lang)
		end
		local index = tr and split(tr, lang == "ltc" and "," or ";", true, true) or {}
		for i = 1, ulen(text) do
			local module_type = lang .. "-pron"
			if lang == "och" then
				module_type = module_type .. "-ZS"
			end
			
			local data_module = safe_require("Module:zh/data/" .. module_type .. "/" .. usub(text, i, i))
			
			if not data_module or (((not index[i]) or index[i] == "y") and #data_module > 1) then
				return fail(lang)
			end
			
			if index[i] == "y" then
				index[i] = 1
			elseif index[i] then
				index[i] = tonumber(index[i])
			end
			
			index[i] = index[i] and data_module[index[i]] or data_module[1]
			
			if lang == "ltc" then
				local data = mw.loadData("Module:ltc-pron/data")
				local initial, final, tone = require("Module:ltc-pron").infer_categories(index[i])
				tone = tone ~= "" and ("<sup>" .. tone .. "</sup>") or tone
				index[i] = data.initialConv["Zhengzhang"][initial] .. data.finalConv["Zhengzhang"][final] .. tone
			else
				index[i] = index[i][6]
			end
		end
		tr = table.concat(index, " ")
		if lang == "och" then
			tr = "*" .. tr
		end
	elseif lang == "nan" then
		-- TODO
	elseif lang == "nan-tws" then
		tr = require("Module:nan-pron").pengim_display(tr)
	elseif lang == "wuu" then
		local w_pron = require("Module:wuu-pron")
		if tr:match(';') then
			--TODO
			return fail(lang)
		elseif tr:match(':') then
			tr = w_pron.wugniu_format(tr:sub(4))
		else
			tr = w_pron.wugniu_format(w_pron.wikt_to_wugniu(tr))
		end
	elseif lang == "zhx-sic" then
		tr = ugsub(tr, "([%d-])(%a)", "%1 %2")
			:gsub("%d[%d%*%-]*%f[^%d%*]", "<sup>%0</sup>")
	else
		tr = require("Module:" .. lang .. "-pron").rom(tr)
	end
	
	-- End with a space so that concurrent parts of running text that need to be transliterated separately (e.g. due to links) are still properly separated.
	return tr .. " "
end

return export