Bước tới nội dung

Mô đun:Hrkt-translit

Từ điển mở Wiktionary
local concat = table.concat
local find = mw.ustring.find
local gsub = mw.ustring.gsub
local insert = table.insert
local load_data = mw.loadData
local toNFC = mw.ustring.toNFC

local m_ja = require("Module:ja")
local kata_to_hira = m_ja.kata_to_hira
local normalize_kana = m_ja.normalize_kana

local data_common = load_data("Module:Hrkt-translit/data")
local c_apos = data_common.rom["っ"]

local export = {}

local function get_data(lang)
	local function inspect_table(t, ...)
		for i = 1, select("#", ...) do
			if type(t) == "table" then
				t = t[select(i, ...)]
			else return nil end
		end
		return t
	end
	if lang then
		local name_data = "Module:Hrkt-translit/data/" .. lang
		if package.loaders[2](name_data) then
			local data_lang = load_data(name_data)
			return function(...)
				local item_lang, item_common = data_lang[...], data_common[...]
				for i = 2, select("#", ...) do 
					local key = select(i, ...)
					if type(item_lang) == "table" then
						item_lang = item_lang[key]
					else return inspect_table(item_common, select(i, ...)) end
					if type(item_common) == "table" then
						item_common = item_common[key]
					else return inspect_table(item_lang, select(i + 1, ...)) end
				end
				if item_lang ~= nil then return item_lang else return item_common end
			end
		end
	end
	return function(...)
		return inspect_table(data_common[...], select(2, ...))
	end
end

function export.tr(text, lang, sc, options)
	options = options or {}
	
	local result = {[0] = ""}
	local result_sp = {}
	
	local d = get_data(lang)
	
	local function getlast(i_start, predicate_good, predicate_bad)
		local in_xml = false
		for i = i_start or #result, 1, -1 do
			if in_xml then
				if result[i] == "<" then in_xml = false end
			elseif result[i] == ">" then
				in_xml = true
			else
				if (predicate_bad or function(index)
					return result_sp[index] == "stop"
				end)(i) then break end
				if (predicate_good or function(index)
					return result[index]:len() > 0 and result_sp[index] ~= "'"
				end)(i) then return i end
			end
		end
		return 0
	end
	
	-- normalize long vowels and iteration marks
	text = normalize_kana(text)
	
	-- convert to NFC (FIXME: convert this module to use NFD, which will simplify things)
	text = toNFC(text)
	
	for c in gsub(text, "[ァ-ヶ𛄠𛄢𛅤-𛅦]", kata_to_hira):gsub("\227\130[\144-\146]゙", {
		-- convert ゐ゙, ゑ゙, を゙ to ヸ, ヹ, ヺ, to ensure voicing works correctly
		["ゐ゙"] = "ヸ", ["ゑ゙"] = "ヹ", ["を゙"] = "ヺ",
	}):gmatch(".[\128-\191]*") do
		local rc = options.hist and d("rom_hist", c) or d("rom", c) or c
		local rc_sp = d("rom_sp", c)
		local i_last = getlast()
		
		if options.keep_period and c == "." then rc = "."
		elseif c:match("%a") then rc_sp = "stop" end
		
		local repl_digraph = d("digraph", c, result[i_last])
		if repl_digraph then
			result[i_last], rc = repl_digraph, ""
			result_sp[i_last], rc_sp = nil, nil
		end
		
		if not options.hist then --はへ
			if d("flag_hahe", result_sp[i_last]) and (find(c, "[-%.゙゚]") or rc:match("%a") or rc == c_apos) then
				result[i_last] = result_sp[i_last]
				result_sp[i_last] = nil
			end
			if d("flag_hahe", rc_sp) and (options.phonetic or result_sp[getlast(nil, function(i)
				return result[i]:len() > 0 and result_sp[i] ~= "'" or result_sp[i] == "stop"
			end, function() return false end)] == "stop" or result[i_last]:match"[-%a]" or result[i_last] == c_apos) then
				rc = rc_sp
				rc_sp = nil
			end
		end
		
		if rc:match"%a" and find(result[i_last], "^[,%.?!:)Ӡ]$") then --space and punctuations
			result[i_last] = result[i_last] .. " "
		elseif find(rc, "^[(“]$") and result[i_last]:match("%a") then
			rc = " " .. rc
		end
		
		if rc_sp == "voiced" then --voicing
			result[i_last] = result[i_last]:gsub("^[b-df-hj-np-tv-z]+", d("tr_voicing"))
		elseif rc_sp == "semivoiced" then
			result[i_last] = result[i_last]:gsub("^[b-df-hj-np-tv-z]+", d("tr_semivoicing"))
		end
		
		if result[i_last] == "n" and rc:match(options.hist and "^[aiueoyw]" or "^[aiueoy]") then --na vs n'a
			rc = c_apos .. rc
		end
		
		local r_lastlast = result[i_last]:match"^.*(%a%A*)$" --vowel clusters or stop consonants
		if r_lastlast then
			if r_lastlast:match("[aiueo]") then
				if rc:match("^%-[yw]") or options.hist and (r_lastlast == "i" and rc:sub(1, 1) == "y" or r_lastlast == "u" and rc:sub(1, 1) == "w") then
					if rc:sub(1, 1) == "-" then rc = rc:sub(2) end
					result[i_last] = result[i_last]:sub(1, -2)
					if rc:sub(1, 1) == "y" and d("flag_postalveolarconsonant", result[i_last]) then rc = rc:sub(2) end
				elseif rc:match"^%-[aiueo]$" then
					rc = rc:sub(2)
					if r_lastlast == rc then
						result[i_last] = result[i_last] .. r_lastlast
						rc = ""
					elseif d("flag_specialconsonant", result[i_last]) then
						result[i_last] = result[i_last]:sub(1, -2)
					elseif r_lastlast == "i" then
						result[i_last] = result[i_last]:sub(1, -2) .. "y"
					elseif r_lastlast:match("[ou]") and rc ~= "u" then
						result[i_last] = result[i_last]:sub(1, -2) .. "w"
					else
						result[i_last] = result[i_last]:sub(1, -2)
					end
				elseif rc:match("^[aiueo]$") then
					if not options.hist and not options.phonetic and d("tr_long", r_lastlast .. rc) and not result[i_last]:match("[aiueo][aiueo]$") then
						result[i_last] = result[i_last] .. rc
						rc = ""
					end
				end
			end
		end
		
		insert(result, rc)
		result_sp[#result] = rc_sp
	end
	
	if not options.hist then --isolated はへ
		local i_last = getlast()
		if d("flag_hahe", result_sp[i_last]) and getlast(i_last - 1) == 0 then
			result[i_last] = result_sp[i_last]
		end
	end
	
	local num_cap = 0
	local has_gem = false
	for i, v in ipairs(result) do
		--gemination
		if has_gem then
			local apos, consonant, remainder = v:match("^(" .. c_apos .. "*)([b-df-hj-np-tv-z]+)(.*)")
			if consonant then
				local c_gem = d("tr_gem", apos .. consonant) or consonant:sub(1, 1)
				v = consonant .. remainder
				local i_gem = getlast(i)
				while true do
					i_gem = getlast(i_gem - 1)
					if result_sp[i_gem] == "gem" then
						result[i_gem] = c_gem
					else
						i_gem = getlast(i_gem + 1)
						result[i_gem] = apos .. result[i_gem]
						break
					end
				end
				has_gem = false
			end
		elseif result_sp[i] == "gem" then
			has_gem = true
		end
		
		-- anga vs a'nga
		if v:match("^ng") then
			local i_no_gem = getlast(i - 1, function(index)
				return result[index]:len() > 0 and result_sp[index] ~= "'" and result_sp[index] ~= "gem"
			end)
			if find(result[i_no_gem], "%a") then
				result[i_no_gem] = result[i_no_gem] .. c_apos
			end
		end
		
		--diacritics (long vowels and others)
		if not options.no_diacritics then
			v = v:gsub("[aiueo][aiueo%A]*", d("tr_long"))
		end
		
		--uppercase
		if result_sp[i] == "cap" then num_cap = num_cap + 1 end
		if num_cap > 0 then
			v = v:gsub(".[\128-\191]*", function(c)
				if num_cap <= 0 then return c end
				local uc = c:uupper()
				if c ~= uc then num_cap = num_cap - 1 end
				return uc
			end)
		end
		result[i] = v
	end
	return (concat(result):gsub(c_apos, "'"))
end

return export