Bước tới nội dung

Mô đun:ViePron/sandbox

Từ điển mở Wiktionary
---Transcribes a Vietnamese word or compound word into IPA. Supports
-- pronunciations in some of the main dialects of Vietnam.
-- 
-- This module is intended as a replacement for the lumbering monstrosity
-- [[Bản mẫu:vie-pron]], which itself is a replacement for the
-- editor-unfriendly [[Bản mẫu:VieIPA]].
require "mw.ustring"
local p = {}
p.dialects = require("Module:ViePron/dialects/sandbox")

local dialects = {
    "Hà Nội", "Hải Phòng",
	"Vinh", "Thanh Chương", "Hà Tĩnh",
	"Huế",
	"Quy Nhơn", "Sài Gòn"
}

local accentedChars = "đâăêôơưáấắéếíóốớúứýàầằèềìòồờùừỳảẩẳẻểỉỏổởủửỷãẫẵẽễĩõỗỡũữỹạậặẹệịọộợụựỵ"

---Table mapping vowel characters to their toneless base letters.
local vowelsToBases = {
	["a"] = "a",  ["á"] = "a",  ["à"] = "a",  ["ã"] = "a",  ["ả"] = "a",  ["ạ"] = "a",
	["â"] = "â",  ["ấ"] = "â",  ["ầ"] = "â",  ["ẫ"] = "â",  ["ẩ"] = "â",  ["ậ"] = "â",
	["ă"] = "ă",  ["ắ"] = "ă",  ["ằ"] = "ă",  ["ẵ"] = "ă",  ["ẳ"] = "ă",  ["ặ"] = "ă",
	["e"] = "e",  ["é"] = "e",  ["è"] = "e",  ["ẽ"] = "e",  ["ẻ"] = "e",  ["ẹ"] = "e",
	["ê"] = "ê",  ["ế"] = "ê",  ["ề"] = "ê",  ["ễ"] = "ê",  ["ể"] = "ê",  ["ệ"] = "ê",
	["i"] = "i",  ["í"] = "i",  ["ì"] = "i",  ["ĩ"] = "i",  ["ỉ"] = "i",  ["ị"] = "i",
	["o"] = "o",  ["ó"] = "o",  ["ò"] = "o",  ["õ"] = "o",  ["ỏ"] = "o",  ["ọ"] = "o",
	["ô"] = "ô",  ["ố"] = "ô",  ["ồ"] = "ô",  ["ỗ"] = "ô",  ["ổ"] = "ô",  ["ộ"] = "ô",
	["ơ"] = "ơ",  ["ớ"] = "ơ",  ["ờ"] = "ơ",  ["ỡ"] = "ơ",  ["ở"] = "ơ",  ["ợ"] = "ơ",
	["u"] = "u",  ["ú"] = "u",  ["ù"] = "u",  ["ũ"] = "u",  ["ủ"] = "u",  ["ụ"] = "u",
	["ư"] = "ư",  ["ứ"] = "ư",  ["ừ"] = "ư",  ["ữ"] = "ư",  ["ử"] = "ư",  ["ự"] = "ư",
	["y"] = "y",  ["ý"] = "y",  ["ỳ"] = "y",  ["ỹ"] = "y",  ["ỷ"] = "y",  ["ỵ"] = "y"
}

---Table mapping vowel characters to the VIQR representation of their tones.
local vowelsToVIQRTones = {
	["a"] = "",  ["á"] = "'",  ["à"] = "`",  ["ã"] = "~",  ["ả"] = "?",  ["ạ"] = ".",
	["â"] = "",  ["ấ"] = "'",  ["ầ"] = "`",  ["ẫ"] = "~",  ["ẩ"] = "?",  ["ậ"] = ".",
	["ă"] = "",  ["ắ"] = "'",  ["ằ"] = "`",  ["ẵ"] = "~",  ["ẳ"] = "?",  ["ặ"] = ".",
	["e"] = "",  ["é"] = "'",  ["è"] = "`",  ["ẽ"] = "~",  ["ẻ"] = "?",  ["ẹ"] = ".",
	["ê"] = "",  ["ế"] = "'",  ["ề"] = "`",  ["ễ"] = "~",  ["ể"] = "?",  ["ệ"] = ".",
	["i"] = "",  ["í"] = "'",  ["ì"] = "`",  ["ĩ"] = "~",  ["ỉ"] = "?",  ["ị"] = ".",
	["o"] = "",  ["ó"] = "'",  ["ò"] = "`",  ["õ"] = "~",  ["ỏ"] = "?",  ["ọ"] = ".",
	["ô"] = "",  ["ố"] = "'",  ["ồ"] = "`",  ["ỗ"] = "~",  ["ổ"] = "?",  ["ộ"] = ".",
	["ơ"] = "",  ["ớ"] = "'",  ["ờ"] = "`",  ["ỡ"] = "~",  ["ở"] = "?",  ["ợ"] = ".",
	["u"] = "",  ["ú"] = "'",  ["ù"] = "`",  ["ũ"] = "~",  ["ủ"] = "?",  ["ụ"] = ".",
	["ư"] = "",  ["ứ"] = "'",  ["ừ"] = "`",  ["ữ"] = "~",  ["ử"] = "?",  ["ự"] = ".",
	["y"] = "",  ["ý"] = "'",  ["ỳ"] = "`",  ["ỹ"] = "~",  ["ỷ"] = "?",  ["ỵ"] = "."
}

---Receives a word and returns a copy of the word without tone marks.
function p.detone(word)
	return mw.ustring.gsub(word, "%a", vowelsToBases)
end

---Returns the VIQR representation of the given glide-vowel-glide sequence’s tone.
function p.viqrTone(gvg)
	for character in mw.ustring.gmatch(gvg, "%a") do
		if vowelsToVIQRTones[character] then
			local tone = vowelsToVIQRTones[character]
			-- Immediately return non-ngang tones.
			if #tone > 0 then return tone end
		end
	end
	-- Fall back on the ngang tone.
	return ""
end

---Returns a breakdown of the given word.
-- @usage {{#gọi:ViePron|components|tiếng}}
function p.components(frame)
    local c = p._components(frame.args.word or frame.args[1])
    return mw.ustring.format("%s-%s-%s", c.ci, c.gvg, c.cf)
end
function p._components(word)
	-- Initial and final consonant clusters
	-- [[Bản mẫu:vie-pron/VieC]]
    local ci, cf = mw.ustring.match(word, "^([bcdđfghjklmnpqrstvwxz]*).-([cghmnpt]*)$")
    local giv
    if ci == "g" then
        local c2 = mw.ustring.sub(word, 2, 2)
        if c2 == "i" then
            ci = "gi"
        elseif vowelsToBases[c2] == "i" then
            ci = "gi"
            giv = c2
        elseif c2 == "y" or vowelsToBases[c2] == "y" then
            -- e.g. [[giặt gỵa]]
            ci = "d"
            local c2idx = mw.ustring.find(accentedChars, c2, 1, true) - 6
            word = ci .. mw.ustring.sub(accentedChars, c2idx, c2idx) ..
                mw.ustring.sub(word, 3)
        end
    elseif ci == "q" then
        ci = "qu"
    end
    
    -- Interior glide-vowel-glide sequence
	-- [[Bản mẫu:vie-pron/VieV]]
    local interior = mw.ustring.sub(word, mw.ustring.len(ci) + 1,
        mw.ustring.len(word) - mw.ustring.len(cf))
	local gvg = {}
    for letter in mw.ustring.gmatch(interior, "%a") do
		if vowelsToBases[letter] then table.insert(gvg, letter) else break end
	end
	gvg = table.concat(gvg)
	
	-- TODO: Support polysyllabic words.
	assert(mw.ustring.len(ci .. gvg .. cf) == mw.ustring.len(word),
        "Từ này không tuân theo quy tắc chính tả tiếng Việt, hoặc là từ đa âm tiết được viết như một từ. " ..
        "Nếu là từ ngoại ngữ, xin hãy định rõ cách phiên âm vào tham số của bản mẫu vie-pron, " ..
        "và phân tách các âm tiết bằng dấu gạch ngang (-) hoặc khoảng cách. (“" ..
		ci .. "”+“" .. gvg .. "”+“" .. cf .. "”≠“" .. word .. "”)")
	
    -- Words in which “gi-” is short for “*gii-”.
    if ci == "gi" and giv and #gvg < 1 then gvg = giv end
    
    -- “y-”
    if #ci < 1 and mw.ustring.sub(gvg, 1, 1) == "y" then
        gvg = "i" .. mw.ustring.sub(gvg, 2)
    end
    
	-- Semisyllables, like in “H'Mông”
	if #ci > 0 and #gvg < 1 and #cf < 1 then gvg = "ờ" end
	
	-- Tone
	local t = p.viqrTone(gvg)
	
	return {ci = ci, gvg = gvg, cf = cf, t = t}
end

---Returns the IPA transcription of the given initial consonant cluster.
-- @usage {{#gọi:VieIPA|ciToIPA|t|iế|ng|Hà Tĩnh}}
function p.ciToIPA(frame)
    return p._ciToIPA(frame.args.ci or frame.args[1],
        frame.args.gvg or frame.args[2],
        frame.args.cf or frame.args[3],
        frame.args.dialect or frame.args[4])
end
function p._ciToIPA(ci, gvg, cf, dialect)
	local data = p.dialects[dialect].initialConsonantsToIPA
	local ipa = data[ci] or
		-- Loanwords from some minority languages retain double consonants.
		data[mw.ustring.sub(ci, 1)] or data[mw.ustring.sub(ci, 2)]
	if type(ipa) == "function" then ipa = ipa(p.detone(gvg), cf) end
	return ipa or ""
end

---Returns the IPA transcription of the given glide-vowel-glide sequence.
-- @usage {{#gọi:VieIPA|gvgToIPA|t|iế|ng|'|Hà Tĩnh}}
function p.gvgToIPA(frame)
    return p._gvgToIPA(frame.args.ci or frame.args[1],
        frame.args.gvg or frame.args[2],
        frame.args.cf or frame.args[3],
        frame.args.t or frame.args[4],
        frame.args.dialect or frame.args[5])
end
function p._gvgToIPA(ci, gvg, cf, t, dialect)
	local gvgData = p.dialects[dialect].interiorToIPA
	local toneAttributes = p.dialects[dialect].toneAttributes[t] or {}
	local ipa = gvgData[gvg] or gvgData[p.detone(gvg)] or ""
	if type(ipa) == "function" then ipa = ipa(ci, cf) or "" end
	
	-- Insert glottal stop.
	if toneAttributes.glottal then
		if toneAttributes.repeated then
			ipa = mw.ustring.gsub(ipa, "(%a)_ː?", "%1_ʔ%1_")
		else ipa = mw.ustring.gsub(ipa, "(%a)_(ː?)", "%1_%2ʔ") end
	end
	
	-- Insert breathy-voice diacritic.
	if toneAttributes.breathy then ipa = mw.ustring.gsub(ipa, "_", "\204\164_") -- U+0324
	-- Or insert creaky-voice diacritic.
	elseif toneAttributes.creaky then ipa = mw.ustring.gsub(ipa, "_", "\204\176_") -- U+0330
	end
	
	return mw.ustring.gsub(ipa, "_", "")
end

---Returns the IPA transcription of the given final consonant cluster.
-- @usage {{#gọi:VieIPA|t|iế|ng|Quy Nhơn}}
function p.cfToIPA(frame)
    return p._cfToIPA(frame.args.ci or frame.args[1],
        frame.args.gvg or frame.args[2],
        frame.args.cf or frame.args[3],
        frame.args.dialect or frame.args[4])
end
function p._cfToIPA(ci, gvg, cf, dialect)
	local ipa = p.dialects[dialect].finalConsonantsToIPA[cf]
	if type(ipa) == "function" then ipa = ipa(ci, p.detone(gvg)) end
	return ipa or ""
end

---Returns the IPA tone letters for the given word.
-- [[Bản mẫu:vie-pron/VieTn]] and [[Bản mẫu:vie-pron/VieT]]
-- @usage {{#invoke:ViePron|viqrToneToIPA|tiếng|Sài Gòn}}
function p.viqrToneToIPA(frame)
    return p._viqrToneToIPA(frame.args.word or frame.args[1],
        frame.args.dialect or frame.args[2])
end
function p._viqrToneToIPA(word, dialect)
	local viqr = p.viqrTone(word)
	if not viqr then return end
	return p.dialects[dialect].viqrTonesToIPA[viqr]
end

---Returns the IPA transcription of the given Vietnamese text.
-- @usage {{#invoke:ViePron|ipa|tiếng Việt}}
function p.ipa(frame)
	return p._ipa(frame.args.text or frame.args[1] or "",
		frame.args.dialect or frame.args[2] or "Hà Nội", frame.args.css)
end
function p._ipa(text, dialect, css)
	local ipa = {}
	for word in mw.ustring.gmatch(mw.ustring.lower(text), "([a-z" .. accentedChars .. "]+)") do
		local c = p._components(word)
        local tone = p._viqrToneToIPA(word, dialect)
        if css then
            tone = "<span class='IPA-tone'>" .. tone .. "</span>"
        end
		table.insert(ipa,
			p._ciToIPA(c.ci, c.gvg, c.cf, dialect) ..
			p._gvgToIPA(c.ci, c.gvg, c.cf, c.t, dialect) ..
			p._cfToIPA(c.ci, c.gvg, c.cf, dialect) ..
			tone)
	end
	return table.concat(ipa, " ")
end

---Returns [[Bản mẫu:vie-pron/Bảng]] prefilled with IPA transcriptions of the
-- given word in several dialects.
-- @usage {{#invoke:ViePron|standaloneTable}}
function p.standaloneTable(frame)
    -- Get any words passed in as arguments to #invoke:.
    local words = {frame.args.word}
    local overrides = {}
    if #words < 1 or #words[1] < 1 then
        for i, arg in ipairs(frame.args) do
            table.insert(words, arg)
        end
        for k, v in pairs(frame.args) do
            overrides[k] = v
        end
    end
    
    -- Get any words passed in as arguments to [[Bản mẫu:vie-pron]].
    if #words < 1 then
        local template = frame:getParent()
        for i, arg in ipairs(template.args) do
            table.insert(words, arg)
        end
        for k, v in pairs(template.args) do
            overrides[k] = v
        end
    end
    
    -- Fall back on the page name.
    if #words < 1 then words = {mw.title.getCurrentTitle().text} end
    
    -- Fill in the display template.
    words = table.concat(words, " ")
    return frame:expandTemplate{
        title = "vie-pron/Bảng",
        args = {
            words,
            HN = overrides.HN or p._ipa(words, "Hà Nội", true),
            H = overrides.H or p._ipa(words, "Huế", true),
            SG = overrides.SG or p._ipa(words, "Sài Gòn", true),
            V = overrides.V or p._ipa(words, "Vinh", true),
            TC = overrides.TC or p._ipa(words, "Thanh Chương", true),
            HT = overrides.HT or p._ipa(words, "Hà Tĩnh", true),
        },
    }
end

---Returns an HTML table row with one header cell for each supported dialect.
-- @usage {{#invoke:ViePron|tableColumnHeaders}}
function p.tableColumnHeaders()
	local headers = {"<th>Từ</th>"}
	for i = 1, #dialects do
		table.insert(headers, "<th>" .. dialects[i] .. "</th>")
	end
	return "<tr>" .. table.concat(headers) .. "</tr>"
end

---Returns an HTML table row of IPA transcriptions of the given word in all the
-- supported dialects. Adjacent, identical table cells are combined.
-- @usage {{#invoke:ViePron|tableRow|tiếng Việt}}
function p.tableRow(frame)
	return p._tableRow(frame.args.word or frame.args[1])
end
function p._tableRow(word)
	local cells = {}
	local colspans = {}
	for i = 1, #dialects do
		cells[i] = p._ipa(word, dialects[i], true)
		colspans[i] = 1
	end
	for i = #cells, 1, -1 do
		if cells[i] == cells[i - 1] then
			colspans[i - 1] = colspans[i - 1] + colspans[i]
			table.remove(cells, i)
			table.remove(colspans, i)
		end
	end
	for i = 1, #cells do
		local colspan = ""
		if colspans[i] > 1 then colspan = " colspan='" .. colspans[i] .. "'" end
		cells[i] = "<td" .. colspan ..">" .. cells[i] .. "</td>"
	end
	return "<tr><th scope='row'>" .. word .. "</th>" .. table.concat(cells) .. "</tr>"
end

return p