Mô đun:sa-utilities

local export = {}

-- Common regex patterns:
export.consonant_list = "kKgGNcCjJYwWqQRtTdDnpPbBmyrlLvSzsh"
export.consonant = "[" .. export.consonant_list .. "]"
export.accent = "[/\\]"
export.vowel_list = "aAiIuUfFxXeEoO"
export.vowel = "[" .. export.vowel_list .. "]"
export.vowel_with_accent = export.vowel .. export.accent .. "?"

-- Abbreviated helper functions:
local U = mw.ustring.char
local match = mw.ustring.match
local gsub = mw.ustring.gsub
local sub = mw.ustring.sub
local lower = mw.ustring.lower
local upper = mw.ustring.upper

--[=[Detects whether a specified text ends in a given pattern.
	Parameters:
		text (String): the text to be tested
		pattern (String): the query pattern to be tested on the end of the text.
	Return:
		Boolean
]=]
local function ends_with(text, pattern)
	return match(text, pattern .. "$")
end

--[=[Detects whether a specified text begins in a given pattern.
	Parameters:
		text (String): the text to be tested
		pattern (String): the query pattern to be tested on the beginning of the text.
	Return:
		Boolean
]=]
local function starts_with(text, pattern)
	return match(text, "^" .. pattern)
end

-- Common transformation types:
--[=[ Increase a vowel one grade (guṇation). It is possible that this should
		include provisions for guṇation of sonorants into CV configurations
		(e.g. i/ī -> ya -> yā, etc.). Perhaps will need to be updated.
]=]
export.up_one_grade = {
	['a'] = 'A', ['A'] = 'A', ['a/'] = 'A/', ['A/'] = 'A/', ['a\\'] = 'A\\', ['A\\'] = 'A\\',
	['i'] = 'e', ['I'] = 'e', ['i/'] = 'e/', ['I/'] = 'e/', ['i\\'] = 'e\\', ['I\\'] = 'e\\',
	['u'] = 'o', ['U'] = 'o', ['u/'] = 'o/', ['U/'] = 'o/', ['u\\'] = 'o\\', ['U\\'] = 'o\\',
	['e'] = 'E', ['E'] = 'E', ['e/'] = 'E/', ['E/'] = 'E/', ['e\\'] = 'E\\', ['E\\'] = 'E\\',
	['o'] = 'O', ['O'] = 'O', ['o/'] = 'O/', ['O/'] = 'O/', ['o\\'] = 'O\\', ['O\\'] = 'O\\',
	['f'] = 'ar', ['F'] = 'ar', ['f/'] = 'a/r', ['F/'] = 'a/r', ['f\\'] = 'a\\r', ['F\\'] = 'a\\r',
}

-- Decrease vowel one grade (reverse of above)
export.shorten = {
	['a'] = 'a', ['A'] = 'a', ['a/'] = 'a/', ['A/'] = 'a/', ['a\\'] = 'a\\', ['A\\'] = 'a\\',
	['i'] = 'i', ['I'] = 'i', ['i/'] = 'i/', ['I/'] = 'i/', ['i\\'] = 'i\\', ['I\\'] = 'i\\',
	['u'] = 'u', ['U'] = 'u', ['u/'] = 'u/', ['U/'] = 'u/', ['u\\'] = 'u\\', ['U\\'] = 'u\\',
	['f'] = 'f', ['F'] = 'f', ['f/'] = 'f/', ['F/'] = 'f/', ['f\\'] = 'f\\', ['F\\'] = 'f\\',
}

-- Lengthen a vowel
export.lengthen = {
	['a'] = 'A', ['A'] = 'A', ['a/'] = 'A/', ['A/'] = 'A/', ['a\\'] = 'A\\', ['A\\'] = 'A\\',
	['i'] = 'I', ['I'] = 'I', ['i/'] = 'I/', ['I/'] = 'I/', ['i\\'] = 'I\\', ['I\\'] = 'I\\',
	['u'] = 'U', ['U'] = 'U', ['u/'] = 'U/', ['U/'] = 'U/', ['u\\'] = 'U\\', ['U\\'] = 'U\\',
	['f'] = 'F', ['F'] = 'F', ['f/'] = 'F/', ['F/'] = 'F/', ['f\\'] = 'F\\', ['F\\'] = 'F\\',
}

-- Convert a monosegmental (or at least monoliteral) diphthong into a/ā + glide.
export.split_diphthong = {
	['e'] = 'ay', ['e/'] = 'a/y', ['e\\'] = 'a\\y',
	['E'] = 'Ay', ['E/'] = 'A/y', ['E\\'] = 'A\\y',
	['o'] = 'av', ['o/'] = 'a/v', ['o\\'] = 'a\\v',
	['O'] = 'Av', ['O/'] = 'A/v', ['O\\'] = 'A\\v',
}

--[=[Convert a syllabic sonorant to its associated consonantal form. This map is
		presently misnamed since the the input is syllabic (not semivocalic)
		and since liquids are not semivowels at all.
]=]	
export.semivowel_to_cons = {
	['i'] = 'y', ['I'] = 'y',
	['u'] = 'v', ['U'] = 'v',
	['f'] = 'r', ['F'] = 'r',
	['x'] = 'l', ['X'] = 'l',
}

-- Add a homorganic glide to a vowel
local insert_glide = {
	['i'] = 'iy', ['I'] = 'iy', ['i/'] = 'i/y', ['I/'] = 'i/y', ['i\\'] = 'i\\y', ['I\\'] = 'i\\y',
	['u'] = 'uv', ['U'] = 'uv', ['u/'] = 'u/v', ['U/'] = 'u/v', ['u\\'] = 'u\\v', ['U\\'] = 'u\\v',
}

--[=[Convert all unambiguous stops to their absolute final value. The equivalent
	values (e.g. k = k) may be redundant given the implementation of 
	absolute_final and internal_sandhi below. 
]=]
local to_final = {
	['k'] = 'k', ['K'] = 'k', ['g'] = 'k', ['G'] = 'k',
	['w'] = 'w', ['W'] = 'w', ['q'] = 'w', ['Q'] = 'w',
	['t'] = 't', ['T'] = 't', ['d'] = 't', ['D'] = 't',
	['p'] = 'p', ['P'] = 'p', ['b'] = 'p', ['B'] = 'p',
	['Y'] = 'N',
}

-- Convert dental to reptroflex
local dental_to_retroflex = {
	['t'] = 'w', ['T'] = 'W', ['d'] = 'q', ['Q'] = 'Q', ['n'] = 'R',
}

-- Remove aspiration
local deaspirate = {
	['K'] = 'k', ['G'] = 'g',
	['C'] = 'c', ['J'] = 'j',
	['W'] = 'w', ['Q'] = 'q',
	['T'] = 't', ['D'] = 'd',
	['P'] = 'p', ['B'] = 'b',
	['h'] = 'g',
}

--[=[Detects whether a word is monosyllabic. This function does not apply sandhi
		to determing whether a potential phonemic form like /dā́rv/ would be
		syllabified to [dā́ru]. This might need to be changed.
	Parameters:
		text (String): the text to be checked for monosyllabicity
	Return:
		Boolean
]=]
function export.is_monosyllabic(text)
	return match(text, "^" .. export.consonant .. "*" .. export.vowel .. export.accent .. "?" .. export.consonant .. "*$")
end

--[=[Transforms a word to its absolute final sandhi form.
	Parameters:
		text (String): the text to be converted to final sandhi position
		ambig_hint (String): an indication of what outcome a palatal should have as
			final palatals will unpredictably produce either a retroflex or a velar
			stop in final position (e.g. spáś- > spáṭ 'spy' vs. náś- > nák 'night').
			Required for palatal-final strings.
	Return:
		String
]=]
local function absolute_final(text, ambig_hint)
	if ends_with(text, export.consonant .. export.consonant) then -- at least 2 consonants
		-- Take the first of the cluster.
		text = gsub(text, "(" .. export.consonant .. "+)$",
			function(cluster) return sub(cluster, 1, 1) end)
	end
	-- ḷ, v, and y are not handled as they should not appear finally. Perhaps wrong.
	if ends_with(text, "[kwtpNRnmlaAiIuUeEoOfFxXH][/\\]?") then
		-- do nothing
	elseif ends_with(text, "M") then -- just in case, ṃ > m
		text = gsub(text, ".$", "m")
	elseif ends_with(text, "[sr]") then -- convert to final visarga
		text = gsub(text, ".$", "H")
	elseif ends_with(text, "[KgGWqQTdDPbBY]") then -- Handle final stops.
		text = gsub(text, ".$", to_final)
	elseif ends_with(text, "[cCjJhSz]") then -- Handle final palatals.
		text = gsub(text, ".$", ambig_hint)
	end
	return text
end

--[=[Applying retroflexion to a stem and ending without joining them.
		This include RUKI, ṣ-cluster harmony, and nasal retroflexions.
	Parameters:
		stem (String): the stem to receive an ending
		ending (String): the ending to be affixed
	Return:
		String (stem), String (ending)
]=]
function export.retroflexion(stem, ending)
	-- Does the stem end in a RUKI environment?
	if ends_with(stem, "[iIeEfFxuUoOrk][/\\]?[HM]?") then
		ending = gsub(ending, "^s([^rfF])", "z%1")			-- Convert ending-initial s > ṣ not followed by [rṛṝ].
		ending = gsub(ending, "^z[tTdDn]*", function(dentals) return gsub(dentals, ".", dental_to_retroflex) end)
	end
	-- Does the stem end in a RUKI environment followed by s and the ending not start with [rṛṝ]?
	if ends_with(stem, "[iIeEfFxuUoOrk][/\\]?[HM]?s") and starts_with(ending, "[^rfF]") then
		stem = gsub(stem, "s$", "z")						-- Convert stem-final s > ṣ
	end
	if ends_with(stem, "z") then	-- Does the stem end in ṣ?
		-- Convert an ending-initial dental (cluster) to retroflex
		ending = gsub(ending, "^[tTdDn]*", function(dentals) return gsub(dentals, ".", dental_to_retroflex) end)
	end
	-- Does the stem contain a nasal harmony trigger without intervening blockers?
	if ends_with(stem, "[zrfF][^cCjJYwWqQRtTdDnSsl]*") then
		ending = gsub(ending, -- Convert all retroflexable n > ṇ in the ending
			"^([^cCjJYwWqQRtTdDnSsl]*)n([aAiIeEfFxuUoOynmv])",
			function(pre, post)		-- Does this need to be a function? Why not "%1R%2"? Does it make a difference in speed?
				return pre .. "R" .. post
			end)
	end
	-- Does the stem contain a nasal harmony trigger without intervening blockers and stem-final n?
	if ends_with(stem, "[zrfF][^cCjJYwWqQRtTdDnSsl]*n") and starts_with(ending, "[aAiIeEfFxuUoOynmv]") then
		stem = gsub(stem, "n$", "R")	-- Convert stem-final n > ṇ
	end
	-- For safety, does the ending contain a unblocked nasal harmony trigger and un retroflexed n?
	ending = gsub(ending,
		"([zrfF][^cCjJYwWqQRtTdDnSsl]*)n([aAiIeEfFxuUoOynmv])",
		function(pre, post)
			return pre .. "R" .. post		-- Again, Why not "%1R%2"?
		end)
	return stem, ending
end

--[=[Combine a stem and ending while modfiying the accentuation. This does not currently
		account for mobility of accent between the stem and ending.
	Parameters:
		stem (String): the stem to receive an ending
		ending (String): the ending to be affixed
		has_accent (Boolean): whether the word has an accent to be modified at all
		accent_override (Boolean): whether to strip the stem of accent
		mono (Boolean): whether the stem is monosyllabic AND susceptible to 
			accentual mobility (cf. pā́dam ~ padā́ vs. gā́m ~ gávā). This should
			perhaps be renamed or removed as redundant to accent_override in this
			function. This should not be applied to *all* monosyllabic nouns!
		recessive (Boolean): whether the accent must be moved to the leftmost vowel (e.g. in the vocative)
	Return:
		String
]=]
local function combine_accent(stem, ending, has_accent, accent_override, mono, recessive)
	if has_accent then
		if recessive then
			local combined = stem .. ending				 -- combine word
			combined = gsub(combined, export.accent, "") -- remove any accent
			combined = gsub(combined, "^([^" .. export.vowel_list .. "]-)(" .. export.vowel .. ")", "%1%2/") -- accent first vowel of combined form
			return combined
		elseif accent_override then
			stem = gsub(stem, export.accent, "")		-- remove all accents from stem
		elseif mono and match(ending, export.accent) then
			stem = gsub(stem, export.accent, "")		-- remove all accents from stem
		-- If both the stem and ending are accented, remove the ending accent. This may be too simple.
		elseif match(stem, export.accent) and match(ending, export.accent) then
			ending = gsub(ending, export.accent, "")
		end
	end
	return stem .. ending
end

--[=[Return a word-stem combined with a given ending while handling internal
		sandhi and accentuation. Please see the implementation for details.
	Parameters:
		input_table (Table): This table is expected to contain the items:
			stem (String): the stem to receive an ending
			ending (String): the ending to be affixed
			has_accent (Boolean): whether the word has an accent to be modified
			mono (Boolean): whether the stem is monosyllabic OR behaves like a monosyllable (e.g. root noun compounds)
			accent_override (Boolean): whether to strip the stem of accent since
				some non-monosyllabic forms (e.g. present participles and gen.pl.)
				may show stem-to-ending accentual mobility.
			recessive (Boolean): whether the accent must be moved to the leftmost vowel (e.g. in the vocative)
			j_to_z (Boolean): whether to convert j > ṣ before {t, th} instead of j > k
			h_to_g (Boolean): whether to convert h > g before {t, th, d, dh} (duh + -tá = dugdhá) instead of h + t > ḍh (lih + -tá = līḍhá)
			non_final (Boolean): if true, will not apply word-final sandhi (i.e. convert s to visarga)
			ignore_s_allophones (Boolean): whether to ignore r and s allophony in the stem before the ending
	Return:
		String
]=]
function export.internal_sandhi(input_table)
	local stem, ending = input_table.stem, input_table.ending
	local last		-- last segment of the stem
	local acc		-- the accent					
	local first		-- the first segment of the ending
	local combined	-- the combined form
	-- explicitly ignored are CV, C + semivowel, or C + nasal
	if ending == "" then
		return absolute_final(stem, input_table.ambig_hint)
	-- all cases of ending starts with vowel
	elseif starts_with(ending, export.vowel) then
		if ends_with(stem, export.vowel_with_accent) then -- stem ends with vowel
			-- strip last vowel and accent off stem
			stem, last, acc = match(stem, "^(.*)(" .. export.vowel .. ")(" .. export.accent .. "?)$")
			-- strip first vowel off ending
			first, ending = match(ending, "^(.)(.*)$")
			if match(last, '[iIuU]') and input_table.mono then	-- monosyllabic semivowel-final stems (this applies to root noun compounds as well)
				stem = stem .. insert_glide[last .. acc]
				ending = first .. ending
			elseif lower(last) == lower(first) then				-- homorganic vowels (FIXME: what happens if e- + -e, o- + -o, E- + -E, O- + -O?)
				ending = upper(first) .. acc .. ending
			elseif lower(last) == "a" then						-- gunation and vrddhization after stem-final a.
				ending = export.up_one_grade[first .. acc] .. ending
			elseif export.semivowel_to_cons[last] then			-- Can the stem-final vowel be made consonantal?
				stem = stem .. export.semivowel_to_cons[last]	
				ending = first .. (acc == "/" and "\\" or "") .. ending		-- Convert stem-final uditta to ending-initial svarita
			elseif export.split_diphthong[last] then			-- Can a stem-final guna and vrddhi be split into V + sonorant?
				stem = stem .. export.split_diphthong[last .. acc]
				ending = first .. ending
			end
		end
		-- all consonant-final stems left unchanged
	-- {i, u} > {ī, ū} /__[rv]C if stem is monosyllabic (is monosyllabic constraint necessary)
	elseif ends_with(stem, "[iu][/\\]?[rv]") and input_table.mono then
		-- lengthen high vowel before [rv]].
		stem = gsub(stem, "([iu][/\\]?)([rv])$", function(vow, sonorant) return export.lengthen[vow] .. sonorant end)
	-- n > ñ /__{c, j}
	elseif ends_with(stem, "[cj]") and starts_with(ending, "n") then
		ending = gsub(ending, "^.", "Y")
	-- ś > k /__s (RUKI applied later)
	elseif ends_with(stem, "S") and starts_with(ending, "s") then
		stem = gsub(stem, ".$", "k")
	-- s > t /__s if stem is monosyllabic
	elseif ends_with(stem, "s") and starts_with(ending, "s") and input_table.mono then
		stem = gsub(stem, ".$", "t")
	-- bh > d /__s if stem is monosyllabic (FIXME: probably only be in (some) nominals)
	elseif ends_with(stem, "s") and starts_with(ending, "B") and input_table.mono then
		stem = gsub(stem, ".$", "d")
	-- j > ṣ /__{t, th} in certain words (FIXME: does this get handled correctly if j_to_z is false?)
	elseif ends_with(stem, "j") and starts_with(ending, "[tT]") and input_table.j_to_z then
		stem = gsub(stem, ".$", "z")
	-- {a, i, u}h + {t, th, d, dh} > {ā, ī, ū}ḍh (Bartholomae's Law with retroflection and compensatory lengthening)
	elseif ends_with(stem, "h") and starts_with(ending, "[tTdD]") then
		if input_table.h_to_g then
			stem = gsub(stem, ".$", "g")
			ending = gsub(ending, "^.", "D")
		else
			stem = gsub(stem, "([aiu]?)([/\\]?)h$", function(vow, acc) return (export.lengthen[vow] or "") .. acc end)
			ending = gsub(ending, "[tTdD]", "Q")
		end
	elseif ends_with(stem, "h") and starts_with(ending, "s") then
		stem = gsub(stem, ".$", "k")
		ending = gsub(ending, "^.", "z")
	-- Bartholomae's Law
	elseif ends_with(stem, "[GJQDBh]") and starts_with(ending, "[tT]") then
		stem = gsub(stem, ".$", deaspirate)
		ending = gsub(ending, "^.", "D")
	-- Allo other C- + -C interactions 
	elseif ends_with(stem, export.consonant) and starts_with(ending, export.consonant) then
		if input_table.final then -- (FIXME: I don't recall what this parameter is for. When would this be called?)
			if ends_with(stem, export.consonant .. export.consonant) then -- at least 2 consonants
				-- take the first of the cluster
				stem = gsub(stem, "(" .. export.consonant .. "+)$",
					function(cluster) return sub(cluster, 1, 1) end)
			end
			if ends_with(stem, "[KgGWqQTdDPbB]") then
				stem = gsub(stem, ".$", to_final)
			elseif ends_with(stem, "[cCjJhSz]") then
				stem = gsub(stem, ".$", input_table.ambig_hint)
			end
		end
		
		if ends_with(stem, "[cCjJ]") and starts_with(ending, "[tTdDs]") then
			if ends_with(stem, "Y[cCjJ]") then
				stem = gsub(stem, "..$", 'Nk')
			else
				stem = gsub(stem, ".$", 'k')
			end
		end
		
		if ends_with(stem, "[kwp]") then	-- if stem ends in {k, ṭ, p}
			if starts_with(ending, "[gGjJqQdDbB]") then -- {k, ṭ, p} > {g, ḍ, b} before following voiced stop
				stem = gsub(stem, ".$", {['k'] = 'g', ['w'] = 'q', ['p'] = 'b'})
			elseif starts_with(ending, "h") then		-- {k, ṭ, p}h > {ggh, ḍḍh, bbh}
				stem = gsub(stem, ".$", {['k'] = 'g', ['w'] = 'q', ['p'] = 'b'})
				ending = gsub(ending, "^.", gsub(stem, ".$", {['k'] = 'G', ['w'] = 'Q', ['p'] = 'B'}))
			end
		elseif ends_with(stem, "[gqbd]") then	-- if stem ends in {g, ḍ, b}
			if starts_with(ending, "[kKcCwWtTpP]") then -- {g, ḍ, d, b} > {k, ṭ, t, p} before following unvoiced stop
				stem = gsub(stem, ".$", {['g'] = 'k', ['q'] = 'w', ['d'] = 't', ['b'] = 'p'})
			end
		elseif ends_with(stem, "t") then	-- if stem ends in t
			if starts_with(ending, "[cCjJwWqQ]") then	-- homorganic with following stop
				stem = gsub(stem, ".$", gsub(ending, "^.",
					{
						['c'] = 'c', ['C'] = 'c', ['j'] = 'j', ['J'] = 'J',
						['w'] = 'w', ['W'] = 'w', ['q'] = 'q', ['Q'] = 'q',
					}))
			elseif starts_with(ending, "S") then		-- tś > cch
				stem = gsub(stem, ".$", "c")
				ending = gsub(ending, "^.", "C")
			elseif starts_with(ending, "[gGdDbB]") then	-- t > d /__{g(h), d(h), b(h)} 
				stem = gsub(stem, ".$", "d")
			elseif starts_with(ending, "[nm]") then		-- t > n /__{n, m}
				stem = gsub(stem, ".$", "n")
			elseif starts_with(ending, "h") then		-- t + h > ddh
				stem = gsub(stem, ".$", "d")
				ending = gsub(ending, "^.", "D")
			end
		elseif ends_with(stem, "m") then	-- if stem ends in m
			if starts_with(ending, "[hSzs]") then		-- m > ṃ /__{h, ś, ṣ, s}
				stem = gsub(stem, ".$", "M")
			elseif starts_with(ending, "[^yrln]") then	-- m > n before any other consonant beside {y, r, l, n} (FIXME: Is this true? Should this feed into the next conditional?)
				stem = gsub(stem, ".$", "n")
			end
		elseif ends_with(stem, "n") then	-- if stem ends in n
			if starts_with(ending, "[hSzs]") then		-- n > ṃ /__{h, ś, ṣ, s}
				stem = gsub(stem, ".$", "M")
			elseif starts_with(ending, "[kKgGcCjJwWqQpPbBl]") then	-- make homorganic
				stem = gsub(stem, ".$",
					{
						['k'] = 'N', ['K'] = 'N', ['g'] = 'N', ['G'] = 'N',
						['c'] = 'Y', ['C'] = 'Y', ['j'] = 'Y', ['J'] = 'Y',
						['w'] = 'R', ['W'] = 'R', ['q'] = 'R', ['Q'] = 'R',
						['p'] = 'm', ['P'] = 'm', ['b'] = 'm', ['B'] = 'm',
						['l'] = 'M', -- or 'l~'
					})
			end
		-- Loss of {z, ẓ} with compensatory lengthening before voiced consonant. (FIXME: but what about the médha- < *mázdha- type?)
		elseif ends_with(stem, "[aA][/\\]?[sHr]") and starts_with(ending, "[rgGdDbByvjJqQlLhnm]") and (not input_table.ignore_s_allophones) then
			stem = gsub(stem, "([aA])([/\\]?)[sHr]$",
				function(vow, acc) return (vow == "a" and "o" or "A") .. acc end)
		-- final s-allophones
		elseif ends_with(stem, "[sHr]") and (not input_table.ignore_s_allophones) then
			if starts_with(ending, "[kKpPzsS]") then		-- visarga
				stem = gsub(stem, ".$", "H")
			elseif starts_with(ending, "[cCwWtT]") then		-- homorganic fricative
				local homorg_s = {
						['c'] = 'S', ['C'] = 'S',
						['w'] = 'z', ['W'] = 'Z',
						['t'] = 's', ['T'] = 's'
					}
				stem = gsub(stem, ".$", homorg_s[sub(ending, 1, 1)])
			elseif starts_with(ending, "r") then					-- Loss of z before r with compensatory lengthening before voiced consonant. (FIXME: Redundant/correct?)
				stem = gsub(stem, "(" .. export.vowel .. "[/\\])[sHr]$", function(vow) return export.lengthen[vow] or vow end)
			elseif starts_with(ending, "[gGjJqQdDbByvlLhnm]") then	-- s-allophones > r (FIXME: Redundant/correct?)
				stem = gsub(stem, ".$", "r")
			end
		elseif ends_with(stem, export.consonant .. "r") then -- Final Cr > Cṛ if the ending begins with a consonant
			stem = gsub(stem, "r$", "f")
		end
	end
	stem, ending = export.retroflexion(stem, ending)
	combined = combine_accent(stem, ending, input_table.has_accent, input_table.accent_override, input_table.mono, input_table.recessive)
	if input_table.non_final then return combined end
	return absolute_final(combined, input_table.ambig_hint)
end

return export