Mô đun:uz-afg-translit
Giao diện
- Tài liệu bên dưới được tạo bởi Module:documentation2/functions/translit. [sửa]
- Liên kết hữu ích: danh sách trang con • liên kết • nhúng • trường hợp kiểm thử • chỗ thử
Mô đun này có chức năng chuyển tự văn bản Tiếng Nam Uzbek.
Lưu ý, không nên gọi mô đun này trực tiếp trong các bản mẫu hoặc mô đun khác.
Để sử dụng trong một bản mẫu, hãy dùng {{xlit}}.
Còn trong một mô đun, hãy dùng Mô đun:languages#Language:transliterate.
Đối với trường hợp kiểm thử, xem Module:uz-afg-translit/testcases.
Chức năng
[sửa]tr(text, lang, sc)- Chuyển tự một
text(văn bản) được đưa ra và viết bằng chữ viết được xác định bởi mãsc, và ngôn ngữ được xác định bởi mãlang. - Nếu chuyển tự thất bại, nó sẽ gọi giá trị
nil.
local export = {}
local m_str_utils = require("Module:string utilities")
local gcodepoint = m_str_utils.gcodepoint
local rfind = m_str_utils.find
local rsubn = m_str_utils.gsub
local rmatch = m_str_utils.match
local rsplit = m_str_utils.split
local U = m_str_utils.char
local unpack = unpack or table.unpack -- Lua 5.2 compatibility
-- assigned below
local has_diacritics
-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
local zwnj = U(0x200C) -- zero-width non-joiner
local alif_madda = U(0x622)
local alif_hamza_below = U(0x625)
local alif = U(0x627)
local taa_marbuuTa = U(0x629)
local laam = U(0x644)
local waaw = U(0x648)
local alif_maqSuura = "یٰ"
local yaa = "ی"
local fatHataan = U(0x64B)
local Dammataan = U(0x64C)
local kasrataan = U(0x64D)
local fatHa = U(0x64E)
local Damma = U(0x64F)
local kasra = U(0x650)
local shadda = U(0x651)
local sukuun = U(0x652)
local waaw_maj = "ۉ"
local yaa_maj = "ې"
local dagger_alif = U(0x670)
local alif_waSl = U(0x671)
--local zwj = U(0x200D) -- zero-width joiner
local lrm = U(0x200E) -- left-to-right mark
local rlm = U(0x200F) -- right-to-left mark
-- Occurs after al- in allaḏī and variants so that we can implement elision of
-- a- after a preceding vowel, after which we remove the marker.
local alladi_marker = U(0xFFF0)
local tt = {
-- consonants
["ب"]="b",
["پ"]="p",
["ت"]="t",
["ث"]="s",
["ج"]="j",
["چ"]="ch",
["ح"]="h",
["خ"]="x",
["د"]="d", ["ذ"]="z", ["ر"]="r",
["ز"]="z",
["ژ"]="zh",
["س"]="s", ["ش"]="sh",
["ص"]="s",
["ض"]="z",
["ط"]="t",
["ظ"]="z",
["ع"]="",
["غ"]="gʻ",
["ف"]="f",
["ق"]="q",
["ک"]="k",
["گ"]="g",
["ل"]="l",
["م"]="m",
["ن"]="n",
["ه"]="h",
-- toʾ marbūta (special) - always after a fátḥa (a), silent at the end of
-- an utterance, "t" in ʾizofa or with pronounced tanwīn. We catch
-- most instances of toʾ marbūta before we get to this stage.
[taa_marbuuTa]="t", -- toʾ marbūta = ة
-- control characters
[zwnj]="-", -- ZWNJ (zero-width non-joiner)
-- [zwj]="", -- ZWJ (zero-width joiner)
-- rare letters
-- semivowels or long vowels, alif, hamza, special letters
["ا"]="o", -- ʾalif
-- hamzated letters
["أ"]="ʼ", -- hamza over alif
[alif_hamza_below]="ʼ", -- hamza under alif
["ؤ"]="ʼ", -- hamza over wow
["ئ"]="ʼ", -- hamza over yo
["ء"]="ʼ", -- hamza on the line
-- long vowels
[waaw]="v", --"ū" after zamma (u) and not before diacritic
[yaa]="y", --"ī" after kasra (i) and not before diacritic
[alif_maqSuura]="o", -- ʾalif maqsūra
[alif_madda]="o", -- ʾalif madda
[alif_waSl]= "", -- hamzatu l-wasl
[dagger_alif] = "o", -- ʾalif xanjariyya = dagger ʾalif (Koranic diacritic)
-- short vowels, šádda and sukūn
[fatHataan]="an", -- fatḥatan
[Dammataan]="un", -- zammatan
[kasrataan]="in", -- kasratan
[fatHa]="a", -- fatḥa
[Damma]="u", -- zamma
[kasra]="i", -- kasra
[waaw_maj]="oʻ",
[yaa_maj]="e",
-- šadda - doubled consonant
[sukuun]="", --sukūn - no vowel
-- ligatures
["ﻻ"]="lo",
["ﷲ"]="lloh",
-- tatwīl
["ـ"]="", -- tatwīl, no sound
-- numerals
["۱"]="1", ["۲"]="2", ["۳"]="3", ["۴"]="4", ["۵"]="5",
["۶"]="6", ["۷"]="7", ["۸"]="8", ["۹"]="9", ["۰"]="0",
-- punctuation (leave on separate lines)
["؟"]="?", -- question mark
["«"]='“', -- quotation mark
["»"]='”', -- quotation mark
["٫"]=".", -- decimal point
["٬"]=",", -- thousands separator
["٪"]="%", -- percent sign
["،"]=",", -- comma
["؛"]=";" -- semicolon
}
local he = "ه"
local sun_letters = "تثدذرزسشصضطظلن"
-- For use in implementing sun-letter assimilation of ال (al-)
local ttsun1 = {}
local ttsun2 = {}
local ttsun3 = {}
for cp in gcodepoint(sun_letters) do
local ch = U(cp)
ttsun1[ch] = tt[ch]
ttsun2["l-" .. ch] = tt[ch] .. "-" .. ch
table.insert(ttsun3, tt[ch])
end
-- For use in implementing elision of al-
local sun_letters_tr = table.concat(ttsun3, "")
local consonants_needing_vowels = "بتثجحخدذرزسشصضطظعغفقکلمنهپچژڤگڨڧڢںأإؤئءةﷲ"
-- consonants on the right side; includes alif madda
local rconsonants = consonants_needing_vowels .. "ویآ"
-- consonants on the left side; does not include alif madda
local lconsonants = consonants_needing_vowels .. "وی"
-- Arabic semicolon, comma, question mark; tatwīl; period, exclamation point,
-- single quote for bold/italic, double quotes for quoted material
local punctuation = "؟،؛" .. "ـ" .. ".!'" .. '"'
local space_like = "%s'" .. '"' .. zwnj .. "\n" .. "%p"
local space_like_class = "[" .. space_like .. "]"
local numbers = "۱۲۳۴۵۶۷۸۹۰"
local before_diacritic_checking_subs = {
------------ transformations prior to checking for diacritics --------------
-- random Koranic marks and presentation forms
{U(0x06E1), sukuun}, -- "Small High Dotless Head of Khah" (variant of sukūn)
{U(0x06DA), ""}, -- "Small High Jeem"
{U(0x06DF), ""}, -- "Small High Rounded Zero" (FIXME: correct?)
{U(0x08F0), U(0x64B)}, -- "Open Fathatan"
{U(0x08F1), U(0x64C)}, -- "Open Dammatan"
{U(0x08F2), U(0x64D)}, -- "Open Kasratan"
{U(0x06E4), ""}, -- "Small High Madda" (FIXME: correct?)
{U(0x06D6), ""}, -- "Small High Ligature Sad with Lam with Alef Maksura" (FIXME: there are others we need to do)
{U(0x06E5), "و"},
{U(0x06E6), "ی"},
-- convert llh for alloh into ll+shadda+dagger-alif+h
{"لله", "للّٰه"},
-- shadda+short-vowel (including tanwīn vowels, i.e. -an -in -un) gets
-- replaced with short-vowel+shadda during NFC normalisation, which
-- MediaWiki does for all Unicode strings; however, it makes the
-- transliteration process inconvenient, so undo it.
{"([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. dagger_alif .. "])" .. shadda, shadda .. "%1"},
-- ignore Koranic gemination at beginning of word due to assimilation of preceding consonant
{" ([" .. lconsonants .. "])" .. shadda, " %1"},
-- ignore alif jamīla (otiose alif in 3pl verb forms)
-- alif_madda, ayn
{"(" .. sukuun .. ")" .. alif_madda , "%1" .. "ء" .. fatHa .. alif},
{"([" .. lconsonants .. sukuun .. "])" .. "ع"
, "%1" .. "ء" },
{ "ع" .. sukuun , "ء" .. sukuun },
-- final he
{"([" .. fatHa .. Damma .. kasra .. "])" .. "ه" .. "(" .. space_like_class .. ")" , "%1%2" },
{"([" .. fatHa .. Damma .. kasra .. "])" .. "ه" .. "$" , "%1" },
{"([" .. fatHa .. Damma .. kasra .. "])" .. he .. "$" , "%1%2" },
-- #1: handle zamma + wow + alif (final -ū)
{Damma .. waaw .. alif, Damma .. waaw},
-- #2: handle wow + sukūn + alif (final -w in -aw in defective verbs)
-- this must go before the generation of w, which removes the waw here.
{waaw .. sukuun .. alif, waaw .. sukuun},
-- ignore final alif or alif maqsūra following fatḥatan (e.g. in accusative
-- singular or words like عَصًا "stick" or هُدًى "guidance"; this is called
-- tanwin nasb)
{"(" .. fatHa .. "?)" .. yaa .. dagger_alif, "%1" .. alif},
{fatHataan .. "[" .. alif .. alif_maqSuura .. "]", fatHataan},
-- same but with the fatḥatan placed over the alif or alif maqsūra
-- instead of over the previous letter (considered a misspelling but
-- common)
{"[" .. alif .. alif_maqSuura .. "]" .. fatHataan, fatHataan},
-- toʾ marbūta should always be preceded by fatḥa, alif, alif madda or
-- dagger alif; infer fatḥa if not
{"([^" .. fatHa .. alif .. alif_madda .. dagger_alif .. "])" .. taa_marbuuTa, "%1" .. fatHa .. taa_marbuuTa},
-- similarly for alif between consonants, possibly marked with shadda
-- (does not apply to initial alif, which is silent when not marked with
-- hamza, or final alif, which might be pronounced as -an)
{"([" .. lconsonants .. "]" .. shadda .. "?)" .. alif .. "([" .. rconsonants .. "])",
"%1" .. fatHa .. alif .. "%2"},
-- infer fatḥa in case of non-fatḥa + alif/alif-maqsūra + dagger alif
{"([^" .. fatHa .. "])([" .. alif .. alif_maqSuura .. "]" .. dagger_alif .. ")", "%1" .. fatHa .. "%2"},
-- infer kasra in case of hamza-under-alif not + kasra
{alif_hamza_below .. "([^" .. kasra .. kasrataan .. "])", alif_hamza_below .. kasra .. "%1"},
-- ignore dagger alif placed over regular alif or alif maqsūra
{"([" .. alif .. alif_maqSuura .. "])" .. dagger_alif, "%1"},
----------- rest of these concern definite article alif-lom ----------
-- in kasra/zamma + alif + lam, make alif into hamzatu l-wasl, so we
-- handle cases like بِالتَّوْفِیق (bi-t-tawfīq) correctly
{"([" .. Damma .. kasra .. "])" .. alif .. laam, "%1" .. alif_waSl .. laam},
-- al + consonant + shadda (only recognize word-initially if regular alif): remove shadda
{"^(" .. alif .. fatHa .. "?" .. laam .. "[" .. lconsonants .. "])" .. shadda, "%1"},
{"(" .. space_like_class .. alif .. fatHa .. "?" .. laam .. "[" .. lconsonants .. "])" .. shadda, "%1"},
{"(" .. alif_waSl .. fatHa .. "?" .. laam .. "[" .. lconsonants .. "])" .. shadda, "%1"},
-- handle l- hamzatu l-wasl or word-initial al-
{"^" .. alif .. fatHa .. "?" .. laam, "al-"},
{"(" .. space_like_class .. ")" .. alif .. fatHa .. "?" .. laam, "%1al-"},
-- next one for bi-t-tawfīq
{"([" .. Damma .. kasra .. "])" .. alif_waSl .. fatHa .. "?" .. laam, "%1-l-"},
-- next one for remaining hamzatu l-wasl (at beginning of word)
{alif_waSl .. fatHa .. "?" .. laam, "l-"},
-- special casing if the l in al- has a shadda on it (as in الَّذِی "that"),
-- so we don't mistakenly double the dash; insert a special marker here so
-- that we know later to elide the a- after a vowel
{"l%-" .. shadda, "l" .. alladi_marker .. "l"},
-- implement assimilation of sun letters
{"l%-[" .. sun_letters .. "]", ttsun2},
}
-- Transliterate the word(s) in TEXT. LANG (the language) and SC (the script)
-- are ignored. OMIT_I3RAAB means leave out final short vowels (ʾiʿrob).
-- GRAY_I3RAAB means render transliterate short vowels (ʾiʿrob) in gray.
-- FORCE_TRANSLIT causes even non-vocalized text to be transliterated
-- (normally the function checks for non-vocalized text and returns nil,
-- since such text is ambiguous in transliteration).
function export.tr(text, lang, sc, omit_i3raab, gray_i3raab, force_translit)
-- make it possible to call this function from a template
if type(text) == "table" then
local function f(x) return (x ~= "") and x or nil end
text, lang, sc, omit_i3raab, force_translit =
f(text.args[1]), f(text.args[2]), f(text.args[3]), f(text.args[4]), f(text.args[5])
end
for _, sub in ipairs(before_diacritic_checking_subs) do
text = rsub(text, sub[1], sub[2])
end
if not force_translit and not has_diacritics(text) then
require("Module:debug").track("ar-translit/lacking diacritics")
return nil
end
------------ transformations after checking for diacritics --------------
-- Replace plain alif with hamzatu l-wasl when followed by fatḥa/zamma/kasra.
-- Must go after handling of initial al-, which distinguishes alif-fatḥa
-- from alif w/hamzatu l-wasl. Must go before generation of ū and ī, which
-- eliminate the zamma/kasra.
text = rsub(text, alif .. "([" .. fatHa .. Damma .. kasra .. waaw_maj .. yaa_maj .. "])", alif_waSl .. "%1")
-- zamma + waw not followed by a diacritic is ū, otherwise w
text = rsub(text, Damma .. waaw .. "([^" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. shadda .. sukuun .. dagger_alif .. waaw_maj .. yaa_maj .. "])", "ū%1")
text = rsub(text, Damma .. waaw .. "$", "ū")
-- kasra + yaa not followed by a diacritic (or ū from prev step) is ī, otherwise y
text = rsub(text, kasra .. yaa .. "([^" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. shadda .. sukuun .. dagger_alif .. waaw_maj .. yaa_maj .. "ū])", "ī%1")
text = rsub(text, kasra .. yaa .. "$", "ī")
-- final he
text = rsub(text, "([" .. fatHa .. Damma .. kasra .. "])" .. he .. "$", "%1")
text = rsub(text, "([" .. fatHa .. Damma .. kasra .. "])" .. he .. "(" .. space_like_class .. zwnj .. ")", "%1%2")
text = rsub(text, zwnj, "")
-- convert shadda to double letter.
text = rsub(text, "(.)" .. shadda, "%1%1")
if not omit_i3raab and gray_i3raab then -- show ʾiʿrob grayed in transliteration
-- decide whether to gray out the t in ﺓ. If word begins with al- or l-, yes.
-- Otherwise, no if word ends in a/i/u, yes if ends in an/in/un.
text = rsub(text, "^(a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])",
'%1<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%2')
text = rsub(text, "(" .. space_like_class .. "a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])",
'%1<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%2')
text = rsub(text, taa_marbuuTa .. "([" .. fatHa .. Damma .. kasra .. "])", "t%1")
text = rsub(text, taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. "])",
'<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%1')
text = rsub(text, ".", {
[fatHataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">an</span>',
[kasrataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">in</span>',
[Dammataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">un</span>'
})
text = rsub(text, "([" .. fatHa .. Damma .. kasra .. "])(" .. space_like_class .. ")",
function(vowel, space)
vowel_repl = {
[fatHa] = '<span style="color: var(--wikt-palette-grey-8,#888)">a</span> ',
[kasra] = '<span style="color: var(--wikt-palette-grey-8,#888)">i</span> ',
[Damma] = '<span style="color: var(--wikt-palette-grey-8,#888)">u</span> '
}
return vowel_repl[vowel] .. space
end
)
text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "]$", {
[fatHa] = '<span style="color: var(--wikt-palette-grey-8,#888)">a</span>',
[kasra] = '<span style="color: var(--wikt-palette-grey-8,#888)">i</span>',
[Damma] = '<span style="color: var(--wikt-palette-grey-8,#888)">u</span>'
})
text = rsub(text, '</span><span style="color: var(--wikt-palette-grey-8,#888)">', "")
elseif omit_i3raab then -- omit ʾiʿrob in transliteration
text = rsub(text, "[" .. fatHataan .. Dammataan .. kasrataan .. "]", "")
text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "](" .. space_like_class .. ")", "%1")
text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "]$", "")
end
-- toʾ marbūta should not be rendered by -t if word-final even when
-- ʾiʿrob (desinential inflection) is shown; instead, use (t) before
-- whitespace, nothing when final; but render final -ﺍﺓ and -ﺁﺓ as -oh,
-- consistent with Wehr's dictionary
-- Left-to-right or right-to-left mark at end of text will prevent toʾ marbūta
-- from being transliterated correctly.
text = string.gsub(text, lrm, "")
text = string.gsub(text, rlm, "")
text = rsub(text, "([" .. alif .. alif_madda .. "])" .. taa_marbuuTa .. "$", "%1h")
-- Ignore final toʾ marbūta (it appears as "a" due to the preceding
-- short vowel). Need to do this after graying or omitting word-final
-- ʾiʿrob.
text = rsub(text, taa_marbuuTa .. "$", "")
text = rsub(text, taa_marbuuTa .. "(%p)", "%1")
if not omit_i3raab then -- show ʾiʿrob in transliteration
text = rsub(text, taa_marbuuTa .. "(" .. space_like_class .. ")", "(t)%1")
else
-- When omitting ʾiʿrob, show all non-absolutely-final instances of
-- toʾ marbūta as (t), with trailing ʾiʿrob omitted.
text = rsub(text, taa_marbuuTa, "(t)")
end
-- tatwīl should be rendered as - at beginning or end of word. It will
-- be rendered as nothing in the middle of a word (FIXME, do we want
-- this?)
text = rsub(text, "^ـ", "-")
text = rsub(text, "(" .. space_like_class .. ")ـ",
"%1-")
text = rsub(text, "ـ$", "-")
text = rsub(text, "ـ(" .. space_like_class .. ")", "-%1")
-- Now convert remaining Arabic chars according to table.
text = rsub(text, ".", tt)
text = rsub(text, "ao", "o")
-- Implement elision of al- after a final vowel. We do this
-- conservatively, only handling elision of the definite article and related
-- terms (specifically, relative pronoun الَّذِی (allaḏī) and variants) rather
-- than elision in other cases of hamzat al-wasl (e.g. form-I imperatives
-- or form-VII and above verbal nouns) partly because elision in
-- these cases isn't so common in MSA and partly to avoid excessive
-- elision in case of words written with initial bare alif instead of
-- properly with hamzated alif. Possibly we should reconsider.
text = rsub(text, "([aiuoīū]'* +'*)a([" .. sun_letters_tr .. "][%-" .. alladi_marker .. "])",
"%1%2")
if gray_i3raab then
text = rsub(text, "([aiuoīū]'*</span>'* +'*)a([" .. sun_letters_tr .. "][%-" .. alladi_marker .. "])",
"%1%2")
end
-- remove indicator of allaḏī, which has served its purpose
text = rsub(text, alladi_marker, "")
-- Special-case the transliteration of alloh, without the hyphen.
text = rsub(text, "^(a?)l%-loh", "%1lloh")
text = rsub(text, "(" .. space_like_class .. "a?)l%-loh", "%1lloh")
-- Compress multiple spaces, which may occur e.g. when removing Koranic diacritics.
text = rsub(text, "(%s)%s+", "%1")
-- Remove length (not native in uzbek)
text = rsub(text, "ī", "i")
text = rsub(text, "ū", "u")
return text
end
local has_diacritics_subs = {
-- FIXME! What about lam-alif ligature?
-- remove punctuation and shadda
-- must go before removing final consonants
{"[" .. punctuation .. shadda .. "]", ""},
{zwnj, " "},
-- Remove consonants at end of word or utterance, so that we're OK with
-- words lacking iʿrob (must go before removing other consonants).
-- If you want to catch places without iʿrob, comment out the next two lines.
{"[" .. lconsonants .. "]$", ""},
{"[" .. lconsonants .. "]([%)%]}]?" .. space_like_class .. ")", "%1"},
-- remove consonants (or alif) when followed by diacritics
-- must go after removing shadda
-- do not remove the diacritics yet because we need them to handle
-- long-vowel sequences of diacritic + pseudo-consonant
{"[" .. lconsonants .. alif .. "]([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. sukuun .. dagger_alif .. waaw_maj .. yaa_maj .. "])", "%1"},
-- the following two must go after removing consonants w/diacritics because
-- we only want to treat vocalic wow/yo' in them (we want to have removed
-- wow/yo' followed by a diacritic)
-- remove zamma + wow
{Damma .. waaw, ""},
-- remove kasra + yo'
{kasra .. yaa, ""},
-- remove majhuls
{waaw_maj, ""},
{yaa_maj, ""},
-- remove fatḥa/fatḥatan + alif/alif-maqsūra
{"[" .. fatHataan .. fatHa .. "][" .. alif .. alif_maqSuura .. "]", ""},
-- remove diacritics
{"[" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. sukuun .. dagger_alif .. "]", ""},
-- remove numbers, hamzatu l-wasl, alif madda
{"[" .. numbers .. "ٱ" .. "آ" .. "]", ""},
-- remove non-Arabic characters
{"[^" .. U(0x0600) .. "-" .. U(0x06FF) .. U(0x0750) .. "-" .. U(0x077F) ..
U(0x08A0) .. "-" .. U(0x08FF) .. U(0xFB50) .. "-" .. U(0xFDFF) ..
U(0xFE70) .. "-" .. U(0xFEFF) .. "]", ""}
}
-- declared as local above
function has_diacritics(text)
local orig_text = text
local count
text, count = rsubn(text, "[" .. lrm .. rlm .. "]", "")
if count > 0 then
require("Module:debug").track("ar-translit/lrm or rlm")
end
for _, sub in ipairs(has_diacritics_subs) do
text = rsub(text, unpack(sub))
end
if #text > 0 then
mw.log(("Check for missing diacritics failed; original text '%s', text without diacritics '%s'"):format(
orig_text, text))
end
return #text == 0
end
-- Return true if transliteration TR is an irregular transliteration of
-- ARABIC. Return false if ARABIC can't be transliterated. For purposes of
-- establishing regularity, hyphens are ignored and word-final toʾ marbūta
-- can be transliterated as "(t)", "" or "t".
function export.irregular_translit(arabic, tr)
if not arabic or arabic == "" or not tr or tr == "" then
return false
end
local regtr = export.tr(arabic)
if not regtr or regtr == tr then
return false
end
local arwords = rsplit(arabic, " ")
local regwords = rsplit(regtr, " ")
local words = rsplit(tr, " ")
if #regwords ~= #words or #regwords ~= #arwords then
return true
end
for i=1,#regwords do
local regword = regwords[i]
local word = words[i]
local arword = arwords[i]
-- Resolve final (t) in auto-translit to t, h or nothing
if rfind(regword, "%(t%)$") then
regword = rfind(word, "oh$") and rsub(regword, "%(t%)$", "h") or
rfind(word, "t$") and rsub(regword, "%(t%)$", "t") or
rsub(regword, "%(t%)$", "")
end
-- Resolve clitics + short a + alif-lom, which may get auto-transliterated
-- to contain long o, to short a if the manual translit has it; note
-- that currently in cases with assimilated l, the auto-translit will
-- fail, so we won't ever get here and don't have to worry about
-- auto-translit l against manual-translit assimilated char.
local clitic_chars = "^[وفیییل]" -- separate line to avoid L2R display weirdness
if rfind(arword, clitic_chars .. fatHa .. "?[" .. alif .. alif_waSl .. "]" .. laam) and rfind(word, "^[wfkl]a%-") then
regword = rsub(regword, "^([wfkl])o", "%1a")
end
-- Ignore hyphens when comparing
if rsub(regword, "%-", "") ~= rsub(word, "%-", "") then
return true
end
end
return false
end
return export