Модул:ur-translit
Изглед
Документацију овог модула можете да направите на страници Модул:ur-translit/док
local U = mw.ustring.char
local gsub = mw.ustring.gsub
local export = {}
local fatHataan = U(0x64B)
local zabar = U(0x64E)
local zer = U(0x650)
local pesh = U(0x64F)
local zwnj = U(0x200C) -- Is this even used in Urdu? Why was it included in the previous version?
local highhmz = U(0x654)
local tashdid = U(0x651) -- also called tashdid
local jazm = "ْ"
local he = "ہ"
local ghunna = U(0x658)
local dagger_alif = U(0x670)
local consonants = "ببپتثجچحخدذرزژسشصضطظعغفقکگلࣇمنݨؤہئھٹڈڑ"
local consonantS = "ببپتثجچحخدذرزژسشصضطظعغفقکگلࣇمنݨہھٹڈڑ"
local consonantS2 = "یببپتثجچحخدذرزژسشصضطظعغفقکگلࣇمنݨوؤہھئٹڈڑ"
local semivowel = "یو"
local vowels = "āایئےۓوؤ"
local indvowels = "آایےوؤ"
local hes = "ہح"
local diacritics = "َُِّْٰ"
local ZZP = "َُِ"
local lrm = U(0x200e) -- left-to-right mark
local rlm = U(0x200f) -- right-to-left mark
local consonants_needing_vowels = "ببپتثجچحخدذرزژسشصضطظعغفقکگلࣇمنںݨہئٹڈڑءﷲ"
-- consonants on the right side; includes alif madda
local rconsonants = consonants_needing_vowels .. "ویآ"
-- consonants on the left side; does not include alif madda
local lconsonants = consonants_needing_vowels
local space_like = "%s'" .. '"'
local space_like_class = "[" .. space_like .. "]"
-- not all letters here are used by urdu
local mapping = {
["آ"] = 'ā', ["ب"] = 'b', ["پ"] = 'p', ["ت"] = 't', ["ٹ"] = 'ṭ', ["ث"] = 's',
["ج"] = 'j', ["چ"] = 'c', ["ح"] = 'h', ["خ"] = 'x',
["د"] = 'd', ["ڈ"] = 'ḍ', ["ذ"] = 'z', ["ر"] = 'r', ['ڑ'] = "ṛ", ["ز"] = 'z', ["ژ"] = 'ź',
["س"] = 's', ["ش"] = 'ś', ["ص"] = 's', ["ض"] = 'z',
["ط"] = 't', ["ظ"] = 'z', ["غ"] = 'ġ', ["ف"] = 'f', ["ق"] = 'q',
["ک"] = 'k', ["گ"] = 'g', ["ݨ"] = 'ṇ', ["ࣇ"] = 'ḷ',
["ل"] = 'l', ["م"] = 'm', ["ن"] = 'n', ["و"] = 'o', ["ہ"] = 'h', ["ی"] = 'e', ["ے"] = 'e', ["۔"] = ".", ["ں"] = '̃',
["ھ"] = "h",
["ع"] = '\'',
["ء"] = '\'',
["أ"] = '',
-- diacritics
[zabar] = "a",
[zer] = "i",
[pesh] = "u",
[jazm] = "", -- also sukun - no vowel
[zwnj] = "-", -- ZWNJ (zero-width non-joiner)
-- ligatures
["ﻻ"] = "lā",
["ﷲ"] = "allāh",
-- kashida
["ـ"] = "-", -- kashida, no sound
-- numerals
["۱"] = "1", ["۲"] = "2", ["۳"] = "3", ["۴"] = "4", ["۵"] = "5",
["۶"] = "6", ["۷"] = "7", ["۸"] = "8", ["۹"] = "9", ["۰"] = "0",
-- punctuation (leave on separate lines)
["؟"] = "?", -- question mark
["۔"] = ".", -- period
["،"] = ",", -- comma
["؛"] = ";", -- semicolon
["«"] = '“', -- quotation mark
["»"] = '”', -- quotation mark
["٪"] = "%", -- percent
["؉"] = "‰", -- per mille
["٫"] = ".", -- decimals
["٬"] = ",", -- thousand
["ۓ"] = "-ye",
[highhmz] = "-yi",
}
local punctuation = ":%(%)%[%]*&٫؛؟،ـ«\".\'!»٪؉۔"
local numbers = "۱۲۳۴۵۶۷۸۹۰"
local ain = 'ع'
local alif = 'ا'
local ye = 'ی'
local ye2 = 'ئ'
local ye3 = "ے"
local vao = "و"
local aspirate = 'ھ'
local highhmz = U(0x654)
local aiu = "āīūآ"
local n_exceptions = "[^" .. aiu .. "]" -- for nasalization exceptions
local before_diacritic_checking_subs = {
------------ transformations prior to checking for diacritics --------------
{U(0x06E5), "و"},
{U(0x06E6), "ی"},
-- ignore dagger alif placed over regular alif or alif maqṣūra
{"([" .. alif .. ye .. "])" .. dagger_alif, alif},
{"([^" .. alif .. ye .. "])" .. fatHataan, alif .. fatHataan},
}
local has_diacritics_subs = {
-- remove arabic ye (ruins conversions)
{"لل" .. he , ""},
{"لل" .. tashdid .. he , ""},
{"لل" .. tashdid .. dagger_alif .. he , ""},
{"ۃ" , ""},
-- aspirated consonants should cound as 1 consonant not two
{"([" .. consonants .. "][".. ZZP .. diacritics .. "?])" .. aspirate , "%1"},
{"([" .. consonants .. "])" .. aspirate , "%1"},
{ aspirate , ""},
-- remove punctuation and tashdid
{"[" .. punctuation .. tashdid .. highhmz .. zwnj .. numbers .. "]", ""},
-- noon gunna and silent consonants can be removed
{ ".. [".. ZZP .. indvowels .. diacritics .. "?] .. ([" .. consonantS2 .. "])" .. "([".. ghunna .. jazm .."])" .. "([" .. consonantS2 .. "])" , ""},
{ "([" .. consonants .. "])" .. ghunna , ""},
{ "([" .. consonantS2 .. "])" .. jazm , ""},
{ "([" .. consonantS2 .. "])" .. "یٰ" , ""},
-- must go before removing final consonants
{"[".. ZZP .. diacritics .. "]" .. alif , alif },
{fatHataan , "" },
{ "([" .. consonantS2 .. "])" .. "[" .. ZZP .. diacritics .. indvowels .. "?]" .. "([ںۓۂۂ])", "" },
{ "([ںۓۂۂ])", "" },
{ "([" .. ye .. alif .. "])" .. dagger_alif, alif},
{ dagger_alif .. ye , alif},
{ alif .. "[".. ZZP .. diacritics .. "]" , ""},
{ "[".. ZZP .. diacritics .. "]" .. alif , alif},
{ dagger_alif .. "([" .. ye .. alif .. "])", alif},
-- Remove consonants at end of word or utterance, so that we're OK with
-- words lacking iʿrāb (must go before removing other consonants).
-- If you want to catch places without iʿrāb, comment out the next two lines.
{"[" .. lconsonants .. "]$", ""},
-- closed consonants
{"([" .. consonantS2 .. "])[" .. indvowels .. ZZP .. "]", ""},
-- remove consonants (or alif) when followed by diacritics
-- must go after removing tashdid
-- do not remove the diacritics yet because we need them to handle
-- long-vowel sequences of diacritic + pseudo-consonant
{"[" .. lconsonants .. alif .. "]([" .. fatHataan .. zabar .. pesh .. zer .. jazm .. dagger_alif .. "])", "%1"},
-- the following two must go after removing consonants w/diacritics because
{"([" .. rconsonants .. "])([".. ZZP .. diacritics .. "?][" .. indvowels .. "?])([" .. consonantS2 .. "])", ""},
{"[" .. indvowels .. "]([" .. rconsonants .. "])", ""},
{"[".. ZZP .. diacritics .. "]([" .. lconsonants .. "])", ""},
{"([" .. consonants .. "])[" .. indvowels .. ZZP .. diacritics .. "]", ""},
{"([" .. rconsonants .. "])(" .. space_like_class .. ")", ""},
{"[" .. lconsonants .. "]" .. zabar .. "[".. ye .. ye3 .. vao .. "]", ""},
-- we only want to treat vocalic wāw/yā' in them (we want to have removed
-- remove vaw
{ "[" .. lconsonants .. "]" .. vao, ""},
{"ؤ" .. pesh , ""},
{"ؤ", ""},
-- remove ye
{ "[" .. lconsonants .. "]" .. ye, ""},
{ye3, ""},
{"([" .. consonants .. "][" .. ZZP .. "])" .. he,""},
-- remove fatḥa/fatḥatan + alif/alif-maqṣūra
{"[" .. fatHataan .. zabar .. "][" .. alif .. ye .. "]", ""},
-- remove diacritics and independant vowels
{"[" .. fatHataan .. zabar .. pesh .. zer .. jazm .. dagger_alif .. "]", ""},
{ "[" .. indvowels .. "]" , ""},
{ "[".. semivowel .."]" .. "[" .. indvowels .. "]" , ""},
-- remove numbers, hamzatu l-waṣl, alif madda
{"[" .. numbers .. "ٱ" .. "آ" .. "]", ""},
{"%s", ""},
}
-- declared as local above
local function has_diacritics(text)
local count
text, count = gsub(text, "[" .. lrm .. rlm .. "]", "")
if count > 0 then
require("Module:debug").track("ur-translit/lrm or rlm")
end
for _, sub in ipairs(has_diacritics_subs) do
text = gsub(text, unpack(sub))
end
return #text == 0
end
function export.tr(text, lang, sc)
if type(text) == "table" then
local function f(x) return (x ~= "") and x or nil end
text, lang, sc, omit_i3raab, force_translit =
f(text.args[1]), f(text.args[2]), f(text.args[3]), f(text.args[4]), f(text.args[5])
end
for _, sub in ipairs(before_diacritic_checking_subs) do
text = gsub(text, sub[1], sub[2])
end
if not force_translit and not has_diacritics(text) then
require("Module:debug").track("ur-translit/lacking diacritics")
return nil
end
--define the "end" of a word
text = gsub(text, "#", "HASHTAG")
text = gsub(text, " | ", "# | #")
text = gsub(text, "\n" , "#".."\n" .. "#")
text = gsub(text, "(["..punctuation.."])" , "#".."%1" .. "#")
text = "##" .. gsub(text, " ", "# #") .. "##"
text = gsub(text, zwnj, "#"..zwnj.."#")
-- hastags now mark the beginning and end of a word
--exceptions
text = gsub(text, "#" .. vao .. he .. "#", "#vo#")
text = gsub(text, "#" .. vao .. pesh .. he .. "#", "#vo#")
text = gsub(text, "#" .. "پ" .. he .. "#", "#pe#")
text = gsub(text, "#" .. "پ" .. zer .. he .. "#", "#pe#")
text = gsub(text, "#" .. ye .. he .. "#", "#ye#")
text = gsub(text, "#" .. ye .. zer .. he .. "#", "#ye#")
--character reformatting
--to make an exceptions for a word, put hashtags on both sides
text = gsub(text, "ۂ", he .. highhmz)
text = gsub(text, highhmz, "#"..highhmz.."#")
--text = gsub(text, 'ىٰ', "ā") -- the first letter is U+0649 (Arabic alif maqṣūra), it doesn't belong here
text = gsub(text, 'یٰ', "ā") -- the first letter is U+06CC
text = gsub(text, 'ٰ', "ā")
text = gsub(text, 'ا' .. fatHataan, "an")
text = gsub(text, 'لا', "ﻻ")
text = gsub(text, "ة" , "ۃ")
text = gsub(text, "ۃ" .. "([" .. ZZP .. "])", "ت%1")
text = gsub(text, "ۃ" , he)
-- Tashdeed
text = gsub(text, '([' .. consonantS2 .. '])' .. tashdid, "%1%1")
text = gsub(text, '([' .. consonantS2 .. '])' .. tashdid .. '([' .. ZZP .. '])', "%1%1%2")
-- For some reason the tashdeed gets pushed after the other diacritics, so this line is necessary for tashdeed to work with other diacritics
text = gsub(text, '([' .. consonants .. '])' .. '([' .. ZZP .. '])' .. tashdid, "%1%1%2")
text = gsub(text, '([' .. ZZP .. '])' .. aspirate, aspirate.."%1")
text = gsub(text, dagger_alif .. aspirate, aspirate.."%1")
text = gsub(text, ye .. '([' .. ZZP .. '])' .. tashdid, "yy%1")
text = gsub(text, vao .. '([' .. ZZP .. '])' .. tashdid, "vv%1")
text = gsub(text, ye .. tashdid .. '([' .. ZZP .. '])', "yy%1")
text = gsub(text, vao .. tashdid .. '([' .. ZZP .. '])', "vv%1")
--initial alif
text = gsub(text, "(["..consonantS2.."])" .. alif, "%1ā")
--alifs paired to a consonant are a vowel
text = gsub(text, jazm .. alif, "-") -- invisible ZWNJ
text = gsub(text, jazm .. "آ", "-ā") -- invisible ZWNJ
text = gsub(text, "(["..consonantS2.."])" .. "آ", "%1'ā")
text = gsub(text, pesh .. vao .. zabar .. alif , "ūā" )
text = gsub(text, zabar .. alif, "ā")
text = gsub(text, "(" .. ghunna .. ")" .. alif, "%1ā")
text = gsub(text, "(["..diacritics.."])" .. alif, "%1")
text = gsub(text, "(["..ZZP.."])" .. alif, "%1")
--alifs not paired to a consonant are a glottal stop (not shown currently)
text = gsub(text, alif.."(["..diacritics.."])".. "(["..consonantS2.."])", "%1%2")
text = gsub(text, alif..ye.."#", "ī")
text = gsub(text, alif..ye, "e")
text = gsub(text, alif..ye3, "e")
text = gsub(text, alif..zabar..ye3, "ai")
text = gsub(text, alif..vao, "o")
text = gsub(text, alif..zer..ye, "ī")
text = gsub(text, alif..pesh..vao, "ū")
text = gsub(text, alif.."(["..diacritics.."])", "%1")
-- convert semi vowels
text = gsub(text, vao.. "(["..diacritics..ZZP.."])", "v%1")
text = gsub(text, ye.. "(["..diacritics..ZZP.."])", "y%1")
text = gsub(text, ye .. "ā", "yā")
text = gsub(text, vao.. "ā", "vā")
text = gsub(text, ye .. "(["..zabar.."]?)" .. ye3, "y%1"..ye3.."")
text = gsub(text, vao .. "(["..zabar.."]?)" .. ye3, "v%1"..ye3.."")
text = gsub(text, ye .. "(["..semivowel.."])(["..semivowel.."])", "e%1%2")
text = gsub(text, vao .. "(["..semivowel.."])(["..semivowel.."])", "o%1%2")
text = gsub(text, ye .. "(["..semivowel.."])", "y%1")
text = gsub(text, vao .. "(["..semivowel.."])", "v%1")
-- conversions for vaav/vaw/vao
text = gsub(text, pesh.. vao, "ū")
text = gsub(text, zabar .. vao, "au")
text = gsub(text, vao.. "(["..diacritics..ZZP.."])", "v%1")
text = gsub(text, "(["..diacritics..ZZP.."])" .. vao, "%1v")
-- conversions for ye
text = gsub(text, zer.. ye, "ī")
text = gsub(text, ye .. "#", "ī#")
text = gsub(text, zabar.. ye, "ai")
text = gsub(text, zabar.. ye3, "ai")
text = gsub(text, ye .. "(["..diacritics..ZZP.."])", "y%1")
text = gsub(text, "(["..diacritics..ZZP.."])" .. ye , "%1y")
-- final he and izafa/ezafe
text = gsub(text, "e" .. zer .. "#", "e-yi#")
text = gsub(text, "ī" .. zer .. "#", "ī-yi#")
text = gsub(text, "y" .. zer .. "#", "-yi#")
text = gsub(text, zer .. "#", "-i#")
text = gsub(text, "(["..ZZP.."])" .. he .. "#" .. zwnj, "%1-")
text = gsub(text, "(["..ZZP.."])" .. he .. "#", "%1#")
text = gsub(text, zabar .. he .. "#", "a#")
-- noon ghunna assimilation/nasalization
--remove impossible nasal vowels
text = mw.ustring.gsub(text, "ن" .. ghunna .. "([ب])", "m%1") -- nasal vowels are impossible before b
text = mw.ustring.gsub(text, "ن" .. ghunna .. "ت" .. aspirate, "nth")
text = mw.ustring.gsub(text, "ن" .. ghunna .. "([قگ])", "ṅ%1") -- impossible before q and g
text = mw.ustring.gsub(text, "(" .. n_exceptions .. ")" .. "ن" .. ghunna .. "ٹ" .. aspirate , "%1ṇṭh")
text = mw.ustring.gsub(text, "(" .. n_exceptions .. ")" .. "ن" .. ghunna .. "پ" .. aspirate, "%1mph")
text = mw.ustring.gsub(text, "(" .. n_exceptions .. ")" .. "ن" .. ghunna .. "ک" .. aspirate, "%1ṅkh")
text = mw.ustring.gsub(text, "ن" .. ghunna .. "([ج])", "ñ%1") -- impossible before j
text = mw.ustring.gsub(text, "ن".. ghunna .. "ڈ" .. aspirate, "ṇḍh") -- aspirated d/D cant be nasalized
text = mw.ustring.gsub(text, "ن".. ghunna .. "د" .. aspirate, "ndh") -- aspirated d/D cant be nasalized
--other nasals
text = mw.ustring.gsub(text, "ن" .. jazm .. "([کگق])" .. "#", "ṅ%1#")
text = mw.ustring.gsub(text, "ن" .. jazm .. "([دتر])", "n%1") -- dental
text = mw.ustring.gsub(text, "ن" .. jazm .. "([چج])", "ñ%1") -- postalveolar
-- if noon ghunna cannot assimilate, it becomes a nasal vowel.
text = mw.ustring.gsub(text, "ن" .. ghunna, "ں")
-- get rid of hashtags (not needed)
text = gsub(text, "#", "")
text = gsub(text, "HASHTAG", "#")
text = string.gsub(text, lrm, "")
text = string.gsub(text, rlm, "")
-- convert all characters
text = mw.ustring.gsub(text, '.', mapping)
-- vowel fixes
-- nasalized dipthongs
text = gsub(text, 'a([iu])̃', 'a͠%1')
-- alif
-- Final corrections
text = mw.ustring.gsub(text, "lll", "ll")
text = mw.ustring.gsub(text, "āa", "ā")
text = mw.ustring.gsub(text, "aaa", "ā")
text = mw.ustring.gsub(text, "āā", "ā")
text = mw.ustring.gsub(text, "aa", "ā")
--now get rid of the zero consonants
text = mw.ustring.gsub(text, "ئ", "")
text = mw.ustring.gsub(text, "u" .. "ؤ" , "u")
text = mw.ustring.gsub(text, "ؤ" .. "u" , "ū")
text = mw.ustring.gsub(text, "ؤ", "o")
text = mw.ustring.toNFC(text)
return text
end
return export