Module:sa-translit

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This module will transliterate Sanskrit language text per WT:SA TR. It is also used to transliterate Apabhramsa, Old Awadhi, Old Gujarati, Old Hindi, Old Marathi, and Pali. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:sa-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

local export = {}

local m_str_utils = require("Module:string utilities")

local gsub = m_str_utils.gsub
local toNFC = mw.ustring.toNFC
local U = m_str_utils.char

local grave = U(0x300)
local acute = U(0x301)
local diaeresis = U(0x308)
local svar = U(0x951)
local anud = U(0x952)
local d_svar = U(0x1CDA) -- double svarita, sometimes used for long vowel with svarita

local consonants = {
	['क']='k', ['ख']='kh', ['ग']='g', ['घ']='gh', ['ङ']='ṅ',
	['च']='c', ['छ']='ch', ['ज']='j', ['झ']='jh', ['ञ']='ñ', 
	['ट']='ṭ', ['ठ']='ṭh', ['ड']='ḍ', ['ढ']='ḍh', ['ण']='ṇ', 
	['त']='t', ['थ']='th', ['द']='d', ['ध']='dh', ['न']='n', 
	['प']='p', ['फ']='ph', ['ब']='b', ['भ']='bh', ['म']='m',
	['य']='y', ['र']='r', ['ल']='l', ['व']='v', ['ळ']='ḷ',
	['श']='ś', ['ष']='ṣ', ['स']='s', ['ह']='h',
}

local diacritics = {
	['ा']='ā', ['ि']='i', ['ी']='ī', ['ु']='u', ['ू']='ū', ['ृ']='ṛ', ['ॄ']='ṝ', 
	['ॢ']='ḷ', ['ॣ']='ḹ', ['े']='e', ['ै']='ai', ['ो']='o', ['ौ']='au',  ['्']='',
}

local tt = {
	-- vowels
	['अ']='a', ['आ']='ā', ['इ']='i', ['ई']='ī', ['उ']='u', ['ऊ']='ū', ['ऋ']='ṛ', ['ॠ']='ṝ',
	['ऌ']='ḷ', ['ॡ']='ḹ', ['ए']='e', ['ऐ']='ai', ['ओ']='o', ['औ']='au', 
	-- chandrabindu
	['ँ']='m̐', --until a better method is found
	-- anusvara
	['ं']='ṃ', --until a better method is found
	['ꣳ']='ṃ',  -- candrabindu virama
	-- visarga
	['ः']='ḥ',
	-- avagraha
	['ऽ']='ʼ',
	--numerals
	['०']='0', ['१']='1', ['२']='2', ['३']='3', ['४']='4', ['५']='5', ['६']='6', ['७']='7', ['८']='8', ['९']='9',
	--punctuation        
--  ['॥']='.', --double danda
--	['।']='.', --danda
    --Vedic extensions
    ['ᳵ']='x', ['ᳶ']='f',
    --Om
    ['ॐ']='oṃ',
    --reconstructed
    ['*'] = '',
}

function export.tr(text, lang, sc)
	if sc ~= "Deva" then
		return nil
	end

	-- Vedic accent handling 
	if text:match(anud) or text:match(svar) or text:match(d_svar) then
		-- insert 'a' after consonants without vowel diacritic or virama
		text = gsub(text, '([क-ह])([ा-्ॢॣ]?)', 
			function(c,d)
				if d == "" then return c .. 'a' else return c .. d end
			end)
		local vow_list = "aअ-औा-ौॠ-ॣ"
		local vow = "[" .. vow_list .. "]"
		local extra_list = "ःंँ" -- visarga, anusvara, candrabindu
		local extra = "[" .. extra_list .. "]"		
		local acc_list = grave .. acute .. svar .. anud .. d_svar
--		local cons_list = "क-हᳵᳶऽ् \'" -- consonants + avagraha + virama + space + apostrophe (from e.g. bold formatting)
		-- Workaround: the consonants (plus a few other signs, see outcommented 'local cons_list') 
		-- are defined by negating the non-consonants, so as to include 
		-- the munged versions of formatting characters (e.g. bold formatting)
        local cons = "[^" .. vow_list .. acc_list .. extra_list .. "।॥ॐ]" 
        -- independent svarita before udatta or other independent svarita (indicated by १/३ with both svarita and anudatta sign)
		text = gsub(text, "(" .. extra .. "?)" .. anud .. "?[१३][" .. anud .. svar .. d_svar .. "]+(" .. 
			cons .. "*" .. vow .. ")(" .. extra .. "?)([" .. svar .. d_svar .. "]?)",
			function(a,b,c,d)
				if d ~= "" then
					return grave .. a .. b .. grave .. c	-- 2 × independent svarita
				else
					return grave .. a .. b .. acute .. c	-- independent svarita + udatta
				end
			end)
		-- optional: a few non-Rigvedic ways to mark the independent svarita (but compatible with Rigvedic system)
		-- 1) ᳡ (U+1CE1) used by Atharvavedic Śaunakīya Saṃhitā 
		-- 2) ᳖ (U+1CD6) used by Śuklayajurveda Mādhyandina-Saṃhitā for 'standard' independent svarita
		-- 3) ᳕ (U+1CD5) used by Śuklayajurveda Mādhyandina-Saṃhitā for 'aggravated' independent svarita (before udatta)
		-- note that the Rigvedic system doesn't distinguish between dependent vs. independendent 
		-- svarita after udatta (the latter would need manual addition of grave)
		text = gsub(text, "(" .. extra .. "?)[᳡᳖]", grave .. "%1")
		text = gsub(text, "(" .. extra .. "?)᳕(" .. cons .. "*" .. vow ..")", grave .. "%1%2" .. acute)
		-- initial udatta/svarita
		text = gsub(text, "^(" .. cons .. "*" .. vow .. ")(" .. extra .. "?)([^" .. anud .. grave .. extra_list .. "])",
			function(a,b,c)
				if c == svar or c == d_svar then
					return a .. grave .. b -- initial svarita
				else
					return a .. acute .. b .. c -- initial udatta
				end
			end)
		-- the same, after (double) danda or 'om'
		text = gsub(text, "([।॥ॐ]" .. cons .. "*" .. vow .. ")(" .. extra .. "?)([^" .. anud .. grave .. extra_list .. "])",
			function(a,b,c)
				if c == svar or c == d_svar then
					return a .. grave .. b -- initial svarita
				else
					return a .. acute .. b .. c -- initial udatta
				end
			end)
		-- in case of anudatta sign not before other anudatta sign (nor before grave accent from १/३)
		text = gsub(text, "(" .. vow .. extra .. "?" .. anud .. cons .. "*" .. 
			vow .. ")(" .. extra .. "?)([^" .. anud .. grave .. extra_list .. "])",
			function(a,b,c)
				if c == svar or c == d_svar then  
					return a .. grave .. b -- independent svarita
				else
					return a .. acute .. b .. c -- udatta
				end
			end)
		-- and again (excluding acute on next vowel), in case of overlapping patterns (if 'c' above happens to be another vowel with anudatta)
		text = gsub(text, "(" .. vow .. extra .. "?" .. anud .. cons .. "*" .. 
			vow .. ")(" .. extra .. "?)([^" .. anud .. grave .. acute .. extra_list .. "])",
			function(a,b,c)
				if c == svar or c == d_svar then  
					return a .. grave .. b -- independent svarita
				else
					return a .. acute .. b .. c -- udatta
				end
			end)
		-- the same, string final
		text = gsub(text, "(" .. vow .. extra .. "?" .. anud .. cons .. "*" .. 
			vow .. ")(" .. extra .. "?)([" .. svar .. d_svar .. "]?)$",
			function(a,b,c)
				if c ~= "" then  
					return a .. grave .. b -- independent svarita
				else
					return a .. acute .. b -- udatta
				end
			end)
		-- unmarked vowel after udatta is also udatta
		text = gsub(text, "(" .. vow .. acute .. extra .. "?" .. cons .. "*" .. 
			vow .. ")(" .. extra .. "?[^" .. acc_list .. extra_list .. "])", "%1" .. acute .. "%2")
		-- and again, in case of three udatta's in a row
		text = gsub(text, "(" .. vow .. acute .. extra .. "?" .. cons .. "*" .. 
			vow .. ")(" .. extra .. "?[^" .. acc_list .. extra_list .. "])", "%1" .. acute .. "%2")
		-- yet again: 4 udatta's in a row occurs in RV.1.164.39
		text = gsub(text, "(" .. vow .. acute .. extra .. "?" .. cons .. "*" .. 
			vow .. ")(" .. extra .. "?[^" .. acc_list .. extra_list .. "])", "%1" .. acute .. "%2")
		-- the same, string final
		text = gsub(text, "(" .. vow .. acute .. extra .. "?" .. cons .. "*" .. 
			vow .. ")(" .. extra .. "?)$", "%1" .. acute .. "%2")
		-- remove remaining anudatta and svarita signs
		text = gsub(text, "[" .. anud .. svar .. d_svar .. "]", "")
		
		text = gsub(text, '.', consonants)
		text = gsub(text, '.', diacritics)
	else -- no Vedic accents
		text = gsub(
		text,
		'([क-ह])'..
		'([ािीुूृॄॢॣेैोौ्]?)'..
		'([अ-औ]?)',
		function(c, d, e)
			if d == "" and e ~= "" then        
				if tt[e] == "i" or tt[e] == "u" then return consonants[c] .. 'a' .. tt[e] .. diaeresis
				else return consonants[c] .. 'a' .. tt[e] end
            elseif e ~= "" then
				return consonants[c] .. diacritics[d] .. tt[e]
			elseif d == "" then        
				return consonants[c] .. 'a'
			else
				return consonants[c] .. diacritics[d]
			end
		end)
	end
	
	text = gsub(text, '([aअ][' .. acute .. grave .. ']?[इउ])', '%1' .. diaeresis)
	text = gsub(text, '.', tt)
	text = gsub(text, " ?[।॥]", ".")
	text = toNFC(text)
	
	return text
end
 
return export