SinhalaVITS-TTS-F1 / romanizer.py
KasunUoM's picture
Script for Romanizing
bef9309 verified
# This helps converting Sinhala text to standard Romanized text
import re
# -- Specials (vowels, diacritics, standalone signs) --
ro_specials = [
['ඓ', 'ai'],
['ඖ', 'au'],
['ඍ', 'ṛ'],
['ඎ', 'ṝ'],
['ඐ', 'ḹ'],
['අ', 'a'],
['ආ', 'ā'],
['ඇ', 'æ'], ['ඇ', 'Æ'],
['ඈ', 'ǣ'],
['ඉ', 'i'],
['ඊ', 'ī'],
['උ', 'u'],
['ඌ', 'ū'],
['එ', 'e'],
['ඒ', 'ē'],
['ඔ', 'o'],
['ඕ', 'ō'],
['ඞ්', 'ṅ'],
['ං', 'ṁ'], ['ං', 'ṃ'],
['ඃ', 'ḥ'], ['ඃ', 'Ḥ'],
]
# -- Consonants --
ro_consonants = [
['ඛ', 'kh'],
['ඨ', 'ṭh'],
['ඝ', 'gh'],
['ඡ', 'ch'],
['ඣ', 'jh'],
['ඦ', 'ñj'],
['ඪ', 'ḍh'],
['ඬ', 'ṇḍ'],
['ථ', 'th'],
['ධ', 'dh'],
['ඵ', 'ph'],
['භ', 'bh'],
['ඹ', 'mb'],
['ඳ', 'ṉd'],
['ඟ', 'ṉg'],
['ඥ', 'gn'],
['ක', 'k'],
['ග', 'g'],
['ච', 'c'],
['ජ', 'j'],
['ඤ', 'ñ'],
['ට', 'ṭ'],
['ඩ', 'ḍ'],
['ණ', 'ṇ'],
['ත', 't'],
['ද', 'd'],
['න', 'n'],
['ප', 'p'],
['බ', 'b'],
['ම', 'm'],
['ය', 'y'],
['ර', 'r'],
['ල', 'l'],
['ව', 'v'],
['ශ', 'ś'],
['ෂ', 'ş'], ['ෂ', 'ṣ'],
['ස', 's'],
['හ', 'h'],
['ළ', 'ḷ'],
['ෆ', 'f']
]
# -- Combinations (consonant + vowel signs) --
ro_combinations = [
['', '', '්'],
['', 'a', ''],
['', 'ā', 'ා'],
['', 'æ', 'ැ'],
['', 'ǣ', 'ෑ'],
['', 'i', 'ි'],
['', 'ī', 'ී'],
['', 'u', 'ු'],
['', 'ū', 'ූ'],
['', 'e', 'ෙ'],
['', 'ē', 'ේ'],
['', 'ai', 'ෛ'],
['', 'o', 'ො'],
['', 'ō', 'ෝ'],
['', 'ṛ', 'ෘ'],
['', 'ṝ', 'ෲ'],
['', 'au', 'ෞ'],
['', 'ḹ', 'ෳ']
]
# -- Generate consonant+vowel combos --
def create_conso_combi(combinations, consonants):
conso_combi = []
for combi in combinations:
for conso in consonants:
base_sinh = conso[0] + combi[2]
base_rom = combi[0] + conso[1] + combi[1]
conso_combi.append((base_sinh, base_rom))
return conso_combi
ro_conso_combi = create_conso_combi(ro_combinations, ro_consonants)
# -- Core replace function --
def replace_all(text, mapping):
# sort by length (to handle longest matches first)
mapping = sorted(mapping, key=lambda x: len(x[0]), reverse=True)
for sinh, rom in mapping:
text = re.sub(sinh, rom, text)
return text
# -- Main Sinhala → Roman Function --
def sinhala_to_roman(text):
# remove ZWJ (zero-width joiner)
text = text.replace("\u200D", "")
# do consonant+vowel combos first
text = replace_all(text, ro_conso_combi)
# then specials
text = replace_all(text, ro_specials)
return text