# This helps converting Sinhala text to standard Romanized text import re # -- Specials (vowels, diacritics, standalone signs) -- ro_specials = [ ['ඓ', 'ai'], ['ඖ', 'au'], ['ඍ', 'ṛ'], ['ඎ', 'ṝ'], ['ඐ', 'ḹ'], ['අ', 'a'], ['ආ', 'ā'], ['ඇ', 'æ'], ['ඇ', 'Æ'], ['ඈ', 'ǣ'], ['ඉ', 'i'], ['ඊ', 'ī'], ['උ', 'u'], ['ඌ', 'ū'], ['එ', 'e'], ['ඒ', 'ē'], ['ඔ', 'o'], ['ඕ', 'ō'], ['ඞ්', 'ṅ'], ['ං', 'ṁ'], ['ං', 'ṃ'], ['ඃ', 'ḥ'], ['ඃ', 'Ḥ'], ] # -- Consonants -- ro_consonants = [ ['ඛ', 'kh'], ['ඨ', 'ṭh'], ['ඝ', 'gh'], ['ඡ', 'ch'], ['ඣ', 'jh'], ['ඦ', 'ñj'], ['ඪ', 'ḍh'], ['ඬ', 'ṇḍ'], ['ථ', 'th'], ['ධ', 'dh'], ['ඵ', 'ph'], ['භ', 'bh'], ['ඹ', 'mb'], ['ඳ', 'ṉd'], ['ඟ', 'ṉg'], ['ඥ', 'gn'], ['ක', 'k'], ['ග', 'g'], ['ච', 'c'], ['ජ', 'j'], ['ඤ', 'ñ'], ['ට', 'ṭ'], ['ඩ', 'ḍ'], ['ණ', 'ṇ'], ['ත', 't'], ['ද', 'd'], ['න', 'n'], ['ප', 'p'], ['බ', 'b'], ['ම', 'm'], ['ය', 'y'], ['ර', 'r'], ['ල', 'l'], ['ව', 'v'], ['ශ', 'ś'], ['ෂ', 'ş'], ['ෂ', 'ṣ'], ['ස', 's'], ['හ', 'h'], ['ළ', 'ḷ'], ['ෆ', 'f'] ] # -- Combinations (consonant + vowel signs) -- ro_combinations = [ ['', '', '්'], ['', 'a', ''], ['', 'ā', 'ා'], ['', 'æ', 'ැ'], ['', 'ǣ', 'ෑ'], ['', 'i', 'ි'], ['', 'ī', 'ී'], ['', 'u', 'ු'], ['', 'ū', 'ූ'], ['', 'e', 'ෙ'], ['', 'ē', 'ේ'], ['', 'ai', 'ෛ'], ['', 'o', 'ො'], ['', 'ō', 'ෝ'], ['', 'ṛ', 'ෘ'], ['', 'ṝ', 'ෲ'], ['', 'au', 'ෞ'], ['', 'ḹ', 'ෳ'] ] # -- Generate consonant+vowel combos -- def create_conso_combi(combinations, consonants): conso_combi = [] for combi in combinations: for conso in consonants: base_sinh = conso[0] + combi[2] base_rom = combi[0] + conso[1] + combi[1] conso_combi.append((base_sinh, base_rom)) return conso_combi ro_conso_combi = create_conso_combi(ro_combinations, ro_consonants) # -- Core replace function -- def replace_all(text, mapping): # sort by length (to handle longest matches first) mapping = sorted(mapping, key=lambda x: len(x[0]), reverse=True) for sinh, rom in mapping: text = re.sub(sinh, rom, text) return text # -- Main Sinhala → Roman Function -- def sinhala_to_roman(text): # remove ZWJ (zero-width joiner) text = text.replace("\u200D", "") # do consonant+vowel combos first text = replace_all(text, ro_conso_combi) # then specials text = replace_all(text, ro_specials) return text