# final # ================================================================ # = STEP 1: SETUP AND DOWNLOAD (YOUR PROVEN METHOD) = # ================================================================ import os print("--- 1. Installing All Libraries ---") print("✅ Libraries installed.") print("\n--- 2. Cloning IndicLID Repository ---") # Using your proven method of changing directories print("✅ Repository cloned.") # Navigate into the correct directory structure print("\n--- 3. Downloading and Unzipping IndicLID Models ---") print("✅ Download commands executed. Unzipping now...") print("✅ Unzip commands executed.") print("\n🎉🎉🎉 SETUP COMPLETE. You can now proceed to Step 2. 🎉🎉🎉") import shutil import os # Source folder path source = "/usr/local/lib/python3.12/dist-packages/transformers" # Destination folder path destination = "/content/IndicLID/Inference/ai4bharat/" # Ensure destination directory exists os.makedirs(destination, exist_ok=True) # Move folder moved_path = shutil.move(source, destination) print(f"Folder moved to: {moved_path}") # ========================= # = STEP 2: INITIALIZE MODELS (EXACTLY AS YOUR OLD CODE) = # ========================= import os import sys import torch print("--- Applying your original add_safe_globals fix... ---") if "/content/IndicLID/Inference" not in sys.path: sys.path.append("/content/IndicLID/Inference") from transformers.models.bert.modeling_bert import ( BertModel, BertPreTrainedModel, BertForSequenceClassification, BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention, BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput ) from transformers.models.bert.configuration_bert import BertConfig import torch.nn as nn from torch.nn.modules.sparse import Embedding from torch.nn.modules.container import ModuleList from torch.nn.modules.linear import Linear from torch.nn.modules.normalization import LayerNorm from torch.nn.modules.dropout import Dropout torch.serialization.add_safe_globals([ BertModel, BertPreTrainedModel, BertForSequenceClassification, BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention, BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput, BertConfig, Embedding, ModuleList, Linear, LayerNorm, Dropout, ]) print("✅ Comprehensive safe globals added successfully.") from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from IndicTransToolkit.processor import IndicProcessor from ai4bharat.IndicLID import IndicLID print("--- Loading all models into memory... ---") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") lid = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6) print("✅ IndicLID model loaded successfully.") MODEL_ID = "ai4bharat/indictrans2-indic-en-1B" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, trust_remote_code=True).to(device) ip = IndicProcessor(inference=True) print("✅ IndicTrans2 1B model loaded.") print("🎉 ALL MODELS ARE LOADED. Proceed to direct batch prediction tests.") import sys print(sys.path) pip show transformers # ================================================================ # = STEP 2.5: LOAD ROMANSETU (COMPATIBLE WITH 4.40.2) = # ================================================================ from transformers import AutoTokenizer, AutoModelForCausalLM import torch print("--- Loading RomanSetu model compatible with transformers 4.40.2... ---") # Try smaller, more compatible models first model_options = [ "ai4bharat/romansetu-cpt-roman-100m", "ai4bharat/romansetu-cpt-roman-200m" ] rs_model = None rs_tokenizer = None for model_id in model_options: try: print(f"Trying model: {model_id}") rs_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) rs_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(device) print(f"✅ {model_id} loaded successfully.") break except Exception as e: print(f"❌ {model_id} failed: {e}") continue if rs_model is None: print("❌ All RomanSetu models failed. Continuing with transliteration-based approach.") def translate_with_romansetu(text, max_new_tokens=50): if rs_model is None: # Fallback: use enhanced transliteration + IndicTrans2 from indic_transliteration import sanscript from indic_transliteration.sanscript import transliterate try: # Try to transliterate and then translate with IndicTrans2 native_text = transliterate(text, sanscript.ITRANS, sanscript.DEVANAGARI) pre = ip.preprocess_batch([native_text], src_lang="hin_Deva", tgt_lang="eng_Latn") inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device) with torch.no_grad(): out = model.generate(**inputs, num_beams=3, max_length=100) dec = tokenizer.batch_decode(out, skip_special_tokens=True) post = ip.postprocess_batch(dec, lang="hin_Deva") return post[0] except: return text try: prompt = f"Translate this romanized Indian text to English: {text}" inputs = rs_tokenizer(prompt, return_tensors="pt").to(device) with torch.no_grad(): outputs = rs_model.generate( inputs.input_ids, max_new_tokens=max_new_tokens, num_beams=2, temperature=0.7, do_sample=True, pad_token_id=rs_tokenizer.eos_token_id ) full_response = rs_tokenizer.decode(outputs, skip_special_tokens=True) translation = full_response.replace(prompt, "").strip() return translation if translation and len(translation) > 2 else text except Exception as e: return text print("✅ RomanSetu/fallback translation function defined.") print("🎉 SETUP COMPLETE with fallback mechanism.") # ================================================================ # = STEP 2.6: LOAD INDICXLIT FOR BETTER TRANSLITERATION (CORRECTED) = # ================================================================ print("--- Installing and loading IndicXlit for better romanized text handling ---") # Install IndicXlit (compatible with your transformers==4.40.2) from ai4bharat.transliteration import XlitEngine import torch try: # Load IndicXlit engines for different languages (based on official docs) xlit_engines = { "hindi": XlitEngine("hi", beam_width=4, rescore=True), "bengali": XlitEngine("bn", beam_width=4, rescore=True), "tamil": XlitEngine("ta", beam_width=4, rescore=True), "telugu": XlitEngine("te", beam_width=4, rescore=True), "gujarati": XlitEngine("gu", beam_width=4, rescore=True), "kannada": XlitEngine("kn", beam_width=4, rescore=True), "malayalam": XlitEngine("ml", beam_width=4, rescore=True), "punjabi": XlitEngine("pa", beam_width=4, rescore=True), "marathi": XlitEngine("mr", beam_width=4, rescore=True), "urdu": XlitEngine("ur", beam_width=4, rescore=True), } print("✅ Multiple IndicXlit engines loaded successfully.") except Exception as e: print(f"❌ Error loading IndicXlit: {e}") print("💡 Falling back to basic transliteration.") xlit_engines = {} def enhanced_transliterate_with_xlit(text, target_lang): """ Enhanced transliteration using IndicXlit (based on official API) """ lang_key = target_lang.lower() if not xlit_engines or lang_key not in xlit_engines: # Fallback to your existing transliteration from indic_transliteration import sanscript from indic_transliteration.sanscript import transliterate script_map = { "hindi": sanscript.DEVANAGARI, "bengali": sanscript.BENGALI, "tamil": sanscript.TAMIL, "telugu": sanscript.TELUGU, "kannada": sanscript.KANNADA, "malayalam": sanscript.MALAYALAM, "gujarati": sanscript.GUJARATI, "punjabi": sanscript.GURMUKHI, "marathi": sanscript.DEVANAGARI, "urdu": 'urdu' } return transliterate(text, sanscript.ITRANS, script_map.get(lang_key, sanscript.DEVANAGARI)) try: # Use IndicXlit for better transliteration (official API) engine = xlit_engines[lang_key] # For sentences, use translit_sentence (returns dict with lang code as key) if ' ' in text: result = engine.translit_sentence(text) # Get the language code for this engine lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te", "gujarati": "gu", "kannada": "kn", "malayalam": "ml", "punjabi": "pa", "marathi": "mr", "urdu": "ur"} lang_code = lang_codes.get(lang_key, "hi") return result.get(lang_code, text) else: # For single words, use translit_word (returns dict with topk results) result = engine.translit_word(text, topk=1) lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te", "gujarati": "gu", "kannada": "kn", "malayalam": "ml", "punjabi": "pa", "marathi": "mr", "urdu": "ur"} lang_code = lang_codes.get(lang_key, "hi") return result.get(lang_code, [text])[0] except Exception as e: print(f"IndicXlit error for '{text}': {e}") # Fallback if IndicXlit fails return text print("✅ Enhanced transliteration function defined.") print("🎉 INDICXLIT SETUP COMPLETE.") import pandas as pd from indic_transliteration import sanscript from indic_transliteration.sanscript import transliterate # EXPANDED language mapping to handle misdetections LID_TO_TRANSLATE = { # Hindi variants "hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, "hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Maithili (often confused with Hindi) - map to Hindi "mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, "mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Bengali variants "ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, "ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, # Assamese (often confused with Bengali) - map to Bengali "asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, "asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, # Tamil variants "tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, "tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, "tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, # Telugu variants "tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"}, "tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"}, # Kannada variants "kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"}, "kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"}, # Malayalam variants "mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"}, "mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"}, # Gujarati variants "guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"}, "guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"}, # Punjabi variants "pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"}, "pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"}, # Marathi variants "mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"}, "mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"}, # Urdu variants "urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"}, "urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"}, # Additional commonly misdetected languages "snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Sindhi → Hindi "nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Nepali → Hindi "kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Konkani → Hindi "gom_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Goan Konkani → Hindi "brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Bodo → Hindi } def enhanced_transliterate_robust(text, target_script): """ Enhanced transliteration with better romanization handling """ try: # Preprocess text for better transliteration cleaned_text = text.lower().strip() # Handle common romanization patterns replacements = { 'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph', 'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh', 'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au' } for old, new in replacements.items(): cleaned_text = cleaned_text.replace(old, new) # Transliterate using your existing library result = transliterate(cleaned_text, sanscript.ITRANS, target_script) return result if result else text except Exception as e: print(f"Transliteration error: {e}") return text def detect_and_translate_robust(texts, batch_size=64): """ Robust detection and translation with expanded language mapping """ results = [] preds = lid.batch_predict(texts, batch_size) for item in preds: if isinstance(item, dict): text = item.get("text", "") lang_code = item.get("lang", item.get("pred_lang", "")) score = float(item.get("score", 0.0)) model_name = item.get("model", "") else: text, lang_code, score, model_name = item is_romanized = lang_code.endswith("_Latn") if lang_code not in LID_TO_TRANSLATE: translation = f"Language '{lang_code}' not supported for translation" method = "Unsupported" else: try: lang_info = LID_TO_TRANSLATE[lang_code] src_code = lang_info["it_code"] if is_romanized: # Use enhanced transliteration native_text = enhanced_transliterate_robust(text, lang_info["script"]) method = f"Enhanced Transliteration + IndicTrans2 (detected as {lang_code})" print(f"Enhanced: '{text}' → '{native_text}' (detected: {lang_code})") else: native_text = text method = f"IndicTrans2 (detected as {lang_code})" # Translate with IndicTrans2 pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn") inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device) with torch.no_grad(): out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True) dec = tokenizer.batch_decode(out, skip_special_tokens=True) post = ip.postprocess_batch(dec, lang=src_code) translation = post[0] except Exception as e: translation = f"Translation error: {str(e)}" method = "Error" results.append({ "original_text": text, "detected_lang": lang_code, "script_type": "Romanized" if is_romanized else "Native", "confidence": f"{score:.3f}", "translation_method": method, "english_translation": translation }) return pd.DataFrame(results) print("✅ Robust translation function with expanded language mapping defined") # Test with the same samples sample_texts = [ "यहाँ कितने लोग हैं?", "tum kaha ho", "aaj mausam suhana hai", "aap kaise hain", "আমি ভালো আছি।", "ami bhalo achi", "mera naam rahul hai", "main office jaa raha hun" ] print(f"🔍 Testing robust approach with expanded language mapping...") df_results = detect_and_translate_robust(sample_texts, batch_size=16) display(df_results) # ================================================================ # = COMPLETE TEST CODE FOR ALL 22 INDIAN LANGUAGES = # ================================================================ import pandas as pd from indic_transliteration import sanscript from indic_transliteration.sanscript import transliterate # Official 22 Indian languages sample sentences (native + romanized) sample_sentences = { "Assamese": ("আপুনি কেনেকৈ আছেন?", "apuni kenekoi asen?"), "Bengali": ("তুমি কেমন আছো?", "tumi kemon acho?"), "Bodo": ("नांगनि फाथै खौ?", "nangni phathai kho?"), "Dogri": ("तुसीं केहे हो?", "tusi kehe ho?"), "Gujarati": ("તમે કેમ છો?", "tame kem cho?"), "Hindi": ("तुम कैसे हो?", "tum kaise ho?"), "Kannada": ("ನೀವು ಹೇಗಿದ್ದೀರಾ?", "neevu hegiddira?"), "Kashmiri": ("तुस की छै?", "tus ki chhai?"), "Konkani": ("तुम कशें आसा?", "tum kashen asa?"), "Maithili": ("अहाँ कथी छी?", "ahaan kathi chhi?"), "Malayalam": ("സുഖമായിരോ?", "sukhamaayiro?"), "Manipuri": ("नमस्कार, नखोंगबा तौ?", "namaskaar, nakhongba tau?"), "Marathi": ("तू कसा आहेस?", "tu kasa ahes?"), "Nepali": ("तिमी कस्तो छौ?", "timi kasto chau?"), "Odia": ("ତୁମେ କେମିତି ଅଛ?", "tume kemiti achha?"), "Punjabi": ("ਤੁਸੀਂ ਕਿਵੇਂ ਹੋ?", "tusi kiven ho?"), "Sanskrit": ("भवतः कथम् अस्ति?", "bhavatah katham asti?"), "Santali": ("ᱥᱟᱱᱛᱟᱲᱤ ᱠᱚᱱᱛᱮᱞᱤ ᱟᱹᱲᱤ?", "santalii konteli adii?"), "Sindhi": ("توهان ڪيئن آهيو؟", "tohan kayn aahiyo?"), "Tamil": ("நீங்கள் எப்படி இருக்கிறீர்கள்?", "neenga epdi irukeenga?"), "Telugu": ("మీరు ఎలా ఉన్నారు?", "meeru ela unnaru?"), "Urdu": ("آپ کیسے ہیں؟", "aap kaise hain?") } # Expanded language mapping (covers common misdetections) LID_TO_TRANSLATE = { # Hindi variants "hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, "hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, "mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Maithili→Hindi "mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, "nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Nepali→Hindi "snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Sindhi→Hindi "kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Konkani→Hindi "brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Bodo→Hindi # Bengali variants "ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, "ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, "asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, # Assamese→Bengali "asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, # Tamil variants "tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, "tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, "tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, # Telugu variants "tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"}, "tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"}, # Kannada variants "kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"}, "kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"}, # Malayalam variants "mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"}, "mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"}, # Gujarati variants "guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"}, "guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"}, # Punjabi variants "pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"}, "pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"}, # Marathi variants "mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"}, "mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"}, # Urdu variants "urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"}, "urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"}, } def enhanced_transliterate_robust(text, target_script): """Enhanced transliteration with better romanization handling""" try: cleaned_text = text.lower().strip() replacements = { 'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph', 'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh', 'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au' } for old, new in replacements.items(): cleaned_text = cleaned_text.replace(old, new) result = transliterate(cleaned_text, sanscript.ITRANS, target_script) return result if result else text except Exception as e: print(f"Transliteration error: {e}") return text def test_all_22_languages(texts, batch_size=32): """Complete testing function for all 22 languages""" results = [] preds = lid.batch_predict(texts, batch_size) for item in preds: if isinstance(item, dict): text = item.get("text", "") lang_code = item.get("lang", item.get("pred_lang", "")) score = float(item.get("score", 0.0)) model_name = item.get("model", "") else: text, lang_code, score, model_name = item is_romanized = lang_code.endswith("_Latn") if lang_code not in LID_TO_TRANSLATE: translation = f"Language '{lang_code}' not supported" method = "Unsupported" else: try: lang_info = LID_TO_TRANSLATE[lang_code] src_code = lang_info["it_code"] if is_romanized: native_text = enhanced_transliterate_robust(text, lang_info["script"]) method = f"Transliteration+IndicTrans2 (detected: {lang_code})" print(f"Romanized: '{text}' → '{native_text}'") else: native_text = text method = f"IndicTrans2 (detected: {lang_code})" # Translate with IndicTrans2 pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn") inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device) with torch.no_grad(): out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True) dec = tokenizer.batch_decode(out, skip_special_tokens=True) post = ip.postprocess_batch(dec, lang=src_code) translation = post[0] except Exception as e: translation = f"Translation error: {str(e)}" method = "Error" results.append({ "language": text[:20] + "..." if len(text) > 20 else text, "original_text": text, "detected_lang": lang_code, "script_type": "Romanized" if is_romanized else "Native", "confidence": f"{score:.3f}", "method": method, "english_translation": translation }) return pd.DataFrame(results) # Create test dataset with all 44 samples (22 native + 22 romanized) print("🔍 Creating test dataset for all 22 official Indian languages...") all_test_texts = [] for lang, (native, roman) in sample_sentences.items(): all_test_texts.append(native) all_test_texts.append(roman) print(f"📊 Testing {len(all_test_texts)} samples ({len(sample_sentences)} languages × 2 scripts)...") # Run the complete test df_results = test_all_22_languages(all_test_texts, batch_size=32) # Display results print("\n🎯 COMPLETE TEST RESULTS:") display(df_results) # Summary statistics print(f"\n📈 SUMMARY STATISTICS:") print(f"Total samples tested: {len(df_results)}") print(f"Languages detected: {df_results['detected_lang'].nunique()}") print(f"Native script samples: {len(df_results[df_results['script_type'] == 'Native'])}") print(f"Romanized samples: {len(df_results[df_results['script_type'] == 'Romanized'])}") print(f"Successfully translated: {len(df_results[~df_results['english_translation'].str.contains('error|not supported', case=False)])}") import pandas as pd def detailed_translation_summary(df_results): """ Generate comprehensive detailed summary of translation results """ # Flag successful translations df_results['successful_translation'] = ~df_results['english_translation'].str.contains('error|not supported', case=False, na=False) print("\n=========== OVERALL SUMMARY ===========") print(f"Total samples tested: {len(df_results)}") print(f"Languages detected: {df_results['detected_lang'].nunique()}") print(f"Native script samples: {df_results[df_results['script_type'] == 'Native'].shape[0]}") print(f"Romanized samples: {df_results[df_results['script_type'] == 'Romanized'].shape}") print(f"Successfully translated: {df_results['successful_translation'].sum()}") overall_success_rate = (df_results['successful_translation'].sum() / len(df_results) * 100) print(f"Overall success rate: {overall_success_rate:.1f}%") print("\n=========== DETAILED LANGUAGE BREAKDOWN ===========") # Per-language analysis lang_summary = df_results.groupby('detected_lang').agg( total_samples=('original_text', 'count'), native_count=('script_type', lambda x: (x == 'Native').sum()), romanized_count=('script_type', lambda x: (x == 'Romanized').sum()), mean_confidence=('confidence', lambda x: pd.to_numeric(x, errors='coerce').mean()), success=('successful_translation', 'sum'), error_count=('successful_translation', lambda x: (~x).sum()) ).reset_index().sort_values('total_samples', ascending=False) lang_summary['success_rate'] = (lang_summary['success'] / lang_summary['total_samples'] * 100).round(1) print(lang_summary) print("\n=========== TOP PERFORMING LANGUAGES ===========") top_performers = lang_summary[lang_summary['success_rate'] >= 90].sort_values('success_rate', ascending=False) if len(top_performers) > 0: print(top_performers[['detected_lang', 'total_samples', 'success_rate']]) else: print("No languages with 90%+ success rate") print("\n=========== CHALLENGING LANGUAGES ===========") challenging = lang_summary[lang_summary['success_rate'] < 50].sort_values('success_rate') if len(challenging) > 0: print(challenging[['detected_lang', 'total_samples', 'success_rate']]) else: print("No languages with <50% success rate") print("\n=========== ERROR ANALYSIS ===========") error_df = df_results[~df_results['successful_translation']] print(f"Total errors: {len(error_df)}") if len(error_df) > 0: print("\nError samples:") print(error_df[['original_text', 'detected_lang', 'script_type', 'confidence', 'english_translation']]) else: print("No errors found!") print("\n=========== SUCCESS BREAKDOWN BY SCRIPT ===========") script_summary = df_results.groupby('script_type').agg( total_samples=('original_text', 'count'), successful=('successful_translation', 'sum'), success_rate=('successful_translation', lambda x: x.mean() * 100) ).round(1) print(script_summary) print("\n=========== DETECTION CONFIDENCE ANALYSIS ===========") confidence_summary = lang_summary[['detected_lang', 'mean_confidence']].sort_values('mean_confidence', ascending=False) print("Top 10 most confident detections:") print(confidence_summary.head(10)) return lang_summary, script_summary, error_df # ===== HOW TO USE ===== print("✅ Detailed summary function defined") print("\n📋 To run on your test results:") print(" lang_summary, script_summary, error_df = detailed_translation_summary(df_results)") print(" display(lang_summary)") print(" display(error_df)") lang_summary, script_summary, error_df = detailed_translation_summary(df_results) display(lang_summary) display(error_df)