Spaces:

SagarVelamuri
/

TranslationSpace

Sleeping

App Files Files Community

SagarVelamuri commited on 27 days ago

Commit

cd9cc66

verified ·

1 Parent(s): 448b0e7

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -26

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import os, re, types, traceback, torch, gradio as gr
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from IndicTransToolkit import IndicProcessor
 import spacy
@@ -14,8 +14,7 @@ TE_CODE  = "tel_Telu"
 ip = IndicProcessor(inference=True)
-# --------------------- Sentence Splitting (spaCy) ---------------------
-import spacy
 try:
     nlp = spacy.load("en_core_web_sm")
 except OSError:
@@ -23,13 +22,44 @@ except OSError:
     download("en_core_web_sm")
     nlp = spacy.load("en_core_web_sm")
 def split_into_sentences(text):
     """Split English text into sentences using spaCy."""
     doc = nlp(text.strip())
     return [sent.text.strip() for sent in doc.sents if sent.text.strip()]
-# --------------------- Cleanup Helper ---------------------
 def clean_translation(text):
     """Remove unresolved placeholder tags such as <ID1>, <ID2>."""
     return re.sub(r"<ID\d+>", "", text).strip()
@@ -57,7 +87,6 @@ def load_model(model_name: str):
         low_cpu_mem_usage=True, dtype=dtype, token=token
     ).to(device).eval()
-    # Fix vocab mismatch if any
     try:
         mdl.config.vocab_size = mdl.get_output_embeddings().weight.shape[0]
     except Exception:
@@ -66,23 +95,25 @@ def load_model(model_name: str):
     _model_cache[model_name] = (tok, mdl)
     return tok, mdl
-# --------------------- Streaming Translation ---------------------
 @torch.inference_mode()
 def translate_dual_stream(text, model_choice, num_beams, max_new):
-    """Generator that yields progressive Hindi & Telugu translations one sentence at a time."""
     if not text or not text.strip():
         yield "", ""
         return
     tok, mdl = load_model(MODELS[model_choice])
     sentences = split_into_sentences(text)
     hi_acc, te_acc = [], []
-    # Yield empty for immediate UI update
-    yield "", ""
     for i, sentence in enumerate(sentences, 1):
-        # --- Hindi Translation ---
         try:
             batch_hi = ip.preprocess_batch([sentence], src_lang=SRC_CODE, tgt_lang=HI_CODE)
             enc_hi = tok(batch_hi, max_length=256, truncation=True, padding=True, return_tensors="pt").to(device)
@@ -97,11 +128,17 @@ def translate_dual_stream(text, model_choice, num_beams, max_new):
             )
             dec_hi = tok.batch_decode(out_hi, skip_special_tokens=True, clean_up_tokenization_spaces=True)
             post_hi = ip.postprocess_batch(dec_hi, lang=HI_CODE)
-            hi_acc.append(clean_translation(post_hi[0]))
         except Exception as e:
             hi_acc.append(f"⚠️ Hindi failed (sentence {i}): {e}")
-        # --- Telugu Translation ---
         try:
             batch_te = ip.preprocess_batch([sentence], src_lang=SRC_CODE, tgt_lang=TE_CODE)
             enc_te = tok(batch_te, max_length=256, truncation=True, padding=True, return_tensors="pt").to(device)
@@ -120,7 +157,6 @@ def translate_dual_stream(text, model_choice, num_beams, max_new):
         except Exception as e:
             te_acc.append(f"⚠️ Telugu failed (sentence {i}): {e}")
-        # Stream progressive output
         yield (" ".join(hi_acc), " ".join(te_acc))
 # --------------------- Dark Theme ---------------------
@@ -148,7 +184,7 @@ CUSTOM_CSS = """
 textarea { background:#0b0f19 !important; color:#f9fafb !important; border-radius:8px !important; border:1px solid #374151 !important; font-size:15px !important; line-height:1.55; }
 button { border-radius:8px !important; font-weight:600 !important; }
-/* Make all component labels readable on dark bg */
 .gradio-container label,
 .gradio-container .label,
 .gradio-container .block-title,
@@ -157,7 +193,7 @@ button { border-radius:8px !important; font-weight:600 !important; }
   color:#093999 !important;
 }
-/* --- Dropdown: dark text on white field/menu --- */
 #model_dd .wrap,
 #model_dd .container {
   background:#111827 !important;
@@ -169,19 +205,19 @@ button { border-radius:8px !important; font-weight:600 !important; }
 #model_dd ::placeholder,
 #model_dd select,
 #model_dd option {
-  color: #ffffff!important; /* dark text */
   background:#111827 !important;
 }
 #model_dd .options,
 #model_dd .options .item {
   background:#111827 !important;
-  color: #ffffff !important;
 }
-#model_dd label { /* the component's own label */
   color:#efe4b0 !important;
 }
-/* Sliders: keep labels visible */
 .gradio-container .range-block label,
 .gradio-container .gr-slider label {
   color:#efe4b0 !important;
@@ -192,7 +228,7 @@ button { border-radius:8px !important; font-weight:600 !important; }
 with gr.Blocks(theme=THEME, css=CUSTOM_CSS, title="EN → HI/TE Translator") as demo:
     with gr.Group(elem_id="hdr"):
         gr.Markdown("<h1>English → Hindi & Telugu Translator</h1>")
-        gr.Markdown("<p>IndicTrans2 with simplified preprocessing and sentence-wise translation</p>")
     model_choice = gr.Dropdown(
         label="Choose Model",
@@ -205,10 +241,10 @@ with gr.Blocks(theme=THEME, css=CUSTOM_CSS, title="EN → HI/TE Translator") as
         with gr.Column(scale=2):
             with gr.Group(elem_classes="panel"):
                 gr.Markdown("<h2>English Input</h2>")
-                src = gr.Textbox(lines=12, placeholder="Enter English...", show_label=False)
             with gr.Row():
                 translate_btn = gr.Button("Translate", variant="primary")
-                clear_btn     = gr.Button("Clear", variant="secondary")
         with gr.Column(scale=2):
             with gr.Group(elem_classes="panel"):
@@ -222,15 +258,14 @@ with gr.Blocks(theme=THEME, css=CUSTOM_CSS, title="EN → HI/TE Translator") as
             with gr.Group(elem_classes="panel"):
                 gr.Markdown("<h2>Settings</h2>")
                 num_beams = gr.Slider(1, 8, value=4, step=1, label="Beam Search", elem_id="model_dd")
-                max_new   = gr.Slider(32, 512, value=128, step=16, label="Max New Tokens", elem_id="model_dd")
-    # Stream generator connection
     translate_btn.click(
         translate_dual_stream,
         inputs=[src, model_choice, num_beams, max_new],
         outputs=[hi_out, te_out]
     )
     clear_btn.click(lambda: ("", "", ""), outputs=[src, hi_out, te_out])
-# Enable queue for streaming
 demo.queue(max_size=48).launch()

+import os, re, torch, gradio as gr
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from IndicTransToolkit import IndicProcessor
 import spacy
 ip = IndicProcessor(inference=True)
+# --------------------- spaCy Sentence Splitter ---------------------
 try:
     nlp = spacy.load("en_core_web_sm")
 except OSError:
     download("en_core_web_sm")
     nlp = spacy.load("en_core_web_sm")
 def split_into_sentences(text):
     """Split English text into sentences using spaCy."""
     doc = nlp(text.strip())
     return [sent.text.strip() for sent in doc.sents if sent.text.strip()]
+# --------------------- Abbreviation Expansion ---------------------
+ABBREVIATION_MAP = {
+    "subs.": "subsection",
+    "cl.": "clause",
+    "art.": "article",
+    "sec.": "section",
+    "s.": "section",
+    "no.": "number",
+    "sch.": "schedule",
+    "para.": "paragraph",
+    "r.": "rule",
+    "reg.": "regulation",
+    "dept.": "department",
+}
+_ABBR_PATTERN = re.compile(
+    r'(?<![A-Za-z])(' + '|'.join(re.escape(k) for k in ABBREVIATION_MAP.keys()) + r')(?=\s*(?:\(|\d|[A-Z]|[a-z]))',
+    flags=re.IGNORECASE
+)
+def expand_abbreviations(text: str) -> str:
+    """Replace known abbreviations with full forms safely (without affecting natural words)."""
+    def replacer(match):
+        key = match.group(0)
+        repl = ABBREVIATION_MAP.get(key.lower(), key)
+        if key.isupper():
+            return repl.upper()
+        elif key[0].isupper():
+            return repl.capitalize()
+        return repl
+    return _ABBR_PATTERN.sub(replacer, text)
+# --------------------- Clean Up Placeholder Tags ---------------------
 def clean_translation(text):
     """Remove unresolved placeholder tags such as <ID1>, <ID2>."""
     return re.sub(r"<ID\d+>", "", text).strip()
         low_cpu_mem_usage=True, dtype=dtype, token=token
     ).to(device).eval()
     try:
         mdl.config.vocab_size = mdl.get_output_embeddings().weight.shape[0]
     except Exception:
     _model_cache[model_name] = (tok, mdl)
     return tok, mdl
+# --------------------- Translation ---------------------
 @torch.inference_mode()
 def translate_dual_stream(text, model_choice, num_beams, max_new):
+    """Stream Hindi and Telugu translations, one sentence at a time."""
     if not text or not text.strip():
         yield "", ""
         return
     tok, mdl = load_model(MODELS[model_choice])
+    # Expand known abbreviations
+    text = expand_abbreviations(text)
     sentences = split_into_sentences(text)
     hi_acc, te_acc = [], []
+    yield "", ""  # Clear UI early
     for i, sentence in enumerate(sentences, 1):
+        # --- Hindi ---
         try:
             batch_hi = ip.preprocess_batch([sentence], src_lang=SRC_CODE, tgt_lang=HI_CODE)
             enc_hi = tok(batch_hi, max_length=256, truncation=True, padding=True, return_tensors="pt").to(device)
             )
             dec_hi = tok.batch_decode(out_hi, skip_special_tokens=True, clean_up_tokenization_spaces=True)
             post_hi = ip.postprocess_batch(dec_hi, lang=HI_CODE)
+            hi_text = clean_translation(post_hi[0])
+            # Optionally ensure danda for Hindi if missing
+            if not re.search(r"[।?!…]$", hi_text):
+                hi_text += "।"
+            hi_acc.append(hi_text)
         except Exception as e:
             hi_acc.append(f"⚠️ Hindi failed (sentence {i}): {e}")
+        # --- Telugu ---
         try:
             batch_te = ip.preprocess_batch([sentence], src_lang=SRC_CODE, tgt_lang=TE_CODE)
             enc_te = tok(batch_te, max_length=256, truncation=True, padding=True, return_tensors="pt").to(device)
         except Exception as e:
             te_acc.append(f"⚠️ Telugu failed (sentence {i}): {e}")
         yield (" ".join(hi_acc), " ".join(te_acc))
 # --------------------- Dark Theme ---------------------
 textarea { background:#0b0f19 !important; color:#f9fafb !important; border-radius:8px !important; border:1px solid #374151 !important; font-size:15px !important; line-height:1.55; }
 button { border-radius:8px !important; font-weight:600 !important; }
+/* Labels */
 .gradio-container label,
 .gradio-container .label,
 .gradio-container .block-title,
   color:#093999 !important;
 }
+/* Dropdown Styling */
 #model_dd .wrap,
 #model_dd .container {
   background:#111827 !important;
 #model_dd ::placeholder,
 #model_dd select,
 #model_dd option {
+  color:#ffffff!important;
   background:#111827 !important;
 }
 #model_dd .options,
 #model_dd .options .item {
   background:#111827 !important;
+  color:#ffffff !important;
 }
+#model_dd label {
   color:#efe4b0 !important;
 }
+/* Slider labels */
 .gradio-container .range-block label,
 .gradio-container .gr-slider label {
   color:#efe4b0 !important;
 with gr.Blocks(theme=THEME, css=CUSTOM_CSS, title="EN → HI/TE Translator") as demo:
     with gr.Group(elem_id="hdr"):
         gr.Markdown("<h1>English → Hindi & Telugu Translator</h1>")
+        gr.Markdown("<p>IndicTrans2 with abbreviation expansion and sentence-wise translation</p>")
     model_choice = gr.Dropdown(
         label="Choose Model",
         with gr.Column(scale=2):
             with gr.Group(elem_classes="panel"):
                 gr.Markdown("<h2>English Input</h2>")
+                src = gr.Textbox(lines=12, placeholder="Enter English text...", show_label=False)
             with gr.Row():
                 translate_btn = gr.Button("Translate", variant="primary")
+                clear_btn = gr.Button("Clear", variant="secondary")
         with gr.Column(scale=2):
             with gr.Group(elem_classes="panel"):
             with gr.Group(elem_classes="panel"):
                 gr.Markdown("<h2>Settings</h2>")
                 num_beams = gr.Slider(1, 8, value=4, step=1, label="Beam Search", elem_id="model_dd")
+                max_new = gr.Slider(32, 512, value=128, step=16, label="Max New Tokens", elem_id="model_dd")
     translate_btn.click(
         translate_dual_stream,
         inputs=[src, model_choice, num_beams, max_new],
         outputs=[hi_out, te_out]
     )
     clear_btn.click(lambda: ("", "", ""), outputs=[src, hi_out, te_out])
 demo.queue(max_size=48).launch()