Spaces:

SagarVelamuri
/

TranslationSpace

Sleeping

App Files Files Community

SagarVelamuri commited on Sep 5

Commit

4151903

verified ·

1 Parent(s): 683b0a0

Update app.py

Browse files

Files changed (1) hide show

app.py +174 -123

app.py CHANGED Viewed

@@ -1,177 +1,228 @@
-import os, traceback, types, torch, gradio as gr
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-# Robust import for IndicProcessor
-try:
-    from IndicTransToolkit import IndicProcessor
-except Exception:
-    from IndicTransToolkit.IndicTransToolkit import IndicProcessor
-# -------- Config --------
-TOKENIZER_ID = os.getenv("TOKENIZER_ID", "ai4bharat/indictrans2-en-indic-1B")
-MODEL_ID     = os.getenv("MODEL_ID",     "law-ai/InLegalTrans-En2Indic-1B")
-TOKENIZER_REV, MODEL_REV = os.getenv("TOKENIZER_REV"), os.getenv("MODEL_REV")
 SRC_CODE = "eng_Latn"
 HI_CODE  = "hin_Deva"
 TE_CODE  = "tel_Telu"
-# -------- Model Load --------
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-tok_kwargs = dict(trust_remote_code=True, use_fast=True)
-if TOKENIZER_REV: tok_kwargs["revision"] = TOKENIZER_REV
-tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID, **tok_kwargs)
-mdl_kwargs = dict(trust_remote_code=True, attn_implementation="eager",
-                  low_cpu_mem_usage=True, dtype=dtype)
-if MODEL_REV: mdl_kwargs["revision"] = MODEL_REV
-model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, **mdl_kwargs).to(device).eval()
-# Ensure generation config is correct
-if getattr(model.generation_config, "pad_token_id", None) is None:
-    model.generation_config.pad_token_id = tokenizer.pad_token_id or tokenizer.eos_token_id
-if getattr(model.generation_config, "eos_token_id", None) is None and tokenizer.eos_token_id is not None:
-    model.generation_config.eos_token_id = tokenizer.eos_token_id
-def _ensure_vocab_consistency(md, tok):
-    try:
-        actual_vocab = md.get_output_embeddings().weight.shape[0]
-    except Exception: actual_vocab = None
-    if actual_vocab:
-        md.config.vocab_size = actual_vocab
-        md.generation_config.vocab_size = actual_vocab
-    else:
-        vs = getattr(tok, "vocab_size", len(tok) if hasattr(tok, "__len__") else 64000)
-        md.config.vocab_size = vs
-        md.generation_config.vocab_size = vs
-    if not hasattr(md.config, "get_text_config"):
-        md.config.get_text_config = types.MethodType(lambda self: self, md.config)
-_ensure_vocab_consistency(model, tokenizer)
-for obj in (model.config, model.generation_config):
-    try: setattr(obj, "use_cache", False)
-    except: pass
-# Processor
-ip = IndicProcessor(inference=True)
-# -------- Inference --------
 @torch.inference_mode()
-def _translate_to_lang(text, tgt_code, num_beams, max_new_tokens, temperature, top_p, top_k):
-    batch = ip.preprocess_batch([text], src_lang=SRC_CODE, tgt_lang=tgt_code)
-    enc = tokenizer(batch, max_length=256, truncation=True, padding="longest",
-                    return_tensors="pt").to(device)
-    do_sample = (temperature and float(temperature) > 0)
-    out = model.generate(
-        **enc,
-        max_new_tokens=int(max_new_tokens),
-        num_beams=int(num_beams),
-        do_sample=do_sample,
-        temperature=float(temperature) if do_sample else None,
-        top_p=float(top_p) if do_sample else None,
-        top_k=int(top_k) if do_sample else None,
-        use_cache=False,
-    )
-    decoded = tokenizer.batch_decode(out, skip_special_tokens=True)
-    final = ip.postprocess_batch(decoded, lang=tgt_code)
-    return final[0].strip()
-def translate_dual(text, num_beams, max_new_tokens, temperature, top_p, top_k):
-    text = text.strip()
-    if not text: return "", ""
     try:
-        hi = _translate_to_lang(text, HI_CODE, num_beams, max_new_tokens, temperature, top_p, top_k)
     except Exception as e:
-        hi = f"⚠️ Hindi error: {type(e).__name__}: {str(e).splitlines()[-1]}"
     try:
-        te = _translate_to_lang(text, TE_CODE, num_beams, max_new_tokens, temperature, top_p, top_k)
     except Exception as e:
-        te = f"⚠️ Telugu error: {type(e).__name__}: {str(e).splitlines()[-1]}"
     return hi, te
-# -------- Theme & Styling --------
-THEME = gr.themes.Base(
     primary_hue="blue", neutral_hue="slate"
 ).set(
-    body_background_fill="#f9fafb",
-    body_text_color="#111827",
-    block_background_fill="#ffffff",
-    block_border_color="#e5e7eb",
-    block_title_text_color="#111827",
     button_primary_background_fill="#2563eb",
-    button_primary_text_color="#ffffff"
 )
 CUSTOM_CSS = """
-#hdr {
-  text-align:center; padding:16px; margin-bottom:16px;
-}
-#hdr h1 { font-size:24px; font-weight:700; margin:0; color:#111827; }
-#hdr p { font-size:14px; color:#6b7280; margin:4px 0 0; }
-.panel {
-  border:1px solid #e5e7eb; border-radius:12px;
-  background:white; box-shadow:0 1px 3px rgba(0,0,0,0.08);
-  padding:12px; display:flex; flex-direction:column;
-}
-.panel h2 {
-  font-size:16px; font-weight:600; margin-bottom:8px; color:#374151;
-}
-textarea {
-  font-size:15px !important; line-height:1.55 !important;
-  padding:10px 12px !important;
-  border:1px solid #d1d5db !important; border-radius:8px !important;
-}
-button { font-weight:600 !important; border-radius:8px !important; }
-button:hover { opacity:0.9; transition:opacity 0.2s; }
 """
-# -------- UI --------
-with gr.Blocks(theme=THEME, css=CUSTOM_CSS, title="EN → Hindi / Telugu Translator") as demo:
     with gr.Group(elem_id="hdr"):
         gr.Markdown("<h1>English → Hindi & Telugu Translator</h1>")
-        gr.Markdown("<p>Powered by IndicTrans2 · law-ai/InLegalTrans-En2Indic-1B</p>")
     with gr.Row():
-        # Input Column
         with gr.Column(scale=2):
             with gr.Group(elem_classes="panel"):
                 gr.Markdown("<h2>English Input</h2>")
-                src = gr.Textbox(placeholder="Type English text...", lines=12, show_label=False)
             with gr.Row():
                 translate_btn = gr.Button("👉 Translate", variant="primary")
                 clear_btn     = gr.Button("Clear", variant="secondary")
-        # Output Column
         with gr.Column(scale=2):
             with gr.Group(elem_classes="panel"):
                 gr.Markdown("<h2>Hindi Translation</h2>")
                 hi_out = gr.Textbox(lines=6, show_copy_button=True, show_label=False)
             with gr.Group(elem_classes="panel"):
                 gr.Markdown("<h2>Telugu Translation</h2>")
                 te_out = gr.Textbox(lines=6, show_copy_button=True, show_label=False)
-        # Settings Column
         with gr.Column(scale=1):
             with gr.Group(elem_classes="panel"):
-                gr.Markdown("<h2>Advanced Settings</h2>")
-                num_beams   = gr.Slider(1, 8, value=4, step=1, label="Num Beams")
-                max_new     = gr.Slider(16, 512, value=128, step=8, label="Max Tokens")
-                temperature = gr.Slider(0.0, 1.5, value=0.0, step=0.05, label="Temperature")
-                top_p       = gr.Slider(0.0, 1.0, value=1.0, step=0.01, label="Top-p")
-                top_k       = gr.Slider(0, 100, value=50, step=1, label="Top-k")
-    # Wiring
-    translate_btn.click(translate_dual, inputs=[src, num_beams, max_new, temperature, top_p, top_k],
-                        outputs=[hi_out, te_out])
     clear_btn.click(lambda: ("", "", ""), outputs=[src, hi_out, te_out])
 demo.queue(max_size=48).launch()

+import os, re, types, traceback, torch, gradio as gr
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from IndicTransToolkit import IndicProcessor
+# --------------------- Device ---------------------
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+# --------------------- Languages ------------------
 SRC_CODE = "eng_Latn"
 HI_CODE  = "hin_Deva"
 TE_CODE  = "tel_Telu"
+ip = IndicProcessor(inference=True)
+# --------------------- Regex / Helpers ---------------------
+TAG_REGEX = re.compile(
+    r"(?:_src\S+)|(?:tgt\S+)|"
+    r"(?:>>\s*\S+\s*<<)|"
+    r"\b(?:eng_Latn|hin_Deva|hin_deva|tel_Telu|tel_telu)\b|"
+    r"<ID\d*>"
+)
+def strip_lang_tags(text: str) -> str:
+    s = TAG_REGEX.sub(" ", text)
+    return re.sub(r"\s{2,}", " ", s).strip()
+def ensure_hindi_danda(s: str) -> str:
+    s = re.sub(r"\.\s*$", "।", s)
+    if not re.search(r"[।?!…]\s*$", s) and re.search(r"[\u0900-\u097F]\s*$", s):
+        s += "।"
+    return s
+# Sentence splitting (pysbd or fallback)
+try:
+    import pysbd
+    _SEGMENTER = pysbd.Segmenter(language="en", clean=True)
+except Exception:
+    _SEGMENTER = None
+_LEGAL_JOIN_RE = re.compile(r'\b([A-Za-z]{1,6})\.\s*$')
+_NEXT_CONT_RE  = re.compile(r'^\s*(?:[\(\[\{]|\d|[a-z])')
+def _merge_legal_abbrev_breaks(sents):
+    merged, i = [], 0
+    while i < len(sents):
+        cur = sents[i].strip()
+        while i + 1 < len(sents):
+            nxt = sents[i + 1].lstrip()
+            if _LEGAL_JOIN_RE.search(cur) and _NEXT_CONT_RE.match(nxt):
+                cur = f"{cur} {nxt}"
+                i += 1
+            else:
+                break
+        merged.append(cur)
+        i += 1
+    return [s for s in merged if s]
+def split_into_sentences(text: str):
+    if _SEGMENTER is not None:
+        return _merge_legal_abbrev_breaks(_SEGMENTER.segment(text))
+    PLACEHOLDER = "\uE000"
+    protected = re.sub(
+        r'\b([A-Za-z]{1,6})\.(?=\s*(?:[\(\[\{]|\d|[a-z]))',
+        r'\1' + PLACEHOLDER, text.strip()
+    )
+    protected = re.sub(
+        r'\b([A-Za-z]{1,5})\.(?=\s+[A-Z])',
+        r'\1' + PLACEHOLDER, protected
+    )
+    parts = re.split(r'(?<=[.?!])\s+', protected)
+    return _merge_legal_abbrev_breaks([p.replace(PLACEHOLDER, '.') for p in parts if p.strip()])
+# --------------------- Model Loader ---------------------
+MODELS = {
+    "Default (Public)": "law-ai/InLegalTrans-En2Indic-1B",
+    "Fine-tuned (Private)": "SagarVelamuri/InLegalTrans-En2Indic-FineTuned-Tel-Hin"
+}
+_model_cache = {}
+def load_model(model_name: str):
+    if model_name in _model_cache:
+        return _model_cache[model_name]
+    tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)
+    mdl = AutoModelForSeq2SeqLM.from_pretrained(
+        model_name, trust_remote_code=True,
+        low_cpu_mem_usage=True, dtype=dtype
+    ).to(device).eval()
+    # Fix vocab
+    try:
+        mdl.config.vocab_size = mdl.get_output_embeddings().weight.shape[0]
+    except Exception:
+        pass
+    _model_cache[model_name] = (tok, mdl)
+    return tok, mdl
+def build_bad_words_ids_from_vocab(tok):
+    vocab = tok.get_vocab()
+    candidates = [
+        "eng_Latn","hin_Deva","hin_deva","tel_Telu","tel_telu",
+        "_srceng_Latn","tgthin_Deva","tgt_tel_Telu",
+        ">>hin_Deva<<",">>tel_Telu<<",
+    ] + [f"<ID{i}>" for i in range(10)]
+    out = []
+    for c in candidates:
+        if c in vocab: out.append([vocab[c]]); continue
+        sp_c = "▁" + c
+        if sp_c in vocab: out.append([vocab[sp_c]])
+    return out
+# --------------------- Translation ---------------------
 @torch.inference_mode()
+def _translate(text: str, tgt_lang: str, model_choice: str,
+               num_beams=4, max_new=128, batch_size=3) -> str:
+    tok, mdl = load_model(MODELS[model_choice])
+    BAD_WORDS_IDS = build_bad_words_ids_from_vocab(tok)
+    sentences = split_into_sentences(text)
+    full_trans = []
+    for i in range(0, len(sentences), batch_size):
+        batch = sentences[i:i+batch_size]
+        proc = ip.preprocess_batch(batch, src_lang=SRC_CODE, tgt_lang=tgt_lang)
+        enc = tok(proc, padding=True, truncation=True, max_length=256, return_tensors="pt").to(device)
+        out = mdl.generate(
+            **enc, max_length=max_new, num_beams=num_beams,
+            early_stopping=True, no_repeat_ngram_size=3, use_cache=False,
+            bad_words_ids=BAD_WORDS_IDS if BAD_WORDS_IDS else None
+        )
+        decoded = tok.batch_decode(out, skip_special_tokens=True)
+        decoded = [strip_lang_tags(t) for t in decoded]
+        post = ip.postprocess_batch(decoded, lang=tgt_lang)
+        if tgt_lang == HI_CODE:
+            post = [ensure_hindi_danda(x) for x in post]
+        full_trans.extend(p.strip() for p in post)
+    return " ".join(full_trans)
+def translate_dual(text, model_choice, num_beams, max_new):
+    if not text.strip(): return "", ""
     try:
+        hi = _translate(text, HI_CODE, model_choice, num_beams=num_beams, max_new=max_new)
     except Exception as e:
+        hi = f"⚠️ Hindi failed: {e}"
     try:
+        te = _translate(text, TE_CODE, model_choice, num_beams=num_beams, max_new=max_new)
     except Exception as e:
+        te = f"⚠️ Telugu failed: {e}"
     return hi, te
+# --------------------- Dark Theme ---------------------
+THEME = gr.themes.Soft(
     primary_hue="blue", neutral_hue="slate"
 ).set(
+    body_background_fill="#0b0f19",
+    body_text_color="#f3f4f6",
+    block_background_fill="#111827",
+    block_border_color="#1f2937",
+    block_title_text_color="#e5e7eb",
     button_primary_background_fill="#2563eb",
+    button_primary_text_color="#ffffff",
 )
 CUSTOM_CSS = """
+#hdr { text-align:center; padding:16px; }
+#hdr h1 { font-size:24px; font-weight:700; color:#f9fafb; margin:0; }
+#hdr p { font-size:14px; color:#9ca3af; margin-top:4px; }
+.panel { border:1px solid #1f2937; border-radius:10px; padding:12px; background:#111827; box-shadow:0 1px 2px rgba(0,0,0,0.4);}
+.panel h2 { font-size:16px; font-weight:600; margin-bottom:6px; color:#f3f4f6; }
+textarea { background:#0b0f19 !important; color:#f9fafb !important; border-radius:8px !important; border:1px solid #374151 !important; font-size:15px !important; line-height:1.55; }
+button { border-radius:8px !important; font-weight:600 !important; }
 """
+# --------------------- UI ---------------------
+with gr.Blocks(theme=THEME, css=CUSTOM_CSS, title="EN → HI/TE Translator") as demo:
     with gr.Group(elem_id="hdr"):
         gr.Markdown("<h1>English → Hindi & Telugu Translator</h1>")
+        gr.Markdown("<p>IndicTrans2 with batch sentence decomposition</p>")
+    model_choice = gr.Dropdown(
+        label="Choose Model", choices=list(MODELS.keys()),
+        value="Default (Public)"
+    )
     with gr.Row():
         with gr.Column(scale=2):
             with gr.Group(elem_classes="panel"):
                 gr.Markdown("<h2>English Input</h2>")
+                src = gr.Textbox(lines=12, placeholder="Enter English...", show_label=False)
             with gr.Row():
                 translate_btn = gr.Button("👉 Translate", variant="primary")
                 clear_btn     = gr.Button("Clear", variant="secondary")
         with gr.Column(scale=2):
             with gr.Group(elem_classes="panel"):
                 gr.Markdown("<h2>Hindi Translation</h2>")
                 hi_out = gr.Textbox(lines=6, show_copy_button=True, show_label=False)
             with gr.Group(elem_classes="panel"):
                 gr.Markdown("<h2>Telugu Translation</h2>")
                 te_out = gr.Textbox(lines=6, show_copy_button=True, show_label=False)
         with gr.Column(scale=1):
             with gr.Group(elem_classes="panel"):
+                gr.Markdown("<h2>Settings</h2>")
+                num_beams   = gr.Slider(1, 8, value=4, step=1, label="Beam Search")
+                max_new     = gr.Slider(32, 512, value=128, step=16, label="Max New Tokens")
+    translate_btn.click(
+        translate_dual,
+        inputs=[src, model_choice, num_beams, max_new],
+        outputs=[hi_out, te_out]
+    )
     clear_btn.click(lambda: ("", "", ""), outputs=[src, hi_out, te_out])
 demo.queue(max_size=48).launch()