# app.py # Fast English -> Bengali translator with optional speech input and fast image generation import os import re import time import random import traceback import torch import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from diffusers import StableDiffusionPipeline # -------- Configuration -------- TRANSLATION_MODEL = os.environ.get("TRANSLATION_MODEL", "facebook/nllb-200-distilled-600M") SRC_LANG = os.environ.get("SRC_LANG", "eng_Latn") TGT_LANG = os.environ.get("TGT_LANG", "ben_Beng") MAX_LENGTH = int(os.environ.get("MAX_LENGTH", "512")) DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Image model (can be changed to any HF stable-diffusion model you prefer) IMAGE_MODEL_ID = os.environ.get("IMAGE_MODEL_ID", "runwayml/stable-diffusion-v1-5") # -------- Globals / Caches -------- _translation_tokenizer = None _translation_model = None _image_pipe = None # -------- Helpers: Translation -------- def split_into_sentences(text: str): if not text: return [] # Basic sentence splitting that keeps punctuation sentences = re.split(r'(?<=[.!?])\s+', text.strip()) return [s.strip() for s in sentences if s.strip()] def load_translation_model(): global _translation_tokenizer, _translation_model if _translation_tokenizer is None or _translation_model is None: try: print(f"Loading translation model {TRANSLATION_MODEL} on {DEVICE}...") _translation_tokenizer = AutoTokenizer.from_pretrained(TRANSLATION_MODEL, use_fast=False) _translation_model = AutoModelForSeq2SeqLM.from_pretrained(TRANSLATION_MODEL).to(DEVICE) print("Translation model loaded.") except Exception as e: _translation_tokenizer, _translation_model = None, None raise return _translation_tokenizer, _translation_model def _get_forced_bos_token_id(tokenizer): # Different tokenizers expose language IDs differently. Try several approaches and fallback to None. # 1) Hugging Face lang_code_to_id mapping (used by some multilingual tokenizers) try: if hasattr(tokenizer, "lang_code_to_id") and isinstance(tokenizer.lang_code_to_id, dict): if TGT_LANG in tokenizer.lang_code_to_id: return tokenizer.lang_code_to_id[TGT_LANG] except Exception: pass # 2) Convert token string -> id (some checkpoints use language tags as tokens) try: token_id = tokenizer.convert_tokens_to_ids(TGT_LANG) if token_id != tokenizer.unk_token_id: return token_id except Exception: pass # 3) Try common special form (e.g. "<2ben_Beng>") try: candidate = f"<2{TGT_LANG}>" token_id = tokenizer.convert_tokens_to_ids(candidate) if token_id != tokenizer.unk_token_id: return token_id except Exception: pass return None def translate_text(text: str, max_length: int = MAX_LENGTH): """Translate English text to Bengali. Returns translated string or error message.""" if not text or not text.strip(): return "" try: tokenizer, model = load_translation_model() except Exception as e: tb = traceback.format_exc() return f"Model load error: {e}\n{tb}" sentences = split_into_sentences(text) translations = [] forced_bos = _get_forced_bos_token_id(tokenizer) for s in sentences: if not s: continue try: # Prepend source language hint if the model expects it (common for NLLB) src_prefixed = f"{SRC_LANG} {s}" inputs = tokenizer( src_prefixed, return_tensors="pt", truncation=True, max_length=max_length, ).to(DEVICE) gen_kwargs = dict( max_length=max_length + 64, num_beams=5, early_stopping=True, ) if forced_bos is not None: gen_kwargs["forced_bos_token_id"] = forced_bos elif getattr(model.config, "forced_bos_token_id", None) is not None: gen_kwargs["forced_bos_token_id"] = model.config.forced_bos_token_id generated_tokens = model.generate(**inputs, **gen_kwargs) decoded = tokenizer.decode(generated_tokens[0], skip_special_tokens=True) # The decoded text sometimes includes the target language token. Remove if present at start. if decoded.startswith(TGT_LANG): decoded = decoded[len(TGT_LANG):].strip() translations.append(decoded) except RuntimeError as re_err: return f"Runtime error during generation: {re_err}" except Exception as e: translations.append(f"[Error translating sentence: {e}]") return " ".join(translations) # -------- Image generation (fast-ish) -------- def load_image_model(model_id: str = IMAGE_MODEL_ID): global _image_pipe if _image_pipe is None: try: dtype = torch.float16 if DEVICE.type == "cuda" else torch.float32 print(f"Loading image model {model_id} (dtype={dtype}) on {DEVICE} ...") _image_pipe = StableDiffusionPipeline.from_pretrained( model_id, torch_dtype=dtype, ) # Move to device _image_pipe = _image_pipe.to(DEVICE) print("Image model loaded.") except Exception as e: _image_pipe = None raise return _image_pipe def generate_image(prompt: str, num_inference_steps: int = 4): """Generate one image; returns PIL Image or None and status message.""" if not prompt or not prompt.strip(): return None, "Please enter a prompt to generate an image." try: pipe = load_image_model() seed = random.randint(0, 2**31 - 1) gen = torch.Generator(device=DEVICE).manual_seed(seed) if DEVICE.type == "cuda" else torch.Generator().manual_seed(seed) # Guidance scale and steps tuned for speed; user can change steps via UI out = pipe( prompt=prompt, num_inference_steps=int(num_inference_steps), guidance_scale=7.5, generator=gen, ) image = out.images[0] return image, f"Generated (seed={seed}) in {num_inference_steps} steps." except Exception as e: tb = traceback.format_exc() return None, f"Error generating image: {e}\n{tb}" # -------- Optional: Speech transcription (if dependencies installed) -------- try: import speech_recognition as sr from pydub import AudioSegment _SR_AVAILABLE = True except Exception: _SR_AVAILABLE = False def transcribe_audio_file(audio_path: str): if not audio_path: return "" if not _SR_AVAILABLE: return "(speech_recognition/pydub not installed) Please type your text or install optional deps." try: recognizer = sr.Recognizer() # Convert file to WAV if needed wav_path = audio_path if not audio_path.lower().endswith('.wav'): audio = AudioSegment.from_file(audio_path) wav_path = audio_path.rsplit('.', 1)[0] + '.wav' audio.export(wav_path, format='wav') with sr.AudioFile(wav_path) as source: audio_data = recognizer.record(source) text = recognizer.recognize_google(audio_data) return text except Exception as e: return f"Error transcribing audio: {e}" # -------- Gradio UI -------- css = """ .gradio-container { max-width: 1100px !important; } .header { text-align: center; padding: 16px; border-radius: 8px; color: white; background: linear-gradient(90deg,#2563eb,#7c3aed); } .quick-btn { margin: 6px; } """ with gr.Blocks(title="Fast Bengali Translator & Image Generator", css=css) as demo: gr.Markdown("""

⚡ Fast English → Bengali Translator + Fast Image Generator

Speech input (optional), sentence-split translation, and 2–8 step image generation for fast feedback.

""") with gr.Tabs(): with gr.TabItem("Translation"): gr.Markdown("### English → Bengali") with gr.Row(): with gr.Column(scale=6): audio_input = gr.Audio(source="upload", type="filepath", label="Record or upload audio (optional)") transcribe_btn = gr.Button("Transcribe Speech") input_text = gr.Textbox(lines=6, placeholder="Type or paste English text here...", label="English Text") with gr.Row(): quick_hello = gr.Button("Hello, how are you?") quick_weather = gr.Button("The weather is nice today.") quick_thanks = gr.Button("Thank you very much.") translate_btn = gr.Button("Translate") with gr.Column(scale=6): output_text = gr.Textbox(lines=6, label="Bengali Translation", interactive=False) copy_btn = gr.Button("Copy") use_for_image_btn = gr.Button("Use translation as image prompt") with gr.TabItem("Image Generation"): gr.Markdown("### Fast Image Generation") with gr.Row(): with gr.Column(scale=6): image_prompt = gr.Textbox(lines=4, label="Image Prompt", placeholder="Describe the image you want to generate...") with gr.Row(): generate_btn = gr.Button("Generate Image") clear_btn = gr.Button("Clear") steps_slider = gr.Slider(minimum=2, maximum=12, step=1, value=4, label="Inference Steps (fewer = faster)") with gr.Column(scale=6): output_image = gr.Image(label="Generated Image", interactive=False) status_message = gr.Textbox(label="Status", interactive=False) gr.Markdown("---") gr.Markdown("*Notes: For best performance use a GPU in Spaces or locally. Optional speech transcription requires `speechrecognition` and `pydub`.*") # Event bindings def _transcribe_then_fill(path): return transcribe_audio_file(path) def _copy_text(t): return t def _use_translation_for_image(t): return t transcribe_btn.click(fn=_transcribe_then_fill, inputs=audio_input, outputs=input_text) translate_btn.click(fn=translate_text, inputs=input_text, outputs=output_text) copy_btn.click(fn=_copy_text, inputs=output_text, outputs=output_text) use_for_image_btn.click(fn=_use_translation_for_image, inputs=output_text, outputs=image_prompt) generate_btn.click(fn=generate_image, inputs=[image_prompt, steps_slider], outputs=[output_image, status_message]) clear_btn.click(fn=lambda: ["", None, ""], inputs=None, outputs=[image_prompt, output_image, status_message]) if __name__ == '__main__': demo.launch(server_name='0.0.0.0', server_port=int(os.environ.get('PORT', 7860)))