# app.py
# Fast English -> Bengali translator with optional speech input and fast image generation

import os
import re
import time
import random
import traceback

import torch
import gradio as gr

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from diffusers import StableDiffusionPipeline

# -------- Configuration --------
TRANSLATION_MODEL = os.environ.get("TRANSLATION_MODEL", "facebook/nllb-200-distilled-600M")
SRC_LANG = os.environ.get("SRC_LANG", "eng_Latn")
TGT_LANG = os.environ.get("TGT_LANG", "ben_Beng")
MAX_LENGTH = int(os.environ.get("MAX_LENGTH", "512"))
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Image model (can be changed to any HF stable-diffusion model you prefer)
IMAGE_MODEL_ID = os.environ.get("IMAGE_MODEL_ID", "runwayml/stable-diffusion-v1-5")

# -------- Globals / Caches --------
_translation_tokenizer = None
_translation_model = None
_image_pipe = None

# -------- Helpers: Translation --------

def split_into_sentences(text: str):
    if not text:
        return []
    # Basic sentence splitting that keeps punctuation
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s.strip() for s in sentences if s.strip()]


def load_translation_model():
    global _translation_tokenizer, _translation_model
    if _translation_tokenizer is None or _translation_model is None:
        try:
            print(f"Loading translation model {TRANSLATION_MODEL} on {DEVICE}...")
            _translation_tokenizer = AutoTokenizer.from_pretrained(TRANSLATION_MODEL, use_fast=False)
            _translation_model = AutoModelForSeq2SeqLM.from_pretrained(TRANSLATION_MODEL).to(DEVICE)
            print("Translation model loaded.")
        except Exception as e:
            _translation_tokenizer, _translation_model = None, None
            raise
    return _translation_tokenizer, _translation_model


def _get_forced_bos_token_id(tokenizer):
    # Different tokenizers expose language IDs differently. Try several approaches and fallback to None.
    # 1) Hugging Face lang_code_to_id mapping (used by some multilingual tokenizers)
    try:
        if hasattr(tokenizer, "lang_code_to_id") and isinstance(tokenizer.lang_code_to_id, dict):
            if TGT_LANG in tokenizer.lang_code_to_id:
                return tokenizer.lang_code_to_id[TGT_LANG]
    except Exception:
        pass

    # 2) Convert token string -> id (some checkpoints use language tags as tokens)
    try:
        token_id = tokenizer.convert_tokens_to_ids(TGT_LANG)
        if token_id != tokenizer.unk_token_id:
            return token_id
    except Exception:
        pass

    # 3) Try common special form (e.g. "<2ben_Beng>")
    try:
        candidate = f"<2{TGT_LANG}>"
        token_id = tokenizer.convert_tokens_to_ids(candidate)
        if token_id != tokenizer.unk_token_id:
            return token_id
    except Exception:
        pass

    return None


def translate_text(text: str, max_length: int = MAX_LENGTH):
    """Translate English text to Bengali. Returns translated string or error message."""
    if not text or not text.strip():
        return ""

    try:
        tokenizer, model = load_translation_model()
    except Exception as e:
        tb = traceback.format_exc()
        return f"Model load error: {e}\n{tb}"

    sentences = split_into_sentences(text)
    translations = []

    forced_bos = _get_forced_bos_token_id(tokenizer)

    for s in sentences:
        if not s:
            continue
        try:
            # Prepend source language hint if the model expects it (common for NLLB)
            src_prefixed = f"{SRC_LANG} {s}"

            inputs = tokenizer(
                src_prefixed,
                return_tensors="pt",
                truncation=True,
                max_length=max_length,
            ).to(DEVICE)

            gen_kwargs = dict(
                max_length=max_length + 64,
                num_beams=5,
                early_stopping=True,
            )

            if forced_bos is not None:
                gen_kwargs["forced_bos_token_id"] = forced_bos
            elif getattr(model.config, "forced_bos_token_id", None) is not None:
                gen_kwargs["forced_bos_token_id"] = model.config.forced_bos_token_id

            generated_tokens = model.generate(**inputs, **gen_kwargs)

            decoded = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

            # The decoded text sometimes includes the target language token. Remove if present at start.
            if decoded.startswith(TGT_LANG):
                decoded = decoded[len(TGT_LANG):].strip()

            translations.append(decoded)

        except RuntimeError as re_err:
            return f"Runtime error during generation: {re_err}"
        except Exception as e:
            translations.append(f"[Error translating sentence: {e}]")

    return " ".join(translations)

# -------- Image generation (fast-ish) --------

def load_image_model(model_id: str = IMAGE_MODEL_ID):
    global _image_pipe
    if _image_pipe is None:
        try:
            dtype = torch.float16 if DEVICE.type == "cuda" else torch.float32
            print(f"Loading image model {model_id} (dtype={dtype}) on {DEVICE} ...")

            _image_pipe = StableDiffusionPipeline.from_pretrained(
                model_id,
                torch_dtype=dtype,
            )

            # Move to device
            _image_pipe = _image_pipe.to(DEVICE)
            print("Image model loaded.")

        except Exception as e:
            _image_pipe = None
            raise
    return _image_pipe


def generate_image(prompt: str, num_inference_steps: int = 4):
    """Generate one image; returns PIL Image or None and status message."""
    if not prompt or not prompt.strip():
        return None, "Please enter a prompt to generate an image."

    try:
        pipe = load_image_model()

        seed = random.randint(0, 2**31 - 1)
        gen = torch.Generator(device=DEVICE).manual_seed(seed) if DEVICE.type == "cuda" else torch.Generator().manual_seed(seed)

        # Guidance scale and steps tuned for speed; user can change steps via UI
        out = pipe(
            prompt=prompt,
            num_inference_steps=int(num_inference_steps),
            guidance_scale=7.5,
            generator=gen,
        )

        image = out.images[0]
        return image, f"Generated (seed={seed}) in {num_inference_steps} steps."

    except Exception as e:
        tb = traceback.format_exc()
        return None, f"Error generating image: {e}\n{tb}"

# -------- Optional: Speech transcription (if dependencies installed) --------

try:
    import speech_recognition as sr
    from pydub import AudioSegment
    _SR_AVAILABLE = True
except Exception:
    _SR_AVAILABLE = False


def transcribe_audio_file(audio_path: str):
    if not audio_path:
        return ""
    if not _SR_AVAILABLE:
        return "(speech_recognition/pydub not installed) Please type your text or install optional deps."

    try:
        recognizer = sr.Recognizer()
        # Convert file to WAV if needed
        wav_path = audio_path
        if not audio_path.lower().endswith('.wav'):
            audio = AudioSegment.from_file(audio_path)
            wav_path = audio_path.rsplit('.', 1)[0] + '.wav'
            audio.export(wav_path, format='wav')

        with sr.AudioFile(wav_path) as source:
            audio_data = recognizer.record(source)
            text = recognizer.recognize_google(audio_data)
            return text

    except Exception as e:
        return f"Error transcribing audio: {e}"

# -------- Gradio UI --------

css = """
.gradio-container { max-width: 1100px !important; }
.header { text-align: center; padding: 16px; border-radius: 8px; color: white; background: linear-gradient(90deg,#2563eb,#7c3aed); }
.quick-btn { margin: 6px; }
"""

with gr.Blocks(title="Fast Bengali Translator & Image Generator", css=css) as demo:
    gr.Markdown("""
    <div class="header"><h2>⚡ Fast English → Bengali Translator + Fast Image Generator</h2>
    <p>Speech input (optional), sentence-split translation, and 2–8 step image generation for fast feedback.</p></div>
    """)

    with gr.Tabs():
        with gr.TabItem("Translation"):
            gr.Markdown("### English → Bengali")
            with gr.Row():
                with gr.Column(scale=6):
                    audio_input = gr.Audio(source="upload", type="filepath", label="Record or upload audio (optional)")
                    transcribe_btn = gr.Button("Transcribe Speech")

                    input_text = gr.Textbox(lines=6, placeholder="Type or paste English text here...", label="English Text")
                    with gr.Row():
                        quick_hello = gr.Button("Hello, how are you?")
                        quick_weather = gr.Button("The weather is nice today.")
                        quick_thanks = gr.Button("Thank you very much.")

                    translate_btn = gr.Button("Translate")

                with gr.Column(scale=6):
                    output_text = gr.Textbox(lines=6, label="Bengali Translation", interactive=False)
                    copy_btn = gr.Button("Copy")

                    use_for_image_btn = gr.Button("Use translation as image prompt")

        with gr.TabItem("Image Generation"):
            gr.Markdown("### Fast Image Generation")
            with gr.Row():
                with gr.Column(scale=6):
                    image_prompt = gr.Textbox(lines=4, label="Image Prompt", placeholder="Describe the image you want to generate...")
                    with gr.Row():
                        generate_btn = gr.Button("Generate Image")
                        clear_btn = gr.Button("Clear")
                    steps_slider = gr.Slider(minimum=2, maximum=12, step=1, value=4, label="Inference Steps (fewer = faster)")

                with gr.Column(scale=6):
                    output_image = gr.Image(label="Generated Image", interactive=False)
                    status_message = gr.Textbox(label="Status", interactive=False)

    gr.Markdown("---")
    gr.Markdown("*Notes: For best performance use a GPU in Spaces or locally. Optional speech transcription requires `speechrecognition` and `pydub`.*")

    # Event bindings
    def _transcribe_then_fill(path):
        return transcribe_audio_file(path)

    def _copy_text(t):
        return t

    def _use_translation_for_image(t):
        return t

    transcribe_btn.click(fn=_transcribe_then_fill, inputs=audio_input, outputs=input_text)
    translate_btn.click(fn=translate_text, inputs=input_text, outputs=output_text)
    copy_btn.click(fn=_copy_text, inputs=output_text, outputs=output_text)

    use_for_image_btn.click(fn=_use_translation_for_image, inputs=output_text, outputs=image_prompt)

    generate_btn.click(fn=generate_image, inputs=[image_prompt, steps_slider], outputs=[output_image, status_message])
    clear_btn.click(fn=lambda: ["", None, ""], inputs=None, outputs=[image_prompt, output_image, status_message])

if __name__ == '__main__':
    demo.launch(server_name='0.0.0.0', server_port=int(os.environ.get('PORT', 7860)))