import gradio as gr
import torch
import soundfile as sf
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import numpy as np
import json
import os

# Directory containing config files
CONFIG_DIR = "assets/Viet-SpeechT5-TTS-finetuning"

# Load all config.json files
def load_configs(directory):
    print(f"Searching for config files in: {directory}")
    if not os.path.exists(directory):
        print(f"Directory {directory} does not exist.")
        return []
    
    examples = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file == "config.json":
                file_path = os.path.join(root, file)
                print(f"Found config file: {file_path}")
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        config = json.load(f)
                        if "input_text" in config and "voice" in config and "output_audio_path" in config:
                            examples.append([config["input_text"], config["voice"], os.path.join(root, config["output_audio_path"])])
                        else:
                            print(f"Skipping {file_path}: Missing required fields")
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
    print(f"Total examples loaded: {len(examples)}")
    return examples

# Load processor, model, and vocoder
print("Loading processor, model, and vocoder...")
try:
    processor = SpeechT5Processor.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
    model = SpeechT5ForTextToSpeech.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
    print("Models loaded successfully.")
except Exception as e:
    print(f"Error loading models: {e}")
    raise

# Load speaker embeddings
print("Loading speaker embeddings...")
try:
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    print("Speaker embeddings loaded successfully.")
except Exception as e:
    print(f"Error loading embeddings: {e}")
    raise

def generate_speech(text, voice, output_path="output_speech.wav"):
    print(f"Generating speech for text: {text}, voice: {voice}, output: {output_path}")
    if not text or not voice:
        return None, "Please provide both text and voice selection."
    
    speaker_dict = {"male": 2000, "female": 7000}
    try:
        speaker_id = speaker_dict[voice.lower()]
        speaker_embedding = torch.tensor(embeddings_dataset[speaker_id]["xvector"]).unsqueeze(0)
        inputs = processor(text=text, return_tensors="pt")
        
        with torch.no_grad():
            speech = model.generate_speech(
                inputs["input_ids"],
                speaker_embeddings=speaker_embedding,
                vocoder=vocoder,
                attention_mask=inputs.get("attention_mask")
            )
        
        sf.write(output_path, speech.numpy(), samplerate=16000)
        print(f"Audio saved to {output_path}")
        return output_path, None
    except Exception as e:
        print(f"Error generating speech: {str(e)}")
        return None, f"Error generating speech: {str(e)}"

def load_existing_audio(text, voice, output_audio_path):
    print(f"Load: {output_audio_path}")
    if not output_audio_path:
        return text, voice, None, "Please select an existing audio file."
    return text, voice, output_audio_path, "Successfully Loaded Example Audio"

# Load examples
print("Loading examples...")
examples = load_configs(CONFIG_DIR)
if not examples:
    print("Warning: No examples loaded. Check CONFIG_DIR and config.json files.")

# Create Gradio interface
with gr.Blocks() as iface:
    gr.Markdown("# Vietnamese Text-to-Speech")
    gr.Markdown("Generate speech from Vietnamese text using SpeechT5 or load existing audio from examples.")
    
    # Arrange components in a row
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(label="Vietnamese Text", placeholder="Enter text here...", lines=5)
            voice_input = gr.Radio(choices=["Male", "Female"], label="Voice", value="Male")
            
        with gr.Column():
            output = gr.Audio(label="Generated Speech") 
            error_output = gr.Textbox(label="Error Message")
    
    # Button to generate speech
    gr.Button("Generate").click(
        fn=generate_speech,
        inputs=[text_input, voice_input],
        outputs=[output, error_output]
    )
    
    # Examples component
    gr.Examples(
        examples=examples,
        fn=load_existing_audio,
        inputs=[text_input, voice_input, output],
        outputs=[text_input, voice_input, output, error_output],
        label="Examples (Loads existing audio)"
    )

# Launch app
if __name__ == "__main__":
    print("Launching Gradio interface...")
    try:
        iface.launch()
        print("Gradio interface launched. Open your browser to http://localhost:7860")
    except Exception as e:
        print(f"Error launching Gradio interface: {e}")