import streamlit as st
import torch
import tempfile
import soundfile as sf

from TTS.api import TTS
from streamlit_audiorecorder import audiorecorder

st.set_page_config(page_title="Voice Clone TTS", layout="centered")
st.title("🎙️ Voice-Cloning Text-to-Speech")
st.markdown(
    """
    1. **Record** your voice or **upload** an existing audio file (WAV/MP3).  
    2. Enter the **text** you want spoken in _your_ voice.  
    3. (Optional) Paste an **API key** if required by your model/service.  
    4. Click **Generate** to hear the cloned speech.
    """
)

# 1) AUDIO INPUT: record or upload
st.header("1. Provide your voice sample")
col1, col2 = st.columns(2)

with col1:
    st.write("**Record in-page**")
    audio_bytes = audiorecorder("Click to record", "Recording…")
    if isinstance(audio_bytes, bytes):
        st.audio(audio_bytes, format="audio/wav")

with col2:
    st.write("**Or upload file**")
    upload = st.file_uploader("Upload WAV/MP3", type=["wav", "mp3"])
    if upload is not None:
        audio_bytes = upload.read()
        st.audio(audio_bytes, format=upload.type)

if 'audio_bytes' not in locals() or not isinstance(audio_bytes, (bytes, bytearray)):
    st.warning("Please record or upload a valid audio sample before proceeding.")
    st.stop()

# 2) USER TEXT & (optional) KEY
st.header("2. Text & API key")
text_input = st.text_area("Enter text to speak in your voice", value="Hello, this is my cloned voice!", height=120)
api_key = st.text_input("API Key (if your model needs one)", type="password")

# 3) LOAD & CACHE THE TTS PIPELINE
@st.cache_resource(show_spinner=False)
def load_tts_model():
    # replace with your chosen multispeaker/cloning model
    model_name = "IndexTeam/IndexTTS-1.5"
    # Coqui TTS uses its own GPU flag
    return TTS(model_name=model_name, progress_bar=False, gpu=torch.cuda.is_available())

tts = load_tts_model()

# 4) GENERATE
if st.button("▶️ Generate Speech"):
    if not text_input.strip():
        st.error("Please enter some text to synthesize.")
        st.stop()

    with st.spinner("Cloning your voice…"):
        # save the reference audio to a temp WAV
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
            sf.write(tmp.name, sf.read(io.BytesIO(audio_bytes))[0], samplerate=sf.read(io.BytesIO(audio_bytes))[1])
            ref_path = tmp.name

        # do the TTS with your voice as reference
        wav = tts.tts(text=text_input, speaker_wav=ref_path)

        # save output and play
        out_path = ref_path.replace(".wav", "_out.wav")
        sf.write(out_path, wav, samplerate=tts.synthesizer.output_sample_rate)
        st.success("✅ Done!")
        st.audio(out_path, format="audio/wav")