import streamlit as st import torch import tempfile import soundfile as sf from TTS.api import TTS from streamlit_audiorecorder import audiorecorder st.set_page_config(page_title="Voice Clone TTS", layout="centered") st.title("🎙️ Voice-Cloning Text-to-Speech") st.markdown( """ 1. **Record** your voice or **upload** an existing audio file (WAV/MP3). 2. Enter the **text** you want spoken in _your_ voice. 3. (Optional) Paste an **API key** if required by your model/service. 4. Click **Generate** to hear the cloned speech. """ ) # 1) AUDIO INPUT: record or upload st.header("1. Provide your voice sample") col1, col2 = st.columns(2) with col1: st.write("**Record in-page**") audio_bytes = audiorecorder("Click to record", "Recording…") if isinstance(audio_bytes, bytes): st.audio(audio_bytes, format="audio/wav") with col2: st.write("**Or upload file**") upload = st.file_uploader("Upload WAV/MP3", type=["wav", "mp3"]) if upload is not None: audio_bytes = upload.read() st.audio(audio_bytes, format=upload.type) if 'audio_bytes' not in locals() or not isinstance(audio_bytes, (bytes, bytearray)): st.warning("Please record or upload a valid audio sample before proceeding.") st.stop() # 2) USER TEXT & (optional) KEY st.header("2. Text & API key") text_input = st.text_area("Enter text to speak in your voice", value="Hello, this is my cloned voice!", height=120) api_key = st.text_input("API Key (if your model needs one)", type="password") # 3) LOAD & CACHE THE TTS PIPELINE @st.cache_resource(show_spinner=False) def load_tts_model(): # replace with your chosen multispeaker/cloning model model_name = "IndexTeam/IndexTTS-1.5" # Coqui TTS uses its own GPU flag return TTS(model_name=model_name, progress_bar=False, gpu=torch.cuda.is_available()) tts = load_tts_model() # 4) GENERATE if st.button("▶️ Generate Speech"): if not text_input.strip(): st.error("Please enter some text to synthesize.") st.stop() with st.spinner("Cloning your voice…"): # save the reference audio to a temp WAV with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: sf.write(tmp.name, sf.read(io.BytesIO(audio_bytes))[0], samplerate=sf.read(io.BytesIO(audio_bytes))[1]) ref_path = tmp.name # do the TTS with your voice as reference wav = tts.tts(text=text_input, speaker_wav=ref_path) # save output and play out_path = ref_path.replace(".wav", "_out.wav") sf.write(out_path, wav, samplerate=tts.synthesizer.output_sample_rate) st.success("✅ Done!") st.audio(out_path, format="audio/wav")