VoiceCloner / app.py
tahirturk's picture
Update app.py
f12ab45 verified
import spaces
import gradio as gr
import torch
from TTS.api import TTS
import os
from pydub import AudioSegment
import re
os.makedirs("audio", exist_ok=True)
# Agree to Coqui TTS license
os.environ["COQUI_TOS_AGREED"] = "1"
# Auto-detect device
device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
# Function for long text voice cloning
@spaces.GPU(enable_queue=True)
def clone(text, audio):
# Split input into sentences/phrases
sentences = re.split(r'(?<=[.!?]) +', text)
final_audio = AudioSegment.silent(duration=0)
for i, chunk in enumerate(sentences):
if not chunk.strip():
continue
temp_path = f"chunk_{i}.wav"
tts.tts_to_file(
text=chunk,
speaker_wav=audio,
language="en",
file_path=temp_path
)
final_audio += AudioSegment.from_wav(temp_path)
# Merge chunks into one file
output_path = "./output.wav"
final_audio.export(output_path, format="wav")
return output_path
# UI
with gr.Blocks(theme=gr.themes.Soft(primary_hue="teal", secondary_hue="cyan", neutral_hue="slate")) as demo:
# Custom CSS
gr.HTML("""
<style>
body {
background: linear-gradient(135deg, #0f172a, #1e293b);
font-family: 'Inter', sans-serif;
color: #f8fafc;
}
.gradio-container {
max-width: 1200px;
margin: auto;
}
.gr-block {
background: #1e293b;
border-radius: 16px;
box-shadow: 0 8px 20px rgba(0,0,0,0.4);
padding: 20px;
transition: all 0.3s ease-in-out;
}
.gr-block:hover {
box-shadow: 0 12px 28px rgba(0,0,0,0.6);
}
h1, h2, h3 {
color: #06b6d4;
font-weight: 700;
}
.gr-button.primary {
background: linear-gradient(90deg, #06b6d4, #14b8a6);
border: none;
border-radius: 12px;
font-weight: bold;
transition: 0.3s;
}
.gr-button.primary:hover {
background: linear-gradient(90deg, #14b8a6, #06b6d4);
transform: scale(1.05);
}
.gr-textbox textarea {
background: #0f172a !important;
color: #f8fafc !important;
border-radius: 12px;
border: 1px solid #334155;
}
.gr-textbox textarea:focus {
border-color: #06b6d4;
outline: none !important;
box-shadow: 0 0 10px rgba(6,182,212,0.5);
}
.gr-audio input, .gr-audio {
border-radius: 12px !important;
border: 1px solid #334155 !important;
background: #0f172a !important;
}
</style>
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown(
"""
# πŸŽ™οΈ Voice Clone Studio By Tahir Turk
Clone any voice by uploading a short reference audio file
and typing what you want it to say.
**Powered by XTTS v2 β€” multilingual voice cloning.**
"""
)
text_input = gr.Textbox(
label="Enter your text",
placeholder="Type anything you'd like the cloned voice to say...",
lines=6
)
audio_input = gr.Audio(
type="filepath",
label="Upload voice reference (WAV or MP3)"
)
submit_btn = gr.Button("✨ Generate Voice", variant="primary")
with gr.Column(scale=1):
output_audio = gr.Audio(type="filepath", label="πŸ”Š Generated Voice Output")
gr.Markdown(
"""
---
⚑ **Tips for Best Results**
- Use a **clean, clear** reference audio (5–15 seconds works best).
- Long text will be split automatically for natural speech.
- You can generate **minutes of audio** now without cutoff.
---
"""
)
with gr.Row():
gr.Examples(
examples=[
["Hey! It's me Dorthy, from the Wizard of Oz. Type in whatever you'd like me to say.", "./audio/Wizard-of-Oz-Dorthy.wav"],
["It's me Vito Corleone, from the Godfather. Type in whatever you'd like me to say.", "./audio/Godfather.wav"],
["Hey, it's me Paris Hilton. Type in whatever you'd like me to say.", "./audio/Paris-Hilton.mp3"],
["Hey, it's me Megan Fox from Transformers. Type in whatever you'd like me to say.", "./audio/Megan-Fox.mp3"],
["Hey there, it's me Jeff Goldblum. Type in whatever you'd like me to say.", "./audio/Jeff-Goldblum.mp3"],
["Hey there, it's me Heath Ledger as the Joker. Type in whatever you'd like me to say.", "./audio/Heath-Ledger.mp3"],
],
inputs=[text_input, audio_input],
outputs=[output_audio],
label="🎭 Try with these sample voices"
)
submit_btn.click(fn=clone, inputs=[text_input, audio_input], outputs=output_audio)
demo.launch()