danhtran2mind's picture
Update app.py
e7bceb1 verified
import gradio as gr
import torch
import soundfile as sf
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import numpy as np
import json
import os
# Directory containing config files
CONFIG_DIR = "assets/Viet-SpeechT5-TTS-finetuning"
# Load all config.json files
def load_configs(directory):
print(f"Searching for config files in: {directory}")
if not os.path.exists(directory):
print(f"Directory {directory} does not exist.")
return []
examples = []
for root, _, files in os.walk(directory):
for file in files:
if file == "config.json":
file_path = os.path.join(root, file)
print(f"Found config file: {file_path}")
try:
with open(file_path, 'r', encoding='utf-8') as f:
config = json.load(f)
if "input_text" in config and "voice" in config and "output_audio_path" in config:
examples.append([config["input_text"], config["voice"], os.path.join(root, config["output_audio_path"])])
else:
print(f"Skipping {file_path}: Missing required fields")
except Exception as e:
print(f"Error reading {file_path}: {e}")
print(f"Total examples loaded: {len(examples)}")
return examples
# Load processor, model, and vocoder
print("Loading processor, model, and vocoder...")
try:
processor = SpeechT5Processor.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
model = SpeechT5ForTextToSpeech.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
print("Models loaded successfully.")
except Exception as e:
print(f"Error loading models: {e}")
raise
# Load speaker embeddings
print("Loading speaker embeddings...")
try:
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
print("Speaker embeddings loaded successfully.")
except Exception as e:
print(f"Error loading embeddings: {e}")
raise
def generate_speech(text, voice, output_path="output_speech.wav"):
print(f"Generating speech for text: {text}, voice: {voice}, output: {output_path}")
if not text or not voice:
return None, "Please provide both text and voice selection."
speaker_dict = {"male": 2000, "female": 7000}
try:
speaker_id = speaker_dict[voice.lower()]
speaker_embedding = torch.tensor(embeddings_dataset[speaker_id]["xvector"]).unsqueeze(0)
inputs = processor(text=text, return_tensors="pt")
with torch.no_grad():
speech = model.generate_speech(
inputs["input_ids"],
speaker_embeddings=speaker_embedding,
vocoder=vocoder,
attention_mask=inputs.get("attention_mask")
)
sf.write(output_path, speech.numpy(), samplerate=16000)
print(f"Audio saved to {output_path}")
return output_path, None
except Exception as e:
print(f"Error generating speech: {str(e)}")
return None, f"Error generating speech: {str(e)}"
def load_existing_audio(text, voice, output_audio_path):
print(f"Load: {output_audio_path}")
if not output_audio_path:
return text, voice, None, "Please select an existing audio file."
return text, voice, output_audio_path, "Successfully Loaded Example Audio"
# Load examples
print("Loading examples...")
examples = load_configs(CONFIG_DIR)
if not examples:
print("Warning: No examples loaded. Check CONFIG_DIR and config.json files.")
# Create Gradio interface
with gr.Blocks() as iface:
gr.Markdown("# Vietnamese Text-to-Speech")
gr.Markdown("Generate speech from Vietnamese text using SpeechT5 or load existing audio from examples.")
# Arrange components in a row
with gr.Row():
with gr.Column():
text_input = gr.Textbox(label="Vietnamese Text", placeholder="Enter text here...", lines=5)
voice_input = gr.Radio(choices=["Male", "Female"], label="Voice", value="Male")
with gr.Column():
output = gr.Audio(label="Generated Speech")
error_output = gr.Textbox(label="Error Message")
# Button to generate speech
gr.Button("Generate").click(
fn=generate_speech,
inputs=[text_input, voice_input],
outputs=[output, error_output]
)
# Examples component
gr.Examples(
examples=examples,
fn=load_existing_audio,
inputs=[text_input, voice_input, output],
outputs=[text_input, voice_input, output, error_output],
label="Examples (Loads existing audio)"
)
# Launch app
if __name__ == "__main__":
print("Launching Gradio interface...")
try:
iface.launch()
print("Gradio interface launched. Open your browser to http://localhost:7860")
except Exception as e:
print(f"Error launching Gradio interface: {e}")