Spaces:

danhtran2mind
/

Viet-SpeechT5-TTS-finetuning

Sleeping

App Files Files Community

Viet-SpeechT5-TTS-finetuning / app.py

danhtran2mind

Update app.py

e7bceb1 verified about 1 month ago

raw

history blame contribute delete

5.16 kB

	import gradio as gr
	import torch
	import soundfile as sf
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from datasets import load_dataset
	import numpy as np
	import json
	import os

	# Directory containing config files
	CONFIG_DIR = "assets/Viet-SpeechT5-TTS-finetuning"

	# Load all config.json files
	def load_configs(directory):
	print(f"Searching for config files in: {directory}")
	if not os.path.exists(directory):
	print(f"Directory {directory} does not exist.")
	return []

	examples = []
	for root, _, files in os.walk(directory):
	for file in files:
	if file == "config.json":
	file_path = os.path.join(root, file)
	print(f"Found config file: {file_path}")
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	config = json.load(f)
	if "input_text" in config and "voice" in config and "output_audio_path" in config:
	examples.append([config["input_text"], config["voice"], os.path.join(root, config["output_audio_path"])])
	else:
	print(f"Skipping {file_path}: Missing required fields")
	except Exception as e:
	print(f"Error reading {file_path}: {e}")
	print(f"Total examples loaded: {len(examples)}")
	return examples

	# Load processor, model, and vocoder
	print("Loading processor, model, and vocoder...")
	try:
	processor = SpeechT5Processor.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
	model = SpeechT5ForTextToSpeech.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
	print("Models loaded successfully.")
	except Exception as e:
	print(f"Error loading models: {e}")
	raise

	# Load speaker embeddings
	print("Loading speaker embeddings...")
	try:
	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	print("Speaker embeddings loaded successfully.")
	except Exception as e:
	print(f"Error loading embeddings: {e}")
	raise

	def generate_speech(text, voice, output_path="output_speech.wav"):
	print(f"Generating speech for text: {text}, voice: {voice}, output: {output_path}")
	if not text or not voice:
	return None, "Please provide both text and voice selection."

	speaker_dict = {"male": 2000, "female": 7000}
	try:
	speaker_id = speaker_dict[voice.lower()]
	speaker_embedding = torch.tensor(embeddings_dataset[speaker_id]["xvector"]).unsqueeze(0)
	inputs = processor(text=text, return_tensors="pt")

	with torch.no_grad():
	speech = model.generate_speech(
	inputs["input_ids"],
	speaker_embeddings=speaker_embedding,
	vocoder=vocoder,
	attention_mask=inputs.get("attention_mask")
	)

	sf.write(output_path, speech.numpy(), samplerate=16000)
	print(f"Audio saved to {output_path}")
	return output_path, None
	except Exception as e:
	print(f"Error generating speech: {str(e)}")
	return None, f"Error generating speech: {str(e)}"

	def load_existing_audio(text, voice, output_audio_path):
	print(f"Load: {output_audio_path}")
	if not output_audio_path:
	return text, voice, None, "Please select an existing audio file."
	return text, voice, output_audio_path, "Successfully Loaded Example Audio"

	# Load examples
	print("Loading examples...")
	examples = load_configs(CONFIG_DIR)
	if not examples:
	print("Warning: No examples loaded. Check CONFIG_DIR and config.json files.")

	# Create Gradio interface
	with gr.Blocks() as iface:
	gr.Markdown("# Vietnamese Text-to-Speech")
	gr.Markdown("Generate speech from Vietnamese text using SpeechT5 or load existing audio from examples.")

	# Arrange components in a row
	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(label="Vietnamese Text", placeholder="Enter text here...", lines=5)
	voice_input = gr.Radio(choices=["Male", "Female"], label="Voice", value="Male")

	with gr.Column():
	output = gr.Audio(label="Generated Speech")
	error_output = gr.Textbox(label="Error Message")

	# Button to generate speech
	gr.Button("Generate").click(
	fn=generate_speech,
	inputs=[text_input, voice_input],
	outputs=[output, error_output]
	)

	# Examples component
	gr.Examples(
	examples=examples,
	fn=load_existing_audio,
	inputs=[text_input, voice_input, output],
	outputs=[text_input, voice_input, output, error_output],
	label="Examples (Loads existing audio)"
	)

	# Launch app
	if __name__ == "__main__":
	print("Launching Gradio interface...")
	try:
	iface.launch()
	print("Gradio interface launched. Open your browser to http://localhost:7860")
	except Exception as e:
	print(f"Error launching Gradio interface: {e}")