--- license: mit language: - vi base_model: - microsoft/speecht5_tts pipeline_tag: text-to-speech tags: - tts --- ## Usage Install Libraries ```bash torch soundfile transformers datasets>=3.5.0,<4.0.0 numpy==1.26.4 sentencepiece>=0.2.0 ``` Using below Python script fro Inference ```python import torch import soundfile as sf from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from datasets import load_dataset import numpy as np import json import os # Load processor, model, and vocoder processor = SpeechT5Processor.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning") model = SpeechT5ForTextToSpeech.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") def generate_speech(text, voice, output_path="tests/test_output/tts_output.wav"): print(f"Generating speech for text: {text}, voice: {voice}, output: {output_path}") if not text or not voice: return None, "Please provide both text and voice selection." speaker_dict = {"male": 2000, "female": 7000} try: speaker_id = speaker_dict[voice.lower()] speaker_embedding = torch.tensor(embeddings_dataset[speaker_id]["xvector"]).unsqueeze(0) inputs = processor(text=text, return_tensors="pt") with torch.no_grad(): speech = model.generate_speech( inputs["input_ids"], speaker_embeddings=speaker_embedding, vocoder=vocoder, attention_mask=inputs.get("attention_mask") ) sf.write(output_path, speech.numpy(), samplerate=16000) print(f"Audio saved to {output_path}") return output_path, None except Exception as e: print(f"Error generating speech: {str(e)}") return None, f"Error generating speech: {str(e)}" text = "" voice = "Female" # choose "Male" or "Female" generate_speech(text, voice, output_path="tests/test_output/tts_output.wav") ``` Show the Output TTS Audio ```python from IPython.display import Audio Audio("tests/test_output/tts_output.wav") ```