|
|
--- |
|
|
license: mit |
|
|
language: |
|
|
- vi |
|
|
base_model: |
|
|
- microsoft/speecht5_tts |
|
|
pipeline_tag: text-to-speech |
|
|
tags: |
|
|
- tts |
|
|
--- |
|
|
|
|
|
## Usage |
|
|
Install Libraries |
|
|
```bash |
|
|
torch |
|
|
soundfile |
|
|
transformers |
|
|
datasets>=3.5.0,<4.0.0 |
|
|
numpy==1.26.4 |
|
|
sentencepiece>=0.2.0 |
|
|
``` |
|
|
Using below Python script fro Inference |
|
|
|
|
|
```python |
|
|
import torch |
|
|
import soundfile as sf |
|
|
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan |
|
|
from datasets import load_dataset |
|
|
import numpy as np |
|
|
import json |
|
|
import os |
|
|
|
|
|
# Load processor, model, and vocoder |
|
|
processor = SpeechT5Processor.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning") |
|
|
model = SpeechT5ForTextToSpeech.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning") |
|
|
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
|
|
|
|
|
def generate_speech(text, voice, output_path="tests/test_output/tts_output.wav"): |
|
|
print(f"Generating speech for text: {text}, voice: {voice}, output: {output_path}") |
|
|
if not text or not voice: |
|
|
return None, "Please provide both text and voice selection." |
|
|
|
|
|
speaker_dict = {"male": 2000, "female": 7000} |
|
|
try: |
|
|
speaker_id = speaker_dict[voice.lower()] |
|
|
speaker_embedding = torch.tensor(embeddings_dataset[speaker_id]["xvector"]).unsqueeze(0) |
|
|
inputs = processor(text=text, return_tensors="pt") |
|
|
|
|
|
with torch.no_grad(): |
|
|
speech = model.generate_speech( |
|
|
inputs["input_ids"], |
|
|
speaker_embeddings=speaker_embedding, |
|
|
vocoder=vocoder, |
|
|
attention_mask=inputs.get("attention_mask") |
|
|
) |
|
|
|
|
|
sf.write(output_path, speech.numpy(), samplerate=16000) |
|
|
print(f"Audio saved to {output_path}") |
|
|
return output_path, None |
|
|
except Exception as e: |
|
|
print(f"Error generating speech: {str(e)}") |
|
|
return None, f"Error generating speech: {str(e)}" |
|
|
|
|
|
text = "<input_text>" |
|
|
voice = "Female" # choose "Male" or "Female" |
|
|
generate_speech(text, voice, output_path="tests/test_output/tts_output.wav") |
|
|
``` |
|
|
Show the Output TTS Audio |
|
|
|
|
|
```python |
|
|
from IPython.display import Audio |
|
|
Audio("tests/test_output/tts_output.wav") |
|
|
``` |
|
|
|