danhtran2mind
/

Viet-SpeechT5-TTS-finetuning

Model card Files Files and versions

danhtran2mind commited on Oct 1

Commit

1b8d2da

·

verified ·

1 Parent(s): fcab2d9

Update README.md

Files changed (1) hide show

README.md +64 -1

README.md CHANGED Viewed

@@ -7,4 +7,67 @@ base_model:
 pipeline_tag: text-to-speech
 tags:
 - tts
----

 pipeline_tag: text-to-speech
 tags:
 - tts
+---
+## Usage
+Install Libraries
+```bash
+torch
+soundfile
+transformers
+datasets>=3.5.0,<4.0.0
+numpy==1.26.4
+sentencepiece>=0.2.0
+```
+Using below Python script fro Inference
+```python
+import torch
+import soundfile as sf
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from datasets import load_dataset
+import numpy as np
+import json
+import os
+# Load processor, model, and vocoder
+processor = SpeechT5Processor.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
+model = SpeechT5ForTextToSpeech.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+def generate_speech(text, voice, output_path="tests/test_output/tts_output.wav"):
+    print(f"Generating speech for text: {text}, voice: {voice}, output: {output_path}")
+    if not text or not voice:
+        return None, "Please provide both text and voice selection."
+    speaker_dict = {"male": 2000, "female": 7000}
+    try:
+        speaker_id = speaker_dict[voice.lower()]
+        speaker_embedding = torch.tensor(embeddings_dataset[speaker_id]["xvector"]).unsqueeze(0)
+        inputs = processor(text=text, return_tensors="pt")
+        with torch.no_grad():
+            speech = model.generate_speech(
+                inputs["input_ids"],
+                speaker_embeddings=speaker_embedding,
+                vocoder=vocoder,
+                attention_mask=inputs.get("attention_mask")
+            )
+        sf.write(output_path, speech.numpy(), samplerate=16000)
+        print(f"Audio saved to {output_path}")
+        return output_path, None
+    except Exception as e:
+        print(f"Error generating speech: {str(e)}")
+        return None, f"Error generating speech: {str(e)}"
+text = "<input_text>"
+voice = "Female" # choose "Male" or "Female"
+generate_speech(text, voice, output_path="tests/test_output/tts_output.wav")
+```
+Show the Output TTS Audio
+```python
+from IPython.display import Audio
+Audio("tests/test_output/tts_output.wav")
+```