danhtran2mind commited on
Commit
1b8d2da
·
verified ·
1 Parent(s): fcab2d9

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +64 -1
README.md CHANGED
@@ -7,4 +7,67 @@ base_model:
7
  pipeline_tag: text-to-speech
8
  tags:
9
  - tts
10
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  pipeline_tag: text-to-speech
8
  tags:
9
  - tts
10
+ ---
11
+
12
+ ## Usage
13
+ Install Libraries
14
+ ```bash
15
+ torch
16
+ soundfile
17
+ transformers
18
+ datasets>=3.5.0,<4.0.0
19
+ numpy==1.26.4
20
+ sentencepiece>=0.2.0
21
+ ```
22
+ Using below Python script fro Inference
23
+
24
+ ```python
25
+ import torch
26
+ import soundfile as sf
27
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
28
+ from datasets import load_dataset
29
+ import numpy as np
30
+ import json
31
+ import os
32
+
33
+ # Load processor, model, and vocoder
34
+ processor = SpeechT5Processor.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
35
+ model = SpeechT5ForTextToSpeech.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
36
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
37
+
38
+ def generate_speech(text, voice, output_path="tests/test_output/tts_output.wav"):
39
+ print(f"Generating speech for text: {text}, voice: {voice}, output: {output_path}")
40
+ if not text or not voice:
41
+ return None, "Please provide both text and voice selection."
42
+
43
+ speaker_dict = {"male": 2000, "female": 7000}
44
+ try:
45
+ speaker_id = speaker_dict[voice.lower()]
46
+ speaker_embedding = torch.tensor(embeddings_dataset[speaker_id]["xvector"]).unsqueeze(0)
47
+ inputs = processor(text=text, return_tensors="pt")
48
+
49
+ with torch.no_grad():
50
+ speech = model.generate_speech(
51
+ inputs["input_ids"],
52
+ speaker_embeddings=speaker_embedding,
53
+ vocoder=vocoder,
54
+ attention_mask=inputs.get("attention_mask")
55
+ )
56
+
57
+ sf.write(output_path, speech.numpy(), samplerate=16000)
58
+ print(f"Audio saved to {output_path}")
59
+ return output_path, None
60
+ except Exception as e:
61
+ print(f"Error generating speech: {str(e)}")
62
+ return None, f"Error generating speech: {str(e)}"
63
+
64
+ text = "<input_text>"
65
+ voice = "Female" # choose "Male" or "Female"
66
+ generate_speech(text, voice, output_path="tests/test_output/tts_output.wav")
67
+ ```
68
+ Show the Output TTS Audio
69
+
70
+ ```python
71
+ from IPython.display import Audio
72
+ Audio("tests/test_output/tts_output.wav")
73
+ ```