Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
f01210e
1
Parent(s):
ac26d96
add more examples
Browse files
utils/__pycache__/__init__.cpython-312.pyc
CHANGED
|
Binary files a/utils/__pycache__/__init__.cpython-312.pyc and b/utils/__pycache__/__init__.cpython-312.pyc differ
|
|
|
utils/__pycache__/normalize_text.cpython-312.pyc
CHANGED
|
Binary files a/utils/__pycache__/normalize_text.cpython-312.pyc and b/utils/__pycache__/normalize_text.cpython-312.pyc differ
|
|
|
utils/__pycache__/phonemize_text.cpython-312.pyc
CHANGED
|
Binary files a/utils/__pycache__/phonemize_text.cpython-312.pyc and b/utils/__pycache__/phonemize_text.cpython-312.pyc differ
|
|
|
vieneu_tts.py
CHANGED
|
@@ -37,7 +37,7 @@ def _linear_overlap_add(frames: list[np.ndarray], stride: int) -> np.ndarray:
|
|
| 37 |
class VieNeuTTS:
|
| 38 |
def __init__(
|
| 39 |
self,
|
| 40 |
-
backbone_repo="pnnbao-ump/VieNeu-TTS
|
| 41 |
backbone_device="cpu",
|
| 42 |
codec_repo="neuphonic/neucodec",
|
| 43 |
codec_device="cpu",
|
|
@@ -57,9 +57,6 @@ class VieNeuTTS:
|
|
| 57 |
self._is_quantized_model = False
|
| 58 |
self._is_onnx_codec = False
|
| 59 |
|
| 60 |
-
# backbone repo
|
| 61 |
-
self.advanced_model = backbone_repo.endswith("1000h")
|
| 62 |
-
|
| 63 |
# HF tokenizer
|
| 64 |
self.tokenizer = None
|
| 65 |
|
|
@@ -194,10 +191,7 @@ class VieNeuTTS:
|
|
| 194 |
return recon[0, 0, :]
|
| 195 |
|
| 196 |
def _apply_chat_template(self, ref_codes: list[int], ref_text: str, input_text: str) -> list[int]:
|
| 197 |
-
|
| 198 |
-
input_text = phonemize_with_dict(ref_text) + " " + phonemize_with_dict(input_text)
|
| 199 |
-
else:
|
| 200 |
-
input_text = phonemize_text(ref_text) + " " + phonemize_text(input_text)
|
| 201 |
|
| 202 |
speech_replace = self.tokenizer.convert_tokens_to_ids("<|SPEECH_REPLACE|>")
|
| 203 |
speech_gen_start = self.tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_START|>")
|
|
@@ -246,12 +240,8 @@ class VieNeuTTS:
|
|
| 246 |
return output_str
|
| 247 |
|
| 248 |
def _infer_ggml(self, ref_codes: list[int], ref_text: str, input_text: str) -> str:
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
input_text = phonemize_with_dict(input_text)
|
| 252 |
-
else:
|
| 253 |
-
ref_text = phonemize_text(ref_text)
|
| 254 |
-
input_text = phonemize_text(input_text)
|
| 255 |
|
| 256 |
codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
|
| 257 |
prompt = (
|
|
@@ -269,12 +259,8 @@ class VieNeuTTS:
|
|
| 269 |
return output_str
|
| 270 |
|
| 271 |
def _infer_stream_ggml(self, ref_codes: torch.Tensor, ref_text: str, input_text: str) -> Generator[np.ndarray, None, None]:
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
input_text = phonemize_with_dict(input_text)
|
| 275 |
-
else:
|
| 276 |
-
ref_text = phonemize_text(ref_text)
|
| 277 |
-
input_text = phonemize_text(input_text)
|
| 278 |
|
| 279 |
codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
|
| 280 |
prompt = (
|
|
|
|
| 37 |
class VieNeuTTS:
|
| 38 |
def __init__(
|
| 39 |
self,
|
| 40 |
+
backbone_repo="pnnbao-ump/VieNeu-TTS",
|
| 41 |
backbone_device="cpu",
|
| 42 |
codec_repo="neuphonic/neucodec",
|
| 43 |
codec_device="cpu",
|
|
|
|
| 57 |
self._is_quantized_model = False
|
| 58 |
self._is_onnx_codec = False
|
| 59 |
|
|
|
|
|
|
|
|
|
|
| 60 |
# HF tokenizer
|
| 61 |
self.tokenizer = None
|
| 62 |
|
|
|
|
| 191 |
return recon[0, 0, :]
|
| 192 |
|
| 193 |
def _apply_chat_template(self, ref_codes: list[int], ref_text: str, input_text: str) -> list[int]:
|
| 194 |
+
input_text = phonemize_with_dict(ref_text) + " " + phonemize_with_dict(input_text)
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
speech_replace = self.tokenizer.convert_tokens_to_ids("<|SPEECH_REPLACE|>")
|
| 197 |
speech_gen_start = self.tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_START|>")
|
|
|
|
| 240 |
return output_str
|
| 241 |
|
| 242 |
def _infer_ggml(self, ref_codes: list[int], ref_text: str, input_text: str) -> str:
|
| 243 |
+
ref_text = phonemize_with_dict(ref_text)
|
| 244 |
+
input_text = phonemize_with_dict(input_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
|
| 247 |
prompt = (
|
|
|
|
| 259 |
return output_str
|
| 260 |
|
| 261 |
def _infer_stream_ggml(self, ref_codes: torch.Tensor, ref_text: str, input_text: str) -> Generator[np.ndarray, None, None]:
|
| 262 |
+
ref_text = phonemize_with_dict(ref_text)
|
| 263 |
+
input_text = phonemize_with_dict(input_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
|
| 266 |
prompt = (
|