Spaces:

pnnbao-ump
/

VieNeu-TTS

Running on Zero

App Files Files Community

pnnbao-ump commited on 3 days ago

Commit

f01210e

1 Parent(s): ac26d96

add more examples

Browse files

Files changed (4) hide show

utils/__pycache__/__init__.cpython-312.pyc +0 -0
utils/__pycache__/normalize_text.cpython-312.pyc +0 -0
utils/__pycache__/phonemize_text.cpython-312.pyc +0 -0
vieneu_tts.py +6 -20

utils/__pycache__/__init__.cpython-312.pyc CHANGED Viewed

Binary files a/utils/__pycache__/__init__.cpython-312.pyc and b/utils/__pycache__/__init__.cpython-312.pyc differ

utils/__pycache__/normalize_text.cpython-312.pyc CHANGED Viewed

Binary files a/utils/__pycache__/normalize_text.cpython-312.pyc and b/utils/__pycache__/normalize_text.cpython-312.pyc differ

utils/__pycache__/phonemize_text.cpython-312.pyc CHANGED Viewed

Binary files a/utils/__pycache__/phonemize_text.cpython-312.pyc and b/utils/__pycache__/phonemize_text.cpython-312.pyc differ

vieneu_tts.py CHANGED Viewed

@@ -37,7 +37,7 @@ def _linear_overlap_add(frames: list[np.ndarray], stride: int) -> np.ndarray:
 class VieNeuTTS:
     def __init__(
         self,
-        backbone_repo="pnnbao-ump/VieNeu-TTS-1000h",
         backbone_device="cpu",
         codec_repo="neuphonic/neucodec",
         codec_device="cpu",
@@ -57,9 +57,6 @@ class VieNeuTTS:
         self._is_quantized_model = False
         self._is_onnx_codec = False
-        # backbone repo
-        self.advanced_model = backbone_repo.endswith("1000h")
         # HF tokenizer
         self.tokenizer = None
@@ -194,10 +191,7 @@ class VieNeuTTS:
         return recon[0, 0, :]
     def _apply_chat_template(self, ref_codes: list[int], ref_text: str, input_text: str) -> list[int]:
-        if self.advanced_model:
-            input_text = phonemize_with_dict(ref_text) + " " + phonemize_with_dict(input_text)
-        else:
-            input_text = phonemize_text(ref_text) + " " + phonemize_text(input_text)
         speech_replace = self.tokenizer.convert_tokens_to_ids("<|SPEECH_REPLACE|>")
         speech_gen_start = self.tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_START|>")
@@ -246,12 +240,8 @@ class VieNeuTTS:
         return output_str
     def _infer_ggml(self, ref_codes: list[int], ref_text: str, input_text: str) -> str:
-        if self.advanced_model:
-            ref_text = phonemize_with_dict(ref_text)
-            input_text = phonemize_with_dict(input_text)
-        else:
-            ref_text = phonemize_text(ref_text)
-            input_text = phonemize_text(input_text)
         codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
         prompt = (
@@ -269,12 +259,8 @@ class VieNeuTTS:
         return output_str
     def _infer_stream_ggml(self, ref_codes: torch.Tensor, ref_text: str, input_text: str) -> Generator[np.ndarray, None, None]:
-        if self.advanced_model:
-            ref_text = phonemize_with_dict(ref_text)
-            input_text = phonemize_with_dict(input_text)
-        else:
-            ref_text = phonemize_text(ref_text)
-            input_text = phonemize_text(input_text)
         codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
         prompt = (

 class VieNeuTTS:
     def __init__(
         self,
+        backbone_repo="pnnbao-ump/VieNeu-TTS",
         backbone_device="cpu",
         codec_repo="neuphonic/neucodec",
         codec_device="cpu",
         self._is_quantized_model = False
         self._is_onnx_codec = False
         # HF tokenizer
         self.tokenizer = None
         return recon[0, 0, :]
     def _apply_chat_template(self, ref_codes: list[int], ref_text: str, input_text: str) -> list[int]:
+        input_text = phonemize_with_dict(ref_text) + " " + phonemize_with_dict(input_text)
         speech_replace = self.tokenizer.convert_tokens_to_ids("<|SPEECH_REPLACE|>")
         speech_gen_start = self.tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_START|>")
         return output_str
     def _infer_ggml(self, ref_codes: list[int], ref_text: str, input_text: str) -> str:
+        ref_text = phonemize_with_dict(ref_text)
+        input_text = phonemize_with_dict(input_text)
         codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
         prompt = (
         return output_str
     def _infer_stream_ggml(self, ref_codes: torch.Tensor, ref_text: str, input_text: str) -> Generator[np.ndarray, None, None]:
+        ref_text = phonemize_with_dict(ref_text)
+        input_text = phonemize_with_dict(input_text)
         codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
         prompt = (