Spaces:
Running
on
T4
Running
on
T4
| # src/tts.py | |
| from __future__ import annotations | |
| from typing import Tuple, Union | |
| import numpy as np | |
| from transformers import pipeline | |
| # We use the text-to-speech pipeline with XTTS v2 (zero-shot cloning) | |
| # Example forward params: {"speaker_wav": "/path/to/ref.wav", "language": "en"} | |
| def get_tts_pipeline(model_id: str): | |
| """ | |
| Create a TTS pipeline for the given model. | |
| XTTS v2 works well for zero-shot cloning and is available on the Hub. | |
| """ | |
| # NOTE: Add device selection similar to ASR if needed | |
| return pipeline("text-to-speech", model=model_id) | |
| def run_tts_clone( | |
| ref_audio_path: str, | |
| text_to_speak: str, | |
| model_id: str = "coqui/XTTS-v2", | |
| language: str = "en", | |
| ) -> Union[Tuple[int, np.ndarray], Exception]: | |
| """ | |
| Synthesize 'text_to_speak' in the cloned voice from 'ref_audio_path'. | |
| Returns: | |
| (sampling_rate, waveform) on success, or Exception on failure. | |
| """ | |
| try: | |
| tts = get_tts_pipeline(model_id) | |
| result = tts( | |
| text_to_speak, | |
| forward_params={"speaker_wav": ref_audio_path, "language": language}, | |
| ) | |
| # transformers TTS returns dict like: {"audio": {"array": np.ndarray, "sampling_rate": 24000}} | |
| audio = result["audio"] | |
| sr = int(audio["sampling_rate"]) | |
| wav = audio["array"].astype(np.float32) | |
| return sr, wav | |
| except Exception as e: | |
| return e | |