meg-huggingface
Merge in frimelle in-progress work.
e3f1c3d
raw
history blame
1.41 kB
# src/tts.py
from __future__ import annotations
from typing import Tuple, Union
import numpy as np
from transformers import pipeline
# We use the text-to-speech pipeline with XTTS v2 (zero-shot cloning)
# Example forward params: {"speaker_wav": "/path/to/ref.wav", "language": "en"}
def get_tts_pipeline(model_id: str):
"""
Create a TTS pipeline for the given model.
XTTS v2 works well for zero-shot cloning and is available on the Hub.
"""
# NOTE: Add device selection similar to ASR if needed
return pipeline("text-to-speech", model=model_id)
def run_tts_clone(
ref_audio_path: str,
text_to_speak: str,
model_id: str = "coqui/XTTS-v2",
language: str = "en",
) -> Union[Tuple[int, np.ndarray], Exception]:
"""
Synthesize 'text_to_speak' in the cloned voice from 'ref_audio_path'.
Returns:
(sampling_rate, waveform) on success, or Exception on failure.
"""
try:
tts = get_tts_pipeline(model_id)
result = tts(
text_to_speak,
forward_params={"speaker_wav": ref_audio_path, "language": language},
)
# transformers TTS returns dict like: {"audio": {"array": np.ndarray, "sampling_rate": 24000}}
audio = result["audio"]
sr = int(audio["sampling_rate"])
wav = audio["array"].astype(np.float32)
return sr, wav
except Exception as e:
return e