RepeatAfterMe

Running on T4

meg-huggingface

Merge in frimelle in-progress work.

e3f1c3d 15 days ago

1.41 kB

	# src/tts.py
	from __future__ import annotations
	from typing import Tuple, Union

	import numpy as np
	from transformers import pipeline

	# We use the text-to-speech pipeline with XTTS v2 (zero-shot cloning)
	# Example forward params: {"speaker_wav": "/path/to/ref.wav", "language": "en"}

	def get_tts_pipeline(model_id: str):
	"""
	Create a TTS pipeline for the given model.
	XTTS v2 works well for zero-shot cloning and is available on the Hub.
	"""
	# NOTE: Add device selection similar to ASR if needed
	return pipeline("text-to-speech", model=model_id)

	def run_tts_clone(
	ref_audio_path: str,
	text_to_speak: str,
	model_id: str = "coqui/XTTS-v2",
	language: str = "en",
	) -> Union[Tuple[int, np.ndarray], Exception]:
	"""
	Synthesize 'text_to_speak' in the cloned voice from 'ref_audio_path'.

	Returns:
	(sampling_rate, waveform) on success, or Exception on failure.
	"""
	try:
	tts = get_tts_pipeline(model_id)
	result = tts(
	text_to_speak,
	forward_params={"speaker_wav": ref_audio_path, "language": language},
	)
	# transformers TTS returns dict like: {"audio": {"array": np.ndarray, "sampling_rate": 24000}}
	audio = result["audio"]
	sr = int(audio["sampling_rate"])
	wav = audio["array"].astype(np.float32)
	return sr, wav
	except Exception as e:
	return e