|
|
|
|
|
|
|
|
use crate::{ |
|
|
audio::{load_audio, save_audio, AudioConfig, AudioData}, |
|
|
config::Config, |
|
|
model::{EmotionEncoder, SamplingStrategy, SemanticEncoder, SpeakerEncoder}, |
|
|
text::{TextNormalizer, TextTokenizer, TokenizerConfig}, |
|
|
vocoder::{BigVGAN, BigVGANConfig, Vocoder}, Result, |
|
|
}; |
|
|
use ndarray::Array1; |
|
|
use std::path::{Path, PathBuf}; |
|
|
use std::time::Instant; |
|
|
|
|
|
|
|
|
#[derive(Debug, Clone)] |
|
|
pub struct SynthesisOptions { |
|
|
|
|
|
pub emotion_vector: Option<Vec<f32>>, |
|
|
|
|
|
pub emotion_audio: Option<PathBuf>, |
|
|
|
|
|
pub emotion_alpha: f32, |
|
|
|
|
|
pub sampling: SamplingStrategy, |
|
|
|
|
|
pub repetition_penalty: f32, |
|
|
|
|
|
pub max_length: usize, |
|
|
|
|
|
pub segment_silence_ms: u32, |
|
|
} |
|
|
|
|
|
impl Default for SynthesisOptions { |
|
|
fn default() -> Self { |
|
|
Self { |
|
|
emotion_vector: None, |
|
|
emotion_audio: None, |
|
|
emotion_alpha: 1.0, |
|
|
sampling: SamplingStrategy::TopKP { k: 50, p: 0.95 }, |
|
|
repetition_penalty: 1.1, |
|
|
max_length: 250, |
|
|
segment_silence_ms: 200, |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
#[derive(Debug)] |
|
|
pub struct SynthesisResult { |
|
|
|
|
|
pub audio: Vec<f32>, |
|
|
|
|
|
pub sample_rate: u32, |
|
|
|
|
|
pub duration: f32, |
|
|
|
|
|
pub processing_time: f32, |
|
|
|
|
|
pub rtf: f32, |
|
|
} |
|
|
|
|
|
impl SynthesisResult { |
|
|
|
|
|
pub fn save<P: AsRef<Path>>(&self, path: P) -> Result<()> { |
|
|
let audio_data = AudioData::new(self.audio.clone(), self.sample_rate); |
|
|
save_audio(path, &audio_data) |
|
|
} |
|
|
|
|
|
|
|
|
pub fn duration_formatted(&self) -> String { |
|
|
let minutes = (self.duration / 60.0) as u32; |
|
|
let seconds = (self.duration % 60.0) as u32; |
|
|
format!("{:02}:{:02}", minutes, seconds) |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
pub struct IndexTTS { |
|
|
|
|
|
normalizer: TextNormalizer, |
|
|
|
|
|
tokenizer: TextTokenizer, |
|
|
|
|
|
speaker_encoder: SpeakerEncoder, |
|
|
|
|
|
emotion_encoder: EmotionEncoder, |
|
|
|
|
|
semantic_encoder: SemanticEncoder, |
|
|
|
|
|
vocoder: BigVGAN, |
|
|
|
|
|
audio_config: AudioConfig, |
|
|
|
|
|
config: Config, |
|
|
} |
|
|
|
|
|
impl IndexTTS { |
|
|
|
|
|
pub fn new(config: Config) -> Result<Self> { |
|
|
config.validate()?; |
|
|
|
|
|
log::info!("Initializing IndexTTS..."); |
|
|
|
|
|
|
|
|
let normalizer = TextNormalizer::new(); |
|
|
let tokenizer = TextTokenizer::new(TokenizerConfig { |
|
|
model_path: config.dataset.bpe_model.display().to_string(), |
|
|
vocab_size: config.dataset.vocab_size, |
|
|
..Default::default() |
|
|
})?; |
|
|
|
|
|
|
|
|
let speaker_encoder = SpeakerEncoder::new_placeholder(192); |
|
|
let emotion_encoder = EmotionEncoder::new( |
|
|
config.emotions.num_dims, |
|
|
config.emotions.num.clone(), |
|
|
256, |
|
|
); |
|
|
let semantic_encoder = SemanticEncoder::new_placeholder(); |
|
|
|
|
|
|
|
|
let vocoder_config = BigVGANConfig { |
|
|
sample_rate: config.s2mel.preprocess.sr, |
|
|
num_mels: config.s2mel.preprocess.n_mels, |
|
|
..Default::default() |
|
|
}; |
|
|
let vocoder = BigVGAN::new_fallback(vocoder_config); |
|
|
|
|
|
|
|
|
let audio_config = AudioConfig { |
|
|
sample_rate: config.s2mel.preprocess.sr, |
|
|
n_fft: config.s2mel.preprocess.n_fft, |
|
|
hop_length: config.s2mel.preprocess.hop_length, |
|
|
win_length: config.s2mel.preprocess.win_length, |
|
|
n_mels: config.s2mel.preprocess.n_mels, |
|
|
fmin: config.s2mel.preprocess.fmin, |
|
|
fmax: config.s2mel.preprocess.fmax, |
|
|
}; |
|
|
|
|
|
log::info!("IndexTTS initialized successfully"); |
|
|
|
|
|
Ok(Self { |
|
|
normalizer, |
|
|
tokenizer, |
|
|
speaker_encoder, |
|
|
emotion_encoder, |
|
|
semantic_encoder, |
|
|
vocoder, |
|
|
audio_config, |
|
|
config, |
|
|
}) |
|
|
} |
|
|
|
|
|
|
|
|
pub fn load<P: AsRef<Path>>(config_path: P) -> Result<Self> { |
|
|
let config = Config::load(config_path)?; |
|
|
Self::new(config) |
|
|
} |
|
|
|
|
|
|
|
|
pub fn synthesize( |
|
|
&self, |
|
|
text: &str, |
|
|
speaker_audio_path: &str, |
|
|
options: &SynthesisOptions, |
|
|
) -> Result<SynthesisResult> { |
|
|
let start_time = Instant::now(); |
|
|
|
|
|
log::info!("Starting synthesis for: {}", &text[..text.len().min(50)]); |
|
|
|
|
|
|
|
|
log::debug!("Normalizing text..."); |
|
|
let normalized_text = self.normalizer.normalize(text)?; |
|
|
|
|
|
|
|
|
log::debug!("Tokenizing text..."); |
|
|
let tokens = self.tokenizer.encode(&normalized_text)?; |
|
|
log::debug!("Generated {} tokens", tokens.len()); |
|
|
|
|
|
|
|
|
log::debug!("Loading speaker audio..."); |
|
|
let speaker_audio = load_audio(speaker_audio_path, Some(self.audio_config.sample_rate))?; |
|
|
|
|
|
|
|
|
log::debug!("Extracting speaker embedding..."); |
|
|
let mel_spec = crate::audio::mel_spectrogram(&speaker_audio.samples, &self.audio_config)?; |
|
|
let speaker_embedding = self.speaker_encoder.encode(&mel_spec)?; |
|
|
|
|
|
|
|
|
log::debug!("Extracting semantic codes..."); |
|
|
let semantic_codes = self |
|
|
.semantic_encoder |
|
|
.encode(&speaker_audio.samples, self.audio_config.sample_rate)?; |
|
|
|
|
|
|
|
|
log::debug!("Preparing emotion conditioning..."); |
|
|
let emotion_embedding = if let Some(ref emo_vec) = options.emotion_vector { |
|
|
let emo = self.emotion_encoder.apply_strength(emo_vec, options.emotion_alpha); |
|
|
self.emotion_encoder.encode(&emo)? |
|
|
} else { |
|
|
let neutral = self.emotion_encoder.neutral(); |
|
|
self.emotion_encoder.encode(&neutral)? |
|
|
}; |
|
|
|
|
|
|
|
|
log::debug!("Generating mel spectrogram..."); |
|
|
let mel_length = (tokens.len() as f32 * 2.5) as usize; |
|
|
let mel_spec = self.generate_mel_spectrogram( |
|
|
&tokens, |
|
|
&semantic_codes, |
|
|
&speaker_embedding, |
|
|
&emotion_embedding, |
|
|
mel_length, |
|
|
)?; |
|
|
|
|
|
|
|
|
log::debug!("Running vocoder..."); |
|
|
let audio = self.vocoder.synthesize(&mel_spec)?; |
|
|
|
|
|
|
|
|
log::debug!("Post-processing..."); |
|
|
let audio = self.post_process(&audio); |
|
|
|
|
|
let processing_time = start_time.elapsed().as_secs_f32(); |
|
|
let duration = audio.len() as f32 / self.vocoder.sample_rate() as f32; |
|
|
let rtf = processing_time / duration; |
|
|
|
|
|
log::info!( |
|
|
"Synthesis complete: {:.2}s audio in {:.2}s (RTF: {:.3})", |
|
|
duration, |
|
|
processing_time, |
|
|
rtf |
|
|
); |
|
|
|
|
|
Ok(SynthesisResult { |
|
|
audio, |
|
|
sample_rate: self.vocoder.sample_rate(), |
|
|
duration, |
|
|
processing_time, |
|
|
rtf, |
|
|
}) |
|
|
} |
|
|
|
|
|
|
|
|
pub fn synthesize_to_file( |
|
|
&self, |
|
|
text: &str, |
|
|
speaker_audio_path: &str, |
|
|
output_path: &str, |
|
|
options: &SynthesisOptions, |
|
|
) -> Result<SynthesisResult> { |
|
|
let result = self.synthesize(text, speaker_audio_path, options)?; |
|
|
result.save(output_path)?; |
|
|
log::info!("Saved audio to: {}", output_path); |
|
|
Ok(result) |
|
|
} |
|
|
|
|
|
|
|
|
fn generate_mel_spectrogram( |
|
|
&self, |
|
|
_tokens: &[i64], |
|
|
_semantic_codes: &[i64], |
|
|
_speaker_embedding: &Array1<f32>, |
|
|
_emotion_embedding: &Array1<f32>, |
|
|
mel_length: usize, |
|
|
) -> Result<ndarray::Array2<f32>> { |
|
|
|
|
|
|
|
|
|
|
|
use rand::Rng; |
|
|
let mut rng = rand::thread_rng(); |
|
|
|
|
|
let n_mels = self.audio_config.n_mels; |
|
|
let mut mel = ndarray::Array2::zeros((n_mels, mel_length)); |
|
|
|
|
|
|
|
|
for t in 0..mel_length { |
|
|
for freq in 0..n_mels { |
|
|
|
|
|
let base_value = -4.0 + (freq as f32 / n_mels as f32) * 2.0; |
|
|
let time_mod = ((t as f32 * 0.1).sin() + 1.0) * 0.5; |
|
|
let noise = rng.gen_range(-0.5..0.5); |
|
|
mel[[freq, t]] = base_value + time_mod + noise; |
|
|
} |
|
|
} |
|
|
|
|
|
Ok(mel) |
|
|
} |
|
|
|
|
|
|
|
|
fn post_process(&self, audio: &[f32]) -> Vec<f32> { |
|
|
use crate::audio::{normalize_audio_peak, apply_fade}; |
|
|
|
|
|
|
|
|
let normalized = normalize_audio_peak(audio, 0.89); |
|
|
|
|
|
|
|
|
let fade_samples = (self.audio_config.sample_rate as f32 * 0.005) as usize; |
|
|
apply_fade(&normalized, fade_samples, fade_samples) |
|
|
} |
|
|
|
|
|
|
|
|
pub fn synthesize_long( |
|
|
&self, |
|
|
text: &str, |
|
|
speaker_audio_path: &str, |
|
|
options: &SynthesisOptions, |
|
|
) -> Result<SynthesisResult> { |
|
|
let start_time = Instant::now(); |
|
|
|
|
|
|
|
|
let segments = super::segment_text(text, 100); |
|
|
log::info!("Split text into {} segments", segments.len()); |
|
|
|
|
|
|
|
|
let mut audio_segments = Vec::new(); |
|
|
for (i, segment) in segments.iter().enumerate() { |
|
|
log::info!("Synthesizing segment {}/{}", i + 1, segments.len()); |
|
|
let result = self.synthesize(segment, speaker_audio_path, options)?; |
|
|
audio_segments.push(result.audio); |
|
|
} |
|
|
|
|
|
|
|
|
let audio = super::concatenate_audio( |
|
|
&audio_segments, |
|
|
options.segment_silence_ms, |
|
|
self.vocoder.sample_rate(), |
|
|
); |
|
|
|
|
|
let processing_time = start_time.elapsed().as_secs_f32(); |
|
|
let duration = audio.len() as f32 / self.vocoder.sample_rate() as f32; |
|
|
let rtf = processing_time / duration; |
|
|
|
|
|
Ok(SynthesisResult { |
|
|
audio, |
|
|
sample_rate: self.vocoder.sample_rate(), |
|
|
duration, |
|
|
processing_time, |
|
|
rtf, |
|
|
}) |
|
|
} |
|
|
|
|
|
|
|
|
pub fn sample_rate(&self) -> u32 { |
|
|
self.vocoder.sample_rate() |
|
|
} |
|
|
|
|
|
|
|
|
pub fn config(&self) -> &Config { |
|
|
&self.config |
|
|
} |
|
|
} |
|
|
|
|
|
#[cfg(test)] |
|
|
mod tests { |
|
|
use super::*; |
|
|
|
|
|
#[test] |
|
|
fn test_synthesis_options_default() { |
|
|
let options = SynthesisOptions::default(); |
|
|
assert_eq!(options.emotion_alpha, 1.0); |
|
|
assert!(matches!(options.sampling, SamplingStrategy::TopKP { .. })); |
|
|
} |
|
|
|
|
|
#[test] |
|
|
fn test_synthesis_result_duration() { |
|
|
let result = SynthesisResult { |
|
|
audio: vec![0.0; 22050 * 125], |
|
|
sample_rate: 22050, |
|
|
duration: 125.0, |
|
|
processing_time: 10.0, |
|
|
rtf: 0.08, |
|
|
}; |
|
|
|
|
|
assert_eq!(result.duration_formatted(), "02:05"); |
|
|
} |
|
|
|
|
|
#[test] |
|
|
fn test_segment_text() { |
|
|
let text = "This is sentence one. This is sentence two. This is sentence three."; |
|
|
let segments = super::super::segment_text(text, 50); |
|
|
assert!(segments.len() >= 2); |
|
|
} |
|
|
|
|
|
#[test] |
|
|
fn test_concatenate_audio() { |
|
|
let seg1 = vec![1.0f32; 100]; |
|
|
let seg2 = vec![2.0f32; 100]; |
|
|
let result = super::super::concatenate_audio(&[seg1, seg2], 10, 1000); |
|
|
|
|
|
assert_eq!(result.len(), 210); |
|
|
} |
|
|
} |
|
|
|