//! IndexTTS CLI - High-performance Text-to-Speech in Rust //! //! Command-line interface for IndexTTS synthesizer use clap::{Parser, Subcommand}; use indextts::{ pipeline::{IndexTTS, SynthesisOptions}, Config, Result, }; use std::path::PathBuf; #[derive(Parser)] #[command( name = "indextts", about = "High-performance Text-to-Speech engine in Rust", version, author )] struct Cli { #[command(subcommand)] command: Commands, } #[derive(Subcommand)] enum Commands { /// Synthesize speech from text Synthesize { /// Text to synthesize #[arg(short, long)] text: String, /// Speaker reference audio file #[arg(short = 'v', long)] voice: PathBuf, /// Output audio file path #[arg(short, long, default_value = "output.wav")] output: PathBuf, /// Configuration file path #[arg(short, long)] config: Option, /// Model directory #[arg(short, long, default_value = "models")] model_dir: PathBuf, /// Emotion vector (comma-separated, 8 values 0-1) #[arg(long)] emotion: Option, /// Emotion strength (0-1) #[arg(long, default_value = "1.0")] emotion_alpha: f32, /// Top-k sampling parameter #[arg(long, default_value = "50")] top_k: usize, /// Top-p sampling parameter #[arg(long, default_value = "0.95")] top_p: f32, /// Repetition penalty #[arg(long, default_value = "1.1")] repetition_penalty: f32, /// Use FP16 inference #[arg(long)] fp16: bool, /// Device (cpu, cuda:0, etc.) #[arg(short, long, default_value = "cpu")] device: String, }, /// Synthesize from a text file SynthesizeFile { /// Input text file #[arg(short, long)] input: PathBuf, /// Speaker reference audio file #[arg(short = 'v', long)] voice: PathBuf, /// Output audio file path #[arg(short, long, default_value = "output.wav")] output: PathBuf, /// Configuration file path #[arg(short, long)] config: Option, /// Model directory #[arg(short, long, default_value = "models")] model_dir: PathBuf, /// Silence between segments (milliseconds) #[arg(long, default_value = "200")] silence_ms: u32, }, /// Generate default configuration file InitConfig { /// Output path for config file #[arg(short, long, default_value = "config.yaml")] output: PathBuf, }, /// Show information about the system Info, /// Run benchmarks Benchmark { /// Number of iterations #[arg(short, long, default_value = "10")] iterations: usize, }, } fn main() -> Result<()> { // Initialize logger env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init(); let cli = Cli::parse(); match cli.command { Commands::Synthesize { text, voice, output, config, model_dir, emotion, emotion_alpha, top_k, top_p, repetition_penalty, fp16: _, device: _, } => { log::info!("IndexTTS Synthesizer"); log::info!("===================="); // Load or create config let cfg = if let Some(config_path) = config { Config::load(config_path)? } else { let mut cfg = Config::default(); cfg.model_dir = model_dir; cfg }; // Create TTS instance let tts = IndexTTS::new(cfg)?; // Parse emotion vector let emotion_vec = emotion.map(|s| { s.split(',') .filter_map(|v| v.trim().parse::().ok()) .collect::>() }); // Create synthesis options let options = SynthesisOptions { emotion_vector: emotion_vec, emotion_alpha, sampling: indextts::model::SamplingStrategy::TopKP { k: top_k, p: top_p }, repetition_penalty, ..Default::default() }; // Synthesize log::info!("Text: {}", &text[..text.len().min(100)]); log::info!("Voice: {}", voice.display()); log::info!("Output: {}", output.display()); let result = tts.synthesize_to_file( &text, voice.to_str().unwrap(), output.to_str().unwrap(), &options, )?; log::info!("Duration: {}", result.duration_formatted()); log::info!("Processing time: {:.2}s", result.processing_time); log::info!("Real-time factor: {:.3}x", result.rtf); println!("✓ Synthesis complete: {}", output.display()); } Commands::SynthesizeFile { input, voice, output, config, model_dir, silence_ms, } => { log::info!("IndexTTS File Synthesizer"); log::info!("=========================="); // Read text file let text = std::fs::read_to_string(&input)?; // Load or create config let cfg = if let Some(config_path) = config { Config::load(config_path)? } else { let mut cfg = Config::default(); cfg.model_dir = model_dir; cfg }; // Create TTS instance let tts = IndexTTS::new(cfg)?; // Create synthesis options let options = SynthesisOptions { segment_silence_ms: silence_ms, ..Default::default() }; // Synthesize log::info!("Input file: {}", input.display()); log::info!("Text length: {} characters", text.len()); let result = tts.synthesize_long( &text, voice.to_str().unwrap(), &options, )?; result.save(&output)?; log::info!("Duration: {}", result.duration_formatted()); log::info!("Processing time: {:.2}s", result.processing_time); log::info!("Real-time factor: {:.3}x", result.rtf); println!("✓ Synthesis complete: {}", output.display()); } Commands::InitConfig { output } => { log::info!("Creating default configuration..."); let config = Config::default(); config.save(&output)?; println!("✓ Configuration saved to: {}", output.display()); } Commands::Info => { println!("IndexTTS - High-performance Text-to-Speech Engine"); println!("=================================================="); println!("Version: {}", indextts::VERSION); println!("Platform: {}", std::env::consts::OS); println!("Architecture: {}", std::env::consts::ARCH); println!(); println!("Features:"); println!(" - Multi-language support (Chinese, English, mixed)"); println!(" - Zero-shot voice cloning"); println!(" - 8-dimensional emotion control"); println!(" - High-quality neural vocoding (BigVGAN)"); println!(" - SIMD-optimized audio processing"); println!(" - Parallel processing with Rayon"); println!(); println!("Sample Rate: {} Hz", indextts::SAMPLE_RATE); println!("Mel Bands: {}", indextts::N_MELS); println!("FFT Size: {}", indextts::N_FFT); println!("Hop Length: {}", indextts::HOP_LENGTH); println!(); println!("CPU Cores: {}", num_cpus::get()); println!("Physical Cores: {}", num_cpus::get_physical()); } Commands::Benchmark { iterations } => { log::info!("Running benchmarks ({} iterations)...", iterations); // Benchmark mel-spectrogram computation benchmark_mel_spectrogram(iterations); // Benchmark tokenization benchmark_tokenization(iterations); // Benchmark vocoder benchmark_vocoder(iterations); println!("✓ Benchmarks complete"); } } Ok(()) } fn benchmark_mel_spectrogram(iterations: usize) { use indextts::audio::{mel_spectrogram, AudioConfig}; use std::time::Instant; println!("\nMel-Spectrogram Benchmark"); println!("-------------------------"); let config = AudioConfig::default(); let num_samples = config.sample_rate as usize; // 1 second of audio let signal: Vec = (0..num_samples) .map(|i| (i as f32 * 0.01).sin()) .collect(); let start = Instant::now(); for _ in 0..iterations { let _ = mel_spectrogram(&signal, &config); } let elapsed = start.elapsed(); let per_iter = elapsed.as_secs_f32() / iterations as f32; println!(" Signal length: {} samples ({:.2}s)", num_samples, num_samples as f32 / config.sample_rate as f32); println!(" Iterations: {}", iterations); println!(" Total time: {:.3}s", elapsed.as_secs_f32()); println!(" Per iteration: {:.3}ms", per_iter * 1000.0); println!(" Throughput: {:.1}x real-time", 1.0 / per_iter); } fn benchmark_tokenization(iterations: usize) { use indextts::text::{TextNormalizer, TextTokenizer, TokenizerConfig}; use std::time::Instant; println!("\nTokenization Benchmark"); println!("----------------------"); let normalizer = TextNormalizer::new(); let tokenizer = TextTokenizer::new(TokenizerConfig::default()).unwrap(); let test_texts = vec![ "Hello world, this is a test of the text-to-speech system.", "The quick brown fox jumps over the lazy dog.", "你好世界,这是一个测试。", "Mixed language: Hello 世界 and 你好 world.", ]; let start = Instant::now(); for _ in 0..iterations { for text in &test_texts { let normalized = normalizer.normalize(text).unwrap(); let _tokens = tokenizer.encode(&normalized).unwrap(); } } let elapsed = start.elapsed(); let total_chars: usize = test_texts.iter().map(|t| t.len()).sum(); let per_iter = elapsed.as_secs_f32() / iterations as f32; println!(" Texts: {}", test_texts.len()); println!(" Total characters: {}", total_chars); println!(" Iterations: {}", iterations); println!(" Total time: {:.3}s", elapsed.as_secs_f32()); println!(" Per iteration: {:.3}ms", per_iter * 1000.0); println!( " Throughput: {:.0} chars/sec", (total_chars * iterations) as f32 / elapsed.as_secs_f32() ); } fn benchmark_vocoder(iterations: usize) { use indextts::vocoder::{create_bigvgan_22k, Vocoder}; use ndarray::Array2; use std::time::Instant; println!("\nVocoder Benchmark"); println!("-----------------"); let vocoder = create_bigvgan_22k(); let num_frames = 100; // ~2.5 seconds of audio let mel = Array2::zeros((80, num_frames)); let start = Instant::now(); for _ in 0..iterations { let _ = vocoder.synthesize(&mel); } let elapsed = start.elapsed(); let audio_duration = num_frames as f32 * vocoder.hop_length() as f32 / vocoder.sample_rate() as f32; let per_iter = elapsed.as_secs_f32() / iterations as f32; println!(" Mel frames: {}", num_frames); println!(" Audio duration: {:.2}s", audio_duration); println!(" Iterations: {}", iterations); println!(" Total time: {:.3}s", elapsed.as_secs_f32()); println!(" Per iteration: {:.3}ms", per_iter * 1000.0); println!(" RTF: {:.3}x", per_iter / audio_duration); }