//! Marine Prosody Conditioner - Extract 8D interpretable emotion vectors //! //! Uses Marine salience to extract prosodic features from reference audio. //! These features are interpretable and can be directly edited for control. //! //! The 8D vector captures: //! 1. Period jitter (mean & std) - pitch stability //! 2. Amplitude jitter (mean & std) - roughness/strain //! 3. Harmonic alignment - voiced vs noisy //! 4. Overall salience - authenticity score //! 5. Peak density - speech rate/intensity //! 6. Energy - loudness use crate::error::{Error, Result}; /// 8-dimensional prosody vector extracted from audio /// /// These features capture the "emotional signature" of speech: /// - Low jitter + high energy = confident/happy /// - High jitter + low energy = nervous/uneasy /// - Stable patterns = calm, unstable = agitated #[derive(Debug, Clone, Copy, PartialEq)] pub struct MarineProsodyVector { /// Mean period jitter (pitch stability) /// Lower = more stable pitch, Higher = more variation pub jp_mean: f32, /// Standard deviation of period jitter /// Captures consistency of pitch patterns pub jp_std: f32, /// Mean amplitude jitter (volume stability) /// Lower = consistent volume, Higher = erratic pub ja_mean: f32, /// Standard deviation of amplitude jitter /// Captures volume pattern consistency pub ja_std: f32, /// Mean harmonic alignment score /// 1.0 = perfectly voiced, 0.0 = noise pub h_mean: f32, /// Mean overall salience score /// Overall authenticity/quality rating pub s_mean: f32, /// Peak density (peaks per second) /// Related to speech rate and intensity pub peak_density: f32, /// Mean energy level /// Average loudness of detected peaks pub energy_mean: f32, } impl MarineProsodyVector { /// Create zero vector (baseline) pub fn zeros() -> Self { Self { jp_mean: 0.0, jp_std: 0.0, ja_mean: 0.0, ja_std: 0.0, h_mean: 1.0, s_mean: 1.0, peak_density: 0.0, energy_mean: 0.0, } } /// Convert to f32 array for neural network input pub fn to_array(&self) -> [f32; 8] { [ self.jp_mean, self.jp_std, self.ja_mean, self.ja_std, self.h_mean, self.s_mean, self.peak_density, self.energy_mean, ] } /// Create from f32 array pub fn from_array(arr: [f32; 8]) -> Self { Self { jp_mean: arr[0], jp_std: arr[1], ja_mean: arr[2], ja_std: arr[3], h_mean: arr[4], s_mean: arr[5], peak_density: arr[6], energy_mean: arr[7], } } /// Get combined jitter (average of period and amplitude) pub fn combined_jitter(&self) -> f32 { (self.jp_mean + self.ja_mean) / 2.0 } /// Estimate emotional valence from prosody /// Returns value from -1.0 (negative) to 1.0 (positive) pub fn estimate_valence(&self) -> f32 { // High energy + low jitter = positive // Low energy + high jitter = negative let jitter_factor = 1.0 / (1.0 + self.combined_jitter()); let energy_factor = self.energy_mean.sqrt(); // Combine factors, normalize to -1..1 range (jitter_factor * energy_factor * 2.0 - 1.0).clamp(-1.0, 1.0) } /// Estimate arousal/intensity level /// Returns value from 0.0 (calm) to 1.0 (excited) pub fn estimate_arousal(&self) -> f32 { // High peak density + high energy + some jitter variance = high arousal let density_factor = (self.peak_density / 100.0).clamp(0.0, 1.0); let energy_factor = self.energy_mean.sqrt(); let variance_factor = (self.jp_std + self.ja_std).clamp(0.0, 1.0); ((density_factor + energy_factor + variance_factor) / 3.0).clamp(0.0, 1.0) } } impl Default for MarineProsodyVector { fn default() -> Self { Self::zeros() } } /// Marine-based prosody conditioner for TTS /// /// Replaces heavy Conformer-style extractors with lightweight, interpretable /// Marine salience features. This gives you: /// - 8D interpretable emotion vector /// - Direct editability for control /// - Biologically plausible processing /// - O(n) linear time extraction pub struct MarineProsodyConditioner { sample_rate: u32, jitter_low: f32, jitter_high: f32, min_period: u32, max_period: u32, ema_alpha: f32, } impl MarineProsodyConditioner { /// Create new prosody conditioner for given sample rate pub fn new(sample_rate: u32) -> Self { // F0 range: ~60Hz (low male) to ~4kHz (includes harmonics) let min_period = sample_rate / 4000; let max_period = sample_rate / 60; Self { sample_rate, jitter_low: 0.02, jitter_high: 0.60, min_period, max_period, ema_alpha: 0.01, } } /// Extract prosody vector from audio samples /// /// Analyzes the audio to produce an 8D prosody vector capturing /// the emotional/stylistic characteristics of the speech. /// /// # Arguments /// * `samples` - Audio samples (typically -1.0 to 1.0 range) /// /// # Returns /// * `Ok(MarineProsodyVector)` - Extracted prosody features /// * `Err` - If insufficient peaks detected pub fn from_samples(&self, samples: &[f32]) -> Result { if samples.is_empty() { return Err(Error::Audio("Empty audio buffer".into())); } // Detect peaks and collect jitter measurements let mut peaks: Vec = Vec::new(); let clip_threshold = 1e-3; // Simple peak detection for i in 1..samples.len().saturating_sub(1) { let prev = samples[i - 1].abs(); let curr = samples[i].abs(); let next = samples[i + 1].abs(); if curr > prev && curr > next && curr > clip_threshold { peaks.push(PeakInfo { index: i, amplitude: curr, }); } } if peaks.len() < 3 { // Not enough peaks for meaningful analysis return Ok(MarineProsodyVector::zeros()); } // Calculate inter-peak periods and jitter let mut periods: Vec = Vec::new(); let mut amplitudes: Vec = Vec::new(); let mut jp_values: Vec = Vec::new(); let mut ja_values: Vec = Vec::new(); // Use EMA for tracking let mut ema_period = 0.0f32; let mut ema_amp = 0.0f32; let mut ema_initialized = false; for i in 1..peaks.len() { let period = (peaks[i].index - peaks[i - 1].index) as f32; let amp = peaks[i].amplitude; // Check if period is in valid range if period > self.min_period as f32 && period < self.max_period as f32 { periods.push(period); amplitudes.push(amp); if !ema_initialized { ema_period = period; ema_amp = amp; ema_initialized = true; } else { // Calculate jitter let jp = (period - ema_period).abs() / ema_period; let ja = (amp - ema_amp).abs() / ema_amp; jp_values.push(jp); ja_values.push(ja); // Update EMA ema_period = self.ema_alpha * period + (1.0 - self.ema_alpha) * ema_period; ema_amp = self.ema_alpha * amp + (1.0 - self.ema_alpha) * ema_amp; } } } if jp_values.is_empty() { return Ok(MarineProsodyVector::zeros()); } // Compute statistics let n = jp_values.len() as f32; let duration_sec = samples.len() as f32 / self.sample_rate as f32; // Mean calculations let jp_mean = jp_values.iter().sum::() / n; let ja_mean = ja_values.iter().sum::() / n; let energy_mean = amplitudes.iter().map(|a| a * a).sum::() / amplitudes.len() as f32; // Std calculations let jp_var = jp_values.iter().map(|x| (x - jp_mean).powi(2)).sum::() / n; let ja_var = ja_values.iter().map(|x| (x - ja_mean).powi(2)).sum::() / n; let jp_std = jp_var.sqrt(); let ja_std = ja_var.sqrt(); // Harmonic score (simplified - assume voiced content) let h_mean = 1.0; // Overall salience score let s_mean = 1.0 / (1.0 + jp_mean + ja_mean); // Peak density let peak_density = peaks.len() as f32 / duration_sec; Ok(MarineProsodyVector { jp_mean, jp_std, ja_mean, ja_std, h_mean, s_mean, peak_density, energy_mean, }) } /// Validate TTS output quality using Marine salience /// /// Returns quality score and potential issues detected pub fn validate_tts_output(&self, samples: &[f32]) -> Result { let prosody = self.from_samples(samples)?; let mut issues = Vec::new(); // Check for common TTS problems if prosody.jp_mean < 0.005 { issues.push("Too perfect - sounds robotic (add natural variation)"); } if prosody.jp_mean > 0.3 { issues.push("High period jitter - possible artifacts"); } if prosody.ja_mean > 0.4 { issues.push("High amplitude jitter - volume inconsistency"); } if prosody.s_mean < 0.4 { issues.push("Low salience - audio quality issues"); } if prosody.peak_density < 10.0 { issues.push("Low peak density - missing speech energy"); } let quality_score = prosody.s_mean * 100.0; Ok(TTSQualityReport { prosody, quality_score, issues, }) } /// Get the configured sample rate pub fn sample_rate(&self) -> u32 { self.sample_rate } } /// Internal peak information struct PeakInfo { index: usize, amplitude: f32, } /// TTS quality validation report #[derive(Debug, Clone)] pub struct TTSQualityReport { /// Extracted prosody vector pub prosody: MarineProsodyVector, /// Overall quality score (0-100) pub quality_score: f32, /// List of detected issues pub issues: Vec<&'static str>, } impl TTSQualityReport { /// Check if quality passes threshold pub fn passes(&self, threshold: f32) -> bool { self.quality_score >= threshold && self.issues.is_empty() } } #[cfg(test)] mod tests { use super::*; #[test] fn test_prosody_vector_array_conversion() { let vec = MarineProsodyVector { jp_mean: 0.1, jp_std: 0.05, ja_mean: 0.2, ja_std: 0.1, h_mean: 0.9, s_mean: 0.8, peak_density: 50.0, energy_mean: 0.3, }; let arr = vec.to_array(); let reconstructed = MarineProsodyVector::from_array(arr); assert_eq!(vec.jp_mean, reconstructed.jp_mean); assert_eq!(vec.s_mean, reconstructed.s_mean); } #[test] fn test_conditioner_empty_buffer() { let conditioner = MarineProsodyConditioner::new(22050); let result = conditioner.from_samples(&[]); assert!(result.is_err()); } #[test] fn test_conditioner_silence() { let conditioner = MarineProsodyConditioner::new(22050); let silence = vec![0.0; 1000]; let prosody = conditioner.from_samples(&silence).unwrap(); // Should return zeros for silence assert_eq!(prosody.peak_density, 0.0); } #[test] fn test_estimate_valence() { let positive = MarineProsodyVector { jp_mean: 0.01, jp_std: 0.01, ja_mean: 0.01, ja_std: 0.01, h_mean: 1.0, s_mean: 0.95, peak_density: 100.0, energy_mean: 0.8, }; let negative = MarineProsodyVector { jp_mean: 0.5, jp_std: 0.3, ja_mean: 0.4, ja_std: 0.2, h_mean: 0.7, s_mean: 0.4, peak_density: 30.0, energy_mean: 0.1, }; // Higher energy + lower jitter should give more positive valence assert!(positive.estimate_valence() > negative.estimate_valence()); } }