File size: 1,422 Bytes
2bbfbb7
 
 
 
 
 
0393dfa
2bbfbb7
 
0393dfa
 
 
 
2bbfbb7
0393dfa
2bbfbb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0393dfa
2bbfbb7
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
//! Audio processing module for IndexTTS
//!
//! Provides mel-spectrogram computation, audio I/O, and DSP operations.

mod dsp;
mod io;
pub mod mel;
mod resample;

pub use dsp::{
    apply_fade, apply_preemphasis, dynamic_range_compression, dynamic_range_decompression,
    normalize_audio, normalize_audio_peak,
};
pub use io::{load_audio, save_audio, AudioData};
pub use mel::{mel_spectrogram, mel_to_linear, MelFilterbank};
pub use resample::resample;

use crate::Result;

/// Audio processing configuration
#[derive(Debug, Clone)]
pub struct AudioConfig {
    /// Sample rate
    pub sample_rate: u32,
    /// FFT size
    pub n_fft: usize,
    /// Hop length for STFT
    pub hop_length: usize,
    /// Window length
    pub win_length: usize,
    /// Number of mel bands
    pub n_mels: usize,
    /// Minimum frequency
    pub fmin: f32,
    /// Maximum frequency
    pub fmax: f32,
}

impl Default for AudioConfig {
    fn default() -> Self {
        Self {
            sample_rate: 22050,
            n_fft: 1024,
            hop_length: 256,
            win_length: 1024,
            n_mels: 80,
            fmin: 0.0,
            fmax: 8000.0,
        }
    }
}

/// Compute mel spectrogram from audio file
pub fn compute_mel_from_file(path: &str, config: &AudioConfig) -> Result<ndarray::Array2<f32>> {
    let audio = load_audio(path, Some(config.sample_rate))?;
    mel_spectrogram(&audio.samples, config)
}