dat
feat: Initial commit new cnn voicemail detection model
2dafd32
"""Example usage of CNN ONNX voicemail detector.
This script demonstrates how to use the fast CNN voicemail detection model
for both single file inference and real-time streaming scenarios.
"""
from pathlib import Path
from typing import Tuple
import librosa
import numpy as np
import onnxruntime as ort
class CNNVoicemailDetector:
"""Fast CNN-based voicemail detector using ONNX."""
def __init__(self, model_path: str = "model.onnx"):
"""Initialize the detector.
Args:
model_path: Path to the ONNX model file
"""
# Load ONNX model with optimizations
sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = (
ort.GraphOptimizationLevel.ORT_ENABLE_ALL
)
self.session = ort.InferenceSession(model_path, sess_options)
self.sample_rate = 16000
self.duration = 4.0 # seconds
self.expected_samples = int(self.sample_rate * self.duration)
# Mel-spectrogram parameters
self.n_fft = 512
self.hop_length = 256
self.n_mels = 128
self.fmin = 0
self.fmax = 8000
# Class labels
self.id2label = {0: "live_human", 1: "voicemail"}
def extract_mel_spectrogram(self, audio: np.ndarray) -> np.ndarray:
"""Extract mel-spectrogram features from audio.
Args:
audio: Audio array of shape (64000,) - 4 seconds at 16kHz
Returns:
Mel-spectrogram of shape (1, 1, 128, 251)
"""
# Compute mel-spectrogram
mel_spec = librosa.feature.melspectrogram(
y=audio,
sr=self.sample_rate,
n_fft=self.n_fft,
hop_length=self.hop_length,
n_mels=self.n_mels,
fmin=self.fmin,
fmax=self.fmax,
)
# Convert to log scale (dB)
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
# Normalize to [0, 1]
mel_spec_normalized = (mel_spec_db - mel_spec_db.min()) / (
mel_spec_db.max() - mel_spec_db.min() + 1e-8
)
# Reshape to (1, 1, 128, 251)
return mel_spec_normalized.reshape(1, 1, 128, -1).astype(np.float32)
def preprocess_audio(self, audio_path: str) -> np.ndarray:
"""Load and preprocess audio file.
Args:
audio_path: Path to audio file
Returns:
Preprocessed mel-spectrogram ready for inference
"""
# Load audio (mono, 16kHz)
audio, sr = librosa.load(audio_path, sr=self.sample_rate, mono=True)
# Take first 4 seconds (64,000 samples)
audio_segment = audio[: self.expected_samples]
# Pad if shorter than 4 seconds
if len(audio_segment) < self.expected_samples:
audio_segment = np.pad(
audio_segment, (0, self.expected_samples - len(audio_segment))
)
# Extract mel-spectrogram
return self.extract_mel_spectrogram(audio_segment)
def predict(self, audio_path: str) -> Tuple[str, float, dict]:
"""Detect voicemail from audio file.
Args:
audio_path: Path to audio file
Returns:
Tuple of (prediction, confidence, probabilities_dict)
"""
# Preprocess audio
mel_spec = self.preprocess_audio(audio_path)
# Run inference
outputs = self.session.run(None, {"input": mel_spec})
logits = outputs[0]
# Get prediction
prediction_idx = np.argmax(logits, axis=-1)[0]
prediction = self.id2label[prediction_idx]
# Calculate probabilities
probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
confidence = probabilities[0][prediction_idx]
probs_dict = {
"live_human": float(probabilities[0][0]),
"voicemail": float(probabilities[0][1]),
}
return prediction, float(confidence), probs_dict
def predict_from_array(self, audio_array: np.ndarray) -> Tuple[str, float, dict]:
"""Detect voicemail from audio array.
Args:
audio_array: Audio array (4 seconds @ 16kHz = 64,000 samples)
Returns:
Tuple of (prediction, confidence, probabilities_dict)
"""
# Ensure correct length
if len(audio_array) < self.expected_samples:
audio_array = np.pad(
audio_array, (0, self.expected_samples - len(audio_array))
)
elif len(audio_array) > self.expected_samples:
audio_array = audio_array[: self.expected_samples]
# Extract mel-spectrogram
mel_spec = self.extract_mel_spectrogram(audio_array)
# Run inference
outputs = self.session.run(None, {"input": mel_spec})
logits = outputs[0]
# Get prediction
prediction_idx = np.argmax(logits, axis=-1)[0]
prediction = self.id2label[prediction_idx]
# Calculate probabilities
probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
confidence = probabilities[0][prediction_idx]
probs_dict = {
"live_human": float(probabilities[0][0]),
"voicemail": float(probabilities[0][1]),
}
return prediction, float(confidence), probs_dict
class StreamingVoicemailDetector:
"""Real-time streaming voicemail detector with rolling buffer."""
def __init__(self, model_path: str = "model.onnx", sample_rate: int = 16000):
"""Initialize streaming detector.
Args:
model_path: Path to the ONNX model file
sample_rate: Audio sample rate (default: 16000)
"""
self.detector = CNNVoicemailDetector(model_path)
self.sample_rate = sample_rate
self.buffer_duration = 4.0 # seconds
self.buffer_size = int(sample_rate * self.buffer_duration)
self.audio_buffer = np.zeros(self.buffer_size, dtype=np.float32)
self.is_ready = False
self.samples_received = 0
def add_audio(self, audio_chunk: np.ndarray) -> None:
"""Add audio chunk to rolling buffer.
Args:
audio_chunk: New audio samples to add
"""
chunk_size = len(audio_chunk)
# Shift buffer and add new audio
self.audio_buffer = np.roll(self.audio_buffer, -chunk_size)
self.audio_buffer[-chunk_size:] = audio_chunk
# Track total samples received
self.samples_received += chunk_size
# Mark as ready once we have enough samples
if self.samples_received >= self.buffer_size:
self.is_ready = True
def detect(self) -> Tuple[str, float, dict] | None:
"""Detect voicemail from current buffer.
Returns:
Tuple of (prediction, confidence, probabilities) or None if not ready
"""
if not self.is_ready:
return None
return self.detector.predict_from_array(self.audio_buffer)
def reset(self) -> None:
"""Reset the detector buffer."""
self.audio_buffer = np.zeros(self.buffer_size, dtype=np.float32)
self.is_ready = False
self.samples_received = 0
def example_single_file():
"""Example: Detect voicemail from a single audio file."""
print("=" * 60)
print("Example 1: Single File Detection")
print("=" * 60)
detector = CNNVoicemailDetector("model.onnx")
# Replace with your audio file path
audio_path = "test_audio.wav"
try:
prediction, confidence, probs = detector.predict(audio_path)
print(f"\nAudio: {audio_path}")
print(f"Prediction: {prediction}")
print(f"Confidence: {confidence:.2%}")
print("\nProbabilities:")
print(f" Live Human: {probs['live_human']:.2%}")
print(f" Voicemail: {probs['voicemail']:.2%}")
except FileNotFoundError:
print(f"\n⚠️ File not found: {audio_path}")
print("Please provide a valid audio file path.")
def example_batch_processing():
"""Example: Process multiple audio files."""
print("\n" + "=" * 60)
print("Example 2: Batch Processing")
print("=" * 60)
detector = CNNVoicemailDetector("model.onnx")
# Replace with your audio directory
audio_dir = Path("test_audios")
if not audio_dir.exists():
print(f"\n⚠️ Directory not found: {audio_dir}")
print("Please create a directory with audio files.")
return
audio_files = list(audio_dir.glob("*.wav")) + list(audio_dir.glob("*.mp3"))
if not audio_files:
print(f"\n⚠️ No audio files found in {audio_dir}")
return
print(f"\nProcessing {len(audio_files)} files...\n")
import time
results = []
total_time = 0
for audio_path in audio_files:
try:
start_time = time.perf_counter()
prediction, confidence, probs = detector.predict(str(audio_path))
inference_time = (time.perf_counter() - start_time) * 1000 # ms
total_time += inference_time
results.append(
{
"file": audio_path.name,
"prediction": prediction,
"confidence": confidence,
"time_ms": inference_time,
"probs": probs,
}
)
except Exception as e:
print(f"❌ Error processing {audio_path.name}: {e}")
# Print results table
print(f"{'File':<30} {'Prediction':<15} {'Confidence':<12} {'Time (ms)':<10}")
print("-" * 70)
for result in results:
print(
f"{result['file']:<30} "
f"{result['prediction']:<15} "
f"{result['confidence']:<12.2%} "
f"{result['time_ms']:<10.2f}"
)
# Summary
voicemail_count = sum(1 for r in results if r["prediction"] == "voicemail")
live_human_count = sum(1 for r in results if r["prediction"] == "live_human")
avg_time = total_time / len(results) if results else 0
print("\nSummary:")
print(f" Total files: {len(results)}")
print(f" Voicemail: {voicemail_count}")
print(f" Live Human: {live_human_count}")
print(f" Average inference time: {avg_time:.2f}ms")
print(f" Total processing time: {total_time:.2f}ms")
def example_streaming():
"""Example: Real-time streaming audio detection."""
print("\n" + "=" * 60)
print("Example 3: Real-time Streaming Detection")
print("=" * 60)
detector = StreamingVoicemailDetector("model.onnx")
sample_rate = 16000
chunk_duration = 0.5 # Process every 0.5 seconds
chunk_size = int(sample_rate * chunk_duration)
print(f"\nBuffer duration: {detector.buffer_duration}s")
print(f"Chunk duration: {chunk_duration}s")
print(f"Chunk size: {chunk_size} samples")
print("\nSimulating audio stream...\n")
# Simulate 10 chunks of audio (5 seconds total)
for i in range(10):
# Simulate incoming audio chunk (in practice, from microphone/stream)
audio_chunk = np.random.randn(chunk_size).astype(np.float32)
# Add to detector buffer
detector.add_audio(audio_chunk)
# Try to detect (will return None until buffer is filled)
result = detector.detect()
if result:
prediction, confidence, _ = result
status = "✅" if prediction == "voicemail" else "👤"
print(
f"Chunk {i + 1:2d}: {status} {prediction:<12} "
f"(confidence: {confidence:.2%})"
)
else:
samples_needed = detector.buffer_size - detector.samples_received
print(
f"Chunk {i + 1:2d}: ⏳ Buffering... ({samples_needed} samples needed)"
)
def example_performance_benchmark():
"""Example: Benchmark inference performance."""
print("\n" + "=" * 60)
print("Example 4: Performance Benchmark")
print("=" * 60)
detector = CNNVoicemailDetector("model.onnx")
# Generate test audio
sample_rate = 16000
duration = 4.0
audio_array = np.random.randn(int(sample_rate * duration)).astype(np.float32)
# Warm-up run
detector.predict_from_array(audio_array)
# Benchmark
import time
num_iterations = 100
times = []
print(f"\nRunning {num_iterations} iterations...\n")
for i in range(num_iterations):
start_time = time.perf_counter()
prediction, confidence, _ = detector.predict_from_array(audio_array)
elapsed = (time.perf_counter() - start_time) * 1000 # ms
times.append(elapsed)
# Statistics
times = np.array(times)
print("Performance Statistics:")
print(f" Iterations: {num_iterations}")
print(f" Mean: {times.mean():.2f}ms")
print(f" Median: {np.median(times):.2f}ms")
print(f" Min: {times.min():.2f}ms")
print(f" Max: {times.max():.2f}ms")
print(f" Std Dev: {times.std():.2f}ms")
print(f"\n Throughput: {1000 / times.mean():.1f} inferences/second")
print(f" Real-time factor: {(duration * 1000) / times.mean():.1f}x")
def example_comparison_with_threshold():
"""Example: Using confidence thresholds for decision making."""
print("\n" + "=" * 60)
print("Example 5: Confidence Threshold Strategy")
print("=" * 60)
detector = CNNVoicemailDetector("model.onnx")
# Simulate different confidence levels
test_cases = [
("high_confidence_voicemail", 0.95),
("medium_confidence_voicemail", 0.75),
("low_confidence_voicemail", 0.55),
("uncertain", 0.50),
("low_confidence_human", 0.45),
]
thresholds = {
"high": 0.90,
"medium": 0.70,
"low": 0.60,
}
print("\nConfidence Thresholds:")
print(f" High confidence: ≥ {thresholds['high']:.0%}")
print(f" Medium confidence: ≥ {thresholds['medium']:.0%}")
print(f" Low confidence: ≥ {thresholds['low']:.0%}")
print(f" Uncertain: < {thresholds['low']:.0%}")
print()
print(f"{'Case':<30} {'Confidence':<12} {'Action':<30}")
print("-" * 75)
for case_name, simulated_confidence in test_cases:
# Determine action based on threshold
if simulated_confidence >= thresholds["high"]:
action = "Immediate hangup (very sure)"
elif simulated_confidence >= thresholds["medium"]:
action = "Hangup (confident)"
elif simulated_confidence >= thresholds["low"]:
action = "Hangup with logging"
else:
action = "Wait for more audio / human verify"
print(f"{case_name:<30} {simulated_confidence:<12.2%} {action:<30}")
if __name__ == "__main__":
print("\n⚡ CNN Voicemail Detector - Usage Examples\n")
# Run all examples
example_single_file()
example_batch_processing()
example_streaming()
example_performance_benchmark()
example_comparison_with_threshold()
print("\n" + "=" * 60)
print("✅ All examples completed!")
print("=" * 60)