#!/usr/bin/env python3 """ Audio Emotion & Mental Health Detection Model Lightweight version for Hugging Face Spaces Using scikit-learn instead of PyTorch """ import os import numpy as np import gradio as gr from typing import Dict import warnings import pickle warnings.filterwarnings('ignore') # Audio processing try: import librosa LIBROSA_AVAILABLE = True except ImportError: LIBROSA_AVAILABLE = False print("⚠️ Librosa not available, using scipy") from scipy.io import wavfile import scipy.signal as signal from scipy import fft # Machine Learning from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor from sklearn.preprocessing import StandardScaler from sklearn.neural_network import MLPClassifier, MLPRegressor # ============================================ # AUDIO PROCESSING # ============================================ class AudioFeatureExtractor: """Extract audio features without heavy dependencies""" def __init__(self, sr=16000, n_mfcc=20): self.sr = sr self.n_mfcc = n_mfcc def load_audio(self, audio_path): """Load audio file""" try: if LIBROSA_AVAILABLE: y, sr = librosa.load(audio_path, sr=self.sr, duration=3) return y, sr else: # Use scipy sr, y = wavfile.read(audio_path) # Convert to mono if len(y.shape) > 1: y = y.mean(axis=1) # Normalize y = y.astype(np.float32) if np.max(np.abs(y)) > 0: y = y / np.max(np.abs(y)) # Resample if needed if sr != self.sr: num_samples = int(len(y) * self.sr / sr) y = signal.resample(y, num_samples) # Limit to 3 seconds max_len = 3 * self.sr if len(y) > max_len: y = y[:max_len] return y, self.sr except Exception as e: print(f"Error loading audio: {e}") return np.random.randn(self.sr * 3) * 0.1, self.sr def get_mfcc_simple(self, y): """Simplified MFCC extraction""" # Pre-emphasis y_emphasized = np.append(y[0], y[1:] - 0.97 * y[:-1]) # Framing frame_length = int(0.025 * self.sr) frame_step = int(0.01 * self.sr) num_frames = 1 + int((len(y_emphasized) - frame_length) / frame_step) frames = np.zeros((num_frames, frame_length)) for i in range(num_frames): start = i * frame_step frames[i] = y_emphasized[start:start + frame_length] # Apply window frames *= np.hamming(frame_length) # FFT mag_frames = np.absolute(np.fft.rfft(frames, frame_length)) pow_frames = ((1.0 / frame_length) * (mag_frames ** 2)) # Mel filterbank nfft = frame_length nfilt = 26 low_freq_mel = 0 high_freq_mel = 2595 * np.log10(1 + (self.sr / 2) / 700) mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2) hz_points = 700 * (10**(mel_points / 2595) - 1) bin_points = np.floor((nfft + 1) * hz_points / self.sr).astype(int) fbank = np.zeros((nfilt, int(nfft / 2 + 1))) for m in range(1, nfilt + 1): f_m_minus = bin_points[m - 1] f_m = bin_points[m] f_m_plus = bin_points[m + 1] for k in range(f_m_minus, f_m): fbank[m - 1, k] = (k - bin_points[m - 1]) / (bin_points[m] - bin_points[m - 1]) for k in range(f_m, f_m_plus): fbank[m - 1, k] = (bin_points[m + 1] - k) / (bin_points[m + 1] - bin_points[m]) filter_banks = np.dot(pow_frames, fbank.T) filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) filter_banks = 20 * np.log10(filter_banks) # DCT mfcc = fft.dct(filter_banks, type=2, axis=1, norm='ortho')[:, :self.n_mfcc] return mfcc.T def extract_pitch(self, y): """Extract pitch using autocorrelation""" pitch_values = [] frame_length = int(0.03 * self.sr) hop_length = int(0.01 * self.sr) for i in range(0, len(y) - frame_length, hop_length): frame = y[i:i+frame_length] # Autocorrelation corr = np.correlate(frame, frame, mode='full') corr = corr[len(corr)//2:] # Find first peak after lag 0 d = np.diff(corr) start = int(self.sr / 400) # Min 400 Hz peak = np.where(d[start:] < 0)[0] if len(peak) > 0: peak_idx = peak[0] + start if peak_idx > 0: freq = self.sr / peak_idx if 50 < freq < 400: pitch_values.append(freq) return pitch_values if pitch_values else [150.0] def extract_energy(self, y): """Extract RMS energy""" frame_length = int(0.025 * self.sr) hop_length = int(0.01 * self.sr) rms = [] for i in range(0, len(y) - frame_length, hop_length): frame = y[i:i+frame_length] rms.append(np.sqrt(np.mean(frame**2))) return np.array(rms) def extract_zcr(self, y): """Zero crossing rate""" frame_length = int(0.025 * self.sr) hop_length = int(0.01 * self.sr) zcr = [] for i in range(0, len(y) - frame_length, hop_length): frame = y[i:i+frame_length] crossings = np.sum(np.abs(np.diff(np.sign(frame)))) / 2 zcr.append(crossings / frame_length) return np.array(zcr) def extract_spectral_features(self, y): """Spectral features""" spectrum = np.fft.rfft(y) magnitude = np.abs(spectrum) freq = np.fft.rfftfreq(len(y), 1.0/self.sr) # Spectral centroid centroid = np.sum(freq * magnitude) / (np.sum(magnitude) + 1e-6) # Spectral rolloff cumsum = np.cumsum(magnitude) rolloff_idx = np.where(cumsum >= 0.85 * cumsum[-1])[0] rolloff = freq[rolloff_idx[0]] if len(rolloff_idx) > 0 else 0 # Spectral bandwidth bandwidth = np.sqrt(np.sum(((freq - centroid)**2) * magnitude) / (np.sum(magnitude) + 1e-6)) return centroid, rolloff, bandwidth def extract_all_features(self, audio_path): """Extract all features""" try: y, sr = self.load_audio(audio_path) # MFCCs mfcc = self.get_mfcc_simple(y) mfcc_mean = np.mean(mfcc, axis=1) mfcc_std = np.std(mfcc, axis=1) # Pitch pitch_values = self.extract_pitch(y) pitch_mean = np.mean(pitch_values) pitch_std = np.std(pitch_values) pitch_min = np.min(pitch_values) pitch_max = np.max(pitch_values) monotone_score = 1.0 / (1.0 + pitch_std/10.0) # Energy rms = self.extract_energy(y) energy_mean = np.mean(rms) energy_std = np.std(rms) energy_max = np.max(rms) # ZCR zcr = self.extract_zcr(y) zcr_mean = np.mean(zcr) zcr_std = np.std(zcr) # Spectral spec_centroid, spec_rolloff, spec_bandwidth = self.extract_spectral_features(y) # Tempo estimation onset_env = rms tempo = 120.0 # Default if len(onset_env) > 10: autocorr = np.correlate(onset_env, onset_env, mode='full') autocorr = autocorr[len(autocorr)//2:] peaks = signal.find_peaks(autocorr)[0] if len(peaks) > 0 and peaks[0] > 0: tempo = 60.0 / (peaks[0] * 0.01) tempo = np.clip(tempo, 60, 180) # Combine features features = np.concatenate([ mfcc_mean, mfcc_std, [pitch_mean, pitch_std, pitch_min, pitch_max, monotone_score], [energy_mean, energy_std, energy_max], [zcr_mean, zcr_std], [spec_centroid, spec_rolloff, spec_bandwidth], [tempo] ]) # Derived scores vocal_affect = self._calc_affect(pitch_std, energy_std, spec_centroid) vocal_energy = self._calc_energy(energy_mean, tempo, zcr_mean) return { 'features': features.astype(np.float32), 'vocal_affect_score': float(vocal_affect), 'monotone_score': float(monotone_score), 'vocal_energy_score': float(vocal_energy), 'pitch_variability': float(pitch_std), 'energy_level': float(energy_mean) } except Exception as e: print(f"Error: {e}") return self._default_features() def _calc_affect(self, pitch_std, energy_std, spec_centroid): """Calculate vocal affect score""" pitch_comp = min(pitch_std / 50.0, 1.0) energy_comp = min(energy_std / 0.3, 1.0) spec_comp = min(spec_centroid / 2000.0, 1.0) return np.clip(pitch_comp * 0.4 + energy_comp * 0.4 + spec_comp * 0.2, 0, 1) def _calc_energy(self, energy_mean, tempo, zcr_mean): """Calculate vocal energy score""" energy_comp = min(energy_mean / 0.5, 1.0) tempo_comp = min(tempo / 150.0, 1.0) zcr_comp = min(zcr_mean / 0.15, 1.0) return np.clip(energy_comp * 0.5 + tempo_comp * 0.3 + zcr_comp * 0.2, 0, 1) def _default_features(self): """Default features for errors""" n_features = self.n_mfcc * 2 + 14 return { 'features': np.random.randn(n_features).astype(np.float32) * 0.1, 'vocal_affect_score': 0.5, 'monotone_score': 0.5, 'vocal_energy_score': 0.5, 'pitch_variability': 30.0, 'energy_level': 0.3 } # ============================================ # EMOTION PREDICTOR # ============================================ class EmotionPredictor: """Lightweight emotion predictor using sklearn""" def __init__(self): self.extractor = AudioFeatureExtractor(sr=16000, n_mfcc=20) # Emotion mapping self.emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised'] # Initialize models self._initialize_models() def _initialize_models(self): """Initialize pre-trained or demo models""" # Try to load pre-trained models if os.path.exists('emotion_classifier.pkl'): try: with open('emotion_classifier.pkl', 'rb') as f: self.emotion_model = pickle.load(f) with open('affect_model.pkl', 'rb') as f: self.affect_model = pickle.load(f) with open('monotone_model.pkl', 'rb') as f: self.monotone_model = pickle.load(f) with open('energy_model.pkl', 'rb') as f: self.energy_model = pickle.load(f) with open('scaler.pkl', 'rb') as f: self.scaler = pickle.load(f) print("✅ Loaded pre-trained models") return except: pass # Create demo models (for demonstration without training) print("ℹ️ Creating demo models (for demonstration)") n_features = 54 # 20*2 MFCC + 14 other features # Emotion classifier self.emotion_model = RandomForestClassifier( n_estimators=100, max_depth=10, random_state=42 ) # Regression models self.affect_model = GradientBoostingRegressor(n_estimators=50, random_state=42) self.monotone_model = GradientBoostingRegressor(n_estimators=50, random_state=42) self.energy_model = GradientBoostingRegressor(n_estimators=50, random_state=42) # Scaler self.scaler = StandardScaler() # Fit with dummy data (for demo purposes) X_dummy = np.random.randn(100, n_features) y_emotion_dummy = np.random.randint(0, 8, 100) y_reg_dummy = np.random.rand(100) self.scaler.fit(X_dummy) self.emotion_model.fit(X_dummy, y_emotion_dummy) self.affect_model.fit(X_dummy, y_reg_dummy) self.monotone_model.fit(X_dummy, y_reg_dummy) self.energy_model.fit(X_dummy, y_reg_dummy) def predict(self, audio_path): """Predict emotion and mental health indicators""" # Extract features feature_dict = self.extractor.extract_all_features(audio_path) features = feature_dict['features'].reshape(1, -1) # Scale features features_scaled = self.scaler.transform(features) # Predict emotion emotion_probs = self.emotion_model.predict_proba(features_scaled)[0] emotion_idx = np.argmax(emotion_probs) emotion = self.emotions[emotion_idx] confidence = emotion_probs[emotion_idx] # Predict regression outputs vocal_affect = np.clip(self.affect_model.predict(features_scaled)[0], 0, 1) monotone_score = np.clip(self.monotone_model.predict(features_scaled)[0], 0, 1) vocal_energy = np.clip(self.energy_model.predict(features_scaled)[0], 0, 1) # Adjust with extracted features for better estimates vocal_affect = (vocal_affect + feature_dict['vocal_affect_score']) / 2 monotone_score = (monotone_score + feature_dict['monotone_score']) / 2 vocal_energy = (vocal_energy + feature_dict['vocal_energy_score']) / 2 # Mental health interpretation indicators = self._interpret_mental_health(monotone_score, vocal_affect, vocal_energy) return { 'emotion': emotion, 'confidence': confidence, 'emotion_probabilities': { self.emotions[i]: prob for i, prob in enumerate(emotion_probs) }, 'vocal_affect_score': vocal_affect, 'monotone_speech_score': monotone_score, 'vocal_energy_score': vocal_energy, 'pitch_variability': feature_dict['pitch_variability'], 'energy_level': feature_dict['energy_level'], 'mental_health_indicators': indicators } def _interpret_mental_health(self, monotone, affect, energy): """Interpret mental health indicators""" indicators = [] if monotone > 0.7: indicators.append("⚠️ High monotone score - possible depression indicator") if affect > 0.7 and energy > 0.7: indicators.append("⚠️ High vocal affect and energy - possible anxiety/stress") if energy < 0.3: indicators.append("⚠️ Low vocal energy - possible low motivation/depression") if affect > 0.6 and monotone < 0.4: indicators.append("⚠️ High vocal affect - possible emotional stress") if 0.35 <= monotone <= 0.65 and 0.35 <= affect <= 0.65 and 0.35 <= energy <= 0.65: indicators.append("✅ Balanced vocal characteristics") if not indicators: indicators.append("ℹ️ Vocal patterns within normal range") return indicators # ============================================ # GRADIO INTERFACE # ============================================ def create_app(): """Create Gradio app""" predictor = EmotionPredictor() def analyze_audio(audio): """Analysis function""" if audio is None: return "❌ Please upload an audio file", "", "", "", "", "" try: results = predictor.predict(audio) # Format emotion output emotion_text = f"## 🎭 **{results['emotion'].upper()}**\n\n" emotion_text += f"**Confidence:** {results['confidence']*100:.1f}%\n\n" emotion_text += "### Probability Distribution:\n" for emotion, prob in sorted(results['emotion_probabilities'].items(), key=lambda x: x[1], reverse=True): bar = "█" * int(prob * 20) + "░" * (20 - int(prob * 20)) emotion_text += f"**{emotion.title()}:** {bar} {prob*100:.1f}%\n" # Format scores affect = f"**Score:** {results['vocal_affect_score']:.3f}\n\n" if results['vocal_affect_score'] > 0.7: affect += "🔴 High intensity" elif results['vocal_affect_score'] < 0.3: affect += "🟢 Low intensity" else: affect += "🟡 Moderate" monotone = f"**Score:** {results['monotone_speech_score']:.3f}\n\n" if results['monotone_speech_score'] > 0.7: monotone += "🔴 Very flat speech" elif results['monotone_speech_score'] < 0.3: monotone += "🟢 Varied pitch" else: monotone += "🟡 Moderate variation" energy = f"**Score:** {results['vocal_energy_score']:.3f}\n\n" if results['vocal_energy_score'] > 0.7: energy += "🔴 High energy" elif results['vocal_energy_score'] < 0.3: energy += "🔴 Low energy" else: energy += "🟢 Normal energy" details = f"**Pitch Variability:** {results['pitch_variability']:.2f} Hz\n" details += f"**Energy Level:** {results['energy_level']:.3f}" mental = "\n".join(results['mental_health_indicators']) return emotion_text, affect, monotone, energy, details, mental except Exception as e: return f"❌ Error: {str(e)}", "", "", "", "", "" # Create interface with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🎙️ Audio Emotion & Mental Health Detection Analyze emotional state and mental health indicators from speech audio. """) with gr.Row(): with gr.Column(): audio = gr.Audio(type="filepath", label="Upload Audio") btn = gr.Button("🔍 Analyze", variant="primary", size="lg") with gr.Column(): emotion_out = gr.Markdown() with gr.Row(): affect_out = gr.Markdown() monotone_out = gr.Markdown() energy_out = gr.Markdown() details_out = gr.Markdown() mental_out = gr.Markdown() gr.Markdown(""" ### 📊 Interpretation - **Vocal Affect:** Emotional intensity (0=calm, 1=intense) - **Monotone Score:** Pitch flatness (high=depression risk) - **Vocal Energy:** Speaking energy (low=low motivation) ⚠️ **Disclaimer:** For research only, not medical diagnosis. """) btn.click( analyze_audio, inputs=audio, outputs=[emotion_out, affect_out, monotone_out, energy_out, details_out, mental_out] ) return demo # ============================================ # MAIN # ============================================ if __name__ == "__main__": app = create_app() app.launch()