Spaces:

akku09090
/

voice_analyser

Sleeping

App Files Files Community

akku09090 commited on Nov 13

Commit

b3e17d3

verified ·

1 Parent(s): 185ab9d

Update app.py

Browse files

Files changed (1) hide show

app.py +303 -481

app.py CHANGED Viewed

@@ -1,39 +1,43 @@
 #!/usr/bin/env python3
 """
 Audio Emotion & Mental Health Detection Model
-Optimized for Hugging Face Spaces Deployment
 """
 import os
 import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
 import gradio as gr
-from typing import Dict, Tuple
 import warnings
 warnings.filterwarnings('ignore')
-# Lightweight audio processing (no librosa dependency)
 try:
     import librosa
     LIBROSA_AVAILABLE = True
 except ImportError:
     LIBROSA_AVAILABLE = False
-    print("⚠️ Librosa not available, using lightweight processing")
-import scipy.signal as signal
 from scipy.io import wavfile
-import scipy.fftpack as fft
 # ============================================
-# LIGHTWEIGHT AUDIO FEATURE EXTRACTOR
 # ============================================
-class LightweightAudioProcessor:
-    """Audio processing without heavy librosa dependency"""
-    def __init__(self, sr=16000, n_mfcc=40):
         self.sr = sr
         self.n_mfcc = n_mfcc
@@ -42,238 +46,200 @@ class LightweightAudioProcessor:
         try:
             if LIBROSA_AVAILABLE:
                 y, sr = librosa.load(audio_path, sr=self.sr, duration=3)
             else:
-                # Fallback: use scipy
                 sr, y = wavfile.read(audio_path)
                 if len(y.shape) > 1:
-                    y = y.mean(axis=1)  # Convert to mono
-                y = y.astype(np.float32) / np.max(np.abs(y))  # Normalize
                 # Resample if needed
                 if sr != self.sr:
                     num_samples = int(len(y) * self.sr / sr)
                     y = signal.resample(y, num_samples)
-                # Limit duration to 3 seconds
                 max_len = 3 * self.sr
                 if len(y) > max_len:
                     y = y[:max_len]
-            return y, self.sr
         except Exception as e:
             print(f"Error loading audio: {e}")
-            return np.random.randn(self.sr * 3), self.sr
-    def extract_mfcc_features(self, y):
-        """Extract MFCC features using lightweight method"""
-        if LIBROSA_AVAILABLE:
-            mfccs = librosa.feature.mfcc(y=y, sr=self.sr, n_mfcc=self.n_mfcc)
-        else:
-            # Simplified MFCC calculation
-            # Apply pre-emphasis
-            emphasized = np.append(y[0], y[1:] - 0.97 * y[:-1])
-            # Frame the signal
-            frame_size = int(0.025 * self.sr)
-            frame_stride = int(0.01 * self.sr)
-            frames = self._frame_signal(emphasized, frame_size, frame_stride)
-            # Apply FFT
-            mag_frames = np.absolute(np.fft.rfft(frames, frame_size))
-            pow_frames = ((1.0 / frame_size) * (mag_frames ** 2))
-            # Mel filter banks (simplified)
-            mel_filters = self._create_mel_filters(26, frame_size, self.sr)
-            filter_banks = np.dot(pow_frames, mel_filters.T)
-            filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)
-            filter_banks = 20 * np.log10(filter_banks)
-            # DCT to get MFCCs
-            mfccs = fft.dct(filter_banks, type=2, axis=1, norm='ortho')[:, :self.n_mfcc].T
-        return mfccs
-    def _frame_signal(self, signal, frame_size, frame_stride):
-        """Frame a signal into overlapping frames"""
-        signal_length = len(signal)
-        num_frames = int(np.ceil(float(np.abs(signal_length - frame_size)) / frame_stride))
-        pad_signal_length = num_frames * frame_stride + frame_size
-        z = np.zeros((pad_signal_length - signal_length))
-        padded = np.append(signal, z)
-        indices = np.tile(np.arange(0, frame_size), (num_frames, 1)) + \
-                  np.tile(np.arange(0, num_frames * frame_stride, frame_stride), (frame_size, 1)).T
-        frames = padded[indices.astype(np.int32, copy=False)]
-        # Apply Hamming window
-        frames *= np.hamming(frame_size)
-        return frames
-    def _create_mel_filters(self, num_filters, fft_size, sample_rate):
-        """Create Mel filter banks"""
         low_freq_mel = 0
-        high_freq_mel = 2595 * np.log10(1 + (sample_rate / 2) / 700)
-        mel_points = np.linspace(low_freq_mel, high_freq_mel, num_filters + 2)
         hz_points = 700 * (10**(mel_points / 2595) - 1)
-        bin_points = np.floor((fft_size + 1) * hz_points / sample_rate)
-        fbank = np.zeros((num_filters, int(np.floor(fft_size / 2 + 1))))
-        for m in range(1, num_filters + 1):
-            f_m_minus = int(bin_points[m - 1])
-            f_m = int(bin_points[m])
-            f_m_plus = int(bin_points[m + 1])
             for k in range(f_m_minus, f_m):
                 fbank[m - 1, k] = (k - bin_points[m - 1]) / (bin_points[m] - bin_points[m - 1])
             for k in range(f_m, f_m_plus):
                 fbank[m - 1, k] = (bin_points[m + 1] - k) / (bin_points[m + 1] - bin_points[m])
-        return fbank
     def extract_pitch(self, y):
-        """Extract pitch features"""
-        if LIBROSA_AVAILABLE:
-            pitches, magnitudes = librosa.piptrack(y=y, sr=self.sr)
-            pitch_values = []
-            for t in range(pitches.shape[1]):
-                index = magnitudes[:, t].argmax()
-                pitch = pitches[index, t]
-                if pitch > 0:
-                    pitch_values.append(pitch)
-        else:
-            # Simple autocorrelation-based pitch detection
-            pitch_values = []
-            frame_length = int(0.025 * self.sr)
-            hop_length = int(0.01 * self.sr)
-            for i in range(0, len(y) - frame_length, hop_length):
-                frame = y[i:i+frame_length]
-                autocorr = np.correlate(frame, frame, mode='full')
-                autocorr = autocorr[len(autocorr)//2:]
-                # Find peaks
-                peaks = signal.find_peaks(autocorr)[0]
-                if len(peaks) > 0:
-                    pitch = self.sr / peaks[0] if peaks[0] > 0 else 0
-                    if 50 < pitch < 400:  # Valid pitch range
-                        pitch_values.append(pitch)
-        return pitch_values if pitch_values else [0]
     def extract_energy(self, y):
-        """Extract energy features"""
-        if LIBROSA_AVAILABLE:
-            rms = librosa.feature.rms(y=y)[0]
-        else:
-            frame_length = int(0.025 * self.sr)
-            hop_length = int(0.01 * self.sr)
-            rms = []
-            for i in range(0, len(y) - frame_length, hop_length):
-                frame = y[i:i+frame_length]
-                rms.append(np.sqrt(np.mean(frame**2)))
-            rms = np.array(rms)
-        return rms
     def extract_zcr(self, y):
-        """Extract zero crossing rate"""
-        if LIBROSA_AVAILABLE:
-            zcr = librosa.feature.zero_crossing_rate(y)[0]
-        else:
-            zcr = []
-            frame_length = int(0.025 * self.sr)
-            hop_length = int(0.01 * self.sr)
-            for i in range(0, len(y) - frame_length, hop_length):
-                frame = y[i:i+frame_length]
-                zero_crossings = np.sum(np.abs(np.diff(np.sign(frame)))) / 2
-                zcr.append(zero_crossings / frame_length)
-            zcr = np.array(zcr)
-        return zcr
     def extract_spectral_features(self, y):
-        """Extract spectral features"""
-        # Compute FFT
-        fft_spectrum = np.fft.rfft(y)
-        magnitude = np.abs(fft_spectrum)
         freq = np.fft.rfftfreq(len(y), 1.0/self.sr)
         # Spectral centroid
-        spectral_centroid = np.sum(freq * magnitude) / np.sum(magnitude)
-        # Spectral rolloff (85% of energy)
         cumsum = np.cumsum(magnitude)
         rolloff_idx = np.where(cumsum >= 0.85 * cumsum[-1])[0]
-        spectral_rolloff = freq[rolloff_idx[0]] if len(rolloff_idx) > 0 else 0
         # Spectral bandwidth
-        deviation = freq - spectral_centroid
-        spectral_bandwidth = np.sqrt(np.sum((deviation**2) * magnitude) / np.sum(magnitude))
-        return spectral_centroid, spectral_rolloff, spectral_bandwidth
-    def estimate_tempo(self, y):
-        """Estimate tempo"""
-        if LIBROSA_AVAILABLE:
-            tempo, _ = librosa.beat.beat_track(y=y, sr=self.sr)
-            return tempo
-        else:
-            # Simplified tempo estimation
-            onset_env = self.extract_energy(y)
-            autocorr = np.correlate(onset_env, onset_env, mode='full')
-            autocorr = autocorr[len(autocorr)//2:]
-            # Find tempo peaks
-            peaks = signal.find_peaks(autocorr)[0]
-            if len(peaks) > 0:
-                tempo = 60.0 / (peaks[0] * 0.01) if peaks[0] > 0 else 120
-                return np.clip(tempo, 60, 180)
-            return 120
     def extract_all_features(self, audio_path):
-        """Extract comprehensive features from audio"""
         try:
-            # Load audio
             y, sr = self.load_audio(audio_path)
-            # 1. MFCCs
-            mfccs = self.extract_mfcc_features(y)
-            mfcc_mean = np.mean(mfccs, axis=1)
-            mfcc_std = np.std(mfccs, axis=1)
-            # 2. Pitch features
             pitch_values = self.extract_pitch(y)
             pitch_mean = np.mean(pitch_values)
             pitch_std = np.std(pitch_values)
             pitch_min = np.min(pitch_values)
             pitch_max = np.max(pitch_values)
-            monotone_score = 1 / (1 + pitch_std) if pitch_std > 0 else 1.0
-            # 3. Energy features
             rms = self.extract_energy(y)
             energy_mean = np.mean(rms)
             energy_std = np.std(rms)
             energy_max = np.max(rms)
-            # 4. Zero Crossing Rate
             zcr = self.extract_zcr(y)
             zcr_mean = np.mean(zcr)
             zcr_std = np.std(zcr)
-            # 5. Spectral features
-            spectral_centroid, spectral_rolloff, spectral_bandwidth = \
-                self.extract_spectral_features(y)
-            # 6. Chroma (simplified)
-            chroma_mean = 0.5  # Placeholder
-            # 7. Tempo
-            tempo = self.estimate_tempo(y)
             # Combine features
             features = np.concatenate([
@@ -282,231 +248,164 @@ class LightweightAudioProcessor:
                 [pitch_mean, pitch_std, pitch_min, pitch_max, monotone_score],
                 [energy_mean, energy_std, energy_max],
                 [zcr_mean, zcr_std],
-                [spectral_centroid, spectral_rolloff, spectral_bandwidth],
-                [chroma_mean],
                 [tempo]
             ])
-            # Calculate derived scores
-            vocal_affect_score = self._calculate_vocal_affect(
-                pitch_std, energy_std, spectral_centroid
-            )
-            vocal_energy_score = self._calculate_vocal_energy(
-                energy_mean, tempo, zcr_mean
-            )
             return {
                 'features': features.astype(np.float32),
-                'vocal_affect_score': float(vocal_affect_score),
                 'monotone_score': float(monotone_score),
-                'vocal_energy_score': float(vocal_energy_score),
                 'pitch_variability': float(pitch_std),
                 'energy_level': float(energy_mean)
             }
         except Exception as e:
-            print(f"Error extracting features: {e}")
-            # Return default features
-            return self._get_default_features()
-    def _calculate_vocal_affect(self, pitch_std, energy_std, spectral_centroid):
-        """Calculate emotional intensity"""
-        pitch_component = min(pitch_std / 100, 1.0)
-        energy_component = min(energy_std / 0.5, 1.0)
-        spectral_component = min(spectral_centroid / 3000, 1.0)
-        affect_score = (pitch_component * 0.4 +
-                       energy_component * 0.4 +
-                       spectral_component * 0.2)
-        return np.clip(affect_score, 0, 1)
-    def _calculate_vocal_energy(self, energy_mean, tempo, zcr_mean):
-        """Calculate vocal energy/activation"""
-        energy_component = min(energy_mean / 1.0, 1.0)
-        tempo_component = min(tempo / 180, 1.0)
-        zcr_component = min(zcr_mean / 0.3, 1.0)
-        energy_score = (energy_component * 0.5 +
-                       tempo_component * 0.3 +
-                       zcr_component * 0.2)
-        return np.clip(energy_score, 0, 1)
-    def _get_default_features(self):
-        """Return default features for error cases"""
-        n_features = self.n_mfcc * 2 + 18
         return {
-            'features': np.random.randn(n_features).astype(np.float32),
             'vocal_affect_score': 0.5,
             'monotone_score': 0.5,
             'vocal_energy_score': 0.5,
-            'pitch_variability': 50.0,
-            'energy_level': 0.5
         }
 # ============================================
-# NEURAL NETWORK MODEL
-# ============================================
-class MultiTaskEmotionModel(nn.Module):
-    """Multi-task emotion and mental health detection model"""
-    def __init__(self, input_dim, num_emotions=8, dropout=0.5):
-        super(MultiTaskEmotionModel, self).__init__()
-        # Shared layers
-        self.shared_layers = nn.Sequential(
-            nn.Linear(input_dim, 512),
-            nn.BatchNorm1d(512),
-            nn.ReLU(),
-            nn.Dropout(dropout),
-            nn.Linear(512, 256),
-            nn.BatchNorm1d(256),
-            nn.ReLU(),
-            nn.Dropout(dropout),
-            nn.Linear(256, 128),
-            nn.BatchNorm1d(128),
-            nn.ReLU(),
-            nn.Dropout(dropout/2)
-        )
-        # Emotion classification head
-        self.emotion_head = nn.Sequential(
-            nn.Linear(128, 64),
-            nn.ReLU(),
-            nn.Dropout(dropout/2),
-            nn.Linear(64, num_emotions)
-        )
-        # Regression heads
-        self.affect_head = nn.Sequential(
-            nn.Linear(128, 32),
-            nn.ReLU(),
-            nn.Linear(32, 1),
-            nn.Sigmoid()
-        )
-        self.monotone_head = nn.Sequential(
-            nn.Linear(128, 32),
-            nn.ReLU(),
-            nn.Linear(32, 1),
-            nn.Sigmoid()
-        )
-        self.energy_head = nn.Sequential(
-            nn.Linear(128, 32),
-            nn.ReLU(),
-            nn.Linear(32, 1),
-            nn.Sigmoid()
-        )
-    def forward(self, x):
-        shared = self.shared_layers(x)
-        return {
-            'emotion_logits': self.emotion_head(shared),
-            'vocal_affect': self.affect_head(shared),
-            'monotone_score': self.monotone_head(shared),
-            'vocal_energy': self.energy_head(shared)
-        }
-# ============================================
-# PREDICTOR CLASS
 # ============================================
 class EmotionPredictor:
-    """Production inference class"""
     def __init__(self):
-        self.processor = LightweightAudioProcessor(sr=16000, n_mfcc=40)
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         # Emotion mapping
-        self.emotion_map = {
-            'neutral': 0, 'calm': 1, 'happy': 2, 'sad': 3,
-            'angry': 4, 'fearful': 5, 'disgust': 6, 'surprised': 7
-        }
-        self.reverse_emotion_map = {v: k for k, v in self.emotion_map.items()}
-        # Initialize model with pre-trained weights
-        input_dim = 98  # 40*2 (MFCC mean+std) + 18 other features
-        self.model = MultiTaskEmotionModel(
-            input_dim=input_dim,
-            num_emotions=len(self.emotion_map),
-            dropout=0.3
-        )
-        # Load pre-trained weights if available, otherwise use initialized weights
-        self._load_or_initialize_model()
-        self.model.to(self.device)
-        self.model.eval()
-    def _load_or_initialize_model(self):
-        """Load pre-trained model or use initialized weights"""
-        model_path = 'emotion_model.pth'
-        if os.path.exists(model_path):
             try:
-                checkpoint = torch.load(model_path, map_location='cpu')
-                self.model.load_state_dict(checkpoint)
-                print("✅ Loaded pre-trained model")
-            except Exception as e:
-                print(f"⚠️ Could not load model: {e}")
-                print("Using initialized weights (demo mode)")
-        else:
-            print("ℹ️ No pre-trained model found. Using initialized weights (demo mode)")
-            # In demo mode, the model will still work but predictions will be less accurate
-    def predict(self, audio_path: str) -> Dict:
         """Predict emotion and mental health indicators"""
         # Extract features
-        feature_dict = self.processor.extract_all_features(audio_path)
-        features = torch.FloatTensor(feature_dict['features']).unsqueeze(0)
-        features = features.to(self.device)
-        # Predict
-        with torch.no_grad():
-            outputs = self.model(features)
-        # Process outputs
-        emotion_probs = F.softmax(outputs['emotion_logits'], dim=1)[0]
-        emotion_idx = emotion_probs.argmax().item()
-        emotion = self.reverse_emotion_map[emotion_idx]
-        confidence = emotion_probs[emotion_idx].item()
-        # Get all scores
-        vocal_affect = outputs['vocal_affect'][0].item()
-        monotone_score = outputs['monotone_score'][0].item()
-        vocal_energy = outputs['vocal_energy'][0].item()
         # Mental health interpretation
-        mental_health_indicators = self._interpret_mental_health(
-            monotone_score, vocal_affect, vocal_energy
-        )
-        results = {
             'emotion': emotion,
             'confidence': confidence,
             'emotion_probabilities': {
-                self.reverse_emotion_map[i]: prob.item()
-                for i, prob in enumerate(emotion_probs)
             },
             'vocal_affect_score': vocal_affect,
             'monotone_speech_score': monotone_score,
             'vocal_energy_score': vocal_energy,
             'pitch_variability': feature_dict['pitch_variability'],
             'energy_level': feature_dict['energy_level'],
-            'mental_health_indicators': mental_health_indicators
         }
-        return results
     def _interpret_mental_health(self, monotone, affect, energy):
         """Interpret mental health indicators"""
@@ -524,8 +423,8 @@ class EmotionPredictor:
         if affect > 0.6 and monotone < 0.4:
             indicators.append("⚠️ High vocal affect - possible emotional stress")
-        if 0.4 <= monotone <= 0.6 and 0.4 <= affect <= 0.6 and 0.4 <= energy <= 0.6:
-            indicators.append("✅ Balanced vocal characteristics - no significant concerns")
         if not indicators:
             indicators.append("ℹ️ Vocal patterns within normal range")
@@ -537,188 +436,111 @@ class EmotionPredictor:
 # GRADIO INTERFACE
 # ============================================
-def create_gradio_app():
-    """Create Gradio interface"""
-    # Initialize predictor
-    print("Initializing emotion predictor...")
     predictor = EmotionPredictor()
-    print("✅ Predictor ready!")
-    def predict_emotion(audio):
-        """Gradio prediction function"""
         if audio is None:
-            return {
-                emotion_output: "❌ Please upload an audio file",
-                affect_output: "",
-                monotone_output: "",
-                energy_output: "",
-                pitch_output: "",
-                mental_health_output: ""
-            }
         try:
-            # Run prediction
             results = predictor.predict(audio)
             # Format emotion output
-            emotion_text = f"## 🎭 Detected Emotion: **{results['emotion'].upper()}**\n\n"
             emotion_text += f"**Confidence:** {results['confidence']*100:.1f}%\n\n"
-            emotion_text += "### All Emotion Probabilities:\n"
             for emotion, prob in sorted(results['emotion_probabilities'].items(),
                                        key=lambda x: x[1], reverse=True):
-                bar_length = int(prob * 20)
-                bar = "█" * bar_length + "░" * (20 - bar_length)
-                emotion_text += f"**{emotion.capitalize()}:** {bar} {prob*100:.1f}%\n"
             # Format scores
-            affect_text = f"**{results['vocal_affect_score']:.3f}**\n\n"
             if results['vocal_affect_score'] > 0.7:
-                affect_text += "🔴 High emotional intensity detected"
             elif results['vocal_affect_score'] < 0.3:
-                affect_text += "🟢 Low emotional intensity"
             else:
-                affect_text += "🟡 Moderate emotional intensity"
-            monotone_text = f"**{results['monotone_speech_score']:.3f}**\n\n"
             if results['monotone_speech_score'] > 0.7:
-                monotone_text += "🔴 Very flat speech pattern"
             elif results['monotone_speech_score'] < 0.3:
-                monotone_text += "🟢 Varied pitch pattern"
             else:
-                monotone_text += "🟡 Moderate pitch variation"
-            energy_text = f"**{results['vocal_energy_score']:.3f}**\n\n"
             if results['vocal_energy_score'] > 0.7:
-                energy_text += "🔴 High vocal energy"
             elif results['vocal_energy_score'] < 0.3:
-                energy_text += "🔴 Low vocal energy"
             else:
-                energy_text += "🟢 Normal vocal energy"
-            pitch_text = f"**Variability:** {results['pitch_variability']:.2f} Hz\n"
-            pitch_text += f"**Energy Level:** {results['energy_level']:.3f}"
-            mental_health_text = "\n".join(results['mental_health_indicators'])
-            return {
-                emotion_output: emotion_text,
-                affect_output: affect_text,
-                monotone_output: monotone_text,
-                energy_output: energy_text,
-                pitch_output: pitch_text,
-                mental_health_output: mental_health_text
-            }
         except Exception as e:
-            error_msg = f"❌ Error processing audio: {str(e)}"
-            return {
-                emotion_output: error_msg,
-                affect_output: "",
-                monotone_output: "",
-                energy_output: "",
-                pitch_output: "",
-                mental_health_output: ""
-            }
     # Create interface
-    with gr.Blocks(theme=gr.themes.Soft(), title="Audio Emotion Detection") as demo:
         gr.Markdown("""
         # 🎙️ Audio Emotion & Mental Health Detection
-        Upload an audio file to analyze emotional state and mental health indicators.
-        **Features:**
-        - 🎭 Emotion Recognition (8 emotions)
-        - 📊 Vocal Affect Score (emotional intensity)
-        - 📉 Monotone Speech Detection (depression indicator)
-        - ⚡ Vocal Energy Analysis (mood disorder indicator)
         """)
         with gr.Row():
-            with gr.Column(scale=1):
-                audio_input = gr.Audio(
-                    type="filepath",
-                    label="Upload Audio File (WAV, MP3, etc.)"
-                )
-                analyze_btn = gr.Button("🔍 Analyze Audio", variant="primary", size="lg")
-                gr.Markdown("""
-                ### 📝 Instructions:
-                1. Upload an audio file (WAV, MP3, etc.)
-                2. Click "Analyze Audio"
-                3. View results on the right
-                **Note:** Works best with clear speech recordings (3-10 seconds)
-                """)
-            with gr.Column(scale=2):
-                emotion_output = gr.Markdown(label="Emotion Detection")
                 with gr.Row():
-                    with gr.Column():
-                        affect_output = gr.Markdown(label="Vocal Affect Score")
-                    with gr.Column():
-                        monotone_output = gr.Markdown(label="Monotone Score")
-                    with gr.Column():
-                        energy_output = gr.Markdown(label="Vocal Energy")
-                pitch_output = gr.Markdown(label="Technical Details")
-                mental_health_output = gr.Markdown(label="Mental Health Indicators")
         gr.Markdown("""
-        ---
-        ### 📊 Interpretation Guide
-        | Metric | Range | Interpretation |
-        |--------|-------|----------------|
-        | **Vocal Affect** | 0.0-0.3 | Low emotional intensity (calm/neutral) |
-        | | 0.3-0.7 | Moderate emotional intensity |
-        | | 0.7-1.0 | High emotional intensity (stress/anxiety) |
-        | **Monotone Score** | 0.0-0.3 | High pitch variation (normal) |
-        | | 0.3-0.7 | Moderate pitch variation |
-        | | 0.7-1.0 | Very flat speech (possible depression) |
-        | **Vocal Energy** | 0.0-0.3 | Low energy (possible low motivation) |
-        | | 0.3-0.7 | Normal energy level |
-        | | 0.7-1.0 | High energy (possible anxiety/mania) |
-        ---
-        **⚠️ Disclaimer:** This tool is for research and informational purposes only.
-        It should not be used as a substitute for professional medical or psychological diagnosis.
-        Always consult qualified healthcare professionals for mental health concerns.
-        **🔬 Model Info:** Multi-task Deep Neural Network trained on emotional speech datasets (RAVDESS, TESS, CREMA-D)
         """)
-        # Connect button to function
-        analyze_btn.click(
-            fn=predict_emotion,
-            inputs=audio_input,
-            outputs=[emotion_output, affect_output, monotone_output,
-                    energy_output, pitch_output, mental_health_output]
         )
     return demo
 # ============================================
-# MAIN EXECUTION
 # ============================================
 if __name__ == "__main__":
-    print("="*60)
-    print("🎙️ Audio Emotion & Mental Health Detection")
-    print("="*60)
-    print("\nStarting Gradio interface...")
-    # Create and launch app
-    app = create_gradio_app()
-    app.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False
-    )

 #!/usr/bin/env python3
 """
 Audio Emotion & Mental Health Detection Model
+Lightweight version for Hugging Face Spaces
+Using scikit-learn instead of PyTorch
 """
 import os
 import numpy as np
 import gradio as gr
+from typing import Dict
 import warnings
+import pickle
 warnings.filterwarnings('ignore')
+# Audio processing
 try:
     import librosa
     LIBROSA_AVAILABLE = True
 except ImportError:
     LIBROSA_AVAILABLE = False
+    print("⚠️ Librosa not available, using scipy")
 from scipy.io import wavfile
+import scipy.signal as signal
+from scipy import fft
+# Machine Learning
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
+from sklearn.preprocessing import StandardScaler
+from sklearn.neural_network import MLPClassifier, MLPRegressor
 # ============================================
+# AUDIO PROCESSING
 # ============================================
+class AudioFeatureExtractor:
+    """Extract audio features without heavy dependencies"""
+    def __init__(self, sr=16000, n_mfcc=20):
         self.sr = sr
         self.n_mfcc = n_mfcc
         try:
             if LIBROSA_AVAILABLE:
                 y, sr = librosa.load(audio_path, sr=self.sr, duration=3)
+                return y, sr
             else:
+                # Use scipy
                 sr, y = wavfile.read(audio_path)
+                # Convert to mono
                 if len(y.shape) > 1:
+                    y = y.mean(axis=1)
+                # Normalize
+                y = y.astype(np.float32)
+                if np.max(np.abs(y)) > 0:
+                    y = y / np.max(np.abs(y))
                 # Resample if needed
                 if sr != self.sr:
                     num_samples = int(len(y) * self.sr / sr)
                     y = signal.resample(y, num_samples)
+                # Limit to 3 seconds
                 max_len = 3 * self.sr
                 if len(y) > max_len:
                     y = y[:max_len]
+                return y, self.sr
         except Exception as e:
             print(f"Error loading audio: {e}")
+            return np.random.randn(self.sr * 3) * 0.1, self.sr
+    def get_mfcc_simple(self, y):
+        """Simplified MFCC extraction"""
+        # Pre-emphasis
+        y_emphasized = np.append(y[0], y[1:] - 0.97 * y[:-1])
+        # Framing
+        frame_length = int(0.025 * self.sr)
+        frame_step = int(0.01 * self.sr)
+        num_frames = 1 + int((len(y_emphasized) - frame_length) / frame_step)
+        frames = np.zeros((num_frames, frame_length))
+        for i in range(num_frames):
+            start = i * frame_step
+            frames[i] = y_emphasized[start:start + frame_length]
+        # Apply window
+        frames *= np.hamming(frame_length)
+        # FFT
+        mag_frames = np.absolute(np.fft.rfft(frames, frame_length))
+        pow_frames = ((1.0 / frame_length) * (mag_frames ** 2))
+        # Mel filterbank
+        nfft = frame_length
+        nfilt = 26
         low_freq_mel = 0
+        high_freq_mel = 2595 * np.log10(1 + (self.sr / 2) / 700)
+        mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)
         hz_points = 700 * (10**(mel_points / 2595) - 1)
+        bin_points = np.floor((nfft + 1) * hz_points / self.sr).astype(int)
+        fbank = np.zeros((nfilt, int(nfft / 2 + 1)))
+        for m in range(1, nfilt + 1):
+            f_m_minus = bin_points[m - 1]
+            f_m = bin_points[m]
+            f_m_plus = bin_points[m + 1]
             for k in range(f_m_minus, f_m):
                 fbank[m - 1, k] = (k - bin_points[m - 1]) / (bin_points[m] - bin_points[m - 1])
             for k in range(f_m, f_m_plus):
                 fbank[m - 1, k] = (bin_points[m + 1] - k) / (bin_points[m + 1] - bin_points[m])
+        filter_banks = np.dot(pow_frames, fbank.T)
+        filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)
+        filter_banks = 20 * np.log10(filter_banks)
+        # DCT
+        mfcc = fft.dct(filter_banks, type=2, axis=1, norm='ortho')[:, :self.n_mfcc]
+        return mfcc.T
     def extract_pitch(self, y):
+        """Extract pitch using autocorrelation"""
+        pitch_values = []
+        frame_length = int(0.03 * self.sr)
+        hop_length = int(0.01 * self.sr)
+        for i in range(0, len(y) - frame_length, hop_length):
+            frame = y[i:i+frame_length]
+            # Autocorrelation
+            corr = np.correlate(frame, frame, mode='full')
+            corr = corr[len(corr)//2:]
+            # Find first peak after lag 0
+            d = np.diff(corr)
+            start = int(self.sr / 400)  # Min 400 Hz
+            peak = np.where(d[start:] < 0)[0]
+            if len(peak) > 0:
+                peak_idx = peak[0] + start
+                if peak_idx > 0:
+                    freq = self.sr / peak_idx
+                    if 50 < freq < 400:
+                        pitch_values.append(freq)
+        return pitch_values if pitch_values else [150.0]
     def extract_energy(self, y):
+        """Extract RMS energy"""
+        frame_length = int(0.025 * self.sr)
+        hop_length = int(0.01 * self.sr)
+        rms = []
+        for i in range(0, len(y) - frame_length, hop_length):
+            frame = y[i:i+frame_length]
+            rms.append(np.sqrt(np.mean(frame**2)))
+        return np.array(rms)
     def extract_zcr(self, y):
+        """Zero crossing rate"""
+        frame_length = int(0.025 * self.sr)
+        hop_length = int(0.01 * self.sr)
+        zcr = []
+        for i in range(0, len(y) - frame_length, hop_length):
+            frame = y[i:i+frame_length]
+            crossings = np.sum(np.abs(np.diff(np.sign(frame)))) / 2
+            zcr.append(crossings / frame_length)
+        return np.array(zcr)
     def extract_spectral_features(self, y):
+        """Spectral features"""
+        spectrum = np.fft.rfft(y)
+        magnitude = np.abs(spectrum)
         freq = np.fft.rfftfreq(len(y), 1.0/self.sr)
         # Spectral centroid
+        centroid = np.sum(freq * magnitude) / (np.sum(magnitude) + 1e-6)
+        # Spectral rolloff
         cumsum = np.cumsum(magnitude)
         rolloff_idx = np.where(cumsum >= 0.85 * cumsum[-1])[0]
+        rolloff = freq[rolloff_idx[0]] if len(rolloff_idx) > 0 else 0
         # Spectral bandwidth
+        bandwidth = np.sqrt(np.sum(((freq - centroid)**2) * magnitude) / (np.sum(magnitude) + 1e-6))
+        return centroid, rolloff, bandwidth
     def extract_all_features(self, audio_path):
+        """Extract all features"""
         try:
             y, sr = self.load_audio(audio_path)
+            # MFCCs
+            mfcc = self.get_mfcc_simple(y)
+            mfcc_mean = np.mean(mfcc, axis=1)
+            mfcc_std = np.std(mfcc, axis=1)
+            # Pitch
             pitch_values = self.extract_pitch(y)
             pitch_mean = np.mean(pitch_values)
             pitch_std = np.std(pitch_values)
             pitch_min = np.min(pitch_values)
             pitch_max = np.max(pitch_values)
+            monotone_score = 1.0 / (1.0 + pitch_std/10.0)
+            # Energy
             rms = self.extract_energy(y)
             energy_mean = np.mean(rms)
             energy_std = np.std(rms)
             energy_max = np.max(rms)
+            # ZCR
             zcr = self.extract_zcr(y)
             zcr_mean = np.mean(zcr)
             zcr_std = np.std(zcr)
+            # Spectral
+            spec_centroid, spec_rolloff, spec_bandwidth = self.extract_spectral_features(y)
+            # Tempo estimation
+            onset_env = rms
+            tempo = 120.0  # Default
+            if len(onset_env) > 10:
+                autocorr = np.correlate(onset_env, onset_env, mode='full')
+                autocorr = autocorr[len(autocorr)//2:]
+                peaks = signal.find_peaks(autocorr)[0]
+                if len(peaks) > 0 and peaks[0] > 0:
+                    tempo = 60.0 / (peaks[0] * 0.01)
+                    tempo = np.clip(tempo, 60, 180)
             # Combine features
             features = np.concatenate([
                 [pitch_mean, pitch_std, pitch_min, pitch_max, monotone_score],
                 [energy_mean, energy_std, energy_max],
                 [zcr_mean, zcr_std],
+                [spec_centroid, spec_rolloff, spec_bandwidth],
                 [tempo]
             ])
+            # Derived scores
+            vocal_affect = self._calc_affect(pitch_std, energy_std, spec_centroid)
+            vocal_energy = self._calc_energy(energy_mean, tempo, zcr_mean)
             return {
                 'features': features.astype(np.float32),
+                'vocal_affect_score': float(vocal_affect),
                 'monotone_score': float(monotone_score),
+                'vocal_energy_score': float(vocal_energy),
                 'pitch_variability': float(pitch_std),
                 'energy_level': float(energy_mean)
             }
         except Exception as e:
+            print(f"Error: {e}")
+            return self._default_features()
+    def _calc_affect(self, pitch_std, energy_std, spec_centroid):
+        """Calculate vocal affect score"""
+        pitch_comp = min(pitch_std / 50.0, 1.0)
+        energy_comp = min(energy_std / 0.3, 1.0)
+        spec_comp = min(spec_centroid / 2000.0, 1.0)
+        return np.clip(pitch_comp * 0.4 + energy_comp * 0.4 + spec_comp * 0.2, 0, 1)
+    def _calc_energy(self, energy_mean, tempo, zcr_mean):
+        """Calculate vocal energy score"""
+        energy_comp = min(energy_mean / 0.5, 1.0)
+        tempo_comp = min(tempo / 150.0, 1.0)
+        zcr_comp = min(zcr_mean / 0.15, 1.0)
+        return np.clip(energy_comp * 0.5 + tempo_comp * 0.3 + zcr_comp * 0.2, 0, 1)
+    def _default_features(self):
+        """Default features for errors"""
+        n_features = self.n_mfcc * 2 + 14
         return {
+            'features': np.random.randn(n_features).astype(np.float32) * 0.1,
             'vocal_affect_score': 0.5,
             'monotone_score': 0.5,
             'vocal_energy_score': 0.5,
+            'pitch_variability': 30.0,
+            'energy_level': 0.3
         }
 # ============================================
+# EMOTION PREDICTOR
 # ============================================
 class EmotionPredictor:
+    """Lightweight emotion predictor using sklearn"""
     def __init__(self):
+        self.extractor = AudioFeatureExtractor(sr=16000, n_mfcc=20)
         # Emotion mapping
+        self.emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
+        # Initialize models
+        self._initialize_models()
+    def _initialize_models(self):
+        """Initialize pre-trained or demo models"""
+        # Try to load pre-trained models
+        if os.path.exists('emotion_classifier.pkl'):
             try:
+                with open('emotion_classifier.pkl', 'rb') as f:
+                    self.emotion_model = pickle.load(f)
+                with open('affect_model.pkl', 'rb') as f:
+                    self.affect_model = pickle.load(f)
+                with open('monotone_model.pkl', 'rb') as f:
+                    self.monotone_model = pickle.load(f)
+                with open('energy_model.pkl', 'rb') as f:
+                    self.energy_model = pickle.load(f)
+                with open('scaler.pkl', 'rb') as f:
+                    self.scaler = pickle.load(f)
+                print("✅ Loaded pre-trained models")
+                return
+            except:
+                pass
+        # Create demo models (for demonstration without training)
+        print("ℹ️ Creating demo models (for demonstration)")
+        n_features = 54  # 20*2 MFCC + 14 other features
+        # Emotion classifier
+        self.emotion_model = RandomForestClassifier(
+            n_estimators=100,
+            max_depth=10,
+            random_state=42
+        )
+        # Regression models
+        self.affect_model = GradientBoostingRegressor(n_estimators=50, random_state=42)
+        self.monotone_model = GradientBoostingRegressor(n_estimators=50, random_state=42)
+        self.energy_model = GradientBoostingRegressor(n_estimators=50, random_state=42)
+        # Scaler
+        self.scaler = StandardScaler()
+        # Fit with dummy data (for demo purposes)
+        X_dummy = np.random.randn(100, n_features)
+        y_emotion_dummy = np.random.randint(0, 8, 100)
+        y_reg_dummy = np.random.rand(100)
+        self.scaler.fit(X_dummy)
+        self.emotion_model.fit(X_dummy, y_emotion_dummy)
+        self.affect_model.fit(X_dummy, y_reg_dummy)
+        self.monotone_model.fit(X_dummy, y_reg_dummy)
+        self.energy_model.fit(X_dummy, y_reg_dummy)
+    def predict(self, audio_path):
         """Predict emotion and mental health indicators"""
         # Extract features
+        feature_dict = self.extractor.extract_all_features(audio_path)
+        features = feature_dict['features'].reshape(1, -1)
+        # Scale features
+        features_scaled = self.scaler.transform(features)
+        # Predict emotion
+        emotion_probs = self.emotion_model.predict_proba(features_scaled)[0]
+        emotion_idx = np.argmax(emotion_probs)
+        emotion = self.emotions[emotion_idx]
+        confidence = emotion_probs[emotion_idx]
+        # Predict regression outputs
+        vocal_affect = np.clip(self.affect_model.predict(features_scaled)[0], 0, 1)
+        monotone_score = np.clip(self.monotone_model.predict(features_scaled)[0], 0, 1)
+        vocal_energy = np.clip(self.energy_model.predict(features_scaled)[0], 0, 1)
+        # Adjust with extracted features for better estimates
+        vocal_affect = (vocal_affect + feature_dict['vocal_affect_score']) / 2
+        monotone_score = (monotone_score + feature_dict['monotone_score']) / 2
+        vocal_energy = (vocal_energy + feature_dict['vocal_energy_score']) / 2
         # Mental health interpretation
+        indicators = self._interpret_mental_health(monotone_score, vocal_affect, vocal_energy)
+        return {
             'emotion': emotion,
             'confidence': confidence,
             'emotion_probabilities': {
+                self.emotions[i]: prob for i, prob in enumerate(emotion_probs)
             },
             'vocal_affect_score': vocal_affect,
             'monotone_speech_score': monotone_score,
             'vocal_energy_score': vocal_energy,
             'pitch_variability': feature_dict['pitch_variability'],
             'energy_level': feature_dict['energy_level'],
+            'mental_health_indicators': indicators
         }
     def _interpret_mental_health(self, monotone, affect, energy):
         """Interpret mental health indicators"""
         if affect > 0.6 and monotone < 0.4:
             indicators.append("⚠️ High vocal affect - possible emotional stress")
+        if 0.35 <= monotone <= 0.65 and 0.35 <= affect <= 0.65 and 0.35 <= energy <= 0.65:
+            indicators.append("✅ Balanced vocal characteristics")
         if not indicators:
             indicators.append("ℹ️ Vocal patterns within normal range")
 # GRADIO INTERFACE
 # ============================================
+def create_app():
+    """Create Gradio app"""
     predictor = EmotionPredictor()
+    def analyze_audio(audio):
+        """Analysis function"""
         if audio is None:
+            return "❌ Please upload an audio file", "", "", "", "", ""
         try:
             results = predictor.predict(audio)
             # Format emotion output
+            emotion_text = f"## 🎭 **{results['emotion'].upper()}**\n\n"
             emotion_text += f"**Confidence:** {results['confidence']*100:.1f}%\n\n"
+            emotion_text += "### Probability Distribution:\n"
             for emotion, prob in sorted(results['emotion_probabilities'].items(),
                                        key=lambda x: x[1], reverse=True):
+                bar = "█" * int(prob * 20) + "░" * (20 - int(prob * 20))
+                emotion_text += f"**{emotion.title()}:** {bar} {prob*100:.1f}%\n"
             # Format scores
+            affect = f"**Score:** {results['vocal_affect_score']:.3f}\n\n"
             if results['vocal_affect_score'] > 0.7:
+                affect += "🔴 High intensity"
             elif results['vocal_affect_score'] < 0.3:
+                affect += "🟢 Low intensity"
             else:
+                affect += "🟡 Moderate"
+            monotone = f"**Score:** {results['monotone_speech_score']:.3f}\n\n"
             if results['monotone_speech_score'] > 0.7:
+                monotone += "🔴 Very flat speech"
             elif results['monotone_speech_score'] < 0.3:
+                monotone += "🟢 Varied pitch"
             else:
+                monotone += "🟡 Moderate variation"
+            energy = f"**Score:** {results['vocal_energy_score']:.3f}\n\n"
             if results['vocal_energy_score'] > 0.7:
+                energy += "🔴 High energy"
             elif results['vocal_energy_score'] < 0.3:
+                energy += "🔴 Low energy"
             else:
+                energy += "🟢 Normal energy"
+            details = f"**Pitch Variability:** {results['pitch_variability']:.2f} Hz\n"
+            details += f"**Energy Level:** {results['energy_level']:.3f}"
+            mental = "\n".join(results['mental_health_indicators'])
+            return emotion_text, affect, monotone, energy, details, mental
         except Exception as e:
+            return f"❌ Error: {str(e)}", "", "", "", "", ""
     # Create interface
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         # 🎙️ Audio Emotion & Mental Health Detection
+        Analyze emotional state and mental health indicators from speech audio.
         """)
         with gr.Row():
+            with gr.Column():
+                audio = gr.Audio(type="filepath", label="Upload Audio")
+                btn = gr.Button("🔍 Analyze", variant="primary", size="lg")
+            with gr.Column():
+                emotion_out = gr.Markdown()
                 with gr.Row():
+                    affect_out = gr.Markdown()
+                    monotone_out = gr.Markdown()
+                    energy_out = gr.Markdown()
+                details_out = gr.Markdown()
+                mental_out = gr.Markdown()
         gr.Markdown("""
+        ### 📊 Interpretation
+        - **Vocal Affect:** Emotional intensity (0=calm, 1=intense)
+        - **Monotone Score:** Pitch flatness (high=depression risk)
+        - **Vocal Energy:** Speaking energy (low=low motivation)
+        ⚠️ **Disclaimer:** For research only, not medical diagnosis.
         """)
+        btn.click(
+            analyze_audio,
+            inputs=audio,
+            outputs=[emotion_out, affect_out, monotone_out, energy_out, details_out, mental_out]
         )
     return demo
 # ============================================
+# MAIN
 # ============================================
 if __name__ == "__main__":
+    app = create_app()
+    app.launch()