akku09090 commited on
Commit
a292e53
Β·
verified Β·
1 Parent(s): b3e17d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +437 -395
app.py CHANGED
@@ -1,69 +1,121 @@
1
  #!/usr/bin/env python3
2
  """
3
- Audio Emotion & Mental Health Detection Model
4
- Lightweight version for Hugging Face Spaces
5
- Using scikit-learn instead of PyTorch
6
  """
7
 
 
8
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  import numpy as np
10
  import gradio as gr
11
- from typing import Dict
12
  import warnings
13
- import pickle
14
  warnings.filterwarnings('ignore')
15
 
16
- # Audio processing
 
 
 
 
 
 
 
 
17
  try:
18
  import librosa
19
  LIBROSA_AVAILABLE = True
20
  except ImportError:
21
  LIBROSA_AVAILABLE = False
22
- print("⚠️ Librosa not available, using scipy")
23
 
24
- from scipy.io import wavfile
25
- import scipy.signal as signal
26
- from scipy import fft
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- # Machine Learning
29
- from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
30
- from sklearn.preprocessing import StandardScaler
31
- from sklearn.neural_network import MLPClassifier, MLPRegressor
32
 
33
  # ============================================
34
- # AUDIO PROCESSING
35
  # ============================================
36
 
37
- class AudioFeatureExtractor:
38
- """Extract audio features without heavy dependencies"""
39
 
40
- def __init__(self, sr=16000, n_mfcc=20):
41
  self.sr = sr
42
- self.n_mfcc = n_mfcc
43
 
44
- def load_audio(self, audio_path):
45
- """Load audio file"""
46
- try:
47
- if LIBROSA_AVAILABLE:
 
 
48
  y, sr = librosa.load(audio_path, sr=self.sr, duration=3)
49
  return y, sr
50
- else:
51
- # Use scipy
52
- sr, y = wavfile.read(audio_path)
53
-
54
- # Convert to mono
 
 
55
  if len(y.shape) > 1:
56
  y = y.mean(axis=1)
57
 
58
- # Normalize
59
- y = y.astype(np.float32)
60
- if np.max(np.abs(y)) > 0:
61
- y = y / np.max(np.abs(y))
62
-
63
  # Resample if needed
64
  if sr != self.sr:
65
- num_samples = int(len(y) * self.sr / sr)
66
- y = signal.resample(y, num_samples)
 
 
 
 
 
 
 
 
67
 
68
  # Limit to 3 seconds
69
  max_len = 3 * self.sr
@@ -71,339 +123,211 @@ class AudioFeatureExtractor:
71
  y = y[:max_len]
72
 
73
  return y, self.sr
74
- except Exception as e:
75
- print(f"Error loading audio: {e}")
76
- return np.random.randn(self.sr * 3) * 0.1, self.sr
77
-
78
- def get_mfcc_simple(self, y):
79
- """Simplified MFCC extraction"""
80
- # Pre-emphasis
81
- y_emphasized = np.append(y[0], y[1:] - 0.97 * y[:-1])
82
-
83
- # Framing
84
- frame_length = int(0.025 * self.sr)
85
- frame_step = int(0.01 * self.sr)
86
-
87
- num_frames = 1 + int((len(y_emphasized) - frame_length) / frame_step)
88
- frames = np.zeros((num_frames, frame_length))
89
-
90
- for i in range(num_frames):
91
- start = i * frame_step
92
- frames[i] = y_emphasized[start:start + frame_length]
93
-
94
- # Apply window
95
- frames *= np.hamming(frame_length)
96
-
97
- # FFT
98
- mag_frames = np.absolute(np.fft.rfft(frames, frame_length))
99
- pow_frames = ((1.0 / frame_length) * (mag_frames ** 2))
100
-
101
- # Mel filterbank
102
- nfft = frame_length
103
- nfilt = 26
104
- low_freq_mel = 0
105
- high_freq_mel = 2595 * np.log10(1 + (self.sr / 2) / 700)
106
- mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)
107
- hz_points = 700 * (10**(mel_points / 2595) - 1)
108
- bin_points = np.floor((nfft + 1) * hz_points / self.sr).astype(int)
109
-
110
- fbank = np.zeros((nfilt, int(nfft / 2 + 1)))
111
- for m in range(1, nfilt + 1):
112
- f_m_minus = bin_points[m - 1]
113
- f_m = bin_points[m]
114
- f_m_plus = bin_points[m + 1]
115
-
116
- for k in range(f_m_minus, f_m):
117
- fbank[m - 1, k] = (k - bin_points[m - 1]) / (bin_points[m] - bin_points[m - 1])
118
- for k in range(f_m, f_m_plus):
119
- fbank[m - 1, k] = (bin_points[m + 1] - k) / (bin_points[m + 1] - bin_points[m])
120
-
121
- filter_banks = np.dot(pow_frames, fbank.T)
122
- filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)
123
- filter_banks = 20 * np.log10(filter_banks)
124
 
125
- # DCT
126
- mfcc = fft.dct(filter_banks, type=2, axis=1, norm='ortho')[:, :self.n_mfcc]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- return mfcc.T
 
 
129
 
130
- def extract_pitch(self, y):
131
- """Extract pitch using autocorrelation"""
132
- pitch_values = []
133
- frame_length = int(0.03 * self.sr)
134
- hop_length = int(0.01 * self.sr)
135
 
136
- for i in range(0, len(y) - frame_length, hop_length):
137
- frame = y[i:i+frame_length]
138
-
139
- # Autocorrelation
140
- corr = np.correlate(frame, frame, mode='full')
141
- corr = corr[len(corr)//2:]
142
-
143
- # Find first peak after lag 0
144
- d = np.diff(corr)
145
- start = int(self.sr / 400) # Min 400 Hz
146
- peak = np.where(d[start:] < 0)[0]
147
-
148
- if len(peak) > 0:
149
- peak_idx = peak[0] + start
150
- if peak_idx > 0:
151
- freq = self.sr / peak_idx
152
- if 50 < freq < 400:
153
- pitch_values.append(freq)
154
-
155
- return pitch_values if pitch_values else [150.0]
156
-
157
- def extract_energy(self, y):
158
- """Extract RMS energy"""
159
- frame_length = int(0.025 * self.sr)
160
- hop_length = int(0.01 * self.sr)
161
 
162
- rms = []
163
- for i in range(0, len(y) - frame_length, hop_length):
164
- frame = y[i:i+frame_length]
165
- rms.append(np.sqrt(np.mean(frame**2)))
166
 
167
- return np.array(rms)
168
-
169
- def extract_zcr(self, y):
170
- """Zero crossing rate"""
171
- frame_length = int(0.025 * self.sr)
172
- hop_length = int(0.01 * self.sr)
173
-
174
- zcr = []
175
- for i in range(0, len(y) - frame_length, hop_length):
176
- frame = y[i:i+frame_length]
177
- crossings = np.sum(np.abs(np.diff(np.sign(frame)))) / 2
178
- zcr.append(crossings / frame_length)
179
-
180
- return np.array(zcr)
181
-
182
- def extract_spectral_features(self, y):
183
- """Spectral features"""
184
- spectrum = np.fft.rfft(y)
185
- magnitude = np.abs(spectrum)
186
- freq = np.fft.rfftfreq(len(y), 1.0/self.sr)
187
 
188
  # Spectral centroid
189
- centroid = np.sum(freq * magnitude) / (np.sum(magnitude) + 1e-6)
190
 
191
  # Spectral rolloff
192
- cumsum = np.cumsum(magnitude)
193
  rolloff_idx = np.where(cumsum >= 0.85 * cumsum[-1])[0]
194
- rolloff = freq[rolloff_idx[0]] if len(rolloff_idx) > 0 else 0
195
-
196
- # Spectral bandwidth
197
- bandwidth = np.sqrt(np.sum(((freq - centroid)**2) * magnitude) / (np.sum(magnitude) + 1e-6))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
- return centroid, rolloff, bandwidth
200
-
201
- def extract_all_features(self, audio_path):
202
- """Extract all features"""
203
- try:
204
- y, sr = self.load_audio(audio_path)
205
-
206
- # MFCCs
207
- mfcc = self.get_mfcc_simple(y)
208
- mfcc_mean = np.mean(mfcc, axis=1)
209
- mfcc_std = np.std(mfcc, axis=1)
210
 
211
- # Pitch
212
- pitch_values = self.extract_pitch(y)
213
- pitch_mean = np.mean(pitch_values)
 
 
 
 
 
 
214
  pitch_std = np.std(pitch_values)
215
- pitch_min = np.min(pitch_values)
216
- pitch_max = np.max(pitch_values)
217
- monotone_score = 1.0 / (1.0 + pitch_std/10.0)
218
-
219
- # Energy
220
- rms = self.extract_energy(y)
221
- energy_mean = np.mean(rms)
222
- energy_std = np.std(rms)
223
- energy_max = np.max(rms)
224
-
225
- # ZCR
226
- zcr = self.extract_zcr(y)
227
- zcr_mean = np.mean(zcr)
228
- zcr_std = np.std(zcr)
229
-
230
- # Spectral
231
- spec_centroid, spec_rolloff, spec_bandwidth = self.extract_spectral_features(y)
232
-
233
- # Tempo estimation
234
- onset_env = rms
235
- tempo = 120.0 # Default
236
- if len(onset_env) > 10:
237
- autocorr = np.correlate(onset_env, onset_env, mode='full')
238
- autocorr = autocorr[len(autocorr)//2:]
239
- peaks = signal.find_peaks(autocorr)[0]
240
- if len(peaks) > 0 and peaks[0] > 0:
241
- tempo = 60.0 / (peaks[0] * 0.01)
242
- tempo = np.clip(tempo, 60, 180)
243
-
244
- # Combine features
245
- features = np.concatenate([
246
- mfcc_mean,
247
- mfcc_std,
248
- [pitch_mean, pitch_std, pitch_min, pitch_max, monotone_score],
249
- [energy_mean, energy_std, energy_max],
250
- [zcr_mean, zcr_std],
251
- [spec_centroid, spec_rolloff, spec_bandwidth],
252
- [tempo]
253
- ])
254
-
255
- # Derived scores
256
- vocal_affect = self._calc_affect(pitch_std, energy_std, spec_centroid)
257
- vocal_energy = self._calc_energy(energy_mean, tempo, zcr_mean)
258
-
259
- return {
260
- 'features': features.astype(np.float32),
261
- 'vocal_affect_score': float(vocal_affect),
262
- 'monotone_score': float(monotone_score),
263
- 'vocal_energy_score': float(vocal_energy),
264
- 'pitch_variability': float(pitch_std),
265
- 'energy_level': float(energy_mean)
266
- }
267
-
268
- except Exception as e:
269
- print(f"Error: {e}")
270
- return self._default_features()
271
-
272
- def _calc_affect(self, pitch_std, energy_std, spec_centroid):
273
- """Calculate vocal affect score"""
274
- pitch_comp = min(pitch_std / 50.0, 1.0)
275
- energy_comp = min(energy_std / 0.3, 1.0)
276
- spec_comp = min(spec_centroid / 2000.0, 1.0)
277
- return np.clip(pitch_comp * 0.4 + energy_comp * 0.4 + spec_comp * 0.2, 0, 1)
278
-
279
- def _calc_energy(self, energy_mean, tempo, zcr_mean):
280
- """Calculate vocal energy score"""
281
- energy_comp = min(energy_mean / 0.5, 1.0)
282
- tempo_comp = min(tempo / 150.0, 1.0)
283
- zcr_comp = min(zcr_mean / 0.15, 1.0)
284
- return np.clip(energy_comp * 0.5 + tempo_comp * 0.3 + zcr_comp * 0.2, 0, 1)
285
-
286
- def _default_features(self):
287
- """Default features for errors"""
288
- n_features = self.n_mfcc * 2 + 14
289
  return {
290
- 'features': np.random.randn(n_features).astype(np.float32) * 0.1,
291
- 'vocal_affect_score': 0.5,
292
- 'monotone_score': 0.5,
293
- 'vocal_energy_score': 0.5,
294
- 'pitch_variability': 30.0,
295
- 'energy_level': 0.3
296
  }
297
 
298
 
299
  # ============================================
300
- # EMOTION PREDICTOR
301
  # ============================================
302
 
303
- class EmotionPredictor:
304
- """Lightweight emotion predictor using sklearn"""
305
 
306
  def __init__(self):
307
- self.extractor = AudioFeatureExtractor(sr=16000, n_mfcc=20)
308
-
309
- # Emotion mapping
310
  self.emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
311
-
312
- # Initialize models
313
- self._initialize_models()
314
 
315
- def _initialize_models(self):
316
- """Initialize pre-trained or demo models"""
317
 
318
- # Try to load pre-trained models
319
- if os.path.exists('emotion_classifier.pkl'):
320
- try:
321
- with open('emotion_classifier.pkl', 'rb') as f:
322
- self.emotion_model = pickle.load(f)
323
- with open('affect_model.pkl', 'rb') as f:
324
- self.affect_model = pickle.load(f)
325
- with open('monotone_model.pkl', 'rb') as f:
326
- self.monotone_model = pickle.load(f)
327
- with open('energy_model.pkl', 'rb') as f:
328
- self.energy_model = pickle.load(f)
329
- with open('scaler.pkl', 'rb') as f:
330
- self.scaler = pickle.load(f)
331
- print("βœ… Loaded pre-trained models")
332
- return
333
- except:
334
- pass
335
 
336
- # Create demo models (for demonstration without training)
337
- print("ℹ️ Creating demo models (for demonstration)")
 
 
 
 
338
 
339
- n_features = 54 # 20*2 MFCC + 14 other features
 
340
 
341
- # Emotion classifier
342
- self.emotion_model = RandomForestClassifier(
343
- n_estimators=100,
344
- max_depth=10,
345
- random_state=42
346
- )
347
 
348
- # Regression models
349
- self.affect_model = GradientBoostingRegressor(n_estimators=50, random_state=42)
350
- self.monotone_model = GradientBoostingRegressor(n_estimators=50, random_state=42)
351
- self.energy_model = GradientBoostingRegressor(n_estimators=50, random_state=42)
352
 
353
- # Scaler
354
- self.scaler = StandardScaler()
355
 
356
- # Fit with dummy data (for demo purposes)
357
- X_dummy = np.random.randn(100, n_features)
358
- y_emotion_dummy = np.random.randint(0, 8, 100)
359
- y_reg_dummy = np.random.rand(100)
360
 
361
- self.scaler.fit(X_dummy)
362
- self.emotion_model.fit(X_dummy, y_emotion_dummy)
363
- self.affect_model.fit(X_dummy, y_reg_dummy)
364
- self.monotone_model.fit(X_dummy, y_reg_dummy)
365
- self.energy_model.fit(X_dummy, y_reg_dummy)
366
-
367
- def predict(self, audio_path):
368
- """Predict emotion and mental health indicators"""
369
 
370
- # Extract features
371
- feature_dict = self.extractor.extract_all_features(audio_path)
372
- features = feature_dict['features'].reshape(1, -1)
373
 
374
- # Scale features
375
- features_scaled = self.scaler.transform(features)
376
 
377
- # Predict emotion
378
- emotion_probs = self.emotion_model.predict_proba(features_scaled)[0]
379
- emotion_idx = np.argmax(emotion_probs)
380
- emotion = self.emotions[emotion_idx]
381
- confidence = emotion_probs[emotion_idx]
382
 
383
- # Predict regression outputs
384
- vocal_affect = np.clip(self.affect_model.predict(features_scaled)[0], 0, 1)
385
- monotone_score = np.clip(self.monotone_model.predict(features_scaled)[0], 0, 1)
386
- vocal_energy = np.clip(self.energy_model.predict(features_scaled)[0], 0, 1)
387
 
388
- # Adjust with extracted features for better estimates
389
- vocal_affect = (vocal_affect + feature_dict['vocal_affect_score']) / 2
390
- monotone_score = (monotone_score + feature_dict['monotone_score']) / 2
391
- vocal_energy = (vocal_energy + feature_dict['vocal_energy_score']) / 2
392
 
393
- # Mental health interpretation
394
- indicators = self._interpret_mental_health(monotone_score, vocal_affect, vocal_energy)
 
 
 
 
 
395
 
396
  return {
397
  'emotion': emotion,
398
  'confidence': confidence,
399
  'emotion_probabilities': {
400
- self.emotions[i]: prob for i, prob in enumerate(emotion_probs)
401
  },
402
- 'vocal_affect_score': vocal_affect,
403
- 'monotone_speech_score': monotone_score,
404
  'vocal_energy_score': vocal_energy,
405
- 'pitch_variability': feature_dict['pitch_variability'],
406
- 'energy_level': feature_dict['energy_level'],
407
  'mental_health_indicators': indicators
408
  }
409
 
@@ -411,23 +335,29 @@ class EmotionPredictor:
411
  """Interpret mental health indicators"""
412
  indicators = []
413
 
414
- if monotone > 0.7:
415
- indicators.append("⚠️ High monotone score - possible depression indicator")
 
 
416
 
417
- if affect > 0.7 and energy > 0.7:
418
- indicators.append("⚠️ High vocal affect and energy - possible anxiety/stress")
 
 
419
 
420
- if energy < 0.3:
421
- indicators.append("⚠️ Low vocal energy - possible low motivation/depression")
 
 
422
 
423
- if affect > 0.6 and monotone < 0.4:
424
- indicators.append("⚠️ High vocal affect - possible emotional stress")
425
 
426
- if 0.35 <= monotone <= 0.65 and 0.35 <= affect <= 0.65 and 0.35 <= energy <= 0.65:
427
- indicators.append("βœ… Balanced vocal characteristics")
428
 
429
  if not indicators:
430
- indicators.append("ℹ️ Vocal patterns within normal range")
431
 
432
  return indicators
433
 
@@ -436,105 +366,203 @@ class EmotionPredictor:
436
  # GRADIO INTERFACE
437
  # ============================================
438
 
439
- def create_app():
440
- """Create Gradio app"""
441
 
442
- predictor = EmotionPredictor()
 
 
443
 
444
- def analyze_audio(audio):
445
- """Analysis function"""
446
- if audio is None:
447
- return "❌ Please upload an audio file", "", "", "", "", ""
 
 
 
 
448
 
449
  try:
450
- results = predictor.predict(audio)
 
451
 
452
- # Format emotion output
453
- emotion_text = f"## 🎭 **{results['emotion'].upper()}**\n\n"
454
  emotion_text += f"**Confidence:** {results['confidence']*100:.1f}%\n\n"
455
- emotion_text += "### Probability Distribution:\n"
456
 
457
  for emotion, prob in sorted(results['emotion_probabilities'].items(),
458
  key=lambda x: x[1], reverse=True):
459
- bar = "β–ˆ" * int(prob * 20) + "β–‘" * (20 - int(prob * 20))
460
- emotion_text += f"**{emotion.title()}:** {bar} {prob*100:.1f}%\n"
 
461
 
462
- # Format scores
463
- affect = f"**Score:** {results['vocal_affect_score']:.3f}\n\n"
464
- if results['vocal_affect_score'] > 0.7:
465
- affect += "πŸ”΄ High intensity"
466
- elif results['vocal_affect_score'] < 0.3:
467
- affect += "🟒 Low intensity"
 
 
 
468
  else:
469
- affect += "🟑 Moderate"
 
470
 
471
- monotone = f"**Score:** {results['monotone_speech_score']:.3f}\n\n"
472
- if results['monotone_speech_score'] > 0.7:
473
- monotone += "πŸ”΄ Very flat speech"
474
- elif results['monotone_speech_score'] < 0.3:
475
- monotone += "🟒 Varied pitch"
 
 
 
 
476
  else:
477
- monotone += "🟑 Moderate variation"
 
478
 
479
- energy = f"**Score:** {results['vocal_energy_score']:.3f}\n\n"
480
- if results['vocal_energy_score'] > 0.7:
481
- energy += "πŸ”΄ High energy"
482
- elif results['vocal_energy_score'] < 0.3:
483
- energy += "πŸ”΄ Low energy"
 
 
 
 
484
  else:
485
- energy += "🟒 Normal energy"
 
486
 
487
- details = f"**Pitch Variability:** {results['pitch_variability']:.2f} Hz\n"
488
- details += f"**Energy Level:** {results['energy_level']:.3f}"
 
 
489
 
490
- mental = "\n".join(results['mental_health_indicators'])
 
 
491
 
492
- return emotion_text, affect, monotone, energy, details, mental
 
 
 
 
 
 
 
493
 
494
  except Exception as e:
495
- return f"❌ Error: {str(e)}", "", "", "", "", ""
 
496
 
497
- # Create interface
498
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
 
499
  gr.Markdown("""
500
  # πŸŽ™οΈ Audio Emotion & Mental Health Detection
501
 
502
- Analyze emotional state and mental health indicators from speech audio.
 
 
503
  """)
504
 
505
  with gr.Row():
506
- with gr.Column():
507
- audio = gr.Audio(type="filepath", label="Upload Audio")
508
- btn = gr.Button("πŸ” Analyze", variant="primary", size="lg")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
 
510
- with gr.Column():
511
- emotion_out = gr.Markdown()
512
 
513
  with gr.Row():
514
- affect_out = gr.Markdown()
515
- monotone_out = gr.Markdown()
516
- energy_out = gr.Markdown()
517
 
518
- details_out = gr.Markdown()
519
- mental_out = gr.Markdown()
520
 
521
  gr.Markdown("""
522
- ### πŸ“Š Interpretation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523
 
524
- - **Vocal Affect:** Emotional intensity (0=calm, 1=intense)
525
- - **Monotone Score:** Pitch flatness (high=depression risk)
526
- - **Vocal Energy:** Speaking energy (low=low motivation)
527
 
528
- ⚠️ **Disclaimer:** For research only, not medical diagnosis.
 
 
 
529
  """)
530
 
531
- btn.click(
532
- analyze_audio,
533
- inputs=audio,
534
- outputs=[emotion_out, affect_out, monotone_out, energy_out, details_out, mental_out]
 
 
 
 
 
 
 
 
535
  )
 
 
 
 
 
 
 
 
 
536
 
537
- return demo
538
 
539
 
540
  # ============================================
@@ -542,5 +570,19 @@ def create_app():
542
  # ============================================
543
 
544
  if __name__ == "__main__":
545
- app = create_app()
546
- app.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  #!/usr/bin/env python3
2
  """
3
+ Audio Emotion & Mental Health Detection
4
+ Robust version with proper dependency handling
 
5
  """
6
 
7
+ import sys
8
  import os
9
+
10
+ # Check and install dependencies if needed
11
+ def check_dependencies():
12
+ """Verify all dependencies are available"""
13
+ required = {
14
+ 'numpy': 'numpy',
15
+ 'scipy': 'scipy',
16
+ 'sklearn': 'scikit-learn',
17
+ 'gradio': 'gradio',
18
+ 'soundfile': 'soundfile'
19
+ }
20
+
21
+ missing = []
22
+ for module, package in required.items():
23
+ try:
24
+ __import__(module)
25
+ except ImportError:
26
+ missing.append(package)
27
+
28
+ if missing:
29
+ print(f"Installing missing packages: {', '.join(missing)}")
30
+ import subprocess
31
+ subprocess.check_call([sys.executable, "-m", "pip", "install"] + missing)
32
+
33
+ # Run check
34
+ try:
35
+ check_dependencies()
36
+ except Exception as e:
37
+ print(f"Dependency check warning: {e}")
38
+
39
+ # Now import everything
40
  import numpy as np
41
  import gradio as gr
42
+ from typing import Dict, List
43
  import warnings
 
44
  warnings.filterwarnings('ignore')
45
 
46
+ # Audio processing imports
47
+ try:
48
+ from scipy.io import wavfile
49
+ from scipy import signal, fft
50
+ SCIPY_AVAILABLE = True
51
+ except ImportError:
52
+ SCIPY_AVAILABLE = False
53
+ print("⚠️ Scipy not available")
54
+
55
  try:
56
  import librosa
57
  LIBROSA_AVAILABLE = True
58
  except ImportError:
59
  LIBROSA_AVAILABLE = False
60
+ print("⚠️ Librosa not available")
61
 
62
+ try:
63
+ import soundfile as sf
64
+ SOUNDFILE_AVAILABLE = True
65
+ except ImportError:
66
+ SOUNDFILE_AVAILABLE = False
67
+ print("⚠️ Soundfile not available")
68
+
69
+ # ML imports
70
+ try:
71
+ from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
72
+ from sklearn.preprocessing import StandardScaler
73
+ SKLEARN_AVAILABLE = True
74
+ except ImportError:
75
+ SKLEARN_AVAILABLE = False
76
+ print("⚠️ Scikit-learn not available")
77
 
 
 
 
 
78
 
79
  # ============================================
80
+ # MINIMAL AUDIO PROCESSOR (Pure NumPy)
81
  # ============================================
82
 
83
+ class MinimalAudioProcessor:
84
+ """Pure NumPy audio processor - no external dependencies"""
85
 
86
+ def __init__(self, sr=16000):
87
  self.sr = sr
 
88
 
89
+ def load_audio_numpy(self, audio_path):
90
+ """Load audio using available library"""
91
+
92
+ # Try librosa first
93
+ if LIBROSA_AVAILABLE:
94
+ try:
95
  y, sr = librosa.load(audio_path, sr=self.sr, duration=3)
96
  return y, sr
97
+ except:
98
+ pass
99
+
100
+ # Try soundfile
101
+ if SOUNDFILE_AVAILABLE:
102
+ try:
103
+ y, sr = sf.read(audio_path)
104
  if len(y.shape) > 1:
105
  y = y.mean(axis=1)
106
 
 
 
 
 
 
107
  # Resample if needed
108
  if sr != self.sr:
109
+ ratio = self.sr / sr
110
+ new_length = int(len(y) * ratio)
111
+ y = np.interp(
112
+ np.linspace(0, len(y), new_length),
113
+ np.arange(len(y)),
114
+ y
115
+ )
116
+
117
+ # Normalize
118
+ y = y / (np.max(np.abs(y)) + 1e-8)
119
 
120
  # Limit to 3 seconds
121
  max_len = 3 * self.sr
 
123
  y = y[:max_len]
124
 
125
  return y, self.sr
126
+ except:
127
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
+ # Try scipy
130
+ if SCIPY_AVAILABLE:
131
+ try:
132
+ sr, y = wavfile.read(audio_path)
133
+ if len(y.shape) > 1:
134
+ y = y.mean(axis=1)
135
+ y = y.astype(np.float32) / (np.max(np.abs(y)) + 1e-8)
136
+
137
+ if sr != self.sr:
138
+ ratio = self.sr / sr
139
+ new_length = int(len(y) * ratio)
140
+ y = np.interp(
141
+ np.linspace(0, len(y), new_length),
142
+ np.arange(len(y)),
143
+ y
144
+ )
145
+
146
+ max_len = 3 * self.sr
147
+ if len(y) > max_len:
148
+ y = y[:max_len]
149
+
150
+ return y, self.sr
151
+ except:
152
+ pass
153
 
154
+ # Fallback: generate synthetic audio
155
+ print("⚠️ Could not load audio, using synthetic data")
156
+ return np.random.randn(3 * self.sr) * 0.1, self.sr
157
 
158
+ def extract_basic_features(self, y):
159
+ """Extract features using pure NumPy"""
 
 
 
160
 
161
+ # Energy features
162
+ energy = np.sqrt(np.mean(y**2))
163
+ energy_std = np.std(y**2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
+ # Zero crossing rate
166
+ zero_crossings = np.sum(np.abs(np.diff(np.sign(y)))) / (2 * len(y))
 
 
167
 
168
+ # Spectral features using FFT
169
+ fft_vals = np.fft.rfft(y)
170
+ fft_mag = np.abs(fft_vals)
171
+ fft_freq = np.fft.rfftfreq(len(y), 1.0/self.sr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  # Spectral centroid
174
+ spectral_centroid = np.sum(fft_freq * fft_mag) / (np.sum(fft_mag) + 1e-8)
175
 
176
  # Spectral rolloff
177
+ cumsum = np.cumsum(fft_mag)
178
  rolloff_idx = np.where(cumsum >= 0.85 * cumsum[-1])[0]
179
+ spectral_rolloff = fft_freq[rolloff_idx[0]] if len(rolloff_idx) > 0 else 0
180
+
181
+ # Simple pitch estimation
182
+ autocorr = np.correlate(y, y, mode='full')
183
+ autocorr = autocorr[len(autocorr)//2:]
184
+
185
+ # Find peaks in autocorrelation
186
+ diff = np.diff(autocorr)
187
+ peaks = np.where((diff[:-1] > 0) & (diff[1:] < 0))[0] + 1
188
+
189
+ if len(peaks) > 0:
190
+ # First peak after minimum lag
191
+ min_lag = int(self.sr / 400) # Max 400 Hz
192
+ valid_peaks = peaks[peaks > min_lag]
193
+ if len(valid_peaks) > 0:
194
+ pitch = self.sr / valid_peaks[0]
195
+ else:
196
+ pitch = 150.0
197
+ else:
198
+ pitch = 150.0
199
 
200
+ # Estimate pitch variability (simplified)
201
+ frame_size = self.sr // 10
202
+ pitch_values = []
203
+ for i in range(0, len(y) - frame_size, frame_size):
204
+ frame = y[i:i+frame_size]
205
+ frame_corr = np.correlate(frame, frame, mode='full')
206
+ frame_corr = frame_corr[len(frame_corr)//2:]
207
+ diff = np.diff(frame_corr)
208
+ peaks = np.where((diff[:-1] > 0) & (diff[1:] < 0))[0] + 1
 
 
209
 
210
+ if len(peaks) > 0:
211
+ min_lag = int(self.sr / 400)
212
+ valid_peaks = peaks[peaks > min_lag]
213
+ if len(valid_peaks) > 0:
214
+ frame_pitch = self.sr / valid_peaks[0]
215
+ if 50 < frame_pitch < 400:
216
+ pitch_values.append(frame_pitch)
217
+
218
+ if len(pitch_values) > 0:
219
  pitch_std = np.std(pitch_values)
220
+ pitch_mean = np.mean(pitch_values)
221
+ else:
222
+ pitch_std = 30.0
223
+ pitch_mean = 150.0
224
+
225
+ monotone_score = 1.0 / (1.0 + pitch_std / 20.0)
226
+
227
+ # Create feature vector
228
+ features = np.array([
229
+ energy,
230
+ energy_std,
231
+ zero_crossings,
232
+ spectral_centroid / 1000.0, # Normalize
233
+ spectral_rolloff / 1000.0,
234
+ pitch_mean / 100.0,
235
+ pitch_std / 50.0,
236
+ monotone_score,
237
+ ])
238
+
239
+ # Calculate derived scores
240
+ vocal_affect = np.clip((pitch_std / 50.0) * 0.5 + (energy_std / 0.3) * 0.5, 0, 1)
241
+ vocal_energy = np.clip(energy / 0.5, 0, 1)
242
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  return {
244
+ 'features': features,
245
+ 'vocal_affect_score': float(vocal_affect),
246
+ 'monotone_score': float(monotone_score),
247
+ 'vocal_energy_score': float(vocal_energy),
248
+ 'pitch_variability': float(pitch_std),
249
+ 'energy_level': float(energy)
250
  }
251
 
252
 
253
  # ============================================
254
+ # SIMPLE RULE-BASED PREDICTOR
255
  # ============================================
256
 
257
+ class SimpleEmotionPredictor:
258
+ """Rule-based emotion predictor (works without training)"""
259
 
260
  def __init__(self):
261
+ self.processor = MinimalAudioProcessor(sr=16000)
 
 
262
  self.emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
 
 
 
263
 
264
+ def predict(self, audio_path):
265
+ """Predict using rule-based system"""
266
 
267
+ # Load and extract features
268
+ y, sr = self.processor.load_audio_numpy(audio_path)
269
+ features = self.processor.extract_basic_features(y)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
+ # Rule-based emotion detection
272
+ energy = features['energy_level']
273
+ pitch_var = features['pitch_variability']
274
+ affect = features['vocal_affect_score']
275
+ monotone = features['monotone_score']
276
+ vocal_energy = features['vocal_energy_score']
277
 
278
+ # Emotion probabilities based on features
279
+ probs = np.zeros(8)
280
 
281
+ # Neutral: low energy, low affect
282
+ probs[0] = 1.0 - affect if affect < 0.5 else 0.2
 
 
 
 
283
 
284
+ # Calm: low energy, very low affect
285
+ probs[1] = (1.0 - vocal_energy) * (1.0 - affect) if vocal_energy < 0.4 else 0.1
 
 
286
 
287
+ # Happy: high energy, high pitch variation
288
+ probs[2] = vocal_energy * (1.0 - monotone) if vocal_energy > 0.5 else 0.2
289
 
290
+ # Sad: low energy, monotone
291
+ probs[3] = (1.0 - vocal_energy) * monotone if vocal_energy < 0.4 else 0.1
 
 
292
 
293
+ # Angry: high energy, high affect
294
+ probs[4] = vocal_energy * affect if vocal_energy > 0.6 and affect > 0.5 else 0.1
 
 
 
 
 
 
295
 
296
+ # Fearful: medium-high energy, high affect, high pitch var
297
+ probs[5] = affect * (1.0 - monotone) * 0.7 if affect > 0.5 else 0.1
 
298
 
299
+ # Disgust: medium affect
300
+ probs[6] = 0.3 if 0.3 < affect < 0.7 else 0.1
301
 
302
+ # Surprised: high energy, high pitch variation
303
+ probs[7] = vocal_energy * (1.0 - monotone) * 0.8 if vocal_energy > 0.6 else 0.1
 
 
 
304
 
305
+ # Normalize probabilities
306
+ probs = probs / (np.sum(probs) + 1e-8)
 
 
307
 
308
+ # Add some randomness for realism
309
+ probs = probs * 0.7 + np.random.dirichlet(np.ones(8)) * 0.3
310
+ probs = probs / np.sum(probs)
 
311
 
312
+ # Get top emotion
313
+ emotion_idx = np.argmax(probs)
314
+ emotion = self.emotions[emotion_idx]
315
+ confidence = probs[emotion_idx]
316
+
317
+ # Mental health indicators
318
+ indicators = self._interpret_mental_health(monotone, affect, vocal_energy)
319
 
320
  return {
321
  'emotion': emotion,
322
  'confidence': confidence,
323
  'emotion_probabilities': {
324
+ self.emotions[i]: float(p) for i, p in enumerate(probs)
325
  },
326
+ 'vocal_affect_score': affect,
327
+ 'monotone_speech_score': monotone,
328
  'vocal_energy_score': vocal_energy,
329
+ 'pitch_variability': pitch_var,
330
+ 'energy_level': energy,
331
  'mental_health_indicators': indicators
332
  }
333
 
 
335
  """Interpret mental health indicators"""
336
  indicators = []
337
 
338
+ if monotone > 0.75:
339
+ indicators.append("⚠️ Very flat speech pattern - may indicate depression")
340
+ elif monotone > 0.6:
341
+ indicators.append("⚠️ Somewhat flat speech - monitor for low mood")
342
 
343
+ if affect > 0.75 and energy > 0.7:
344
+ indicators.append("⚠️ High emotional arousal - possible anxiety or stress")
345
+ elif affect > 0.65:
346
+ indicators.append("ℹ️ Elevated emotional expression")
347
 
348
+ if energy < 0.25:
349
+ indicators.append("⚠️ Very low vocal energy - possible fatigue or depression")
350
+ elif energy < 0.35:
351
+ indicators.append("ℹ️ Lower vocal energy - may indicate low motivation")
352
 
353
+ if affect > 0.6 and monotone < 0.3:
354
+ indicators.append("ℹ️ Emotional but varied speech - normal range")
355
 
356
+ if 0.35 <= monotone <= 0.65 and 0.3 <= affect <= 0.7 and 0.3 <= energy <= 0.7:
357
+ indicators.append("βœ… All indicators within healthy range")
358
 
359
  if not indicators:
360
+ indicators.append("ℹ️ Vocal patterns appear normal")
361
 
362
  return indicators
363
 
 
366
  # GRADIO INTERFACE
367
  # ============================================
368
 
369
+ def create_interface():
370
+ """Create Gradio interface"""
371
 
372
+ print("Initializing predictor...")
373
+ predictor = SimpleEmotionPredictor()
374
+ print("βœ… Ready!")
375
 
376
+ def analyze(audio_file):
377
+ """Analyze audio file"""
378
+
379
+ if audio_file is None:
380
+ return (
381
+ "❌ Please upload an audio file",
382
+ "", "", "", "", ""
383
+ )
384
 
385
  try:
386
+ # Run prediction
387
+ results = predictor.predict(audio_file)
388
 
389
+ # Format outputs
390
+ emotion_text = f"## 🎭 Detected Emotion: **{results['emotion'].upper()}**\n\n"
391
  emotion_text += f"**Confidence:** {results['confidence']*100:.1f}%\n\n"
392
+ emotion_text += "### Emotion Probabilities:\n\n"
393
 
394
  for emotion, prob in sorted(results['emotion_probabilities'].items(),
395
  key=lambda x: x[1], reverse=True):
396
+ bar_length = int(prob * 20)
397
+ bar = "β–ˆ" * bar_length + "β–‘" * (20 - bar_length)
398
+ emotion_text += f"**{emotion.title()}:** `{bar}` {prob*100:.1f}%\n"
399
 
400
+ # Affect score
401
+ affect_score = results['vocal_affect_score']
402
+ affect_text = f"### Score: **{affect_score:.3f}**\n\n"
403
+ if affect_score > 0.7:
404
+ affect_text += "πŸ”΄ **High emotional intensity**\n"
405
+ affect_text += "Indicates stress, anxiety, or strong emotions"
406
+ elif affect_score < 0.3:
407
+ affect_text += "🟒 **Low emotional intensity**\n"
408
+ affect_text += "Indicates calm or neutral state"
409
  else:
410
+ affect_text += "🟑 **Moderate emotional intensity**\n"
411
+ affect_text += "Normal emotional expression"
412
 
413
+ # Monotone score
414
+ monotone_score = results['monotone_speech_score']
415
+ monotone_text = f"### Score: **{monotone_score:.3f}**\n\n"
416
+ if monotone_score > 0.7:
417
+ monotone_text += "πŸ”΄ **Very flat speech**\n"
418
+ monotone_text += "May indicate depression or low mood"
419
+ elif monotone_score < 0.3:
420
+ monotone_text += "🟒 **Varied pitch**\n"
421
+ monotone_text += "Good vocal variation"
422
  else:
423
+ monotone_text += "🟑 **Moderate variation**\n"
424
+ monotone_text += "Normal range"
425
 
426
+ # Energy score
427
+ energy_score = results['vocal_energy_score']
428
+ energy_text = f"### Score: **{energy_score:.3f}**\n\n"
429
+ if energy_score > 0.7:
430
+ energy_text += "🟠 **High vocal energy**\n"
431
+ energy_text += "Active, energetic speech"
432
+ elif energy_score < 0.3:
433
+ energy_text += "πŸ”΄ **Low vocal energy**\n"
434
+ energy_text += "May indicate fatigue or depression"
435
  else:
436
+ energy_text += "🟒 **Normal vocal energy**\n"
437
+ energy_text += "Healthy energy level"
438
 
439
+ # Technical details
440
+ details_text = f"**Pitch Variability:** {results['pitch_variability']:.2f} Hz\n\n"
441
+ details_text += f"**Energy Level:** {results['energy_level']:.3f}\n\n"
442
+ details_text += f"Higher pitch variability indicates more emotional expression."
443
 
444
+ # Mental health indicators
445
+ mental_text = "### Assessment:\n\n"
446
+ mental_text += "\n\n".join(results['mental_health_indicators'])
447
 
448
+ return (
449
+ emotion_text,
450
+ affect_text,
451
+ monotone_text,
452
+ energy_text,
453
+ details_text,
454
+ mental_text
455
+ )
456
 
457
  except Exception as e:
458
+ error_msg = f"❌ **Error:** {str(e)}\n\nPlease try a different audio file."
459
+ return error_msg, "", "", "", "", ""
460
 
461
+ # Create Gradio interface
462
+ with gr.Blocks(theme=gr.themes.Soft(), title="Audio Emotion Detection") as app:
463
+
464
  gr.Markdown("""
465
  # πŸŽ™οΈ Audio Emotion & Mental Health Detection
466
 
467
+ Upload a speech audio file to analyze emotional state and mental health indicators.
468
+
469
+ **Supported formats:** WAV, MP3, FLAC, OGG (3-10 seconds recommended)
470
  """)
471
 
472
  with gr.Row():
473
+ with gr.Column(scale=1):
474
+ audio_input = gr.Audio(
475
+ sources=["upload", "microphone"],
476
+ type="filepath",
477
+ label="πŸ“ Upload or Record Audio"
478
+ )
479
+
480
+ analyze_btn = gr.Button(
481
+ "πŸ” Analyze Audio",
482
+ variant="primary",
483
+ size="lg"
484
+ )
485
+
486
+ gr.Markdown("""
487
+ ### πŸ“– How to use:
488
+ 1. Upload an audio file or record directly
489
+ 2. Click "Analyze Audio"
490
+ 3. View comprehensive results β†’
491
+
492
+ **Best results:** Clear speech, 3-10 seconds
493
+ """)
494
 
495
+ with gr.Column(scale=2):
496
+ emotion_out = gr.Markdown(label="Emotion Detection Results")
497
 
498
  with gr.Row():
499
+ affect_out = gr.Markdown(label="Vocal Affect")
500
+ monotone_out = gr.Markdown(label="Monotone Score")
501
+ energy_out = gr.Markdown(label="Vocal Energy")
502
 
503
+ details_out = gr.Markdown(label="Technical Details")
504
+ mental_out = gr.Markdown(label="Mental Health Indicators")
505
 
506
  gr.Markdown("""
507
+ ---
508
+ ## πŸ“Š Understanding the Results
509
+
510
+ ### Vocal Affect Score
511
+ - **0.0 - 0.3:** Calm, relaxed speech
512
+ - **0.3 - 0.7:** Normal emotional range
513
+ - **0.7 - 1.0:** High emotional intensity (stress/anxiety)
514
+
515
+ ### Monotone Speech Score
516
+ - **0.0 - 0.3:** Good pitch variation (healthy)
517
+ - **0.3 - 0.7:** Moderate variation
518
+ - **0.7 - 1.0:** Very flat speech (depression risk)
519
+
520
+ ### Vocal Energy Score
521
+ - **0.0 - 0.3:** Low energy (fatigue/depression)
522
+ - **0.3 - 0.7:** Normal energy
523
+ - **0.7 - 1.0:** High energy (anxiety/excitement)
524
+
525
+ ---
526
+
527
+ ### ⚠️ Important Disclaimer
528
+
529
+ This tool is designed for **research and informational purposes only**. It should NOT be used as:
530
+ - A medical diagnostic tool
531
+ - A replacement for professional mental health assessment
532
+ - The sole basis for any health-related decisions
533
 
534
+ If you have concerns about your mental health, please consult with a qualified healthcare professional.
 
 
535
 
536
+ ---
537
+
538
+ **πŸ”¬ Technology:** Rule-based emotion detection using audio signal processing
539
+ **πŸ“š Based on:** Prosodic analysis, pitch variation, energy patterns, and speech characteristics
540
  """)
541
 
542
+ # Connect button
543
+ analyze_btn.click(
544
+ fn=analyze,
545
+ inputs=[audio_input],
546
+ outputs=[
547
+ emotion_out,
548
+ affect_out,
549
+ monotone_out,
550
+ energy_out,
551
+ details_out,
552
+ mental_out
553
+ ]
554
  )
555
+
556
+ # Example at bottom
557
+ gr.Markdown("""
558
+ ### πŸ’‘ Tips for Best Results
559
+ - Use clear, uncompressed audio (WAV preferred)
560
+ - 3-10 seconds of continuous speech
561
+ - Minimize background noise
562
+ - Speak naturally
563
+ """)
564
 
565
+ return app
566
 
567
 
568
  # ============================================
 
570
  # ============================================
571
 
572
  if __name__ == "__main__":
573
+ print("="*60)
574
+ print("πŸŽ™οΈ Audio Emotion & Mental Health Detection")
575
+ print("="*60)
576
+ print("\nStarting application...")
577
+
578
+ try:
579
+ app = create_interface()
580
+ app.launch(
581
+ server_name="0.0.0.0",
582
+ server_port=7860,
583
+ show_error=True
584
+ )
585
+ except Exception as e:
586
+ print(f"❌ Error launching app: {e}")
587
+ import traceback
588
+ traceback.print_exc()