Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,39 +1,43 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
Audio Emotion & Mental Health Detection Model
|
| 4 |
-
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
import os
|
| 8 |
import numpy as np
|
| 9 |
-
import torch
|
| 10 |
-
import torch.nn as nn
|
| 11 |
-
import torch.nn.functional as F
|
| 12 |
import gradio as gr
|
| 13 |
-
from typing import Dict
|
| 14 |
import warnings
|
|
|
|
| 15 |
warnings.filterwarnings('ignore')
|
| 16 |
|
| 17 |
-
#
|
| 18 |
try:
|
| 19 |
import librosa
|
| 20 |
LIBROSA_AVAILABLE = True
|
| 21 |
except ImportError:
|
| 22 |
LIBROSA_AVAILABLE = False
|
| 23 |
-
print("β οΈ Librosa not available, using
|
| 24 |
|
| 25 |
-
import scipy.signal as signal
|
| 26 |
from scipy.io import wavfile
|
| 27 |
-
import scipy.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
# ============================================
|
| 30 |
-
#
|
| 31 |
# ============================================
|
| 32 |
|
| 33 |
-
class
|
| 34 |
-
"""
|
| 35 |
|
| 36 |
-
def __init__(self, sr=16000, n_mfcc=
|
| 37 |
self.sr = sr
|
| 38 |
self.n_mfcc = n_mfcc
|
| 39 |
|
|
@@ -42,238 +46,200 @@ class LightweightAudioProcessor:
|
|
| 42 |
try:
|
| 43 |
if LIBROSA_AVAILABLE:
|
| 44 |
y, sr = librosa.load(audio_path, sr=self.sr, duration=3)
|
|
|
|
| 45 |
else:
|
| 46 |
-
#
|
| 47 |
sr, y = wavfile.read(audio_path)
|
|
|
|
|
|
|
| 48 |
if len(y.shape) > 1:
|
| 49 |
-
y = y.mean(axis=1)
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
# Resample if needed
|
| 53 |
if sr != self.sr:
|
| 54 |
num_samples = int(len(y) * self.sr / sr)
|
| 55 |
y = signal.resample(y, num_samples)
|
| 56 |
|
| 57 |
-
# Limit
|
| 58 |
max_len = 3 * self.sr
|
| 59 |
if len(y) > max_len:
|
| 60 |
y = y[:max_len]
|
| 61 |
-
|
| 62 |
-
|
| 63 |
except Exception as e:
|
| 64 |
print(f"Error loading audio: {e}")
|
| 65 |
-
return np.random.randn(self.sr * 3), self.sr
|
| 66 |
|
| 67 |
-
def
|
| 68 |
-
"""
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
else:
|
| 72 |
-
# Simplified MFCC calculation
|
| 73 |
-
# Apply pre-emphasis
|
| 74 |
-
emphasized = np.append(y[0], y[1:] - 0.97 * y[:-1])
|
| 75 |
-
|
| 76 |
-
# Frame the signal
|
| 77 |
-
frame_size = int(0.025 * self.sr)
|
| 78 |
-
frame_stride = int(0.01 * self.sr)
|
| 79 |
-
frames = self._frame_signal(emphasized, frame_size, frame_stride)
|
| 80 |
-
|
| 81 |
-
# Apply FFT
|
| 82 |
-
mag_frames = np.absolute(np.fft.rfft(frames, frame_size))
|
| 83 |
-
pow_frames = ((1.0 / frame_size) * (mag_frames ** 2))
|
| 84 |
-
|
| 85 |
-
# Mel filter banks (simplified)
|
| 86 |
-
mel_filters = self._create_mel_filters(26, frame_size, self.sr)
|
| 87 |
-
filter_banks = np.dot(pow_frames, mel_filters.T)
|
| 88 |
-
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)
|
| 89 |
-
filter_banks = 20 * np.log10(filter_banks)
|
| 90 |
-
|
| 91 |
-
# DCT to get MFCCs
|
| 92 |
-
mfccs = fft.dct(filter_banks, type=2, axis=1, norm='ortho')[:, :self.n_mfcc].T
|
| 93 |
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
low_freq_mel = 0
|
| 116 |
-
high_freq_mel = 2595 * np.log10(1 + (
|
| 117 |
-
mel_points = np.linspace(low_freq_mel, high_freq_mel,
|
| 118 |
hz_points = 700 * (10**(mel_points / 2595) - 1)
|
| 119 |
-
bin_points = np.floor((
|
| 120 |
|
| 121 |
-
fbank = np.zeros((
|
| 122 |
-
for m in range(1,
|
| 123 |
-
f_m_minus =
|
| 124 |
-
f_m =
|
| 125 |
-
f_m_plus =
|
| 126 |
|
| 127 |
for k in range(f_m_minus, f_m):
|
| 128 |
fbank[m - 1, k] = (k - bin_points[m - 1]) / (bin_points[m] - bin_points[m - 1])
|
| 129 |
for k in range(f_m, f_m_plus):
|
| 130 |
fbank[m - 1, k] = (bin_points[m + 1] - k) / (bin_points[m + 1] - bin_points[m])
|
| 131 |
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
def extract_pitch(self, y):
|
| 135 |
-
"""Extract pitch
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
pitch_values.append(pitch)
|
| 161 |
-
|
| 162 |
-
return pitch_values if pitch_values else [0]
|
| 163 |
|
| 164 |
def extract_energy(self, y):
|
| 165 |
-
"""Extract energy
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
for i in range(0, len(y) - frame_length, hop_length):
|
| 174 |
-
frame = y[i:i+frame_length]
|
| 175 |
-
rms.append(np.sqrt(np.mean(frame**2)))
|
| 176 |
-
|
| 177 |
-
rms = np.array(rms)
|
| 178 |
|
| 179 |
-
return rms
|
| 180 |
|
| 181 |
def extract_zcr(self, y):
|
| 182 |
-
"""
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
frame = y[i:i+frame_length]
|
| 192 |
-
zero_crossings = np.sum(np.abs(np.diff(np.sign(frame)))) / 2
|
| 193 |
-
zcr.append(zero_crossings / frame_length)
|
| 194 |
-
|
| 195 |
-
zcr = np.array(zcr)
|
| 196 |
|
| 197 |
-
return zcr
|
| 198 |
|
| 199 |
def extract_spectral_features(self, y):
|
| 200 |
-
"""
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
magnitude = np.abs(fft_spectrum)
|
| 204 |
freq = np.fft.rfftfreq(len(y), 1.0/self.sr)
|
| 205 |
|
| 206 |
# Spectral centroid
|
| 207 |
-
|
| 208 |
|
| 209 |
-
# Spectral rolloff
|
| 210 |
cumsum = np.cumsum(magnitude)
|
| 211 |
rolloff_idx = np.where(cumsum >= 0.85 * cumsum[-1])[0]
|
| 212 |
-
|
| 213 |
|
| 214 |
# Spectral bandwidth
|
| 215 |
-
|
| 216 |
-
spectral_bandwidth = np.sqrt(np.sum((deviation**2) * magnitude) / np.sum(magnitude))
|
| 217 |
|
| 218 |
-
return
|
| 219 |
-
|
| 220 |
-
def estimate_tempo(self, y):
|
| 221 |
-
"""Estimate tempo"""
|
| 222 |
-
if LIBROSA_AVAILABLE:
|
| 223 |
-
tempo, _ = librosa.beat.beat_track(y=y, sr=self.sr)
|
| 224 |
-
return tempo
|
| 225 |
-
else:
|
| 226 |
-
# Simplified tempo estimation
|
| 227 |
-
onset_env = self.extract_energy(y)
|
| 228 |
-
autocorr = np.correlate(onset_env, onset_env, mode='full')
|
| 229 |
-
autocorr = autocorr[len(autocorr)//2:]
|
| 230 |
-
|
| 231 |
-
# Find tempo peaks
|
| 232 |
-
peaks = signal.find_peaks(autocorr)[0]
|
| 233 |
-
if len(peaks) > 0:
|
| 234 |
-
tempo = 60.0 / (peaks[0] * 0.01) if peaks[0] > 0 else 120
|
| 235 |
-
return np.clip(tempo, 60, 180)
|
| 236 |
-
return 120
|
| 237 |
|
| 238 |
def extract_all_features(self, audio_path):
|
| 239 |
-
"""Extract
|
| 240 |
try:
|
| 241 |
-
# Load audio
|
| 242 |
y, sr = self.load_audio(audio_path)
|
| 243 |
|
| 244 |
-
#
|
| 245 |
-
|
| 246 |
-
mfcc_mean = np.mean(
|
| 247 |
-
mfcc_std = np.std(
|
| 248 |
|
| 249 |
-
#
|
| 250 |
pitch_values = self.extract_pitch(y)
|
| 251 |
pitch_mean = np.mean(pitch_values)
|
| 252 |
pitch_std = np.std(pitch_values)
|
| 253 |
pitch_min = np.min(pitch_values)
|
| 254 |
pitch_max = np.max(pitch_values)
|
| 255 |
-
monotone_score = 1 / (1 + pitch_std
|
| 256 |
|
| 257 |
-
#
|
| 258 |
rms = self.extract_energy(y)
|
| 259 |
energy_mean = np.mean(rms)
|
| 260 |
energy_std = np.std(rms)
|
| 261 |
energy_max = np.max(rms)
|
| 262 |
|
| 263 |
-
#
|
| 264 |
zcr = self.extract_zcr(y)
|
| 265 |
zcr_mean = np.mean(zcr)
|
| 266 |
zcr_std = np.std(zcr)
|
| 267 |
|
| 268 |
-
#
|
| 269 |
-
|
| 270 |
-
self.extract_spectral_features(y)
|
| 271 |
|
| 272 |
-
#
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
|
| 278 |
# Combine features
|
| 279 |
features = np.concatenate([
|
|
@@ -282,231 +248,164 @@ class LightweightAudioProcessor:
|
|
| 282 |
[pitch_mean, pitch_std, pitch_min, pitch_max, monotone_score],
|
| 283 |
[energy_mean, energy_std, energy_max],
|
| 284 |
[zcr_mean, zcr_std],
|
| 285 |
-
[
|
| 286 |
-
[chroma_mean],
|
| 287 |
[tempo]
|
| 288 |
])
|
| 289 |
|
| 290 |
-
#
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
)
|
| 294 |
-
vocal_energy_score = self._calculate_vocal_energy(
|
| 295 |
-
energy_mean, tempo, zcr_mean
|
| 296 |
-
)
|
| 297 |
|
| 298 |
return {
|
| 299 |
'features': features.astype(np.float32),
|
| 300 |
-
'vocal_affect_score': float(
|
| 301 |
'monotone_score': float(monotone_score),
|
| 302 |
-
'vocal_energy_score': float(
|
| 303 |
'pitch_variability': float(pitch_std),
|
| 304 |
'energy_level': float(energy_mean)
|
| 305 |
}
|
| 306 |
|
| 307 |
except Exception as e:
|
| 308 |
-
print(f"Error
|
| 309 |
-
|
| 310 |
-
return self._get_default_features()
|
| 311 |
|
| 312 |
-
def
|
| 313 |
-
"""Calculate
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
affect_score = (pitch_component * 0.4 +
|
| 319 |
-
energy_component * 0.4 +
|
| 320 |
-
spectral_component * 0.2)
|
| 321 |
-
|
| 322 |
-
return np.clip(affect_score, 0, 1)
|
| 323 |
|
| 324 |
-
def
|
| 325 |
-
"""Calculate vocal energy
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
energy_score = (energy_component * 0.5 +
|
| 331 |
-
tempo_component * 0.3 +
|
| 332 |
-
zcr_component * 0.2)
|
| 333 |
-
|
| 334 |
-
return np.clip(energy_score, 0, 1)
|
| 335 |
|
| 336 |
-
def
|
| 337 |
-
"""
|
| 338 |
-
n_features = self.n_mfcc * 2 +
|
| 339 |
return {
|
| 340 |
-
'features': np.random.randn(n_features).astype(np.float32),
|
| 341 |
'vocal_affect_score': 0.5,
|
| 342 |
'monotone_score': 0.5,
|
| 343 |
'vocal_energy_score': 0.5,
|
| 344 |
-
'pitch_variability':
|
| 345 |
-
'energy_level': 0.
|
| 346 |
}
|
| 347 |
|
| 348 |
|
| 349 |
# ============================================
|
| 350 |
-
#
|
| 351 |
-
# ============================================
|
| 352 |
-
|
| 353 |
-
class MultiTaskEmotionModel(nn.Module):
|
| 354 |
-
"""Multi-task emotion and mental health detection model"""
|
| 355 |
-
|
| 356 |
-
def __init__(self, input_dim, num_emotions=8, dropout=0.5):
|
| 357 |
-
super(MultiTaskEmotionModel, self).__init__()
|
| 358 |
-
|
| 359 |
-
# Shared layers
|
| 360 |
-
self.shared_layers = nn.Sequential(
|
| 361 |
-
nn.Linear(input_dim, 512),
|
| 362 |
-
nn.BatchNorm1d(512),
|
| 363 |
-
nn.ReLU(),
|
| 364 |
-
nn.Dropout(dropout),
|
| 365 |
-
|
| 366 |
-
nn.Linear(512, 256),
|
| 367 |
-
nn.BatchNorm1d(256),
|
| 368 |
-
nn.ReLU(),
|
| 369 |
-
nn.Dropout(dropout),
|
| 370 |
-
|
| 371 |
-
nn.Linear(256, 128),
|
| 372 |
-
nn.BatchNorm1d(128),
|
| 373 |
-
nn.ReLU(),
|
| 374 |
-
nn.Dropout(dropout/2)
|
| 375 |
-
)
|
| 376 |
-
|
| 377 |
-
# Emotion classification head
|
| 378 |
-
self.emotion_head = nn.Sequential(
|
| 379 |
-
nn.Linear(128, 64),
|
| 380 |
-
nn.ReLU(),
|
| 381 |
-
nn.Dropout(dropout/2),
|
| 382 |
-
nn.Linear(64, num_emotions)
|
| 383 |
-
)
|
| 384 |
-
|
| 385 |
-
# Regression heads
|
| 386 |
-
self.affect_head = nn.Sequential(
|
| 387 |
-
nn.Linear(128, 32),
|
| 388 |
-
nn.ReLU(),
|
| 389 |
-
nn.Linear(32, 1),
|
| 390 |
-
nn.Sigmoid()
|
| 391 |
-
)
|
| 392 |
-
|
| 393 |
-
self.monotone_head = nn.Sequential(
|
| 394 |
-
nn.Linear(128, 32),
|
| 395 |
-
nn.ReLU(),
|
| 396 |
-
nn.Linear(32, 1),
|
| 397 |
-
nn.Sigmoid()
|
| 398 |
-
)
|
| 399 |
-
|
| 400 |
-
self.energy_head = nn.Sequential(
|
| 401 |
-
nn.Linear(128, 32),
|
| 402 |
-
nn.ReLU(),
|
| 403 |
-
nn.Linear(32, 1),
|
| 404 |
-
nn.Sigmoid()
|
| 405 |
-
)
|
| 406 |
-
|
| 407 |
-
def forward(self, x):
|
| 408 |
-
shared = self.shared_layers(x)
|
| 409 |
-
|
| 410 |
-
return {
|
| 411 |
-
'emotion_logits': self.emotion_head(shared),
|
| 412 |
-
'vocal_affect': self.affect_head(shared),
|
| 413 |
-
'monotone_score': self.monotone_head(shared),
|
| 414 |
-
'vocal_energy': self.energy_head(shared)
|
| 415 |
-
}
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
# ============================================
|
| 419 |
-
# PREDICTOR CLASS
|
| 420 |
# ============================================
|
| 421 |
|
| 422 |
class EmotionPredictor:
|
| 423 |
-
"""
|
| 424 |
|
| 425 |
def __init__(self):
|
| 426 |
-
self.
|
| 427 |
-
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 428 |
|
| 429 |
# Emotion mapping
|
| 430 |
-
self.
|
| 431 |
-
'neutral': 0, 'calm': 1, 'happy': 2, 'sad': 3,
|
| 432 |
-
'angry': 4, 'fearful': 5, 'disgust': 6, 'surprised': 7
|
| 433 |
-
}
|
| 434 |
-
self.reverse_emotion_map = {v: k for k, v in self.emotion_map.items()}
|
| 435 |
-
|
| 436 |
-
# Initialize model with pre-trained weights
|
| 437 |
-
input_dim = 98 # 40*2 (MFCC mean+std) + 18 other features
|
| 438 |
-
self.model = MultiTaskEmotionModel(
|
| 439 |
-
input_dim=input_dim,
|
| 440 |
-
num_emotions=len(self.emotion_map),
|
| 441 |
-
dropout=0.3
|
| 442 |
-
)
|
| 443 |
-
|
| 444 |
-
# Load pre-trained weights if available, otherwise use initialized weights
|
| 445 |
-
self._load_or_initialize_model()
|
| 446 |
|
| 447 |
-
|
| 448 |
-
self.
|
| 449 |
|
| 450 |
-
def
|
| 451 |
-
"""
|
| 452 |
-
model_path = 'emotion_model.pth'
|
| 453 |
|
| 454 |
-
|
|
|
|
| 455 |
try:
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
|
| 466 |
-
def predict(self, audio_path
|
| 467 |
"""Predict emotion and mental health indicators"""
|
| 468 |
|
| 469 |
# Extract features
|
| 470 |
-
feature_dict = self.
|
| 471 |
-
features =
|
| 472 |
-
features = features.to(self.device)
|
| 473 |
|
| 474 |
-
#
|
| 475 |
-
|
| 476 |
-
outputs = self.model(features)
|
| 477 |
|
| 478 |
-
#
|
| 479 |
-
emotion_probs =
|
| 480 |
-
emotion_idx =
|
| 481 |
-
emotion = self.
|
| 482 |
-
confidence = emotion_probs[emotion_idx]
|
| 483 |
|
| 484 |
-
#
|
| 485 |
-
vocal_affect =
|
| 486 |
-
monotone_score =
|
| 487 |
-
vocal_energy =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
|
| 489 |
# Mental health interpretation
|
| 490 |
-
|
| 491 |
-
monotone_score, vocal_affect, vocal_energy
|
| 492 |
-
)
|
| 493 |
|
| 494 |
-
|
| 495 |
'emotion': emotion,
|
| 496 |
'confidence': confidence,
|
| 497 |
'emotion_probabilities': {
|
| 498 |
-
self.
|
| 499 |
-
for i, prob in enumerate(emotion_probs)
|
| 500 |
},
|
| 501 |
'vocal_affect_score': vocal_affect,
|
| 502 |
'monotone_speech_score': monotone_score,
|
| 503 |
'vocal_energy_score': vocal_energy,
|
| 504 |
'pitch_variability': feature_dict['pitch_variability'],
|
| 505 |
'energy_level': feature_dict['energy_level'],
|
| 506 |
-
'mental_health_indicators':
|
| 507 |
}
|
| 508 |
-
|
| 509 |
-
return results
|
| 510 |
|
| 511 |
def _interpret_mental_health(self, monotone, affect, energy):
|
| 512 |
"""Interpret mental health indicators"""
|
|
@@ -524,8 +423,8 @@ class EmotionPredictor:
|
|
| 524 |
if affect > 0.6 and monotone < 0.4:
|
| 525 |
indicators.append("β οΈ High vocal affect - possible emotional stress")
|
| 526 |
|
| 527 |
-
if 0.
|
| 528 |
-
indicators.append("β
Balanced vocal characteristics
|
| 529 |
|
| 530 |
if not indicators:
|
| 531 |
indicators.append("βΉοΈ Vocal patterns within normal range")
|
|
@@ -537,188 +436,111 @@ class EmotionPredictor:
|
|
| 537 |
# GRADIO INTERFACE
|
| 538 |
# ============================================
|
| 539 |
|
| 540 |
-
def
|
| 541 |
-
"""Create Gradio
|
| 542 |
|
| 543 |
-
# Initialize predictor
|
| 544 |
-
print("Initializing emotion predictor...")
|
| 545 |
predictor = EmotionPredictor()
|
| 546 |
-
print("β
Predictor ready!")
|
| 547 |
|
| 548 |
-
def
|
| 549 |
-
"""
|
| 550 |
if audio is None:
|
| 551 |
-
return
|
| 552 |
-
emotion_output: "β Please upload an audio file",
|
| 553 |
-
affect_output: "",
|
| 554 |
-
monotone_output: "",
|
| 555 |
-
energy_output: "",
|
| 556 |
-
pitch_output: "",
|
| 557 |
-
mental_health_output: ""
|
| 558 |
-
}
|
| 559 |
|
| 560 |
try:
|
| 561 |
-
# Run prediction
|
| 562 |
results = predictor.predict(audio)
|
| 563 |
|
| 564 |
# Format emotion output
|
| 565 |
-
emotion_text = f"## π
|
| 566 |
emotion_text += f"**Confidence:** {results['confidence']*100:.1f}%\n\n"
|
| 567 |
-
emotion_text += "###
|
| 568 |
|
| 569 |
for emotion, prob in sorted(results['emotion_probabilities'].items(),
|
| 570 |
key=lambda x: x[1], reverse=True):
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
emotion_text += f"**{emotion.capitalize()}:** {bar} {prob*100:.1f}%\n"
|
| 574 |
|
| 575 |
# Format scores
|
| 576 |
-
|
| 577 |
if results['vocal_affect_score'] > 0.7:
|
| 578 |
-
|
| 579 |
elif results['vocal_affect_score'] < 0.3:
|
| 580 |
-
|
| 581 |
else:
|
| 582 |
-
|
| 583 |
|
| 584 |
-
|
| 585 |
if results['monotone_speech_score'] > 0.7:
|
| 586 |
-
|
| 587 |
elif results['monotone_speech_score'] < 0.3:
|
| 588 |
-
|
| 589 |
else:
|
| 590 |
-
|
| 591 |
|
| 592 |
-
|
| 593 |
if results['vocal_energy_score'] > 0.7:
|
| 594 |
-
|
| 595 |
elif results['vocal_energy_score'] < 0.3:
|
| 596 |
-
|
| 597 |
else:
|
| 598 |
-
|
| 599 |
|
| 600 |
-
|
| 601 |
-
|
| 602 |
|
| 603 |
-
|
| 604 |
|
| 605 |
-
return
|
| 606 |
-
emotion_output: emotion_text,
|
| 607 |
-
affect_output: affect_text,
|
| 608 |
-
monotone_output: monotone_text,
|
| 609 |
-
energy_output: energy_text,
|
| 610 |
-
pitch_output: pitch_text,
|
| 611 |
-
mental_health_output: mental_health_text
|
| 612 |
-
}
|
| 613 |
|
| 614 |
except Exception as e:
|
| 615 |
-
|
| 616 |
-
return {
|
| 617 |
-
emotion_output: error_msg,
|
| 618 |
-
affect_output: "",
|
| 619 |
-
monotone_output: "",
|
| 620 |
-
energy_output: "",
|
| 621 |
-
pitch_output: "",
|
| 622 |
-
mental_health_output: ""
|
| 623 |
-
}
|
| 624 |
|
| 625 |
# Create interface
|
| 626 |
-
with gr.Blocks(theme=gr.themes.Soft()
|
| 627 |
-
|
| 628 |
gr.Markdown("""
|
| 629 |
# ποΈ Audio Emotion & Mental Health Detection
|
| 630 |
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
**Features:**
|
| 634 |
-
- π Emotion Recognition (8 emotions)
|
| 635 |
-
- π Vocal Affect Score (emotional intensity)
|
| 636 |
-
- π Monotone Speech Detection (depression indicator)
|
| 637 |
-
- β‘ Vocal Energy Analysis (mood disorder indicator)
|
| 638 |
""")
|
| 639 |
|
| 640 |
with gr.Row():
|
| 641 |
-
with gr.Column(
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
label="Upload Audio File (WAV, MP3, etc.)"
|
| 645 |
-
)
|
| 646 |
-
|
| 647 |
-
analyze_btn = gr.Button("π Analyze Audio", variant="primary", size="lg")
|
| 648 |
-
|
| 649 |
-
gr.Markdown("""
|
| 650 |
-
### π Instructions:
|
| 651 |
-
1. Upload an audio file (WAV, MP3, etc.)
|
| 652 |
-
2. Click "Analyze Audio"
|
| 653 |
-
3. View results on the right
|
| 654 |
-
|
| 655 |
-
**Note:** Works best with clear speech recordings (3-10 seconds)
|
| 656 |
-
""")
|
| 657 |
|
| 658 |
-
with gr.Column(
|
| 659 |
-
|
| 660 |
|
| 661 |
with gr.Row():
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
monotone_output = gr.Markdown(label="Monotone Score")
|
| 666 |
-
with gr.Column():
|
| 667 |
-
energy_output = gr.Markdown(label="Vocal Energy")
|
| 668 |
|
| 669 |
-
|
| 670 |
-
|
| 671 |
|
| 672 |
gr.Markdown("""
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
| | 0.7-1.0 | High emotional intensity (stress/anxiety) |
|
| 681 |
-
| **Monotone Score** | 0.0-0.3 | High pitch variation (normal) |
|
| 682 |
-
| | 0.3-0.7 | Moderate pitch variation |
|
| 683 |
-
| | 0.7-1.0 | Very flat speech (possible depression) |
|
| 684 |
-
| **Vocal Energy** | 0.0-0.3 | Low energy (possible low motivation) |
|
| 685 |
-
| | 0.3-0.7 | Normal energy level |
|
| 686 |
-
| | 0.7-1.0 | High energy (possible anxiety/mania) |
|
| 687 |
-
|
| 688 |
-
---
|
| 689 |
-
|
| 690 |
-
**β οΈ Disclaimer:** This tool is for research and informational purposes only.
|
| 691 |
-
It should not be used as a substitute for professional medical or psychological diagnosis.
|
| 692 |
-
Always consult qualified healthcare professionals for mental health concerns.
|
| 693 |
-
|
| 694 |
-
**π¬ Model Info:** Multi-task Deep Neural Network trained on emotional speech datasets (RAVDESS, TESS, CREMA-D)
|
| 695 |
""")
|
| 696 |
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
outputs=[emotion_output, affect_output, monotone_output,
|
| 702 |
-
energy_output, pitch_output, mental_health_output]
|
| 703 |
)
|
| 704 |
|
| 705 |
return demo
|
| 706 |
|
| 707 |
|
| 708 |
# ============================================
|
| 709 |
-
# MAIN
|
| 710 |
# ============================================
|
| 711 |
|
| 712 |
if __name__ == "__main__":
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
print("="*60)
|
| 716 |
-
print("\nStarting Gradio interface...")
|
| 717 |
-
|
| 718 |
-
# Create and launch app
|
| 719 |
-
app = create_gradio_app()
|
| 720 |
-
app.launch(
|
| 721 |
-
server_name="0.0.0.0",
|
| 722 |
-
server_port=7860,
|
| 723 |
-
share=False
|
| 724 |
-
)
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
Audio Emotion & Mental Health Detection Model
|
| 4 |
+
Lightweight version for Hugging Face Spaces
|
| 5 |
+
Using scikit-learn instead of PyTorch
|
| 6 |
"""
|
| 7 |
|
| 8 |
import os
|
| 9 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
| 10 |
import gradio as gr
|
| 11 |
+
from typing import Dict
|
| 12 |
import warnings
|
| 13 |
+
import pickle
|
| 14 |
warnings.filterwarnings('ignore')
|
| 15 |
|
| 16 |
+
# Audio processing
|
| 17 |
try:
|
| 18 |
import librosa
|
| 19 |
LIBROSA_AVAILABLE = True
|
| 20 |
except ImportError:
|
| 21 |
LIBROSA_AVAILABLE = False
|
| 22 |
+
print("β οΈ Librosa not available, using scipy")
|
| 23 |
|
|
|
|
| 24 |
from scipy.io import wavfile
|
| 25 |
+
import scipy.signal as signal
|
| 26 |
+
from scipy import fft
|
| 27 |
+
|
| 28 |
+
# Machine Learning
|
| 29 |
+
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
|
| 30 |
+
from sklearn.preprocessing import StandardScaler
|
| 31 |
+
from sklearn.neural_network import MLPClassifier, MLPRegressor
|
| 32 |
|
| 33 |
# ============================================
|
| 34 |
+
# AUDIO PROCESSING
|
| 35 |
# ============================================
|
| 36 |
|
| 37 |
+
class AudioFeatureExtractor:
|
| 38 |
+
"""Extract audio features without heavy dependencies"""
|
| 39 |
|
| 40 |
+
def __init__(self, sr=16000, n_mfcc=20):
|
| 41 |
self.sr = sr
|
| 42 |
self.n_mfcc = n_mfcc
|
| 43 |
|
|
|
|
| 46 |
try:
|
| 47 |
if LIBROSA_AVAILABLE:
|
| 48 |
y, sr = librosa.load(audio_path, sr=self.sr, duration=3)
|
| 49 |
+
return y, sr
|
| 50 |
else:
|
| 51 |
+
# Use scipy
|
| 52 |
sr, y = wavfile.read(audio_path)
|
| 53 |
+
|
| 54 |
+
# Convert to mono
|
| 55 |
if len(y.shape) > 1:
|
| 56 |
+
y = y.mean(axis=1)
|
| 57 |
+
|
| 58 |
+
# Normalize
|
| 59 |
+
y = y.astype(np.float32)
|
| 60 |
+
if np.max(np.abs(y)) > 0:
|
| 61 |
+
y = y / np.max(np.abs(y))
|
| 62 |
|
| 63 |
# Resample if needed
|
| 64 |
if sr != self.sr:
|
| 65 |
num_samples = int(len(y) * self.sr / sr)
|
| 66 |
y = signal.resample(y, num_samples)
|
| 67 |
|
| 68 |
+
# Limit to 3 seconds
|
| 69 |
max_len = 3 * self.sr
|
| 70 |
if len(y) > max_len:
|
| 71 |
y = y[:max_len]
|
| 72 |
+
|
| 73 |
+
return y, self.sr
|
| 74 |
except Exception as e:
|
| 75 |
print(f"Error loading audio: {e}")
|
| 76 |
+
return np.random.randn(self.sr * 3) * 0.1, self.sr
|
| 77 |
|
| 78 |
+
def get_mfcc_simple(self, y):
|
| 79 |
+
"""Simplified MFCC extraction"""
|
| 80 |
+
# Pre-emphasis
|
| 81 |
+
y_emphasized = np.append(y[0], y[1:] - 0.97 * y[:-1])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
+
# Framing
|
| 84 |
+
frame_length = int(0.025 * self.sr)
|
| 85 |
+
frame_step = int(0.01 * self.sr)
|
| 86 |
+
|
| 87 |
+
num_frames = 1 + int((len(y_emphasized) - frame_length) / frame_step)
|
| 88 |
+
frames = np.zeros((num_frames, frame_length))
|
| 89 |
+
|
| 90 |
+
for i in range(num_frames):
|
| 91 |
+
start = i * frame_step
|
| 92 |
+
frames[i] = y_emphasized[start:start + frame_length]
|
| 93 |
+
|
| 94 |
+
# Apply window
|
| 95 |
+
frames *= np.hamming(frame_length)
|
| 96 |
+
|
| 97 |
+
# FFT
|
| 98 |
+
mag_frames = np.absolute(np.fft.rfft(frames, frame_length))
|
| 99 |
+
pow_frames = ((1.0 / frame_length) * (mag_frames ** 2))
|
| 100 |
+
|
| 101 |
+
# Mel filterbank
|
| 102 |
+
nfft = frame_length
|
| 103 |
+
nfilt = 26
|
| 104 |
low_freq_mel = 0
|
| 105 |
+
high_freq_mel = 2595 * np.log10(1 + (self.sr / 2) / 700)
|
| 106 |
+
mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)
|
| 107 |
hz_points = 700 * (10**(mel_points / 2595) - 1)
|
| 108 |
+
bin_points = np.floor((nfft + 1) * hz_points / self.sr).astype(int)
|
| 109 |
|
| 110 |
+
fbank = np.zeros((nfilt, int(nfft / 2 + 1)))
|
| 111 |
+
for m in range(1, nfilt + 1):
|
| 112 |
+
f_m_minus = bin_points[m - 1]
|
| 113 |
+
f_m = bin_points[m]
|
| 114 |
+
f_m_plus = bin_points[m + 1]
|
| 115 |
|
| 116 |
for k in range(f_m_minus, f_m):
|
| 117 |
fbank[m - 1, k] = (k - bin_points[m - 1]) / (bin_points[m] - bin_points[m - 1])
|
| 118 |
for k in range(f_m, f_m_plus):
|
| 119 |
fbank[m - 1, k] = (bin_points[m + 1] - k) / (bin_points[m + 1] - bin_points[m])
|
| 120 |
|
| 121 |
+
filter_banks = np.dot(pow_frames, fbank.T)
|
| 122 |
+
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)
|
| 123 |
+
filter_banks = 20 * np.log10(filter_banks)
|
| 124 |
+
|
| 125 |
+
# DCT
|
| 126 |
+
mfcc = fft.dct(filter_banks, type=2, axis=1, norm='ortho')[:, :self.n_mfcc]
|
| 127 |
+
|
| 128 |
+
return mfcc.T
|
| 129 |
|
| 130 |
def extract_pitch(self, y):
|
| 131 |
+
"""Extract pitch using autocorrelation"""
|
| 132 |
+
pitch_values = []
|
| 133 |
+
frame_length = int(0.03 * self.sr)
|
| 134 |
+
hop_length = int(0.01 * self.sr)
|
| 135 |
+
|
| 136 |
+
for i in range(0, len(y) - frame_length, hop_length):
|
| 137 |
+
frame = y[i:i+frame_length]
|
| 138 |
+
|
| 139 |
+
# Autocorrelation
|
| 140 |
+
corr = np.correlate(frame, frame, mode='full')
|
| 141 |
+
corr = corr[len(corr)//2:]
|
| 142 |
+
|
| 143 |
+
# Find first peak after lag 0
|
| 144 |
+
d = np.diff(corr)
|
| 145 |
+
start = int(self.sr / 400) # Min 400 Hz
|
| 146 |
+
peak = np.where(d[start:] < 0)[0]
|
| 147 |
+
|
| 148 |
+
if len(peak) > 0:
|
| 149 |
+
peak_idx = peak[0] + start
|
| 150 |
+
if peak_idx > 0:
|
| 151 |
+
freq = self.sr / peak_idx
|
| 152 |
+
if 50 < freq < 400:
|
| 153 |
+
pitch_values.append(freq)
|
| 154 |
+
|
| 155 |
+
return pitch_values if pitch_values else [150.0]
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
def extract_energy(self, y):
|
| 158 |
+
"""Extract RMS energy"""
|
| 159 |
+
frame_length = int(0.025 * self.sr)
|
| 160 |
+
hop_length = int(0.01 * self.sr)
|
| 161 |
+
|
| 162 |
+
rms = []
|
| 163 |
+
for i in range(0, len(y) - frame_length, hop_length):
|
| 164 |
+
frame = y[i:i+frame_length]
|
| 165 |
+
rms.append(np.sqrt(np.mean(frame**2)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
+
return np.array(rms)
|
| 168 |
|
| 169 |
def extract_zcr(self, y):
|
| 170 |
+
"""Zero crossing rate"""
|
| 171 |
+
frame_length = int(0.025 * self.sr)
|
| 172 |
+
hop_length = int(0.01 * self.sr)
|
| 173 |
+
|
| 174 |
+
zcr = []
|
| 175 |
+
for i in range(0, len(y) - frame_length, hop_length):
|
| 176 |
+
frame = y[i:i+frame_length]
|
| 177 |
+
crossings = np.sum(np.abs(np.diff(np.sign(frame)))) / 2
|
| 178 |
+
zcr.append(crossings / frame_length)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
+
return np.array(zcr)
|
| 181 |
|
| 182 |
def extract_spectral_features(self, y):
|
| 183 |
+
"""Spectral features"""
|
| 184 |
+
spectrum = np.fft.rfft(y)
|
| 185 |
+
magnitude = np.abs(spectrum)
|
|
|
|
| 186 |
freq = np.fft.rfftfreq(len(y), 1.0/self.sr)
|
| 187 |
|
| 188 |
# Spectral centroid
|
| 189 |
+
centroid = np.sum(freq * magnitude) / (np.sum(magnitude) + 1e-6)
|
| 190 |
|
| 191 |
+
# Spectral rolloff
|
| 192 |
cumsum = np.cumsum(magnitude)
|
| 193 |
rolloff_idx = np.where(cumsum >= 0.85 * cumsum[-1])[0]
|
| 194 |
+
rolloff = freq[rolloff_idx[0]] if len(rolloff_idx) > 0 else 0
|
| 195 |
|
| 196 |
# Spectral bandwidth
|
| 197 |
+
bandwidth = np.sqrt(np.sum(((freq - centroid)**2) * magnitude) / (np.sum(magnitude) + 1e-6))
|
|
|
|
| 198 |
|
| 199 |
+
return centroid, rolloff, bandwidth
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
def extract_all_features(self, audio_path):
|
| 202 |
+
"""Extract all features"""
|
| 203 |
try:
|
|
|
|
| 204 |
y, sr = self.load_audio(audio_path)
|
| 205 |
|
| 206 |
+
# MFCCs
|
| 207 |
+
mfcc = self.get_mfcc_simple(y)
|
| 208 |
+
mfcc_mean = np.mean(mfcc, axis=1)
|
| 209 |
+
mfcc_std = np.std(mfcc, axis=1)
|
| 210 |
|
| 211 |
+
# Pitch
|
| 212 |
pitch_values = self.extract_pitch(y)
|
| 213 |
pitch_mean = np.mean(pitch_values)
|
| 214 |
pitch_std = np.std(pitch_values)
|
| 215 |
pitch_min = np.min(pitch_values)
|
| 216 |
pitch_max = np.max(pitch_values)
|
| 217 |
+
monotone_score = 1.0 / (1.0 + pitch_std/10.0)
|
| 218 |
|
| 219 |
+
# Energy
|
| 220 |
rms = self.extract_energy(y)
|
| 221 |
energy_mean = np.mean(rms)
|
| 222 |
energy_std = np.std(rms)
|
| 223 |
energy_max = np.max(rms)
|
| 224 |
|
| 225 |
+
# ZCR
|
| 226 |
zcr = self.extract_zcr(y)
|
| 227 |
zcr_mean = np.mean(zcr)
|
| 228 |
zcr_std = np.std(zcr)
|
| 229 |
|
| 230 |
+
# Spectral
|
| 231 |
+
spec_centroid, spec_rolloff, spec_bandwidth = self.extract_spectral_features(y)
|
|
|
|
| 232 |
|
| 233 |
+
# Tempo estimation
|
| 234 |
+
onset_env = rms
|
| 235 |
+
tempo = 120.0 # Default
|
| 236 |
+
if len(onset_env) > 10:
|
| 237 |
+
autocorr = np.correlate(onset_env, onset_env, mode='full')
|
| 238 |
+
autocorr = autocorr[len(autocorr)//2:]
|
| 239 |
+
peaks = signal.find_peaks(autocorr)[0]
|
| 240 |
+
if len(peaks) > 0 and peaks[0] > 0:
|
| 241 |
+
tempo = 60.0 / (peaks[0] * 0.01)
|
| 242 |
+
tempo = np.clip(tempo, 60, 180)
|
| 243 |
|
| 244 |
# Combine features
|
| 245 |
features = np.concatenate([
|
|
|
|
| 248 |
[pitch_mean, pitch_std, pitch_min, pitch_max, monotone_score],
|
| 249 |
[energy_mean, energy_std, energy_max],
|
| 250 |
[zcr_mean, zcr_std],
|
| 251 |
+
[spec_centroid, spec_rolloff, spec_bandwidth],
|
|
|
|
| 252 |
[tempo]
|
| 253 |
])
|
| 254 |
|
| 255 |
+
# Derived scores
|
| 256 |
+
vocal_affect = self._calc_affect(pitch_std, energy_std, spec_centroid)
|
| 257 |
+
vocal_energy = self._calc_energy(energy_mean, tempo, zcr_mean)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
|
| 259 |
return {
|
| 260 |
'features': features.astype(np.float32),
|
| 261 |
+
'vocal_affect_score': float(vocal_affect),
|
| 262 |
'monotone_score': float(monotone_score),
|
| 263 |
+
'vocal_energy_score': float(vocal_energy),
|
| 264 |
'pitch_variability': float(pitch_std),
|
| 265 |
'energy_level': float(energy_mean)
|
| 266 |
}
|
| 267 |
|
| 268 |
except Exception as e:
|
| 269 |
+
print(f"Error: {e}")
|
| 270 |
+
return self._default_features()
|
|
|
|
| 271 |
|
| 272 |
+
def _calc_affect(self, pitch_std, energy_std, spec_centroid):
|
| 273 |
+
"""Calculate vocal affect score"""
|
| 274 |
+
pitch_comp = min(pitch_std / 50.0, 1.0)
|
| 275 |
+
energy_comp = min(energy_std / 0.3, 1.0)
|
| 276 |
+
spec_comp = min(spec_centroid / 2000.0, 1.0)
|
| 277 |
+
return np.clip(pitch_comp * 0.4 + energy_comp * 0.4 + spec_comp * 0.2, 0, 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
|
| 279 |
+
def _calc_energy(self, energy_mean, tempo, zcr_mean):
|
| 280 |
+
"""Calculate vocal energy score"""
|
| 281 |
+
energy_comp = min(energy_mean / 0.5, 1.0)
|
| 282 |
+
tempo_comp = min(tempo / 150.0, 1.0)
|
| 283 |
+
zcr_comp = min(zcr_mean / 0.15, 1.0)
|
| 284 |
+
return np.clip(energy_comp * 0.5 + tempo_comp * 0.3 + zcr_comp * 0.2, 0, 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
|
| 286 |
+
def _default_features(self):
|
| 287 |
+
"""Default features for errors"""
|
| 288 |
+
n_features = self.n_mfcc * 2 + 14
|
| 289 |
return {
|
| 290 |
+
'features': np.random.randn(n_features).astype(np.float32) * 0.1,
|
| 291 |
'vocal_affect_score': 0.5,
|
| 292 |
'monotone_score': 0.5,
|
| 293 |
'vocal_energy_score': 0.5,
|
| 294 |
+
'pitch_variability': 30.0,
|
| 295 |
+
'energy_level': 0.3
|
| 296 |
}
|
| 297 |
|
| 298 |
|
| 299 |
# ============================================
|
| 300 |
+
# EMOTION PREDICTOR
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
# ============================================
|
| 302 |
|
| 303 |
class EmotionPredictor:
|
| 304 |
+
"""Lightweight emotion predictor using sklearn"""
|
| 305 |
|
| 306 |
def __init__(self):
|
| 307 |
+
self.extractor = AudioFeatureExtractor(sr=16000, n_mfcc=20)
|
|
|
|
| 308 |
|
| 309 |
# Emotion mapping
|
| 310 |
+
self.emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
|
| 312 |
+
# Initialize models
|
| 313 |
+
self._initialize_models()
|
| 314 |
|
| 315 |
+
def _initialize_models(self):
|
| 316 |
+
"""Initialize pre-trained or demo models"""
|
|
|
|
| 317 |
|
| 318 |
+
# Try to load pre-trained models
|
| 319 |
+
if os.path.exists('emotion_classifier.pkl'):
|
| 320 |
try:
|
| 321 |
+
with open('emotion_classifier.pkl', 'rb') as f:
|
| 322 |
+
self.emotion_model = pickle.load(f)
|
| 323 |
+
with open('affect_model.pkl', 'rb') as f:
|
| 324 |
+
self.affect_model = pickle.load(f)
|
| 325 |
+
with open('monotone_model.pkl', 'rb') as f:
|
| 326 |
+
self.monotone_model = pickle.load(f)
|
| 327 |
+
with open('energy_model.pkl', 'rb') as f:
|
| 328 |
+
self.energy_model = pickle.load(f)
|
| 329 |
+
with open('scaler.pkl', 'rb') as f:
|
| 330 |
+
self.scaler = pickle.load(f)
|
| 331 |
+
print("β
Loaded pre-trained models")
|
| 332 |
+
return
|
| 333 |
+
except:
|
| 334 |
+
pass
|
| 335 |
+
|
| 336 |
+
# Create demo models (for demonstration without training)
|
| 337 |
+
print("βΉοΈ Creating demo models (for demonstration)")
|
| 338 |
+
|
| 339 |
+
n_features = 54 # 20*2 MFCC + 14 other features
|
| 340 |
+
|
| 341 |
+
# Emotion classifier
|
| 342 |
+
self.emotion_model = RandomForestClassifier(
|
| 343 |
+
n_estimators=100,
|
| 344 |
+
max_depth=10,
|
| 345 |
+
random_state=42
|
| 346 |
+
)
|
| 347 |
+
|
| 348 |
+
# Regression models
|
| 349 |
+
self.affect_model = GradientBoostingRegressor(n_estimators=50, random_state=42)
|
| 350 |
+
self.monotone_model = GradientBoostingRegressor(n_estimators=50, random_state=42)
|
| 351 |
+
self.energy_model = GradientBoostingRegressor(n_estimators=50, random_state=42)
|
| 352 |
+
|
| 353 |
+
# Scaler
|
| 354 |
+
self.scaler = StandardScaler()
|
| 355 |
+
|
| 356 |
+
# Fit with dummy data (for demo purposes)
|
| 357 |
+
X_dummy = np.random.randn(100, n_features)
|
| 358 |
+
y_emotion_dummy = np.random.randint(0, 8, 100)
|
| 359 |
+
y_reg_dummy = np.random.rand(100)
|
| 360 |
+
|
| 361 |
+
self.scaler.fit(X_dummy)
|
| 362 |
+
self.emotion_model.fit(X_dummy, y_emotion_dummy)
|
| 363 |
+
self.affect_model.fit(X_dummy, y_reg_dummy)
|
| 364 |
+
self.monotone_model.fit(X_dummy, y_reg_dummy)
|
| 365 |
+
self.energy_model.fit(X_dummy, y_reg_dummy)
|
| 366 |
|
| 367 |
+
def predict(self, audio_path):
|
| 368 |
"""Predict emotion and mental health indicators"""
|
| 369 |
|
| 370 |
# Extract features
|
| 371 |
+
feature_dict = self.extractor.extract_all_features(audio_path)
|
| 372 |
+
features = feature_dict['features'].reshape(1, -1)
|
|
|
|
| 373 |
|
| 374 |
+
# Scale features
|
| 375 |
+
features_scaled = self.scaler.transform(features)
|
|
|
|
| 376 |
|
| 377 |
+
# Predict emotion
|
| 378 |
+
emotion_probs = self.emotion_model.predict_proba(features_scaled)[0]
|
| 379 |
+
emotion_idx = np.argmax(emotion_probs)
|
| 380 |
+
emotion = self.emotions[emotion_idx]
|
| 381 |
+
confidence = emotion_probs[emotion_idx]
|
| 382 |
|
| 383 |
+
# Predict regression outputs
|
| 384 |
+
vocal_affect = np.clip(self.affect_model.predict(features_scaled)[0], 0, 1)
|
| 385 |
+
monotone_score = np.clip(self.monotone_model.predict(features_scaled)[0], 0, 1)
|
| 386 |
+
vocal_energy = np.clip(self.energy_model.predict(features_scaled)[0], 0, 1)
|
| 387 |
+
|
| 388 |
+
# Adjust with extracted features for better estimates
|
| 389 |
+
vocal_affect = (vocal_affect + feature_dict['vocal_affect_score']) / 2
|
| 390 |
+
monotone_score = (monotone_score + feature_dict['monotone_score']) / 2
|
| 391 |
+
vocal_energy = (vocal_energy + feature_dict['vocal_energy_score']) / 2
|
| 392 |
|
| 393 |
# Mental health interpretation
|
| 394 |
+
indicators = self._interpret_mental_health(monotone_score, vocal_affect, vocal_energy)
|
|
|
|
|
|
|
| 395 |
|
| 396 |
+
return {
|
| 397 |
'emotion': emotion,
|
| 398 |
'confidence': confidence,
|
| 399 |
'emotion_probabilities': {
|
| 400 |
+
self.emotions[i]: prob for i, prob in enumerate(emotion_probs)
|
|
|
|
| 401 |
},
|
| 402 |
'vocal_affect_score': vocal_affect,
|
| 403 |
'monotone_speech_score': monotone_score,
|
| 404 |
'vocal_energy_score': vocal_energy,
|
| 405 |
'pitch_variability': feature_dict['pitch_variability'],
|
| 406 |
'energy_level': feature_dict['energy_level'],
|
| 407 |
+
'mental_health_indicators': indicators
|
| 408 |
}
|
|
|
|
|
|
|
| 409 |
|
| 410 |
def _interpret_mental_health(self, monotone, affect, energy):
|
| 411 |
"""Interpret mental health indicators"""
|
|
|
|
| 423 |
if affect > 0.6 and monotone < 0.4:
|
| 424 |
indicators.append("β οΈ High vocal affect - possible emotional stress")
|
| 425 |
|
| 426 |
+
if 0.35 <= monotone <= 0.65 and 0.35 <= affect <= 0.65 and 0.35 <= energy <= 0.65:
|
| 427 |
+
indicators.append("β
Balanced vocal characteristics")
|
| 428 |
|
| 429 |
if not indicators:
|
| 430 |
indicators.append("βΉοΈ Vocal patterns within normal range")
|
|
|
|
| 436 |
# GRADIO INTERFACE
|
| 437 |
# ============================================
|
| 438 |
|
| 439 |
+
def create_app():
|
| 440 |
+
"""Create Gradio app"""
|
| 441 |
|
|
|
|
|
|
|
| 442 |
predictor = EmotionPredictor()
|
|
|
|
| 443 |
|
| 444 |
+
def analyze_audio(audio):
|
| 445 |
+
"""Analysis function"""
|
| 446 |
if audio is None:
|
| 447 |
+
return "β Please upload an audio file", "", "", "", "", ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
|
| 449 |
try:
|
|
|
|
| 450 |
results = predictor.predict(audio)
|
| 451 |
|
| 452 |
# Format emotion output
|
| 453 |
+
emotion_text = f"## π **{results['emotion'].upper()}**\n\n"
|
| 454 |
emotion_text += f"**Confidence:** {results['confidence']*100:.1f}%\n\n"
|
| 455 |
+
emotion_text += "### Probability Distribution:\n"
|
| 456 |
|
| 457 |
for emotion, prob in sorted(results['emotion_probabilities'].items(),
|
| 458 |
key=lambda x: x[1], reverse=True):
|
| 459 |
+
bar = "β" * int(prob * 20) + "β" * (20 - int(prob * 20))
|
| 460 |
+
emotion_text += f"**{emotion.title()}:** {bar} {prob*100:.1f}%\n"
|
|
|
|
| 461 |
|
| 462 |
# Format scores
|
| 463 |
+
affect = f"**Score:** {results['vocal_affect_score']:.3f}\n\n"
|
| 464 |
if results['vocal_affect_score'] > 0.7:
|
| 465 |
+
affect += "π΄ High intensity"
|
| 466 |
elif results['vocal_affect_score'] < 0.3:
|
| 467 |
+
affect += "π’ Low intensity"
|
| 468 |
else:
|
| 469 |
+
affect += "π‘ Moderate"
|
| 470 |
|
| 471 |
+
monotone = f"**Score:** {results['monotone_speech_score']:.3f}\n\n"
|
| 472 |
if results['monotone_speech_score'] > 0.7:
|
| 473 |
+
monotone += "π΄ Very flat speech"
|
| 474 |
elif results['monotone_speech_score'] < 0.3:
|
| 475 |
+
monotone += "π’ Varied pitch"
|
| 476 |
else:
|
| 477 |
+
monotone += "π‘ Moderate variation"
|
| 478 |
|
| 479 |
+
energy = f"**Score:** {results['vocal_energy_score']:.3f}\n\n"
|
| 480 |
if results['vocal_energy_score'] > 0.7:
|
| 481 |
+
energy += "π΄ High energy"
|
| 482 |
elif results['vocal_energy_score'] < 0.3:
|
| 483 |
+
energy += "π΄ Low energy"
|
| 484 |
else:
|
| 485 |
+
energy += "π’ Normal energy"
|
| 486 |
|
| 487 |
+
details = f"**Pitch Variability:** {results['pitch_variability']:.2f} Hz\n"
|
| 488 |
+
details += f"**Energy Level:** {results['energy_level']:.3f}"
|
| 489 |
|
| 490 |
+
mental = "\n".join(results['mental_health_indicators'])
|
| 491 |
|
| 492 |
+
return emotion_text, affect, monotone, energy, details, mental
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 493 |
|
| 494 |
except Exception as e:
|
| 495 |
+
return f"β Error: {str(e)}", "", "", "", "", ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 496 |
|
| 497 |
# Create interface
|
| 498 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
|
|
| 499 |
gr.Markdown("""
|
| 500 |
# ποΈ Audio Emotion & Mental Health Detection
|
| 501 |
|
| 502 |
+
Analyze emotional state and mental health indicators from speech audio.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 503 |
""")
|
| 504 |
|
| 505 |
with gr.Row():
|
| 506 |
+
with gr.Column():
|
| 507 |
+
audio = gr.Audio(type="filepath", label="Upload Audio")
|
| 508 |
+
btn = gr.Button("π Analyze", variant="primary", size="lg")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
|
| 510 |
+
with gr.Column():
|
| 511 |
+
emotion_out = gr.Markdown()
|
| 512 |
|
| 513 |
with gr.Row():
|
| 514 |
+
affect_out = gr.Markdown()
|
| 515 |
+
monotone_out = gr.Markdown()
|
| 516 |
+
energy_out = gr.Markdown()
|
|
|
|
|
|
|
|
|
|
| 517 |
|
| 518 |
+
details_out = gr.Markdown()
|
| 519 |
+
mental_out = gr.Markdown()
|
| 520 |
|
| 521 |
gr.Markdown("""
|
| 522 |
+
### π Interpretation
|
| 523 |
+
|
| 524 |
+
- **Vocal Affect:** Emotional intensity (0=calm, 1=intense)
|
| 525 |
+
- **Monotone Score:** Pitch flatness (high=depression risk)
|
| 526 |
+
- **Vocal Energy:** Speaking energy (low=low motivation)
|
| 527 |
+
|
| 528 |
+
β οΈ **Disclaimer:** For research only, not medical diagnosis.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 529 |
""")
|
| 530 |
|
| 531 |
+
btn.click(
|
| 532 |
+
analyze_audio,
|
| 533 |
+
inputs=audio,
|
| 534 |
+
outputs=[emotion_out, affect_out, monotone_out, energy_out, details_out, mental_out]
|
|
|
|
|
|
|
| 535 |
)
|
| 536 |
|
| 537 |
return demo
|
| 538 |
|
| 539 |
|
| 540 |
# ============================================
|
| 541 |
+
# MAIN
|
| 542 |
# ============================================
|
| 543 |
|
| 544 |
if __name__ == "__main__":
|
| 545 |
+
app = create_app()
|
| 546 |
+
app.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|