#!/usr/bin/env python3
"""
Audio Emotion & Mental Health Detection Model
Lightweight version for Hugging Face Spaces
Using scikit-learn instead of PyTorch
"""

import os
import numpy as np
import gradio as gr
from typing import Dict
import warnings
import pickle
warnings.filterwarnings('ignore')

# Audio processing
try:
    import librosa
    LIBROSA_AVAILABLE = True
except ImportError:
    LIBROSA_AVAILABLE = False
    print("⚠️ Librosa not available, using scipy")

from scipy.io import wavfile
import scipy.signal as signal
from scipy import fft

# Machine Learning
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier, MLPRegressor

# ============================================
# AUDIO PROCESSING
# ============================================

class AudioFeatureExtractor:
    """Extract audio features without heavy dependencies"""
    
    def __init__(self, sr=16000, n_mfcc=20):
        self.sr = sr
        self.n_mfcc = n_mfcc
    
    def load_audio(self, audio_path):
        """Load audio file"""
        try:
            if LIBROSA_AVAILABLE:
                y, sr = librosa.load(audio_path, sr=self.sr, duration=3)
                return y, sr
            else:
                # Use scipy
                sr, y = wavfile.read(audio_path)
                
                # Convert to mono
                if len(y.shape) > 1:
                    y = y.mean(axis=1)
                
                # Normalize
                y = y.astype(np.float32)
                if np.max(np.abs(y)) > 0:
                    y = y / np.max(np.abs(y))
                
                # Resample if needed
                if sr != self.sr:
                    num_samples = int(len(y) * self.sr / sr)
                    y = signal.resample(y, num_samples)
                
                # Limit to 3 seconds
                max_len = 3 * self.sr
                if len(y) > max_len:
                    y = y[:max_len]
                
                return y, self.sr
        except Exception as e:
            print(f"Error loading audio: {e}")
            return np.random.randn(self.sr * 3) * 0.1, self.sr
    
    def get_mfcc_simple(self, y):
        """Simplified MFCC extraction"""
        # Pre-emphasis
        y_emphasized = np.append(y[0], y[1:] - 0.97 * y[:-1])
        
        # Framing
        frame_length = int(0.025 * self.sr)
        frame_step = int(0.01 * self.sr)
        
        num_frames = 1 + int((len(y_emphasized) - frame_length) / frame_step)
        frames = np.zeros((num_frames, frame_length))
        
        for i in range(num_frames):
            start = i * frame_step
            frames[i] = y_emphasized[start:start + frame_length]
        
        # Apply window
        frames *= np.hamming(frame_length)
        
        # FFT
        mag_frames = np.absolute(np.fft.rfft(frames, frame_length))
        pow_frames = ((1.0 / frame_length) * (mag_frames ** 2))
        
        # Mel filterbank
        nfft = frame_length
        nfilt = 26
        low_freq_mel = 0
        high_freq_mel = 2595 * np.log10(1 + (self.sr / 2) / 700)
        mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)
        hz_points = 700 * (10**(mel_points / 2595) - 1)
        bin_points = np.floor((nfft + 1) * hz_points / self.sr).astype(int)
        
        fbank = np.zeros((nfilt, int(nfft / 2 + 1)))
        for m in range(1, nfilt + 1):
            f_m_minus = bin_points[m - 1]
            f_m = bin_points[m]
            f_m_plus = bin_points[m + 1]
            
            for k in range(f_m_minus, f_m):
                fbank[m - 1, k] = (k - bin_points[m - 1]) / (bin_points[m] - bin_points[m - 1])
            for k in range(f_m, f_m_plus):
                fbank[m - 1, k] = (bin_points[m + 1] - k) / (bin_points[m + 1] - bin_points[m])
        
        filter_banks = np.dot(pow_frames, fbank.T)
        filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)
        filter_banks = 20 * np.log10(filter_banks)
        
        # DCT
        mfcc = fft.dct(filter_banks, type=2, axis=1, norm='ortho')[:, :self.n_mfcc]
        
        return mfcc.T
    
    def extract_pitch(self, y):
        """Extract pitch using autocorrelation"""
        pitch_values = []
        frame_length = int(0.03 * self.sr)
        hop_length = int(0.01 * self.sr)
        
        for i in range(0, len(y) - frame_length, hop_length):
            frame = y[i:i+frame_length]
            
            # Autocorrelation
            corr = np.correlate(frame, frame, mode='full')
            corr = corr[len(corr)//2:]
            
            # Find first peak after lag 0
            d = np.diff(corr)
            start = int(self.sr / 400)  # Min 400 Hz
            peak = np.where(d[start:] < 0)[0]
            
            if len(peak) > 0:
                peak_idx = peak[0] + start
                if peak_idx > 0:
                    freq = self.sr / peak_idx
                    if 50 < freq < 400:
                        pitch_values.append(freq)
        
        return pitch_values if pitch_values else [150.0]
    
    def extract_energy(self, y):
        """Extract RMS energy"""
        frame_length = int(0.025 * self.sr)
        hop_length = int(0.01 * self.sr)
        
        rms = []
        for i in range(0, len(y) - frame_length, hop_length):
            frame = y[i:i+frame_length]
            rms.append(np.sqrt(np.mean(frame**2)))
        
        return np.array(rms)
    
    def extract_zcr(self, y):
        """Zero crossing rate"""
        frame_length = int(0.025 * self.sr)
        hop_length = int(0.01 * self.sr)
        
        zcr = []
        for i in range(0, len(y) - frame_length, hop_length):
            frame = y[i:i+frame_length]
            crossings = np.sum(np.abs(np.diff(np.sign(frame)))) / 2
            zcr.append(crossings / frame_length)
        
        return np.array(zcr)
    
    def extract_spectral_features(self, y):
        """Spectral features"""
        spectrum = np.fft.rfft(y)
        magnitude = np.abs(spectrum)
        freq = np.fft.rfftfreq(len(y), 1.0/self.sr)
        
        # Spectral centroid
        centroid = np.sum(freq * magnitude) / (np.sum(magnitude) + 1e-6)
        
        # Spectral rolloff
        cumsum = np.cumsum(magnitude)
        rolloff_idx = np.where(cumsum >= 0.85 * cumsum[-1])[0]
        rolloff = freq[rolloff_idx[0]] if len(rolloff_idx) > 0 else 0
        
        # Spectral bandwidth
        bandwidth = np.sqrt(np.sum(((freq - centroid)**2) * magnitude) / (np.sum(magnitude) + 1e-6))
        
        return centroid, rolloff, bandwidth
    
    def extract_all_features(self, audio_path):
        """Extract all features"""
        try:
            y, sr = self.load_audio(audio_path)
            
            # MFCCs
            mfcc = self.get_mfcc_simple(y)
            mfcc_mean = np.mean(mfcc, axis=1)
            mfcc_std = np.std(mfcc, axis=1)
            
            # Pitch
            pitch_values = self.extract_pitch(y)
            pitch_mean = np.mean(pitch_values)
            pitch_std = np.std(pitch_values)
            pitch_min = np.min(pitch_values)
            pitch_max = np.max(pitch_values)
            monotone_score = 1.0 / (1.0 + pitch_std/10.0)
            
            # Energy
            rms = self.extract_energy(y)
            energy_mean = np.mean(rms)
            energy_std = np.std(rms)
            energy_max = np.max(rms)
            
            # ZCR
            zcr = self.extract_zcr(y)
            zcr_mean = np.mean(zcr)
            zcr_std = np.std(zcr)
            
            # Spectral
            spec_centroid, spec_rolloff, spec_bandwidth = self.extract_spectral_features(y)
            
            # Tempo estimation
            onset_env = rms
            tempo = 120.0  # Default
            if len(onset_env) > 10:
                autocorr = np.correlate(onset_env, onset_env, mode='full')
                autocorr = autocorr[len(autocorr)//2:]
                peaks = signal.find_peaks(autocorr)[0]
                if len(peaks) > 0 and peaks[0] > 0:
                    tempo = 60.0 / (peaks[0] * 0.01)
                    tempo = np.clip(tempo, 60, 180)
            
            # Combine features
            features = np.concatenate([
                mfcc_mean,
                mfcc_std,
                [pitch_mean, pitch_std, pitch_min, pitch_max, monotone_score],
                [energy_mean, energy_std, energy_max],
                [zcr_mean, zcr_std],
                [spec_centroid, spec_rolloff, spec_bandwidth],
                [tempo]
            ])
            
            # Derived scores
            vocal_affect = self._calc_affect(pitch_std, energy_std, spec_centroid)
            vocal_energy = self._calc_energy(energy_mean, tempo, zcr_mean)
            
            return {
                'features': features.astype(np.float32),
                'vocal_affect_score': float(vocal_affect),
                'monotone_score': float(monotone_score),
                'vocal_energy_score': float(vocal_energy),
                'pitch_variability': float(pitch_std),
                'energy_level': float(energy_mean)
            }
            
        except Exception as e:
            print(f"Error: {e}")
            return self._default_features()
    
    def _calc_affect(self, pitch_std, energy_std, spec_centroid):
        """Calculate vocal affect score"""
        pitch_comp = min(pitch_std / 50.0, 1.0)
        energy_comp = min(energy_std / 0.3, 1.0)
        spec_comp = min(spec_centroid / 2000.0, 1.0)
        return np.clip(pitch_comp * 0.4 + energy_comp * 0.4 + spec_comp * 0.2, 0, 1)
    
    def _calc_energy(self, energy_mean, tempo, zcr_mean):
        """Calculate vocal energy score"""
        energy_comp = min(energy_mean / 0.5, 1.0)
        tempo_comp = min(tempo / 150.0, 1.0)
        zcr_comp = min(zcr_mean / 0.15, 1.0)
        return np.clip(energy_comp * 0.5 + tempo_comp * 0.3 + zcr_comp * 0.2, 0, 1)
    
    def _default_features(self):
        """Default features for errors"""
        n_features = self.n_mfcc * 2 + 14
        return {
            'features': np.random.randn(n_features).astype(np.float32) * 0.1,
            'vocal_affect_score': 0.5,
            'monotone_score': 0.5,
            'vocal_energy_score': 0.5,
            'pitch_variability': 30.0,
            'energy_level': 0.3
        }


# ============================================
# EMOTION PREDICTOR
# ============================================

class EmotionPredictor:
    """Lightweight emotion predictor using sklearn"""
    
    def __init__(self):
        self.extractor = AudioFeatureExtractor(sr=16000, n_mfcc=20)
        
        # Emotion mapping
        self.emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
        
        # Initialize models
        self._initialize_models()
    
    def _initialize_models(self):
        """Initialize pre-trained or demo models"""
        
        # Try to load pre-trained models
        if os.path.exists('emotion_classifier.pkl'):
            try:
                with open('emotion_classifier.pkl', 'rb') as f:
                    self.emotion_model = pickle.load(f)
                with open('affect_model.pkl', 'rb') as f:
                    self.affect_model = pickle.load(f)
                with open('monotone_model.pkl', 'rb') as f:
                    self.monotone_model = pickle.load(f)
                with open('energy_model.pkl', 'rb') as f:
                    self.energy_model = pickle.load(f)
                with open('scaler.pkl', 'rb') as f:
                    self.scaler = pickle.load(f)
                print("✅ Loaded pre-trained models")
                return
            except:
                pass
        
        # Create demo models (for demonstration without training)
        print("ℹ️ Creating demo models (for demonstration)")
        
        n_features = 54  # 20*2 MFCC + 14 other features
        
        # Emotion classifier
        self.emotion_model = RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            random_state=42
        )
        
        # Regression models
        self.affect_model = GradientBoostingRegressor(n_estimators=50, random_state=42)
        self.monotone_model = GradientBoostingRegressor(n_estimators=50, random_state=42)
        self.energy_model = GradientBoostingRegressor(n_estimators=50, random_state=42)
        
        # Scaler
        self.scaler = StandardScaler()
        
        # Fit with dummy data (for demo purposes)
        X_dummy = np.random.randn(100, n_features)
        y_emotion_dummy = np.random.randint(0, 8, 100)
        y_reg_dummy = np.random.rand(100)
        
        self.scaler.fit(X_dummy)
        self.emotion_model.fit(X_dummy, y_emotion_dummy)
        self.affect_model.fit(X_dummy, y_reg_dummy)
        self.monotone_model.fit(X_dummy, y_reg_dummy)
        self.energy_model.fit(X_dummy, y_reg_dummy)
    
    def predict(self, audio_path):
        """Predict emotion and mental health indicators"""
        
        # Extract features
        feature_dict = self.extractor.extract_all_features(audio_path)
        features = feature_dict['features'].reshape(1, -1)
        
        # Scale features
        features_scaled = self.scaler.transform(features)
        
        # Predict emotion
        emotion_probs = self.emotion_model.predict_proba(features_scaled)[0]
        emotion_idx = np.argmax(emotion_probs)
        emotion = self.emotions[emotion_idx]
        confidence = emotion_probs[emotion_idx]
        
        # Predict regression outputs
        vocal_affect = np.clip(self.affect_model.predict(features_scaled)[0], 0, 1)
        monotone_score = np.clip(self.monotone_model.predict(features_scaled)[0], 0, 1)
        vocal_energy = np.clip(self.energy_model.predict(features_scaled)[0], 0, 1)
        
        # Adjust with extracted features for better estimates
        vocal_affect = (vocal_affect + feature_dict['vocal_affect_score']) / 2
        monotone_score = (monotone_score + feature_dict['monotone_score']) / 2
        vocal_energy = (vocal_energy + feature_dict['vocal_energy_score']) / 2
        
        # Mental health interpretation
        indicators = self._interpret_mental_health(monotone_score, vocal_affect, vocal_energy)
        
        return {
            'emotion': emotion,
            'confidence': confidence,
            'emotion_probabilities': {
                self.emotions[i]: prob for i, prob in enumerate(emotion_probs)
            },
            'vocal_affect_score': vocal_affect,
            'monotone_speech_score': monotone_score,
            'vocal_energy_score': vocal_energy,
            'pitch_variability': feature_dict['pitch_variability'],
            'energy_level': feature_dict['energy_level'],
            'mental_health_indicators': indicators
        }
    
    def _interpret_mental_health(self, monotone, affect, energy):
        """Interpret mental health indicators"""
        indicators = []
        
        if monotone > 0.7:
            indicators.append("⚠️ High monotone score - possible depression indicator")
        
        if affect > 0.7 and energy > 0.7:
            indicators.append("⚠️ High vocal affect and energy - possible anxiety/stress")
        
        if energy < 0.3:
            indicators.append("⚠️ Low vocal energy - possible low motivation/depression")
        
        if affect > 0.6 and monotone < 0.4:
            indicators.append("⚠️ High vocal affect - possible emotional stress")
        
        if 0.35 <= monotone <= 0.65 and 0.35 <= affect <= 0.65 and 0.35 <= energy <= 0.65:
            indicators.append("✅ Balanced vocal characteristics")
        
        if not indicators:
            indicators.append("ℹ️ Vocal patterns within normal range")
        
        return indicators


# ============================================
# GRADIO INTERFACE
# ============================================

def create_app():
    """Create Gradio app"""
    
    predictor = EmotionPredictor()
    
    def analyze_audio(audio):
        """Analysis function"""
        if audio is None:
            return "❌ Please upload an audio file", "", "", "", "", ""
        
        try:
            results = predictor.predict(audio)
            
            # Format emotion output
            emotion_text = f"## 🎭 **{results['emotion'].upper()}**\n\n"
            emotion_text += f"**Confidence:** {results['confidence']*100:.1f}%\n\n"
            emotion_text += "### Probability Distribution:\n"
            
            for emotion, prob in sorted(results['emotion_probabilities'].items(), 
                                       key=lambda x: x[1], reverse=True):
                bar = "█" * int(prob * 20) + "░" * (20 - int(prob * 20))
                emotion_text += f"**{emotion.title()}:** {bar} {prob*100:.1f}%\n"
            
            # Format scores
            affect = f"**Score:** {results['vocal_affect_score']:.3f}\n\n"
            if results['vocal_affect_score'] > 0.7:
                affect += "🔴 High intensity"
            elif results['vocal_affect_score'] < 0.3:
                affect += "🟢 Low intensity"
            else:
                affect += "🟡 Moderate"
            
            monotone = f"**Score:** {results['monotone_speech_score']:.3f}\n\n"
            if results['monotone_speech_score'] > 0.7:
                monotone += "🔴 Very flat speech"
            elif results['monotone_speech_score'] < 0.3:
                monotone += "🟢 Varied pitch"
            else:
                monotone += "🟡 Moderate variation"
            
            energy = f"**Score:** {results['vocal_energy_score']:.3f}\n\n"
            if results['vocal_energy_score'] > 0.7:
                energy += "🔴 High energy"
            elif results['vocal_energy_score'] < 0.3:
                energy += "🔴 Low energy"
            else:
                energy += "🟢 Normal energy"
            
            details = f"**Pitch Variability:** {results['pitch_variability']:.2f} Hz\n"
            details += f"**Energy Level:** {results['energy_level']:.3f}"
            
            mental = "\n".join(results['mental_health_indicators'])
            
            return emotion_text, affect, monotone, energy, details, mental
            
        except Exception as e:
            return f"❌ Error: {str(e)}", "", "", "", "", ""
    
    # Create interface
    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # 🎙️ Audio Emotion & Mental Health Detection
        
        Analyze emotional state and mental health indicators from speech audio.
        """)
        
        with gr.Row():
            with gr.Column():
                audio = gr.Audio(type="filepath", label="Upload Audio")
                btn = gr.Button("🔍 Analyze", variant="primary", size="lg")
            
            with gr.Column():
                emotion_out = gr.Markdown()
                
                with gr.Row():
                    affect_out = gr.Markdown()
                    monotone_out = gr.Markdown()
                    energy_out = gr.Markdown()
                
                details_out = gr.Markdown()
                mental_out = gr.Markdown()
        
        gr.Markdown("""
        ### 📊 Interpretation
        
        - **Vocal Affect:** Emotional intensity (0=calm, 1=intense)
        - **Monotone Score:** Pitch flatness (high=depression risk)
        - **Vocal Energy:** Speaking energy (low=low motivation)
        
        ⚠️ **Disclaimer:** For research only, not medical diagnosis.
        """)
        
        btn.click(
            analyze_audio,
            inputs=audio,
            outputs=[emotion_out, affect_out, monotone_out, energy_out, details_out, mental_out]
        )
    
    return demo


# ============================================
# MAIN
# ============================================

if __name__ == "__main__":
    app = create_app()
    app.launch()