import os import streamlit as st import tempfile import subprocess import requests from urllib.parse import urlparse import json import torch import torchaudio # Set audio backend like in your working code torchaudio.set_audio_backend("soundfile") # Set cache directories for HuggingFace models # Ensure this directory exists and is writable cache_dir = "/tmp/hf_cache" # This is a common writable location on Linux/Docker os.makedirs(cache_dir, exist_ok=True) os.environ["HF_HOME"] = cache_dir os.environ["HUGGINGFACE_HUB_CACHE"] = cache_dir # --- Accent Model Cache Directory --- # Create a dedicated directory for SpeechBrain models within the accessible cache # Ensure this path is fully prepared and writable speechbrain_model_cache_base = os.path.join(cache_dir, "speechbrain_models_accent_id") os.makedirs(speechbrain_model_cache_base, exist_ok=True) # The specific model's subdirectory within the cache # This is the full path that 'savedir' should point to model_save_path = os.path.join(speechbrain_model_cache_base, "accent-id-commonaccent_xlsr-en-english") os.makedirs(model_save_path, exist_ok=True) # Ensure this specific model directory exists and is writable # --- End Accent Model Cache Directory --- # Try importing the accent detection model try: from speechbrain.pretrained.interfaces import foreign_class MODEL_AVAILABLE = True @st.cache_resource def load_accent_model(): """Load the XLSR Wav2Vec 2.0 accent classification model""" try: st.info(f"Attempting to load model from: {model_save_path}") model = foreign_class( source="Jzuluaga/accent-id-commonaccent_xlsr-en-english", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec22Classifier", # Note: Double check if this is the correct classname. It was CustomEncoderWav2vec2Classifier in the original. savedir=model_save_path # Use the pre-prepared full path ) return model except Exception as e: st.error(f"Failed to load accent model: [Errno 13] Permission denied: '{e}' - Please ensure '{model_save_path}' is writable.") st.error(f"Detailed Error: {e}") return None except ImportError: MODEL_AVAILABLE = False st.error("SpeechBrain not available. Install with: pip install speechbrain") # Accent categories with confidence thresholds ACCENT_CATEGORIES = [ "US", "England", "Australia", "Indian", "Canada", "Scotland", "Ireland", "Wales", "African", "NewZealand", "Bermuda", "Malaysia", "Philippines", "Singapore", "HongKong", "SouthAtlantic"] def download_video_audio(url, output_path): """Download and extract audio from video URL""" try: # Check if it's a direct video file if url.endswith(('.mp4', '.avi', '.mov', '.mkv')): # Download direct video file response = requests.get(url, stream=True, timeout=30) response.raise_for_status() temp_video = output_path.replace('.wav', '.mp4') with open(temp_video, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) # Extract audio using ffmpeg cmd = [ 'ffmpeg', '-i', temp_video, '-ar', '16000', '-ac', '1', '-y', output_path ] subprocess.run(cmd, check=True, capture_output=True) os.remove(temp_video) return True else: # Use yt-dlp for other video platforms (Loom, YouTube, etc.) cmd = [ 'yt-dlp', '--extract-audio', '--audio-format', 'wav', '--audio-quality', '0', '--output', output_path.replace('.wav', '.%(ext)s'), url ] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: return True else: st.error(f"yt-dlp error: {result.stderr}") return False except Exception as e: st.error(f"Download failed: {e}") return False def analyze_accent(audio_file_path, model): """Analyze accent using XLSR Wav2Vec 2.0 model""" try: # Get predictions from the model - same as your working code out_prob, score, index, text_lab = model.classify_file(audio_file_path) # Convert probabilities to dictionary - same approach as your working code probs = out_prob.squeeze().numpy() accent_scores = { ACCENT_CATEGORIES[i]: float(probs[i]) * 100 for i in range(len(ACCENT_CATEGORIES)) } # Get top prediction - same as your working code predicted_accent = text_lab # Use text_lab like your working code confidence = float(score) * 100 return predicted_accent, confidence, accent_scores except Exception as e: st.error(f"Accent analysis failed: {e}") return None, None, None def generate_summary(accent, confidence, top_scores): """Generate a summary of the accent analysis""" if confidence > 80: confidence_level = "Very High" elif confidence > 60: confidence_level = "High" elif confidence > 40: confidence_level = "Moderate" else: confidence_level = "Low" # Get top 3 accents top_3 = sorted(top_scores.items(), key=lambda x: x[1], reverse=True)[:3] summary = f""" **Primary Accent:** {accent} ({confidence:.1f}% confidence) **Confidence Level:** {confidence_level} **Top 3 Detected Accents:** 1. {top_3[0][0]}: {top_3[0][1]:.1f}% 2. {top_3[1][0]}: {top_3[1][1]:.1f}% 3. {top_3[2][0]}: {top_3[2][1]:.1f}% **Hiring Recommendation:** """ if confidence > 70: summary += "✅ Strong English accent detected - Suitable for English-speaking roles" elif confidence > 50: summary += "⚠️ Moderate English accent detected - May require accent assessment" else: summary += "❌ Weak English accent signal - Further evaluation recommended" return summary def main(): st.set_page_config( page_title="English Accent Detector", page_icon="🗣️", layout="wide" ) st.title("🗣️ English Accent Detection Tool") st.subheader("For Hiring & Language Assessment") st.markdown(""" **Purpose:** Analyze spoken English accents from video URLs to assist in hiring decisions. **Supported:** Loom videos, direct MP4 links, YouTube, and other video platforms. """) # Load model if not MODEL_AVAILABLE: st.stop() with st.spinner("Loading XLSR Wav2Vec 2.0 model..."): model = load_accent_model() if not model: st.error("❌ Could not load accent detection model") st.stop() st.success("✅ Accent detection model loaded successfully!") # Input section st.markdown("---") st.subheader("📥 Video Input") video_url = st.text_input( "Enter Video URL", placeholder="https://www.loom.com/share/... or direct MP4 link", help="Supports Loom, YouTube, direct video files, and most video platforms" ) if video_url: st.info(f"🔗 **URL:** {video_url}") if st.button("🎯 Analyze Accent", type="primary"): # Create temporary file for audio with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file: audio_path = tmp_file.name try: # Step 1: Download and extract audio with st.spinner("📥 Downloading video and extracting audio..."): if download_video_audio(video_url, audio_path): st.success("✅ Audio extracted successfully") # Play the extracted audio with open(audio_path, 'rb') as audio_file: st.audio(audio_file.read(), format="audio/wav") else: st.error("❌ Failed to extract audio") st.stop() # Step 2: Analyze accent with st.spinner("🧠 Analyzing accent with XLSR Wav2Vec 2.0..."): accent, confidence, accent_scores = analyze_accent(audio_path, model) if accent: # Display results st.markdown("---") st.subheader("📊 Analysis Results") # Main result col1, col2 = st.columns(2) with col1: st.metric( label="🎯 Detected Accent", value=accent, help="Primary English accent classification" ) with col2: st.metric( label="🎪 Confidence Score", value=f"{confidence:.1f}%", help="Model confidence in the prediction" ) # Detailed breakdown st.subheader("📈 Accent Probability Breakdown") # Sort and display top 8 accents sorted_accents = sorted(accent_scores.items(), key=lambda x: x[1], reverse=True)[:8] for accent_name, score in sorted_accents: st.progress(score/100, text=f"{accent_name}: {score:.1f}%") # Summary st.subheader("📝 Assessment Summary") summary = generate_summary(accent, confidence, accent_scores) st.markdown(summary) # JSON output for API integration with st.expander("🔧 JSON Output (for API integration)"): result_json = { "primary_accent": accent, "confidence_score": round(confidence, 1), "accent_probabilities": {k: round(v, 1) for k, v in accent_scores.items()}, "top_3_accents": [ {"accent": k, "probability": round(v, 1)} for k, v in sorted(accent_scores.items(), key=lambda x: x[1], reverse=True)[:3] ], "recommendation": "suitable" if confidence > 70 else "assessment_needed" if confidence > 50 else "further_evaluation" } st.json(result_json) else: st.error("❌ Accent analysis failed") finally: # Cleanup if os.path.exists(audio_path): os.remove(audio_path) # Footer st.markdown("---") st.markdown(""" **Technical Details:** - Model: XLSR Wav2Vec 2.0 (95% accuracy on English accents) - Supports: 16 English accent varieties - Processing: Automatic audio extraction and resampling to 16kHz **Built for hiring teams to assess English language proficiency** """) if __name__ == "__main__": main()