accent-classification / src /streamlit_app.py
Mohamedenzeyad's picture
Update src/streamlit_app.py
50843ef verified
import os
import streamlit as st
import tempfile
import subprocess
import requests
from urllib.parse import urlparse
import json
import torch
import torchaudio
# Set audio backend like in your working code
torchaudio.set_audio_backend("soundfile")
# Set cache directories for HuggingFace models
# Ensure this directory exists and is writable
cache_dir = "/tmp/hf_cache" # This is a common writable location on Linux/Docker
os.makedirs(cache_dir, exist_ok=True)
os.environ["HF_HOME"] = cache_dir
os.environ["HUGGINGFACE_HUB_CACHE"] = cache_dir
# --- Accent Model Cache Directory ---
# Create a dedicated directory for SpeechBrain models within the accessible cache
# Ensure this path is fully prepared and writable
speechbrain_model_cache_base = os.path.join(cache_dir, "speechbrain_models_accent_id")
os.makedirs(speechbrain_model_cache_base, exist_ok=True)
# The specific model's subdirectory within the cache
# This is the full path that 'savedir' should point to
model_save_path = os.path.join(speechbrain_model_cache_base, "accent-id-commonaccent_xlsr-en-english")
os.makedirs(model_save_path, exist_ok=True) # Ensure this specific model directory exists and is writable
# --- End Accent Model Cache Directory ---
# Try importing the accent detection model
try:
from speechbrain.pretrained.interfaces import foreign_class
MODEL_AVAILABLE = True
@st.cache_resource
def load_accent_model():
"""Load the XLSR Wav2Vec 2.0 accent classification model"""
try:
st.info(f"Attempting to load model from: {model_save_path}")
model = foreign_class(
source="Jzuluaga/accent-id-commonaccent_xlsr-en-english",
pymodule_file="custom_interface.py",
classname="CustomEncoderWav2vec22Classifier", # Note: Double check if this is the correct classname. It was CustomEncoderWav2vec2Classifier in the original.
savedir=model_save_path # Use the pre-prepared full path
)
return model
except Exception as e:
st.error(f"Failed to load accent model: [Errno 13] Permission denied: '{e}' - Please ensure '{model_save_path}' is writable.")
st.error(f"Detailed Error: {e}")
return None
except ImportError:
MODEL_AVAILABLE = False
st.error("SpeechBrain not available. Install with: pip install speechbrain")
# Accent categories with confidence thresholds
ACCENT_CATEGORIES = [
"US", "England", "Australia", "Indian", "Canada",
"Scotland", "Ireland", "Wales", "African", "NewZealand",
"Bermuda", "Malaysia", "Philippines", "Singapore",
"HongKong", "SouthAtlantic"]
def download_video_audio(url, output_path):
"""Download and extract audio from video URL"""
try:
# Check if it's a direct video file
if url.endswith(('.mp4', '.avi', '.mov', '.mkv')):
# Download direct video file
response = requests.get(url, stream=True, timeout=30)
response.raise_for_status()
temp_video = output_path.replace('.wav', '.mp4')
with open(temp_video, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
# Extract audio using ffmpeg
cmd = [
'ffmpeg', '-i', temp_video, '-ar', '16000',
'-ac', '1', '-y', output_path
]
subprocess.run(cmd, check=True, capture_output=True)
os.remove(temp_video)
return True
else:
# Use yt-dlp for other video platforms (Loom, YouTube, etc.)
cmd = [
'yt-dlp', '--extract-audio', '--audio-format', 'wav',
'--audio-quality', '0', '--output', output_path.replace('.wav', '.%(ext)s'),
url
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
return True
else:
st.error(f"yt-dlp error: {result.stderr}")
return False
except Exception as e:
st.error(f"Download failed: {e}")
return False
def analyze_accent(audio_file_path, model):
"""Analyze accent using XLSR Wav2Vec 2.0 model"""
try:
# Get predictions from the model - same as your working code
out_prob, score, index, text_lab = model.classify_file(audio_file_path)
# Convert probabilities to dictionary - same approach as your working code
probs = out_prob.squeeze().numpy()
accent_scores = {
ACCENT_CATEGORIES[i]: float(probs[i]) * 100
for i in range(len(ACCENT_CATEGORIES))
}
# Get top prediction - same as your working code
predicted_accent = text_lab # Use text_lab like your working code
confidence = float(score) * 100
return predicted_accent, confidence, accent_scores
except Exception as e:
st.error(f"Accent analysis failed: {e}")
return None, None, None
def generate_summary(accent, confidence, top_scores):
"""Generate a summary of the accent analysis"""
if confidence > 80:
confidence_level = "Very High"
elif confidence > 60:
confidence_level = "High"
elif confidence > 40:
confidence_level = "Moderate"
else:
confidence_level = "Low"
# Get top 3 accents
top_3 = sorted(top_scores.items(), key=lambda x: x[1], reverse=True)[:3]
summary = f"""
**Primary Accent:** {accent} ({confidence:.1f}% confidence)
**Confidence Level:** {confidence_level}
**Top 3 Detected Accents:**
1. {top_3[0][0]}: {top_3[0][1]:.1f}%
2. {top_3[1][0]}: {top_3[1][1]:.1f}%
3. {top_3[2][0]}: {top_3[2][1]:.1f}%
**Hiring Recommendation:**
"""
if confidence > 70:
summary += "βœ… Strong English accent detected - Suitable for English-speaking roles"
elif confidence > 50:
summary += "⚠️ Moderate English accent detected - May require accent assessment"
else:
summary += "❌ Weak English accent signal - Further evaluation recommended"
return summary
def main():
st.set_page_config(
page_title="English Accent Detector",
page_icon="πŸ—£οΈ",
layout="wide"
)
st.title("πŸ—£οΈ English Accent Detection Tool")
st.subheader("For Hiring & Language Assessment")
st.markdown("""
**Purpose:** Analyze spoken English accents from video URLs to assist in hiring decisions.
**Supported:** Loom videos, direct MP4 links, YouTube, and other video platforms.
""")
# Load model
if not MODEL_AVAILABLE:
st.stop()
with st.spinner("Loading XLSR Wav2Vec 2.0 model..."):
model = load_accent_model()
if not model:
st.error("❌ Could not load accent detection model")
st.stop()
st.success("βœ… Accent detection model loaded successfully!")
# Input section
st.markdown("---")
st.subheader("πŸ“₯ Video Input")
video_url = st.text_input(
"Enter Video URL",
placeholder="https://www.loom.com/share/... or direct MP4 link",
help="Supports Loom, YouTube, direct video files, and most video platforms"
)
if video_url:
st.info(f"πŸ”— **URL:** {video_url}")
if st.button("🎯 Analyze Accent", type="primary"):
# Create temporary file for audio
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
audio_path = tmp_file.name
try:
# Step 1: Download and extract audio
with st.spinner("πŸ“₯ Downloading video and extracting audio..."):
if download_video_audio(video_url, audio_path):
st.success("βœ… Audio extracted successfully")
# Play the extracted audio
with open(audio_path, 'rb') as audio_file:
st.audio(audio_file.read(), format="audio/wav")
else:
st.error("❌ Failed to extract audio")
st.stop()
# Step 2: Analyze accent
with st.spinner("🧠 Analyzing accent with XLSR Wav2Vec 2.0..."):
accent, confidence, accent_scores = analyze_accent(audio_path, model)
if accent:
# Display results
st.markdown("---")
st.subheader("πŸ“Š Analysis Results")
# Main result
col1, col2 = st.columns(2)
with col1:
st.metric(
label="🎯 Detected Accent",
value=accent,
help="Primary English accent classification"
)
with col2:
st.metric(
label="πŸŽͺ Confidence Score",
value=f"{confidence:.1f}%",
help="Model confidence in the prediction"
)
# Detailed breakdown
st.subheader("πŸ“ˆ Accent Probability Breakdown")
# Sort and display top 8 accents
sorted_accents = sorted(accent_scores.items(), key=lambda x: x[1], reverse=True)[:8]
for accent_name, score in sorted_accents:
st.progress(score/100, text=f"{accent_name}: {score:.1f}%")
# Summary
st.subheader("πŸ“ Assessment Summary")
summary = generate_summary(accent, confidence, accent_scores)
st.markdown(summary)
# JSON output for API integration
with st.expander("πŸ”§ JSON Output (for API integration)"):
result_json = {
"primary_accent": accent,
"confidence_score": round(confidence, 1),
"accent_probabilities": {k: round(v, 1) for k, v in accent_scores.items()},
"top_3_accents": [
{"accent": k, "probability": round(v, 1)}
for k, v in sorted(accent_scores.items(), key=lambda x: x[1], reverse=True)[:3]
],
"recommendation": "suitable" if confidence > 70 else "assessment_needed" if confidence > 50 else "further_evaluation"
}
st.json(result_json)
else:
st.error("❌ Accent analysis failed")
finally:
# Cleanup
if os.path.exists(audio_path):
os.remove(audio_path)
# Footer
st.markdown("---")
st.markdown("""
**Technical Details:**
- Model: XLSR Wav2Vec 2.0 (95% accuracy on English accents)
- Supports: 16 English accent varieties
- Processing: Automatic audio extraction and resampling to 16kHz
**Built for hiring teams to assess English language proficiency**
""")
if __name__ == "__main__":
main()