Spaces:
Build error
Build error
File size: 4,166 Bytes
3ae1bf1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import torch
import torchaudio
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
import librosa
import numpy as np
import os
# Model loading function to allow lazy loading
def load_model():
"""
Load the accent classification model and feature extractor.
Returns:
tuple: (model, feature_extractor)
"""
model_name = "dima806/english_accents_classification"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
return model, feature_extractor
# Global variables for lazy loading
_model = None
_feature_extractor = None
def get_model_and_extractor():
"""
Get the model and feature extractor, loading them if necessary.
Returns:
tuple: (model, feature_extractor)
"""
global _model, _feature_extractor
if _model is None or _feature_extractor is None:
_model, _feature_extractor = load_model()
return _model, _feature_extractor
# Load and preprocess the audio
def load_audio(file_path):
"""
Load and preprocess audio file for accent analysis.
Parameters:
file_path (str): Path to the audio file
Returns:
tuple: (audio_data, sample_rate)
"""
# Load audio with librosa
audio, sr = librosa.load(file_path, sr=16000)
return audio, sr
# Predict accent
def predict_accent(file_path):
"""
Predict the accent in an audio file.
Parameters:
file_path (str): Path to the audio file
Returns:
tuple: (accent_label, confidence_score)
"""
# Check if file exists
if not os.path.exists(file_path):
raise FileNotFoundError(f"Audio file not found: {file_path}")
# Get or load model
model, feature_extractor = get_model_and_extractor()
# Load and process audio
audio, sr = load_audio(file_path)
inputs = feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
# Make prediction
with torch.no_grad():
logits = model(**inputs).logits
# Get results
predicted_class_id = torch.argmax(logits).item()
predicted_label = model.config.id2label[predicted_class_id]
confidence = torch.softmax(logits, dim=1)[0][predicted_class_id].item()
# Get all accent probabilities
all_probs = torch.softmax(logits, dim=1)[0].tolist()
all_accents = {model.config.id2label[i]: float(prob) for i, prob in enumerate(all_probs)}
return predicted_label, confidence
# Get detailed accent analysis
def get_detailed_accent_analysis(file_path):
"""
Get detailed accent analysis including all possible accents and their probabilities.
Parameters:
file_path (str): Path to the audio file
Returns:
dict: Detailed accent analysis results
"""
# Get or load model
model, feature_extractor = get_model_and_extractor()
# Load and process audio
audio, sr = load_audio(file_path)
inputs = feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
# Make prediction
with torch.no_grad():
logits = model(**inputs).logits
# Get top prediction
predicted_class_id = torch.argmax(logits).item()
predicted_label = model.config.id2label[predicted_class_id]
confidence = torch.softmax(logits, dim=1)[0][predicted_class_id].item()
# Get all accent probabilities
all_probs = torch.softmax(logits, dim=1)[0].tolist()
all_accents = {model.config.id2label[i]: float(prob) for i, prob in enumerate(all_probs)}
# Sort accents by probability (highest first)
sorted_accents = sorted(all_accents.items(), key=lambda x: x[1], reverse=True)
return {
"top_accent": predicted_label,
"confidence": confidence,
"confidence_percent": confidence * 100,
"all_accents": sorted_accents,
"file_analyzed": os.path.basename(file_path)
}
|