File size: 4,166 Bytes
3ae1bf1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import torch
import torchaudio
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
import librosa
import numpy as np
import os

# Model loading function to allow lazy loading
def load_model():
    """

    Load the accent classification model and feature extractor.

    

    Returns:

        tuple: (model, feature_extractor)

    """
    model_name = "dima806/english_accents_classification"
    model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
    return model, feature_extractor

# Global variables for lazy loading
_model = None
_feature_extractor = None

def get_model_and_extractor():
    """

    Get the model and feature extractor, loading them if necessary.

    

    Returns:

        tuple: (model, feature_extractor)

    """
    global _model, _feature_extractor
    if _model is None or _feature_extractor is None:
        _model, _feature_extractor = load_model()
    return _model, _feature_extractor

# Load and preprocess the audio
def load_audio(file_path):
    """

    Load and preprocess audio file for accent analysis.

    

    Parameters:

        file_path (str): Path to the audio file

        

    Returns:

        tuple: (audio_data, sample_rate)

    """
    # Load audio with librosa
    audio, sr = librosa.load(file_path, sr=16000)
    return audio, sr

# Predict accent
def predict_accent(file_path):
    """

    Predict the accent in an audio file.

    

    Parameters:

        file_path (str): Path to the audio file

        

    Returns:

        tuple: (accent_label, confidence_score)

    """
    # Check if file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Audio file not found: {file_path}")
        
    # Get or load model
    model, feature_extractor = get_model_and_extractor()
    
    # Load and process audio
    audio, sr = load_audio(file_path)
    inputs = feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
    
    # Make prediction
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # Get results
    predicted_class_id = torch.argmax(logits).item()
    predicted_label = model.config.id2label[predicted_class_id]
    confidence = torch.softmax(logits, dim=1)[0][predicted_class_id].item()
    
    # Get all accent probabilities
    all_probs = torch.softmax(logits, dim=1)[0].tolist()
    all_accents = {model.config.id2label[i]: float(prob) for i, prob in enumerate(all_probs)}
    
    return predicted_label, confidence

# Get detailed accent analysis
def get_detailed_accent_analysis(file_path):
    """

    Get detailed accent analysis including all possible accents and their probabilities.

    

    Parameters:

        file_path (str): Path to the audio file

        

    Returns:

        dict: Detailed accent analysis results

    """
    # Get or load model
    model, feature_extractor = get_model_and_extractor()
    
    # Load and process audio
    audio, sr = load_audio(file_path)
    inputs = feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
    
    # Make prediction
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # Get top prediction
    predicted_class_id = torch.argmax(logits).item()
    predicted_label = model.config.id2label[predicted_class_id]
    confidence = torch.softmax(logits, dim=1)[0][predicted_class_id].item()
    
    # Get all accent probabilities
    all_probs = torch.softmax(logits, dim=1)[0].tolist()
    all_accents = {model.config.id2label[i]: float(prob) for i, prob in enumerate(all_probs)}
    
    # Sort accents by probability (highest first)
    sorted_accents = sorted(all_accents.items(), key=lambda x: x[1], reverse=True)
    
    return {
        "top_accent": predicted_label,
        "confidence": confidence,
        "confidence_percent": confidence * 100,
        "all_accents": sorted_accents,
        "file_analyzed": os.path.basename(file_path)
    }