Spaces:

Mohamedenzeyad
/

accent-classification

Build error

App Files Files Community

Mohamedenzeyad commited on May 23

Commit

02f871d

verified ·

1 Parent(s): a3783dd

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +311 -40

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,311 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+import os
+ import io
+ import csv
+ import subprocess
+ import streamlit as st
+ import numpy as np
+ import pandas as pd
+ import tensorflow as tf
+ import tensorflow_hub as hub
+ import matplotlib.pyplot as plt
+ from tensorflow import keras
+ from huggingface_hub import from_pretrained_keras
+ from audio_recorder_streamlit import audio_recorder
+ import yt_dlp
+ import torch
+ import torchaudio
+ torchaudio.set_audio_backend("soundfile")
+ import speechbrain
+ # Check if SpeechBrain is installed, if not display a message
+ try:
+  from speechbrain.pretrained import EncoderClassifier
+  from speechbrain.pretrained.interfaces import foreign_class
+  speechbrain_available = True
+ except ImportError:
+  speechbrain_available = False
+ st.set_page_config(
+  page_title="English Accent Classification",
+  page_icon="🎙️",
+  layout="wide"
+ )
+ # Configuration
+ xlsr_accent_classes = [
+  "US",
+  "England",
+  "Australia",
+  "Indian",
+  "Canada",
+  "Bermuda",
+  "Scotland",
+  "African",
+  "Ireland",
+  "NewZealand",
+  "Wales",
+  "Malaysia",
+  "Philippines",
+  "Singapore",
+  "HongKong",
+  "SouthAtlantic"
+ ]
+ @st.cache_resource
+ def load_models():
+  xlsr_model = None
+  try:
+  # Show loading message for XLSR
+   with st.spinner("Loading XLSR-based accent classifier..."):
+    xlsr_model = foreign_class(
+     source="Jzuluaga/accent-id-commonaccent_xlsr-en-english",
+     pymodule_file="custom_interface.py",
+     classname="CustomEncoderWav2vec2Classifier",
+     savedir="pretrained_models/accent-id-commonaccent_xlsr-en-english"
+    )
+  except Exception as e:
+   st.warning(f"Could not load XLSR model: {e}")
+   xlsr_model = None
+  return xlsr_model
+ # Function to check if ffmpeg is installed
+ def is_ffmpeg_installed():
+  """Checks if ffmpeg is installed and in the PATH."""
+  try:
+   subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
+   return True
+  except (subprocess.CalledProcessError, FileNotFoundError) as e:
+   st.error(f"FFmpeg check failed: {e}")
+   return False
+ # Function to extract audio from YouTube URL
+ def extract_audio(video_url, output_audio_path="audio.wav"):
+  """
+  Downloads video from URL, extracts audio using ffmpeg, and saves it as a WAV file.
+  """
+  if not is_ffmpeg_installed():
+   st.error("FFmpeg is not installed or not in your system's PATH.")
+   st.info("Please install FFmpeg. You can download it from [FFmpeg](https://ffmpeg.org/download.html)")
+   return False
+  ydl_opts = {
+   'format': 'bestaudio/best',
+   'postprocessors': [{
+    'key': 'FFmpegExtractAudio',
+    'preferredcodec': 'wav',
+   }],
+   'outtmpl': 'temp_video.%(ext)s',
+  }
+  try:
+   with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+    info_dict = ydl.extract_info(video_url, download=True)
+    video_filepath = ydl.prepare_filename(info_dict)
+    # yt-dlp with FFmpegExtractAudio should directly output the audio file
+    # The output file will have the same name as the video but with .wav extension
+    base, _ = os.path.splitext(video_filepath)
+    audio_filepath = base + '.wav'
+   # Rename the output file to the desired output_audio_path
+   if os.path.exists(audio_filepath):
+    # Use copy instead of rename to avoid issues if files are on different file systems
+    import shutil
+    shutil.copy2(audio_filepath, output_audio_path)
+    os.remove(audio_filepath)  # Remove the original after copying
+    st.success(f"Audio extracted successfully to {output_audio_path}")
+   else:
+    st.error(f"Error: Audio file not found after extraction.")
+    return False
+   # Clean up the temporary video file if it still exists (sometimes it doesn't)
+   if os.path.exists(video_filepath):
+    os.remove(video_filepath)
+    print(f"Cleaned up temporary video file {video_filepath}")
+   return True
+  except Exception as e:
+   st.error(f"An error occurred during audio extraction: {e}")
+   return False
+ # Function that reads a wav audio file - without tensorflow-io
+ def load_16k_audio_wav(filename):
+  """Read and resample audio file to 16kHz without using tensorflow-io."""
+  # Use ffmpeg to resample the audio file to 16kHz
+  output_filename = "resampled_16k.wav"
+  try:
+   subprocess.run([
+    'ffmpeg', '-y', '-i', filename, '-ar', '16000', '-ac', '1', output_filename
+   ], check=True, capture_output=True)
+   # Read the resampled file
+   audio, sample_rate = tf.audio.decode_wav(tf.io.read_file(output_filename))
+   audio = tf.squeeze(audio, axis=-1)
+   # Clean up
+   if os.path.exists(output_filename):
+    os.remove(output_filename)
+   return audio
+  except Exception as e:
+   st.error(f"Error resampling audio: {e}")
+   # Fallback to just decoding without resampling
+   audio, _ = tf.audio.decode_wav(tf.io.read_file(filename))
+   audio = tf.squeeze(audio, axis=-1)
+   return audio
+ # Function that takes a recorded audio array and returns a tensor
+ def recorded_audio_to_tensor(audio_bytes):
+  # Save the audio bytes to a temporary file
+  temp_path = "temp_recorded_audio.wav"
+  with open(temp_path, "wb") as f:
+   f.write(audio_bytes)
+  # Load the audio file as a tensor
+  audio_tensor = load_16k_audio_wav(temp_path)
+  # Clean up
+  if os.path.exists(temp_path):
+   os.remove(temp_path)
+  return audio_tensor
+ # Function to use XLSR model for accent classification
+ def predict_accent_with_xlsr(audio_file_path, xlsr_model):
+  try:
+   # Classify the audio file
+   out_prob, score, index, text_lab = xlsr_model.classify_file(audio_file_path)
+   # Convert the prediction tensor to numpy for easier handling
+   probs = out_prob.squeeze().numpy()
+   # Create a dictionary of accent probabilities
+   accent_probs = {xlsr_accent_classes[i]: float(probs[i]) for i in range(len(xlsr_accent_classes))}
+   # Get the predicted accent
+   predicted_accent = text_lab
+   confidence = float(score)
+   return predicted_accent, confidence, accent_probs
+  except Exception as e:
+   st.error(f"Error with XLSR prediction: {e}")
+   return None, None, None
+ def main():
+  st.title("English Speaker Accent Recognition")
+  st.subheader("Classify English accents using XLSR Wav2Vec 2.0")
+  st.write("""
+  This application detects and classifies English accents using the XLSR Wav2Vec 2.0 model.
+  """)
+  # Load models
+  xlsr_model = load_models()
+  # Check if ffmpeg is installed
+  if not is_ffmpeg_installed():
+   st.warning("FFmpeg is not installed. You won't be able to use YouTube URLs or process some audio files correctly.")
+   st.info("Please install FFmpeg. You can download it from [FFmpeg](https://ffmpeg.org/download.html)")
+  # Create tabs for different input methods
+  tab3 = st.tabs(["YouTube URL"])[0]
+  with tab3:
+   youtube_url = st.text_input("Enter YouTube URL", placeholder="https://www.youtube.com/watch?v=...")
+   if youtube_url:
+    if st.button("Extract Audio from YouTube", key="extract_btn"):
+     with st.spinner("Extracting audio from YouTube..."):
+      output_path = "youtube_audio.wav"
+      if extract_audio(youtube_url, output_path):
+       st.success("Audio extracted successfully!")
+       st.audio(output_path, format="audio/wav")
+       st.session_state.youtube_audio_path = output_path
+      else:
+       st.error("Failed to extract audio from YouTube URL.")
+  # Process and analyze the audio when the button is clicked
+  if st.button("Predict Accent", type="primary"):
+   audio_file_path = None
+   # Check which audio source we have
+   if 'youtube_audio_path' in st.session_state and os.path.exists(st.session_state.youtube_audio_path):
+    audio_file_path = st.session_state.youtube_audio_path
+   else:
+    st.warning("Please provide a YouTube URL.")
+    st.stop()
+   # Run prediction based on selected model
+   if xlsr_model is not None:
+    with st.spinner("Analyzing audio with XLSR Wav2Vec 2.0..."):
+     xlsr_predicted_accent, xlsr_confidence, xlsr_accent_probs = predict_accent_with_xlsr(
+      audio_file_path, xlsr_model
+     )
+    if xlsr_predicted_accent:
+     st.success(f"🎯 **Predicted Accent: {xlsr_predicted_accent}** (Confidence: {xlsr_confidence:.2f})")
+     # Create visualization for XLSR results
+     sorted_probs = {k: v for k, v in sorted(xlsr_accent_probs.items(), key=lambda item: item[1], reverse=True)}
+     # Create a bar chart
+     fig, ax = plt.subplots(figsize=(10, 6))
+     accents = list(sorted_probs.keys())
+     probabilities = list(sorted_probs.values())
+     ax.bar(accents, probabilities, color='lightcoral')
+     ax.set_ylabel('Probability')
+     ax.set_title('XLSR Wav2Vec 2.0 Accent Probabilities (16 English Accents)')
+     plt.xticks(rotation=45)
+     plt.tight_layout()
+     st.pyplot(fig)
+     # Also display as a table
+     df = pd.DataFrame({
+      'Accent': accents,
+      'Probability': [f"{p:.2%}" for p in probabilities]
+     })
+     st.dataframe(df, hide_index=True)
+     # Add information about XLSR model
+     st.info("""
+     🚀 **XLSR Wav2Vec 2.0 Model**: This state-of-the-art model achieves up to 95% accuracy
+     and can distinguish between 16 different English accent regions including specialized
+     accents like Bermuda, Hong Kong, and South Atlantic varieties.
+     """)
+    else:
+     st.error("XLSR model failed to classify the accent.")
+   # Clean up temporary files
+   if audio_file_path and audio_file_path.startswith("temp_") and os.path.exists(audio_file_path):
+    os.remove(audio_file_path)
+  # Add information about the models
+  st.markdown("---")
+  st.subheader("About the Model")
+  st.markdown("### XLSR Wav2Vec 2.0 ⭐")
+  st.write("""
+  **State-of-the-art** model with 95% accuracy for English accent classification.
+  **Supported accents:**
+  - US, England, Australia, India
+  - Canada, Bermuda, Scotland, Africa
+  - Ireland, New Zealand, Wales
+  - Malaysia, Philippines, Singapore
+  - Hong Kong, South Atlantic
+  Based on self-supervised Wav2Vec 2.0 with cross-lingual representations.
+  """)
+  # Credits
+  st.markdown("---")
+  st.markdown("""
+  **Credits:** - **XLSR Model**: [Jzuluaga/accent-id-commonaccent_xlsr-en-english](https://huggingface.co/Jzuluaga/accent-id-commonaccent_xlsr-en-english) by Juan Zuluaga-Gomez et al.
+  - All SpeechBrain models by [SpeechBrain](https://speechbrain.github.io/)
+  """)
+ if __name__ == "__main__":
+  main()