Spaces:

Thanh-Lam
/

vietnamese-speaker-profiling-v2

Running

App Files Files Community

Thanh-Lam commited on 16 days ago

Commit

f9308ba

1 Parent(s): c3418e9

Add multi-model support with PhoWhisper and model selection dropdown

Browse files

Files changed (4) hide show

app.py +236 -267
model/pho/model.safetensors +3 -0
model/pho/preprocessor_config.json +14 -0
model/pho/training_args.bin +3 -0

app.py CHANGED Viewed

@@ -1,316 +1,285 @@
 """
-Gradio Web Interface for Speaker Profiling
-Usage:
-    python app.py
-    python app.py --config configs/infer.yaml --share
 """
 import os
-import argparse
-import tempfile
-import time
-import numpy as np
 import torch
-import librosa
 import gradio as gr
 from pathlib import Path
-from src.models import MultiTaskSpeakerModel
-from src.utils import (
-    setup_logging,
-    get_logger,
-    load_config,
-    get_device,
-    load_model_checkpoint,
-    preprocess_audio
-)
-class SpeakerProfilerApp:
-    """Gradio application for speaker profiling"""
-    def __init__(self, config_path: str):
-        self.logger = setup_logging(name="gradio_app")
-        self.config = load_config(config_path)
-        self.device = get_device(self.config['inference']['device'])
-        self.sampling_rate = self.config['audio']['sampling_rate']
-        self.max_duration = self.config['audio']['max_duration']
-        self.gender_labels = self.config['labels']['gender']
-        self.dialect_labels = self.config['labels']['dialect']
-        self._load_model()
-    def _load_model(self):
-        """Load model and feature extractor"""
-        from transformers import Wav2Vec2FeatureExtractor, WhisperFeatureExtractor
-        self.logger.info("Loading model...")
-        model_name = self.config['model']['name']
-        is_ecapa = 'ecapa' in model_name.lower() or 'speechbrain' in model_name.lower()
-        # Check if this is a Whisper/PhoWhisper model
-        self.is_whisper = 'whisper' in model_name.lower() or 'phowhisper' in model_name.lower()
-        if is_ecapa:
-            # ECAPA-TDNN: use Wav2Vec2 feature extractor for audio normalization
-            self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-                "facebook/wav2vec2-base"
-            )
-        elif self.is_whisper:
-            # Whisper/PhoWhisper: use WhisperFeatureExtractor
-            self.feature_extractor = WhisperFeatureExtractor.from_pretrained(
-                model_name
-            )
-        else:
-            self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-                self.config['model']['checkpoint']
-            )
-        self.model = MultiTaskSpeakerModel(model_name)
-        self.model = load_model_checkpoint(
-            self.model,
-            self.config['model']['checkpoint'],
-            str(self.device)
-        )
-        self.model.to(self.device)
-        self.model.eval()
-        self.logger.info(f"Model loaded on {self.device}")
-    def predict(self, audio_input):
-        """
-        Predict gender and dialect from audio
-        Args:
-            audio_input: Tuple of (sample_rate, audio_array) from Gradio
-        Returns:
-            Tuple of (gender_result, dialect_result, details)
-        """
-        if audio_input is None:
-            return "No audio", "No audio", "Please upload or record audio"
         try:
-            sr, audio = audio_input
-            if len(audio.shape) > 1:
-                audio = audio.mean(axis=1)
-            audio = audio.astype(np.float32)
-            if audio.max() > 1.0:
-                audio = audio / 32768.0
-            if sr != self.sampling_rate:
-                audio = librosa.resample(audio, orig_sr=sr, target_sr=self.sampling_rate)
-            # Calculate original audio duration BEFORE preprocessing
-            audio_duration = len(audio) / self.sampling_rate
-            # Whisper requires 30 seconds of audio
-            if self.is_whisper:
-                max_duration = 30
             else:
-                max_duration = self.max_duration
-            audio = preprocess_audio(
-                audio,
-                sampling_rate=self.sampling_rate,
-                max_duration=max_duration
-            )
-            # Whisper needs exactly 30 seconds - pad if necessary
-            if self.is_whisper:
-                target_len = self.sampling_rate * 30
-                if len(audio) < target_len:
-                    audio = np.pad(audio, (0, target_len - len(audio)))
-            inputs = self.feature_extractor(
-                audio,
-                sampling_rate=self.sampling_rate,
-                return_tensors="pt",
-                padding=True
             )
-            # Whisper uses 'input_features', WavLM/HuBERT/Wav2Vec2 use 'input_values'
-            if self.is_whisper:
-                input_values = inputs.input_features.to(self.device)
             else:
-                input_values = inputs.input_values.to(self.device)
-            # Measure inference time
-            start_time = time.perf_counter()
-            with torch.no_grad():
-                outputs = self.model(input_values)
-                gender_logits = outputs['gender_logits']
-                dialect_logits = outputs['dialect_logits']
-            # Calculate inference time
-            infer_time = (time.perf_counter() - start_time) * 1000  # Convert to ms
-            gender_probs = torch.softmax(gender_logits, dim=-1).cpu().numpy()[0]
-            dialect_probs = torch.softmax(dialect_logits, dim=-1).cpu().numpy()[0]
-            gender_pred = int(np.argmax(gender_probs))
-            dialect_pred = int(np.argmax(dialect_probs))
-            gender_name = self.gender_labels[gender_pred]
-            dialect_name = self.dialect_labels[dialect_pred]
-            gender_conf = gender_probs[gender_pred] * 100
-            dialect_conf = dialect_probs[dialect_pred] * 100
-            gender_result = f"{gender_name} ({gender_conf:.1f}%)"
-            dialect_result = f"{dialect_name} ({dialect_conf:.1f}%)"
-            details = self._format_details(gender_probs, dialect_probs, infer_time, audio_duration)
-            self.logger.info(f"Prediction: Gender={gender_name}, Dialect={dialect_name} | Inference time: {infer_time:.2f}ms | Audio: {audio_duration:.2f}s")
-            return gender_result, dialect_result, details
         except Exception as e:
-            self.logger.error(f"Prediction error: {e}")
-            return "Error", "Error", f"Error: {str(e)}"
-    def _format_details(self, gender_probs: np.ndarray, dialect_probs: np.ndarray, infer_time: float = None, audio_duration: float = None) -> str:
-        """Format detailed prediction results"""
-        # Gender label names
-        gender_names = ['Female', 'Male']
-        # Dialect label names
-        dialect_names = ['North', 'Central', 'South']
-        lines = []
-        lines.append("Gender Probabilities:")
-        for i, name in enumerate(gender_names):
-            lines.append(f"  {name}: {gender_probs[i]*100:.2f}%")
-        lines.append("")
-        lines.append("Dialect Probabilities:")
-        for i, name in enumerate(dialect_names):
-            lines.append(f"  {name}: {dialect_probs[i]*100:.2f}%")
-        lines.append("")
-        lines.append("─" * 30)
-        if audio_duration is not None:
-            lines.append(f"Audio Duration: {audio_duration:.2f} s")
-        if infer_time is not None:
-            lines.append(f"Inference Time: {infer_time:.2f} ms")
-        return "\n".join(lines)
-    def create_interface(self) -> gr.Blocks:
-        """Create Gradio interface"""
-        # Gradio < 4.0 doesn't support theme in Blocks
-        with gr.Blocks(title="Vietnamese Speaker Profiling") as demo:
-            gr.Markdown(
-                """
-                # Vietnamese Speaker Profiling
-                Identify gender and dialect from Vietnamese speech audio.
-                **Model:** Encoder + Attentive Pooling + LayerNorm + MultiHead Classifier
-                **Supported dialects:** North, Central, South
-                """
-            )
-            with gr.Row():
-                with gr.Column(scale=1):
-                    audio_input = gr.Audio(
-                        label="Input Audio",
-                        type="numpy",
-                        sources=["upload", "microphone"]
-                    )
-                    submit_btn = gr.Button("Analyze", variant="primary")
-                    clear_btn = gr.Button("Clear")
-                with gr.Column(scale=1):
-                    gender_output = gr.Textbox(
-                        label="Gender",
-                        interactive=False
-                    )
-                    dialect_output = gr.Textbox(
-                        label="Dialect",
-                        interactive=False
-                    )
-                    details_output = gr.Textbox(
-                        label="Details",
-                        lines=8,
-                        interactive=False
-                    )
-            gr.Markdown(
-                """
-                ---
-                **Notes:**
-                - Supported formats: WAV, MP3
-                - Recommended duration: 3-10 seconds
-                """
-            )
-            submit_btn.click(
-                fn=self.predict,
-                inputs=[audio_input],
-                outputs=[gender_output, dialect_output, details_output]
-            )
-            clear_btn.click(
-                fn=lambda: (None, "", "", ""),
-                inputs=[],
-                outputs=[audio_input, gender_output, dialect_output, details_output]
-            )
-        return demo
-def main():
-    """Main function"""
-    parser = argparse.ArgumentParser(description="Speaker Profiling Web Interface")
-    parser.add_argument(
-        "--config",
-        type=str,
-        default="configs/infer.yaml",
-        help="Path to config file"
-    )
-    parser.add_argument(
-        "--share",
-        action="store_true",
-        help="Create public link"
-    )
-    parser.add_argument(
-        "--port",
-        type=int,
-        default=7860,
-        help="Port number (default: 7860)"
-    )
-    parser.add_argument(
-        "--server_name",
-        type=str,
-        default="0.0.0.0",
-        help="Server name (default: 0.0.0.0)"
-    )
-    args = parser.parse_args()
-    app = SpeakerProfilerApp(args.config)
-    demo = app.create_interface()
-    demo.launch(
-        server_name=args.server_name,
-        server_port=args.port,
-        share=args.share
-    )
 if __name__ == "__main__":
-    main()

 """
+Vietnamese Speaker Profiling - Multi-Model Gradio Interface
+Supports: Vietnamese Wav2Vec2 and PhoWhisper encoders
 """
 import os
 import torch
+import torchaudio
 import gradio as gr
 from pathlib import Path
+from safetensors.torch import load_file as load_safetensors
+# Model configurations
+MODELS_CONFIG = {
+    "Wav2Vec2 Vietnamese": {
+        "path": "model/vulehuubinh",
+        "encoder_name": "nguyenvulebinh/wav2vec2-base-vi-vlsp2020",
+        "is_whisper": False,
+        "description": "Vietnamese Wav2Vec2 pretrained model - Fast inference"
+    },
+    "PhoWhisper": {
+        "path": "model/pho",
+        "encoder_name": "vinai/PhoWhisper-base",
+        "is_whisper": True,
+        "description": "Vietnamese Whisper model - Higher accuracy"
+    }
+}
+# Labels
+GENDER_LABELS = ["Male", "Female"]
+DIALECT_LABELS = ["Northern", "Central", "Southern"]
+class MultiModelProfiler:
+    """Speaker Profiler supporting multiple encoder models."""
+    def __init__(self):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.sampling_rate = 16000
+        self.models = {}
+        self.processors = {}
+        self.current_model = None
+        print(f"Using device: {self.device}")
+        # Pre-load all models
+        self._load_all_models()
+    def _load_all_models(self):
+        """Load all available models."""
+        for model_name, config in MODELS_CONFIG.items():
+            model_path = Path(config["path"])
+            if model_path.exists():
+                print(f"Loading {model_name}...")
+                self._load_single_model(model_name, config)
+            else:
+                print(f"Model not found: {model_path}")
+    def _load_single_model(self, model_name: str, config: dict):
+        """Load a specific model."""
         try:
+            model_path = Path(config["path"])
+            is_whisper = config["is_whisper"]
+            encoder_name = config["encoder_name"]
+            # Load processor
+            if is_whisper:
+                from transformers import WhisperFeatureExtractor
+                processor = WhisperFeatureExtractor.from_pretrained(encoder_name)
             else:
+                from transformers import Wav2Vec2FeatureExtractor
+                processor = Wav2Vec2FeatureExtractor.from_pretrained(encoder_name)
+            # Load model
+            from src.models import SpeakerProfileModel
+            model = SpeakerProfileModel(
+                model_name=encoder_name,
+                num_gender_classes=2,
+                num_dialect_classes=3,
+                dropout=0.1,
+                freeze_encoder=True
             )
+            # Load checkpoint from safetensors
+            checkpoint_path = model_path / "model.safetensors"
+            if checkpoint_path.exists():
+                state_dict = load_safetensors(str(checkpoint_path))
+                model.load_state_dict(state_dict)
+                print(f"Loaded checkpoint: {checkpoint_path}")
             else:
+                # Try loading from .pt file
+                pt_path = model_path / "best_model.pt"
+                if pt_path.exists():
+                    checkpoint = torch.load(pt_path, map_location=self.device, weights_only=False)
+                    if "model_state_dict" in checkpoint:
+                        model.load_state_dict(checkpoint["model_state_dict"])
+                    else:
+                        model.load_state_dict(checkpoint)
+                    print(f"Loaded checkpoint: {pt_path}")
+            model.to(self.device)
+            model.eval()
+            self.models[model_name] = model
+            self.processors[model_name] = processor
+            if self.current_model is None:
+                self.current_model = model_name
+            print(f"✓ {model_name} loaded successfully")
+        except Exception as e:
+            print(f"✗ Error loading {model_name}: {e}")
+            import traceback
+            traceback.print_exc()
+    def predict(self, audio_path: str, model_name: str):
+        """Predict gender and dialect from audio."""
+        if model_name not in self.models:
+            available = list(self.models.keys())
+            if not available:
+                return "No models available", "No models available"
+            model_name = available[0]
+        try:
+            model = self.models[model_name]
+            processor = self.processors[model_name]
+            is_whisper = MODELS_CONFIG[model_name]["is_whisper"]
+            # Load audio
+            waveform, sr = torchaudio.load(audio_path)
+            # Convert to mono
+            if waveform.shape[0] > 1:
+                waveform = waveform.mean(dim=0, keepdim=True)
+            # Resample if needed
+            if sr != self.sampling_rate:
+                resampler = torchaudio.transforms.Resample(sr, self.sampling_rate)
+                waveform = resampler(waveform)
+            waveform = waveform.squeeze(0).numpy()
+            # Process based on model type
+            if is_whisper:
+                # Whisper requires exactly 30 seconds of audio
+                whisper_length = self.sampling_rate * 30  # 480000 samples
+                if len(waveform) < whisper_length:
+                    waveform_padded = torch.nn.functional.pad(
+                        torch.tensor(waveform),
+                        (0, whisper_length - len(waveform))
+                    ).numpy()
+                else:
+                    waveform_padded = waveform[:whisper_length]
+                inputs = processor(
+                    waveform_padded,
+                    sampling_rate=self.sampling_rate,
+                    return_tensors="pt"
+                )
+                input_tensor = inputs.input_features.to(self.device)
+            else:
+                # Wav2Vec2 uses raw waveform
+                inputs = processor(
+                    waveform,
+                    sampling_rate=self.sampling_rate,
+                    return_tensors="pt",
+                    padding=True
+                )
+                input_tensor = inputs.input_values.to(self.device)
+            # Inference
+            with torch.no_grad():
+                gender_logits, dialect_logits = model(input_tensor)
+                gender_probs = torch.softmax(gender_logits, dim=-1)
+                dialect_probs = torch.softmax(dialect_logits, dim=-1)
+                gender_idx = gender_probs.argmax(dim=-1).item()
+                dialect_idx = dialect_probs.argmax(dim=-1).item()
+                gender_conf = gender_probs[0, gender_idx].item() * 100
+                dialect_conf = dialect_probs[0, dialect_idx].item() * 100
+            gender_result = f"{GENDER_LABELS[gender_idx]} ({gender_conf:.1f}%)"
+            dialect_result = f"{DIALECT_LABELS[dialect_idx]} ({dialect_conf:.1f}%)"
+            return gender_result, dialect_result
         except Exception as e:
+            import traceback
+            traceback.print_exc()
+            return f"Error: {str(e)}", f"Error: {str(e)}"
+    def get_available_models(self):
+        """Get list of available models."""
+        return list(self.models.keys())
+def create_interface():
+    """Create Gradio interface with model selection."""
+    profiler = MultiModelProfiler()
+    available_models = profiler.get_available_models()
+    if not available_models:
+        available_models = ["No models available"]
+    def predict_wrapper(audio, model_name):
+        if audio is None:
+            return "Please upload audio", "Please upload audio"
+        return profiler.predict(audio, model_name)
+    # Create model info text
+    model_info = ""
+    for name, config in MODELS_CONFIG.items():
+        status = "✓" if name in profiler.models else "✗"
+        model_info += f"{status} **{name}**: {config['description']}\n"
+    with gr.Blocks(title="Vietnamese Speaker Profiling", theme=gr.themes.Soft()) as demo:
+        gr.Markdown(
+            """
+            # 🎙️ Vietnamese Speaker Profiling
+            Analyze Vietnamese speech to predict **Gender** and **Dialect Region**.
+            Supports multiple AI models - choose the one that works best for you!
+            """
+        )
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### 📤 Input")
+                audio_input = gr.Audio(
+                    label="Upload or Record Audio",
+                    type="filepath",
+                    sources=["upload", "microphone"]
+                )
+                model_dropdown = gr.Dropdown(
+                    choices=available_models,
+                    value=available_models[0] if available_models else None,
+                    label="🤖 Select Model",
+                    info="Choose the AI model for analysis"
+                )
+                submit_btn = gr.Button("🔍 Analyze", variant="primary", size="lg")
+                gr.Markdown("### ℹ️ Available Models")
+                gr.Markdown(model_info)
+            with gr.Column(scale=1):
+                gr.Markdown("### 📊 Results")
+                gender_output = gr.Textbox(label="👤 Gender", interactive=False)
+                dialect_output = gr.Textbox(label="🗣️ Dialect Region", interactive=False)
+                gr.Markdown(
+                    """
+                    ### 📖 Dialect Regions
+                    - **Northern**: Hanoi and surrounding areas
+                    - **Central**: Huế, Đà Nẵng, and Central Vietnam
+                    - **Southern**: Ho Chi Minh City and Southern Vietnam
+                    """
+                )
+        submit_btn.click(
+            fn=predict_wrapper,
+            inputs=[audio_input, model_dropdown],
+            outputs=[gender_output, dialect_output]
+        )
+        gr.Markdown(
+            """
+            ---
+            *Made with ❤️ for Vietnamese Speech Processing Research*
+            """
+        )
+    return demo
 if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

model/pho/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e14ed1dd391d230ba74d231d164e626c1a9e9b865d0c56a87af4351e92b9557
+size 292648364

model/pho/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "chunk_length": 30,
+  "feature_extractor_type": "WhisperFeatureExtractor",
+  "feature_size": 80,
+  "hop_length": 160,
+  "n_fft": 400,
+  "n_samples": 480000,
+  "nb_max_frames": 3000,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "WhisperProcessor",
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

model/pho/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8581872846c31ff58536a0780aed646dd2c25671e9318390007d5784c62dc39d
+size 5176