axxam-wis-sin

Sleeping

cool commited on Dec 19, 2024

Commit

715584d

verified ·

1 Parent(s): 741f964

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,45 +1,39 @@
-import torch
 import gradio as gr
-from transformers import pipeline
-MODEL_NAME = "BlueRaccoon/whisper-small-kab"
-lang = "uz"  # Used uz instead of kab
-device = 0 if torch.cuda.is_available() else "cpu"
-pipe = pipeline(
-    task="automatic-speech-recognition",
-    model=MODEL_NAME,
-    chunk_length_s=30,
-    device=device,
-)
-pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
-def transcribe(microphone):
-    if microphone is None:
-        return "ERROR: You need to record or upload an audio file."
-    text = pipe(microphone)["text"]
-    return text
-with gr.Blocks() as demo:
-    with gr.Tab("Transcribe Kabyle Audio"):
-        gr.Markdown(
-            f"""
-            # Kabyle Whisper Demo: Transcribe Audio
-            Transcribe Kabyle audio recorded from the microphone or uploaded as a file. This demo uses the fine-tuned
-            checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe Kabyle audio
-            files of arbitrary length.
-            """
-        )
-        # Input for microphone recording only
-        microphone_input = gr.Audio(type="filepath", label="Record or Upload Kabyle Audio")
-        gr.Interface(
-            fn=transcribe,
-            inputs=[microphone_input],
-            outputs=gr.Textbox(label="Transcription"),
-        )
-demo.launch()

 import gradio as gr
+import nemo.collections.asr as nemo_asr
+import numpy as np
+import torch
+# Load the pre-trained Kabyle ASR model
+asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_kab_conformer_transducer_large")
+# Function to transcribe the audio input
+def transcribe(audio):
+    # Print the raw audio input
+    print(f"Raw audio input: {audio}")
+    # Audio in Gradio is returned as a tuple (sample_rate, audio_data)
+    sample_rate, audio_data = audio
+    # Print to check the types
+    print(f"Audio data type: {type(audio_data)}")
+    print(f"Sample rate type: {type(sample_rate)}")
+    # Ensure the audio data is in numpy array format
+    if isinstance(audio_data, np.ndarray):
+        # If it's already numpy, we pass it directly
+        audio_data = np.array(audio_data)
+    elif isinstance(audio_data, torch.Tensor):
+        # If it's a tensor, convert to numpy array
+        audio_data = audio_data.numpy()
+    else:
+        print("Error: Audio data is neither a numpy array nor a tensor.")
+        return "Invalid audio format"
+    # Now transcribe the audio
+    return asr_model.transcribe([audio_data])
+# Create the Gradio interface with audio input and text output
+iface = gr.Interface(fn=transcribe, inputs="audio", outputs="text")
+# Launch the Gradio interface
+iface.launch()