Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import spaces | |
| import torch | |
| from datasets import load_dataset, Audio | |
| from transformers import MoonshineForConditionalGeneration, AutoProcessor | |
| from datasets import load_dataset, Audio | |
| # Use GPU if available, otherwise fallback to CPU | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| # Use float16 for faster inference on GPU, float32 on CPU | |
| torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| # Load model from Hugging Face pretrained | |
| model = MoonshineForConditionalGeneration.from_pretrained('UsefulSensors/moonshine-tiny').to(device).to(torch_dtype) | |
| processor = AutoProcessor.from_pretrained('UsefulSensors/moonshine-tiny') | |
| # Function to transcribe audio | |
| def transcribe_audio(audio_file): | |
| # Load audio file | |
| dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") | |
| dataset = dataset.cast_column("audio", Audio(processor.feature_extractor.sampling_rate)) | |
| sample = dataset[0]["audio"] | |
| inputs = processor( | |
| sample["array"], | |
| return_tensors="pt", | |
| sampling_rate=processor.feature_extractor.sampling_rate | |
| ) | |
| inputs = inputs.to(device, torch_dtype) | |
| # to avoid hallucination loops, we limit the maximum length of the generated text based expected number of tokens per second | |
| token_limit_factor = 6.5 / processor.feature_extractor.sampling_rate # Maximum of 6.5 tokens per second | |
| seq_lens = inputs.attention_mask.sum(dim=-1) | |
| max_length = int((seq_lens * token_limit_factor).max().item()) | |
| generated_ids = model.generate(**inputs, max_length=max_length) | |
| return processor.decode(generated_ids[0], skip_special_tokens=True) | |
| # Create Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## Audio Transcription App") | |
| with gr.Tabs(): | |
| with gr.TabItem("Upload Audio"): | |
| audio_file = gr.Audio( | |
| sources=["upload"], | |
| type="filepath", | |
| label="Upload Audio File" | |
| ) | |
| output_text1 = gr.Textbox( | |
| label="Transcription", | |
| placeholder="Transcription will appear here..." | |
| ) | |
| upload_button = gr.Button("Transcribe Uploaded Audio") | |
| upload_button.click( | |
| fn=transcribe_audio, | |
| inputs=audio_file, | |
| outputs=output_text1 | |
| ) | |
| with gr.TabItem("Record Audio"): | |
| audio_mic = gr.Audio( | |
| sources=["microphone"], | |
| type="filepath", | |
| label="Record Audio" | |
| ) | |
| output_text2 = gr.Textbox( | |
| label="Transcription", | |
| placeholder="Transcription will appear here..." | |
| ) | |
| record_button = gr.Button("Transcribe Recorded Audio") | |
| record_button.click( | |
| fn=transcribe_audio, | |
| inputs=audio_mic, | |
| outputs=output_text2 | |
| ) | |
| gr.Markdown(""" | |
| ### Instructions: | |
| 1. Choose either 'Upload Audio' or 'Record Audio' tab | |
| 2. Upload an audio file or record using your microphone | |
| 3. Click the respective 'Transcribe' button | |
| 4. Wait for the transcription to appear | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |