Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,331 Bytes
fa0e42d 807c399 cbb1a17 31b0139 cbb1a17 270d2e5 cbb1a17 09eca75 cbb1a17 7b150af ec2f83b 145285e cbb1a17 31b0139 7b150af 31b0139 7b150af 09eca75 31b0139 7b150af 879352b 09eca75 fa0e42d 879352b b3b752d 4761bc9 e14a30c 4761bc9 fa0e42d 7b150af d18cd80 fa0e42d 7b150af fa0e42d 7b150af d18cd80 fa0e42d 7b150af fa0e42d cbb1a17 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import gradio as gr
import spaces
import torch
import soundfile as sf
import numpy as np
import librosa
import math
from transformers import MoonshineForConditionalGeneration, AutoProcessor
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny").to(device).to(torch_dtype)
processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
TOKENS_PER_SEC = 12.0
MIN_NEW_TOKENS = 48
MAX_NEW_TOKENS_CAP = 1600
@spaces.GPU
def transcribe_audio(audio_file):
if not audio_file:
return "No audio provided."
audio_array, sr = sf.read(audio_file)
if audio_array.ndim > 1:
audio_array = np.mean(audio_array, axis=1)
target_sr = processor.feature_extractor.sampling_rate
if sr != target_sr:
audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=target_sr)
inputs = processor(audio_array, sampling_rate=target_sr, return_tensors="pt")
inputs = {k: v.to(device=device, dtype=torch_dtype) for k, v in inputs.items()}
duration_sec = len(audio_array) / float(target_sr)
max_new_tokens = min(MAX_NEW_TOKENS_CAP, max(MIN_NEW_TOKENS, int(math.ceil(duration_sec * TOKENS_PER_SEC))))
generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens, no_repeat_ngram_size=4, repetition_penalty=1.05)
return processor.decode(generated_ids[0], skip_special_tokens=True)
theme = gr.themes.Ocean(primary_hue="indigo", secondary_hue="fuchsia", neutral_hue="slate").set(button_large_radius="*radius_sm")
with gr.Blocks(theme=theme) as demo:
gr.Markdown("## Moonshine Tiny STT - 27M Parameters")
gr.HTML("""
<div style="width: 100%; margin-bottom: 20px;">
<img src="https://huggingface.co/spaces/ACloudCenter/moonshine-tiny-STT/resolve/main/public/images/banner.png"
style="width: 100%; height: auto; border-radius: 15px; box-shadow: 0 10px 40px rgba(0,0,0,0.2);"
alt="VibeVoice Banner">
</div>
""")
with gr.Tabs():
with gr.TabItem("Upload Audio"):
audio_file = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File")
output_text1 = gr.Textbox(label="Transcription", placeholder="Transcription will appear here...", lines=10, autoscroll=True)
upload_button = gr.Button("Transcribe Uploaded Audio")
upload_button.click(fn=transcribe_audio, inputs=audio_file, outputs=output_text1)
with gr.TabItem("Record Audio"):
audio_mic = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
output_text2 = gr.Textbox(label="Transcription", placeholder="Transcription will appear here...", lines=10, autoscroll=True)
record_button = gr.Button("Transcribe Recorded Audio")
record_button.click(fn=transcribe_audio, inputs=audio_mic, outputs=output_text2)
gr.Markdown("""
### Instructions:
1. Choose either 'Upload Audio' or 'Record Audio' tab
2. Upload an audio file or record using your microphone
3. Click the respective 'Transcribe' button
4. Wait for the transcription to appear
""")
if __name__ == "__main__":
demo.launch()
|