File size: 3,331 Bytes
fa0e42d
807c399
cbb1a17
31b0139
 
 
 
cbb1a17
270d2e5
cbb1a17
 
 
09eca75
 
cbb1a17
7b150af
 
 
ec2f83b
145285e
cbb1a17
31b0139
 
 
 
7b150af
31b0139
 
 
7b150af
09eca75
31b0139
7b150af
 
 
879352b
09eca75
fa0e42d
879352b
b3b752d
4761bc9
 
e14a30c
4761bc9
 
 
 
fa0e42d
 
7b150af
d18cd80
fa0e42d
7b150af
fa0e42d
7b150af
d18cd80
fa0e42d
7b150af
fa0e42d
 
 
 
 
 
 
 
 
cbb1a17
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import gradio as gr
import spaces
import torch
import soundfile as sf
import numpy as np
import librosa
import math
from transformers import MoonshineForConditionalGeneration, AutoProcessor

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny").to(device).to(torch_dtype)
processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")

TOKENS_PER_SEC = 12.0
MIN_NEW_TOKENS = 48
MAX_NEW_TOKENS_CAP = 1600

@spaces.GPU
def transcribe_audio(audio_file):
    if not audio_file:
        return "No audio provided."
    audio_array, sr = sf.read(audio_file)
    if audio_array.ndim > 1:
        audio_array = np.mean(audio_array, axis=1)
    target_sr = processor.feature_extractor.sampling_rate
    if sr != target_sr:
        audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=target_sr)
    inputs = processor(audio_array, sampling_rate=target_sr, return_tensors="pt")
    inputs = {k: v.to(device=device, dtype=torch_dtype) for k, v in inputs.items()}
    duration_sec = len(audio_array) / float(target_sr)
    max_new_tokens = min(MAX_NEW_TOKENS_CAP, max(MIN_NEW_TOKENS, int(math.ceil(duration_sec * TOKENS_PER_SEC))))
    generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens, no_repeat_ngram_size=4, repetition_penalty=1.05)
    return processor.decode(generated_ids[0], skip_special_tokens=True)

theme = gr.themes.Ocean(primary_hue="indigo", secondary_hue="fuchsia", neutral_hue="slate").set(button_large_radius="*radius_sm")

with gr.Blocks(theme=theme) as demo:
    gr.Markdown("## Moonshine Tiny STT - 27M Parameters")
    gr.HTML("""
        <div style="width: 100%; margin-bottom: 20px;">
            <img src="https://huggingface.co/spaces/ACloudCenter/moonshine-tiny-STT/resolve/main/public/images/banner.png" 
                style="width: 100%; height: auto; border-radius: 15px; box-shadow: 0 10px 40px rgba(0,0,0,0.2);"
                alt="VibeVoice Banner">
        </div>
        """)
    with gr.Tabs():
        with gr.TabItem("Upload Audio"):
            audio_file = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File")
            output_text1 = gr.Textbox(label="Transcription", placeholder="Transcription will appear here...", lines=10, autoscroll=True)
            upload_button = gr.Button("Transcribe Uploaded Audio")
            upload_button.click(fn=transcribe_audio, inputs=audio_file, outputs=output_text1)
        with gr.TabItem("Record Audio"):
            audio_mic = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
            output_text2 = gr.Textbox(label="Transcription", placeholder="Transcription will appear here...", lines=10, autoscroll=True)
            record_button = gr.Button("Transcribe Recorded Audio")
            record_button.click(fn=transcribe_audio, inputs=audio_mic, outputs=output_text2)
    gr.Markdown("""
    ### Instructions:
    1. Choose either 'Upload Audio' or 'Record Audio' tab
    2. Upload an audio file or record using your microphone
    3. Click the respective 'Transcribe' button
    4. Wait for the transcription to appear
    """)

if __name__ == "__main__":
    demo.launch()