Spaces:

ACloudCenter
/

moonshine-tiny-STT

Running on Zero

App Files Files Community

ACloudCenter commited on Oct 14

Commit

879352b

1 Parent(s): 270d2e5

Modify readme icon and gradio theme

Browse files

Files changed (2) hide show

README.md +1 -1
app.py +21 -4

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Moonshine Tiny STT
-emoji: 🏢
 colorFrom: blue
 colorTo: red
 sdk: gradio

 ---
 title: Moonshine Tiny STT
+emoji:  🏆
 colorFrom: blue
 colorTo: red
 sdk: gradio

app.py CHANGED Viewed

@@ -7,39 +7,56 @@ import librosa
 import math
 from transformers import MoonshineForConditionalGeneration, AutoProcessor
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 model = MoonshineForConditionalGeneration.from_pretrained('UsefulSensors/moonshine-tiny').to(device).to(torch_dtype)
 processor = AutoProcessor.from_pretrained('UsefulSensors/moonshine-tiny')
 @spaces.GPU
 def transcribe_audio(audio_file):
     if not audio_file:
         return "No audio provided."
     audio_array, sr = sf.read(audio_file)
     if audio_array.ndim > 1:
-        audio_array = np.mean(audio_array, axis=1)
     target_sr = processor.feature_extractor.sampling_rate
     if sr != target_sr:
         audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=target_sr)
     inputs = processor(
         audio_array,
         sampling_rate=target_sr,
         return_tensors="pt"
     ).to(device, torch_dtype)
     duration_sec = len(audio_array) / float(target_sr)
     max_new_tokens = max(24, int(math.ceil(duration_sec * 7.0)))
     generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens)
-    return processor.decode(generated_ids[0], skip_special_tokens=True)
-with gr.Blocks() as demo:
     gr.Markdown("## Audio Transcription App")
     with gr.Tabs():

 import math
 from transformers import MoonshineForConditionalGeneration, AutoProcessor
+# Use GPU if available and set appropriate dtype
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+# Load model and processor - Moonshine Tiny
 model = MoonshineForConditionalGeneration.from_pretrained('UsefulSensors/moonshine-tiny').to(device).to(torch_dtype)
 processor = AutoProcessor.from_pretrained('UsefulSensors/moonshine-tiny')
+# Define transcription function using HF Zero GPU
 @spaces.GPU
 def transcribe_audio(audio_file):
     if not audio_file:
         return "No audio provided."
+    # Load and preprocess audio
     audio_array, sr = sf.read(audio_file)
     if audio_array.ndim > 1:
+        audio_array = np.mean(audio_array, axis=1)
+    # Resample if necessary in case the audio file has a different sampling rate
     target_sr = processor.feature_extractor.sampling_rate
     if sr != target_sr:
         audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=target_sr)
+    # Prepare inputs for the model - ensure correct dtype and device
     inputs = processor(
         audio_array,
         sampling_rate=target_sr,
         return_tensors="pt"
     ).to(device, torch_dtype)
+    # Duration-based max_new_tokens calculation
     duration_sec = len(audio_array) / float(target_sr)
     max_new_tokens = max(24, int(math.ceil(duration_sec * 7.0)))
+    # Generate transcription with adjusted max_new_tokens
     generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens)
+    return processor.decode(generated_ids[0], skip_special_tokens=True) # Decode the generated IDs to text
+# Set Gradio theme
+theme = gr.themes.Ocean(
+    primary_hue="indigo",
+    secondary_hue="fuchsia",
+    neutral_hue="slate",
+).set(
+    button_large_radius='*radius_sm'
+)
+# Create Gradio interface
+with gr.Blocks(theme=theme) as demo:
     gr.Markdown("## Audio Transcription App")
     with gr.Tabs():