Spaces:

ACloudCenter
/

moonshine-tiny-STT

Running on Zero

ACloudCenter commited on Oct 27

Commit

ec2f83b

verified ·

1 Parent(s): 1f6640c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -15,6 +15,11 @@ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 model = MoonshineForConditionalGeneration.from_pretrained('UsefulSensors/moonshine-tiny').to(device).to(torch_dtype)
 processor = AutoProcessor.from_pretrained('UsefulSensors/moonshine-tiny')
 # Define transcription function using HF Zero GPU
 @spaces.GPU
 def transcribe_audio(audio_file):
@@ -38,9 +43,12 @@ def transcribe_audio(audio_file):
         return_tensors="pt"
     ).to(device, torch_dtype)
-    # Duration-based max_new_tokens calculation
     duration_sec = len(audio_array) / float(target_sr)
-    max_new_tokens = max(24, int(math.ceil(duration_sec * 7.0)))
     # Generate transcription with adjusted max_new_tokens
     generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens)

 model = MoonshineForConditionalGeneration.from_pretrained('UsefulSensors/moonshine-tiny').to(device).to(torch_dtype)
 processor = AutoProcessor.from_pretrained('UsefulSensors/moonshine-tiny')
+# --- Longer token limits (simple) ---
+TOKENS_PER_SEC = 12.0       # was ~7.0 before
+MIN_NEW_TOKENS = 48         # was 24; gives short clips more room
+MAX_NEW_TOKENS_CAP = 3200   # generous cap to avoid runaway
 # Define transcription function using HF Zero GPU
 @spaces.GPU
 def transcribe_audio(audio_file):
         return_tensors="pt"
     ).to(device, torch_dtype)
+    # Duration-based max_new_tokens calculation (longer limits)
     duration_sec = len(audio_array) / float(target_sr)
+    max_new_tokens = min(
+        MAX_NEW_TOKENS_CAP,
+        max(MIN_NEW_TOKENS, int(math.ceil(duration_sec * TOKENS_PER_SEC)))
+    )
     # Generate transcription with adjusted max_new_tokens
     generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens)