ACloudCenter commited on
Commit
879352b
Β·
1 Parent(s): 270d2e5

Modify readme icon and gradio theme

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +21 -4
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Moonshine Tiny STT
3
- emoji: 🏒
4
  colorFrom: blue
5
  colorTo: red
6
  sdk: gradio
 
1
  ---
2
  title: Moonshine Tiny STT
3
+ emoji: πŸ†
4
  colorFrom: blue
5
  colorTo: red
6
  sdk: gradio
app.py CHANGED
@@ -7,39 +7,56 @@ import librosa
7
  import math
8
  from transformers import MoonshineForConditionalGeneration, AutoProcessor
9
 
10
-
11
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
13
 
 
14
  model = MoonshineForConditionalGeneration.from_pretrained('UsefulSensors/moonshine-tiny').to(device).to(torch_dtype)
15
  processor = AutoProcessor.from_pretrained('UsefulSensors/moonshine-tiny')
16
 
 
17
  @spaces.GPU
18
  def transcribe_audio(audio_file):
19
  if not audio_file:
20
  return "No audio provided."
21
 
 
22
  audio_array, sr = sf.read(audio_file)
23
  if audio_array.ndim > 1:
24
- audio_array = np.mean(audio_array, axis=1)
25
 
 
26
  target_sr = processor.feature_extractor.sampling_rate
27
  if sr != target_sr:
28
  audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=target_sr)
29
 
 
30
  inputs = processor(
31
  audio_array,
32
  sampling_rate=target_sr,
33
  return_tensors="pt"
34
  ).to(device, torch_dtype)
35
 
 
36
  duration_sec = len(audio_array) / float(target_sr)
37
  max_new_tokens = max(24, int(math.ceil(duration_sec * 7.0)))
38
 
 
39
  generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens)
40
- return processor.decode(generated_ids[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
41
 
42
- with gr.Blocks() as demo:
 
43
  gr.Markdown("## Audio Transcription App")
44
 
45
  with gr.Tabs():
 
7
  import math
8
  from transformers import MoonshineForConditionalGeneration, AutoProcessor
9
 
10
+ # Use GPU if available and set appropriate dtype
11
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
13
 
14
+ # Load model and processor - Moonshine Tiny
15
  model = MoonshineForConditionalGeneration.from_pretrained('UsefulSensors/moonshine-tiny').to(device).to(torch_dtype)
16
  processor = AutoProcessor.from_pretrained('UsefulSensors/moonshine-tiny')
17
 
18
+ # Define transcription function using HF Zero GPU
19
  @spaces.GPU
20
  def transcribe_audio(audio_file):
21
  if not audio_file:
22
  return "No audio provided."
23
 
24
+ # Load and preprocess audio
25
  audio_array, sr = sf.read(audio_file)
26
  if audio_array.ndim > 1:
27
+ audio_array = np.mean(audio_array, axis=1)
28
 
29
+ # Resample if necessary in case the audio file has a different sampling rate
30
  target_sr = processor.feature_extractor.sampling_rate
31
  if sr != target_sr:
32
  audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=target_sr)
33
 
34
+ # Prepare inputs for the model - ensure correct dtype and device
35
  inputs = processor(
36
  audio_array,
37
  sampling_rate=target_sr,
38
  return_tensors="pt"
39
  ).to(device, torch_dtype)
40
 
41
+ # Duration-based max_new_tokens calculation
42
  duration_sec = len(audio_array) / float(target_sr)
43
  max_new_tokens = max(24, int(math.ceil(duration_sec * 7.0)))
44
 
45
+ # Generate transcription with adjusted max_new_tokens
46
  generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens)
47
+ return processor.decode(generated_ids[0], skip_special_tokens=True) # Decode the generated IDs to text
48
+
49
+ # Set Gradio theme
50
+ theme = gr.themes.Ocean(
51
+ primary_hue="indigo",
52
+ secondary_hue="fuchsia",
53
+ neutral_hue="slate",
54
+ ).set(
55
+ button_large_radius='*radius_sm'
56
+ )
57
 
58
+ # Create Gradio interface
59
+ with gr.Blocks(theme=theme) as demo:
60
  gr.Markdown("## Audio Transcription App")
61
 
62
  with gr.Tabs():