frimelle HF Staff commited on
Commit
63d0469
·
1 Parent(s): 182a1f7

English-only setup

Browse files
Files changed (1) hide show
  1. app.py +56 -37
app.py CHANGED
@@ -6,7 +6,7 @@ import torch
6
  from functools import lru_cache
7
  from transformers import pipeline
8
 
9
- # -------- Sentences to practice (customize freely) ----------
10
  SENTENCE_BANK = [
11
  "The quick brown fox jumps over the lazy dog.",
12
  "I promise to speak clearly and at a steady pace.",
@@ -20,12 +20,11 @@ SENTENCE_BANK = [
20
  "This microphone test checks my pronunciation accuracy.",
21
  ]
22
 
23
- # -------- Utilities ----------
24
  def normalize_text(t: str) -> str:
 
25
  t = t.lower()
26
- # keep letters and numbers, replace anything else with space
27
- t = re.sub(r"[^a-z0-9'äöüßçéèêáàóòúùîïôñ\-]+", " ", t)
28
- # collapse whitespace
29
  t = re.sub(r"\s+", " ", t).strip()
30
  return t
31
 
@@ -36,22 +35,29 @@ def similarity_and_diff(ref: str, hyp: str):
36
  sm = difflib.SequenceMatcher(a=ref_tokens, b=hyp_tokens)
37
  ratio = sm.ratio()
38
 
39
- # Build HTML with insertions/deletions highlighted
40
  out = []
41
  for op, i1, i2, j1, j2 in sm.get_opcodes():
42
  if op == "equal":
43
  out.append(" " + " ".join(ref_tokens[i1:i2]))
44
  elif op == "delete":
45
- out.append(' <span style="background:#ffe0e0;text-decoration:line-through;">'
46
- + " ".join(ref_tokens[i1:i2]) + "</span>")
 
 
47
  elif op == "insert":
48
- out.append(' <span style="background:#e0ffe0;">'
49
- + " ".join(hyp_tokens[j1:j2]) + "</span>")
 
 
50
  elif op == "replace":
51
- out.append(' <span style="background:#ffe0e0;text-decoration:line-through;">'
52
- + " ".join(ref_tokens[i1:i2]) + "</span>")
53
- out.append(' <span style="background:#e0ffe0;">'
54
- + " ".join(hyp_tokens[j1:j2]) + "</span>")
 
 
 
 
55
  html = '<div style="line-height:1.6;font-size:1rem;">' + "".join(out).strip() + "</div>"
56
  return ratio, html
57
 
@@ -66,7 +72,7 @@ def get_asr(model_id: str, device_preference: str):
66
  device = -1
67
  return pipeline(
68
  "automatic-speech-recognition",
69
- model=model_id,
70
  device=device,
71
  chunk_length_s=30,
72
  return_timestamps=False,
@@ -75,14 +81,20 @@ def get_asr(model_id: str, device_preference: str):
75
  def gen_sentence():
76
  return random.choice(SENTENCE_BANK)
77
 
 
 
 
 
 
78
  def check_pronunciation(audio_path, target_sentence, model_id, device_pref, pass_threshold):
79
  if not target_sentence:
80
- return gr.update(value=""), gr.update(value=""), gr.update(value=""), gr.update(value="Please generate a sentence first.")
81
 
82
  asr = get_asr(model_id, device_pref)
83
 
84
  try:
85
- result = asr(audio_path) # no language/task args for English-only models
 
86
  hyp_raw = result["text"].strip()
87
  except Exception as e:
88
  return "", "", "", f"Transcription failed: {e}"
@@ -102,36 +114,43 @@ def check_pronunciation(audio_path, target_sentence, model_id, device_pref, pass
102
 
103
  return hyp_raw, score, diff_html, summary
104
 
105
- with gr.Blocks(title="Say the Sentence") as demo:
 
106
  gr.Markdown(
107
  """
108
- # 🎤 Say the Sentence
109
  1) Generate a sentence.
110
- 2) Press the mic to record yourself reading it.
111
- 3) Transcribe & check.
112
  """
113
  )
114
 
115
  with gr.Row():
116
  target = gr.Textbox(label="Target sentence", interactive=False, placeholder="Click 'Generate sentence'")
 
117
  with gr.Row():
118
  btn_gen = gr.Button("🎲 Generate sentence", variant="primary")
119
  btn_clear = gr.Button("🧹 Clear")
120
 
121
  with gr.Row():
122
  audio = gr.Audio(sources=["microphone"], type="filepath", label="Record your voice")
 
123
  with gr.Accordion("Advanced settings", open=False):
124
- model_id = gr.Dropdown(
125
- choices=[
126
- "openai/whisper-tiny.en", # fastest
127
- "openai/whisper-base.en", # slightly better accuracy
128
- "distil-whisper/distil-small.en", # optional
129
- ],
130
- value="openai/whisper-tiny.en",
131
- label="ASR model (English only)",
132
- )
133
- device_pref = gr.Radio(choices=["auto", "cpu", "cuda"], value="auto", label="Device preference")
134
- pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01, label="Match threshold")
 
 
 
 
135
 
136
  with gr.Row():
137
  btn_check = gr.Button("✅ Transcribe & Check", variant="primary")
@@ -145,12 +164,12 @@ with gr.Blocks(title="Say the Sentence") as demo:
145
 
146
  # Events
147
  btn_gen.click(fn=gen_sentence, outputs=target)
148
- btn_clear.click(fn=lambda: ("", "", "", "", ""), outputs=[target, hyp_out, score_out, diff_out, summary_out])
149
  btn_check.click(
150
- fn=check_pronunciation,
151
- inputs=[audio, target, model_id, device_pref, pass_threshold],
152
- outputs=[hyp_out, score_out, diff_out, summary_out]
153
  )
154
 
155
  if __name__ == "__main__":
156
- demo.launch()
 
6
  from functools import lru_cache
7
  from transformers import pipeline
8
 
9
+ # ------------------- Sentence Bank (customize freely) -------------------
10
  SENTENCE_BANK = [
11
  "The quick brown fox jumps over the lazy dog.",
12
  "I promise to speak clearly and at a steady pace.",
 
20
  "This microphone test checks my pronunciation accuracy.",
21
  ]
22
 
23
+ # ------------------- Utilities -------------------
24
  def normalize_text(t: str) -> str:
25
+ # English-only normalization: lowercase, keep letters/digits/' and -
26
  t = t.lower()
27
+ t = re.sub(r"[^a-z0-9'\-]+", " ", t)
 
 
28
  t = re.sub(r"\s+", " ", t).strip()
29
  return t
30
 
 
35
  sm = difflib.SequenceMatcher(a=ref_tokens, b=hyp_tokens)
36
  ratio = sm.ratio()
37
 
 
38
  out = []
39
  for op, i1, i2, j1, j2 in sm.get_opcodes():
40
  if op == "equal":
41
  out.append(" " + " ".join(ref_tokens[i1:i2]))
42
  elif op == "delete":
43
+ out.append(
44
+ ' <span style="background:#ffe0e0;text-decoration:line-through;">'
45
+ + " ".join(ref_tokens[i1:i2]) + "</span>"
46
+ )
47
  elif op == "insert":
48
+ out.append(
49
+ ' <span style="background:#e0ffe0;">'
50
+ + " ".join(hyp_tokens[j1:j2]) + "</span>"
51
+ )
52
  elif op == "replace":
53
+ out.append(
54
+ ' <span style="background:#ffe0e0;text-decoration:line-through;">'
55
+ + " ".join(ref_tokens[i1:i2]) + "</span>"
56
+ )
57
+ out.append(
58
+ ' <span style="background:#e0ffe0;">'
59
+ + " ".join(hyp_tokens[j1:j2]) + "</span>"
60
+ )
61
  html = '<div style="line-height:1.6;font-size:1rem;">' + "".join(out).strip() + "</div>"
62
  return ratio, html
63
 
 
72
  device = -1
73
  return pipeline(
74
  "automatic-speech-recognition",
75
+ model=model_id, # use English-only Whisper models (.en)
76
  device=device,
77
  chunk_length_s=30,
78
  return_timestamps=False,
 
81
  def gen_sentence():
82
  return random.choice(SENTENCE_BANK)
83
 
84
+ def clear_all():
85
+ # target, hyp_out, score_out, diff_out, summary_out
86
+ return "", "", "", "", ""
87
+
88
+ # ------------------- Core Check (English-only) -------------------
89
  def check_pronunciation(audio_path, target_sentence, model_id, device_pref, pass_threshold):
90
  if not target_sentence:
91
+ return "", "", "", "Please generate a sentence first."
92
 
93
  asr = get_asr(model_id, device_pref)
94
 
95
  try:
96
+ # IMPORTANT: For English-only Whisper (.en), do NOT pass language/task args.
97
+ result = asr(audio_path)
98
  hyp_raw = result["text"].strip()
99
  except Exception as e:
100
  return "", "", "", f"Transcription failed: {e}"
 
114
 
115
  return hyp_raw, score, diff_html, summary
116
 
117
+ # ------------------- UI -------------------
118
+ with gr.Blocks(title="Say the Sentence (English)") as demo:
119
  gr.Markdown(
120
  """
121
+ # 🎤 Say the Sentence (English)
122
  1) Generate a sentence.
123
+ 2) Record yourself reading it.
124
+ 3) Transcribe & check your accuracy.
125
  """
126
  )
127
 
128
  with gr.Row():
129
  target = gr.Textbox(label="Target sentence", interactive=False, placeholder="Click 'Generate sentence'")
130
+
131
  with gr.Row():
132
  btn_gen = gr.Button("🎲 Generate sentence", variant="primary")
133
  btn_clear = gr.Button("🧹 Clear")
134
 
135
  with gr.Row():
136
  audio = gr.Audio(sources=["microphone"], type="filepath", label="Record your voice")
137
+
138
  with gr.Accordion("Advanced settings", open=False):
139
+ model_id = gr.Dropdown(
140
+ choices=[
141
+ "openai/whisper-tiny.en", # fastest (CPU-friendly)
142
+ "openai/whisper-base.en", # better accuracy, a bit slower
143
+ "distil-whisper/distil-small.en" # optional distil English model
144
+ ],
145
+ value="openai/whisper-tiny.en",
146
+ label="ASR model (English only)",
147
+ )
148
+ device_pref = gr.Radio(
149
+ choices=["auto", "cpu", "cuda"],
150
+ value="auto",
151
+ label="Device preference"
152
+ )
153
+ pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01, label="Match threshold")
154
 
155
  with gr.Row():
156
  btn_check = gr.Button("✅ Transcribe & Check", variant="primary")
 
164
 
165
  # Events
166
  btn_gen.click(fn=gen_sentence, outputs=target)
167
+ btn_clear.click(fn=clear_all, outputs=[target, hyp_out, score_out, diff_out, summary_out])
168
  btn_check.click(
169
+ fn=check_pronunciation,
170
+ inputs=[audio, target, model_id, device_pref, pass_threshold],
171
+ outputs=[hyp_out, score_out, diff_out, summary_out]
172
  )
173
 
174
  if __name__ == "__main__":
175
+ demo.launch()