Text generation with the options of Qwen and Llama Instruct models

#2
by frimelle HF Staff - opened
Files changed (3) hide show
  1. app.py +55 -106
  2. src/generate.py +24 -172
  3. src/prompts.py +34 -46
app.py CHANGED
@@ -1,25 +1,26 @@
1
  import gradio as gr
2
-
3
  from gradio_client import Client, handle_file
4
 
5
  import src.generate as generate
6
  import src.process as process
7
 
 
8
  global client
9
 
10
  # TODO: Ideally, instead of the Client method we're using for an external voice cloning app, we use the .load() function and pass in arguments to it directly while displaying the developer's desired UI.
11
  #chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
12
  # ------------------- UI printing functions -------------------
13
  def clear_all():
14
- # target, user_transcript, score_html, result_html, diff_html, tts_ui
15
- return "", "", "", "", "", gr.Row.update(visible=False)
 
16
 
17
 
18
  def make_result_html(pass_threshold, passed, ratio):
19
  """Returns HTML summarizing results.
20
  Parameters:
21
- pass_threshold: Minimum percentage of match between target and
22
- recognized user utterance that counts as passing.
23
  passed: Whether the recognized user utterance is >= `pass_threshold`.
24
  ratio: Sequence match ratio.
25
  """
@@ -78,16 +79,16 @@ def make_html(sentence_match):
78
  return score_html, result_html, diff_html
79
 
80
 
81
- # ------------------- Core Check (Currently English-only) -------------------
82
  # @spaces.GPU
83
  def get_user_transcript(audio_path: gr.Audio, target_sentence: str,
84
- asr_model_id: str, device_pref: str) -> (str, str):
85
  """ASR for the input audio and basic validation.
86
- Uses the selected ASR model `asr_model_id` to recognize words in the input `audio_path`.
87
  Parameters:
88
  audio_path: Processed audio file returned from gradio Audio component.
89
  target_sentence: Sentence the user needs to say.
90
- asr_model_id: Desired ASR model.
91
  device_pref: Preferred ASR processing device. Can be "auto", "cpu", "cuda".
92
  Returns:
93
  error_msg: If there's an error, a string describing what happened.
@@ -101,7 +102,7 @@ def get_user_transcript(audio_path: gr.Audio, target_sentence: str,
101
  return "Please start, record, then stop the audio recording before trying to transcribe.", ""
102
 
103
  # Runs the automatic speech recognition
104
- user_transcript = process.run_asr(audio_path, asr_model_id, device_pref)
105
 
106
  # Handles processing errors.
107
  if isinstance(user_transcript, Exception):
@@ -109,13 +110,13 @@ def get_user_transcript(audio_path: gr.Audio, target_sentence: str,
109
  return "", user_transcript
110
 
111
 
112
- def transcribe_check(audio_path, target_sentence, asr_model_id, device_pref,
113
  pass_threshold):
114
  """Transcribe user, calculate match to target sentence, create results HTML.
115
  Parameters:
116
  audio_path: Local path to recorded audio.
117
  target_sentence: Sentence the user needs to say.
118
- asr_model_id: Desired ASR model.
119
  device_pref: Preferred ASR processing device. Can be "auto", "cpu", "cuda".
120
  Returns:
121
  user_transcript: The recognized user utterance
@@ -127,8 +128,7 @@ def transcribe_check(audio_path, target_sentence, asr_model_id, device_pref,
127
  clone_audio = False
128
  # Transcribe user input
129
  error_msg, user_transcript = get_user_transcript(audio_path,
130
- target_sentence,
131
- asr_model_id,
132
  device_pref)
133
  if error_msg:
134
  score_html = ""
@@ -144,74 +144,33 @@ def transcribe_check(audio_path, target_sentence, asr_model_id, device_pref,
144
  # Create the output to print out
145
  score_html, result_html, diff_html = make_html(sentence_match)
146
 
147
- return (user_transcript, score_html, result_html, diff_html,
148
- gr.Row(visible=clone_audio))
149
 
150
- def clone_voice(audio_input, text_input, exaggeration_input, cfgw_input,
151
- seed_num_input, temperature_input):
152
  global client
153
  # Additional specifications for Chatterbox include:
154
  # exaggeration_input=0.5,
155
  # temperature_input=0.8,
156
- # seed_num_input=0,z
157
  # cfgw_input=0.5,
158
  # api_name="/generate_tts_audio"
159
  return client.predict(text_input=text_input,
160
- audio_prompt_path_input=handle_file(audio_input),
161
- exaggeration_input=exaggeration_input,
162
- cfgw_input=cfgw_input,
163
- seed_num_input=seed_num_input,
164
- temperature_input=temperature_input)
165
 
166
 
167
  # ------------------- UI -------------------
168
- with gr.Blocks(title="Voice Consent Gate") as demo:
169
- gr.Markdown("# Voice Consent Gate: Demo")
170
- with gr.Row():
171
- with gr.Column():
172
- with gr.Accordion(
173
- label="Click for further information on this demo",
174
- open=False):
175
- gr.Markdown("""
176
-
 
177
 
178
- To create a basic voice cloning system with a voice consent gate, you need three parts:
179
- 1. A way of generating novel consent sentences for the person whose voice will be cloned – the “speaker” – to say, making sure the sentence isn’t part of a previous recording but instead uniquely references the current consent context.
180
- 2. An _automatic speech recognition (ASR) system_ that recognizes the sentence conveying consent.
181
- 3. A _voice-cloning text-to-speech (TTS) system_ that takes as input text and the voice clonee’s speech snippets to generate speech.
182
- Some voice-cloning TTS systems can now generate speech similar to a speaker’s voice using _just one sentence_. This means that a sentence used for consent can **also** be used for voice cloning. We demonstrate one way to do that here.
183
- """)
184
- with gr.Row():
185
- with gr.Column(scale=2):
186
- gr.Markdown(
187
- """# 🎤 Say the Sentence (English)"""
188
- )
189
- gr.Markdown(
190
- """
191
- ## 1) Generate a sentence.
192
- ## 2) Record yourself reading it.
193
- ## 3) Transcribe & check your accuracy.
194
- ## 4) If matched, clone your voice to speak any sentence you enter.
195
- """
196
- )
197
- with gr.Column():
198
- consent_method = gr.Dropdown(label="Sentence generation method",
199
- choices=["Llama 3.2 3B Instruct",
200
- "Pre-written"],
201
- value="Pre-written")
202
- asr_model = gr.Dropdown(label="Speech recognition model",
203
- choices=["openai/whisper-tiny.en", # fastest (CPU-friendly)
204
- "openai/whisper-base.en", # better accuracy, a bit slower
205
- "distil-whisper/distil-small.en"
206
- # optional distil English model
207
- ],
208
- value="openai/whisper-tiny.en",
209
- )
210
- voice_clone_model = gr.Dropdown(
211
- label="Voice cloning model",
212
- choices=["Chatterbox", ], value="Chatterbox")
213
- #with gr.Column():
214
- # pass # Just for spacing
215
  with gr.Row():
216
  target = gr.Textbox(label="Target sentence", interactive=False,
217
  placeholder="Click 'Generate sentence'")
@@ -221,10 +180,19 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
221
  btn_clear = gr.Button("🧹 Clear")
222
 
223
  with gr.Row():
224
- consent_audio = gr.Audio(sources=["microphone"], type="filepath",
225
- label="Record your voice", key='consent_audio')
226
-
227
- with gr.Accordion("Advanced ASR settings", open=False):
 
 
 
 
 
 
 
 
 
228
  device_pref = gr.Radio(
229
  choices=["auto", "cpu", "cuda"],
230
  value="auto",
@@ -243,66 +211,47 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
243
  diff_html = gr.HTML(
244
  label="Word-level diff (red = expected but missing / green = extra or replacement)")
245
 
246
- gr.Markdown("## 🔁 Voice Consent Gate (opens upon consent)")
247
  # TODO: Ideally this is gr.Blocks, but that seems to have a visibility-change bug.
248
  with gr.Row(visible=False) as tts_ui:
249
- # Using the render decorator so that we can access consent audio after it's recorded.
250
  @gr.render(inputs=consent_audio)
251
  def show_tts(audio_input):
 
252
  global client
253
  if audio_input:
254
  client = Client("ResembleAI/Chatterbox")
 
 
255
  with gr.Row():
256
  with gr.Column():
257
  gr.Markdown("## Audio input")
258
  # Prepopulating with the consent audio.
259
- # Set interactive=True to be able to change.
260
- tts_audio = gr.Audio(audio_input, type="filepath")
261
  with gr.Row():
262
  with gr.Column():
263
  gr.Markdown("## Text input")
264
  tts_text = gr.Textbox(
265
  "Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.", interactive=True)
266
- with gr.Row():
267
- # TODO: Ideally, these options aren't hardcoded -- e.g., using .load(), where they're imported, allowing for different options depending on the client.
268
- with gr.Accordion("More options", open=False):
269
- exaggeration = gr.Slider(
270
- 0.25, 2, step=.05,
271
- label="Exaggeration (Neutral = 0.5, extreme values can be unstable)",
272
- value=.5
273
- )
274
- cfg_weight = gr.Slider(
275
- 0.2, 1, step=.05, label="CFG/Pace", value=0.5
276
- )
277
- seed_num = gr.Number(value=0,
278
- label="Random seed (0 for random)")
279
- temp = gr.Slider(0.05, 5, step=.05,
280
- label="Temperature", value=.8)
281
  with gr.Row():
282
  clone_btn = gr.Button("Clone!")
283
  cloned_audio = gr.Audio()
284
- clone_btn.click(fn=clone_voice,
285
- inputs=[tts_audio, tts_text, exaggeration,
286
- cfg_weight, seed_num, temp],
287
- outputs=[cloned_audio])
288
 
289
  # -------- Events --------
290
- # Generate sentence: including model name + detailed prompt
291
- btn_gen.click(
292
- fn=generate.gen_sentence,
293
- inputs=[consent_method, voice_clone_model],
294
- outputs=target
295
- )
296
 
 
297
  btn_clear.click(
298
  fn=clear_all,
299
- outputs=[target, user_transcript, score_html, result_html, diff_html,
300
- tts_ui]
301
  )
302
 
303
  btn_check.click(
304
  fn=transcribe_check,
305
- inputs=[consent_audio, target, asr_model, device_pref, pass_threshold],
306
  outputs=[user_transcript, score_html, result_html, diff_html, tts_ui]
307
  )
308
 
 
1
  import gradio as gr
2
+ # import spaces
3
  from gradio_client import Client, handle_file
4
 
5
  import src.generate as generate
6
  import src.process as process
7
 
8
+ # TODO: Abusing the 'global' notation for now so we can be flexible to multiple clients.
9
  global client
10
 
11
  # TODO: Ideally, instead of the Client method we're using for an external voice cloning app, we use the .load() function and pass in arguments to it directly while displaying the developer's desired UI.
12
  #chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
13
  # ------------------- UI printing functions -------------------
14
  def clear_all():
15
+ # target, user_transcript, score_html, diff_html, result_html,
16
+ # TODO(?): Add tts_text, tts_audio, clone_status (Maybe? Was there before.)
17
+ return "", "", "", "", "", "", "", None,
18
 
19
 
20
  def make_result_html(pass_threshold, passed, ratio):
21
  """Returns HTML summarizing results.
22
  Parameters:
23
+ pass_threshold: Minimum percentage of match between target and recognized user utterance that counts as passing.
 
24
  passed: Whether the recognized user utterance is >= `pass_threshold`.
25
  ratio: Sequence match ratio.
26
  """
 
79
  return score_html, result_html, diff_html
80
 
81
 
82
+ # ------------------- Core Check (English-only) -------------------
83
  # @spaces.GPU
84
  def get_user_transcript(audio_path: gr.Audio, target_sentence: str,
85
+ model_id: str, device_pref: str) -> (str, str):
86
  """ASR for the input audio and basic validation.
87
+ Uses the selected ASR model `model_id` to recognize words in the input `audio_path`.
88
  Parameters:
89
  audio_path: Processed audio file returned from gradio Audio component.
90
  target_sentence: Sentence the user needs to say.
91
+ model_id: Desired ASR model.
92
  device_pref: Preferred ASR processing device. Can be "auto", "cpu", "cuda".
93
  Returns:
94
  error_msg: If there's an error, a string describing what happened.
 
102
  return "Please start, record, then stop the audio recording before trying to transcribe.", ""
103
 
104
  # Runs the automatic speech recognition
105
+ user_transcript = process.run_asr(audio_path, model_id, device_pref)
106
 
107
  # Handles processing errors.
108
  if isinstance(user_transcript, Exception):
 
110
  return "", user_transcript
111
 
112
 
113
+ def transcribe_check(audio_path, target_sentence, model_id, device_pref,
114
  pass_threshold):
115
  """Transcribe user, calculate match to target sentence, create results HTML.
116
  Parameters:
117
  audio_path: Local path to recorded audio.
118
  target_sentence: Sentence the user needs to say.
119
+ model_id: Desired ASR model.
120
  device_pref: Preferred ASR processing device. Can be "auto", "cpu", "cuda".
121
  Returns:
122
  user_transcript: The recognized user utterance
 
128
  clone_audio = False
129
  # Transcribe user input
130
  error_msg, user_transcript = get_user_transcript(audio_path,
131
+ target_sentence, model_id,
 
132
  device_pref)
133
  if error_msg:
134
  score_html = ""
 
144
  # Create the output to print out
145
  score_html, result_html, diff_html = make_html(sentence_match)
146
 
147
+ return user_transcript, score_html, result_html, diff_html, gr.Row(visible=clone_audio)
 
148
 
149
+ def clone_voice(audio_input, text_input):
150
+ # TODO: Note that this is the 'global' hack to pass in the client.
151
  global client
152
  # Additional specifications for Chatterbox include:
153
  # exaggeration_input=0.5,
154
  # temperature_input=0.8,
155
+ # seed_num_input=0,
156
  # cfgw_input=0.5,
157
  # api_name="/generate_tts_audio"
158
  return client.predict(text_input=text_input,
159
+ audio_prompt_path_input=handle_file(audio_input))
 
 
 
 
160
 
161
 
162
  # ------------------- UI -------------------
163
+ with gr.Blocks(title="Say the Sentence (English)") as demo:
164
+ gr.Markdown(
165
+ """
166
+ # 🎤 Say the Sentence (English)
167
+ 1) Generate a sentence.
168
+ 2) Record yourself reading it.
169
+ 3) Transcribe & check your accuracy.
170
+ 4) If matched, clone your voice to speak any sentence you enter.
171
+ """
172
+ )
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  with gr.Row():
175
  target = gr.Textbox(label="Target sentence", interactive=False,
176
  placeholder="Click 'Generate sentence'")
 
180
  btn_clear = gr.Button("🧹 Clear")
181
 
182
  with gr.Row():
183
+ consent_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record your voice", key='consent_audio')
184
+
185
+ with gr.Accordion("Advanced settings", open=False):
186
+ model_id = gr.Dropdown(
187
+ choices=[
188
+ "openai/whisper-tiny.en", # fastest (CPU-friendly)
189
+ "openai/whisper-base.en", # better accuracy, a bit slower
190
+ "distil-whisper/distil-small.en" # optional distil English model
191
+ "distil-whisper/distil-small.en",
192
+ ],
193
+ value="openai/whisper-tiny.en",
194
+ label="ASR model (English only)",
195
+ )
196
  device_pref = gr.Radio(
197
  choices=["auto", "cpu", "cuda"],
198
  value="auto",
 
211
  diff_html = gr.HTML(
212
  label="Word-level diff (red = expected but missing / green = extra or replacement)")
213
 
 
214
  # TODO: Ideally this is gr.Blocks, but that seems to have a visibility-change bug.
215
  with gr.Row(visible=False) as tts_ui:
216
+ # Using the render decorator so that we can easily pass in the consent audio after it's recorded.
217
  @gr.render(inputs=consent_audio)
218
  def show_tts(audio_input):
219
+ # TODO: Abusing global, since we can't send a Client as a component to a function.
220
  global client
221
  if audio_input:
222
  client = Client("ResembleAI/Chatterbox")
223
+ with gr.Row():
224
+ gr.Markdown("# 🔁 Voice cloning")
225
  with gr.Row():
226
  with gr.Column():
227
  gr.Markdown("## Audio input")
228
  # Prepopulating with the consent audio.
229
+ tts_audio = gr.Audio(audio_input, interactive=True, type="filepath")
 
230
  with gr.Row():
231
  with gr.Column():
232
  gr.Markdown("## Text input")
233
  tts_text = gr.Textbox(
234
  "Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.", interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  with gr.Row():
236
  clone_btn = gr.Button("Clone!")
237
  cloned_audio = gr.Audio()
238
+ clone_btn.click(fn=clone_voice, inputs=[tts_audio, tts_text], outputs=[cloned_audio])
 
 
 
239
 
240
  # -------- Events --------
241
+ # Use pre-specified sentence bank by default
242
+ btn_gen.click(fn=generate.gen_sentence_set, outputs=target)
243
+ # Or use LLM generation:
244
+ # btn_gen.click(fn=generate.gen_sentence_llm, outputs=target)
 
 
245
 
246
+ # TODO(?): clearing tts_text, tts_audio, clone_status (not sure what that was)
247
  btn_clear.click(
248
  fn=clear_all,
249
+ outputs=[target, user_transcript, score_html, result_html, diff_html]
 
250
  )
251
 
252
  btn_check.click(
253
  fn=transcribe_check,
254
+ inputs=[consent_audio, target, model_id, device_pref, pass_threshold],
255
  outputs=[user_transcript, score_html, result_html, diff_html, tts_ui]
256
  )
257
 
src/generate.py CHANGED
@@ -1,32 +1,16 @@
1
- # src/generate.py
2
- """
3
- Module: generate
4
- ----------------
5
- Handles the generation of "consent sentences" for the Voice Consent Gate demo.
6
-
7
- This module connects to an external language model (in this case, the public
8
- Hugging Face Space for Llama 3.2 3B Instruct) to generate natural-sounding
9
- sentences that users can read aloud to give informed consent for voice cloning.
10
-
11
- If the model call fails (e.g., due to rate limits or network issues),
12
- a fallback sentence is chosen from a small built-in sentence bank.
13
-
14
- Functions:
15
- - _extract_llama_text(): Normalize the API output from the Llama demo.
16
- - gen_sentence_llm(): Generate a consent sentence from the Llama model Space.
17
- - gen_sentence_set(): Select a random prewritten sentence (for fallback/testing).
18
- """
19
-
20
- import os
21
  import random
22
- from typing import Any
23
- from gradio_client import Client
24
 
25
  import src.process as process
26
- from src.prompts import get_consent_generation_prompt
27
 
 
 
 
 
 
28
 
29
- # ------------------- Sentence Bank (unchanged) -------------------
30
  SENTENCE_BANK = [
31
  "The quick brown fox jumps over the lazy dog.",
32
  "I promise to speak clearly and at a steady pace.",
@@ -41,153 +25,21 @@ SENTENCE_BANK = [
41
  ]
42
 
43
 
44
- # ------------------- Model / Space Configuration -------------------
45
- # The demo connects to the Llama 3.2 3B Instruct Space on Hugging Face.
46
- # You can override these defaults by setting environment variables in your Space.
47
- LLAMA_SPACE_ID = os.getenv(
48
- "LLAMA_SPACE_ID", "huggingface-projects/llama-3.2-3B-Instruct"
49
- )
50
- LLAMA_API_NAME = "/chat" # The Space exposes a single /chat endpoint.
51
- HF_TOKEN = os.getenv("HF_TOKEN") # Optional; not required for public Spaces.
52
-
53
-
54
- def _extract_llama_text(result: Any) -> str:
55
- """
56
- Normalize the API response from the Llama 3.2 3B demo Space into plain text.
57
-
58
- The Space’s `/chat` endpoint may return different shapes depending on how
59
- the Gradio app is structured — sometimes a string, other times a dictionary
60
- or list. This function recursively traverses and extracts the first
61
- meaningful text string it finds.
62
-
63
- Parameters
64
- ----------
65
- result : Any
66
- The raw output returned by `client.predict()`.
67
-
68
- Returns
69
- -------
70
- str
71
- Cleaned text output (may be empty string if extraction fails).
72
- """
73
- if isinstance(result, str):
74
- return result.strip()
75
- if isinstance(result, (int, float, bool)):
76
- return str(result)
77
- if isinstance(result, list):
78
- # If multiple segments are returned (e.g., multiple sentences),
79
- # join them into one string.
80
- parts = []
81
- for x in result:
82
- s = _extract_llama_text(x)
83
- if s:
84
- parts.append(s)
85
- return " ".join(parts).strip()
86
- if isinstance(result, dict):
87
- # Common key names used in Gradio JSON responses
88
- for key in ("text", "response", "content", "generated_text", "message"):
89
- v = result.get(key)
90
- if isinstance(v, str) and v.strip():
91
- return v.strip()
92
- return ""
93
-
94
-
95
- def gen_sentence(sentence_method="Pre-written", audio_model_name="Chatterbox"):
96
- # chatterbox model name, detailed prompt (short_prompt=False)
97
- if sentence_method == "Pre-written":
98
- return gen_sentence_set()
99
- else:
100
- try:
101
- return gen_sentence_llm(sentence_method,
102
- audio_model_name,
103
- fallback_on_error=False # ← show errors during testing
104
- )
105
- except Exception as e:
106
- # Show a helpful message directly in the Target sentence box
107
- return f"[ERROR calling LLM] {type(e).__name__}: {e}"
108
-
109
- # TODO: Support more than just Llama 3.2 3B Instruct
110
- def gen_sentence_llm(sentence_method="Llama 3.2 3B Instruct", audio_model_name: str = "Chatterbox", *, fallback_on_error: bool = False # Set True for production to avoid crashes
111
- ) -> str:
112
- """
113
- Generate a consent sentence using the Llama 3.2 3B Instruct demo Space.
114
-
115
- This function constructs a prompt describing the linguistic and ethical
116
- requirements for a consent sentence (via `get_consent_generation_prompt`)
117
- and sends it to the Llama demo hosted on Hugging Face Spaces.
118
-
119
- The response is normalized into a single English sentence suitable
120
- for reading aloud.
121
-
122
- Parameters
123
- ----------
124
- audio_model_name : str, optional
125
- The name of the voice-cloning model to mention in the sentence.
126
- Defaults to "Chatterbox".
127
- fallback_on_error : bool, optional
128
- If True, return a random fallback sentence instead of raising
129
- an error when the Space call fails. Default is False for debugging.
130
-
131
- Returns
132
- -------
133
- str
134
- A clean, human-readable consent sentence.
135
-
136
- Raises
137
- ------
138
- Exception
139
- Re-raises the underlying error if `fallback_on_error` is False.
140
- """
141
- # Generate the full natural-language prompt that the LLM will receive
142
- prompt = get_consent_generation_prompt(audio_model_name)
143
-
144
- try:
145
- # Initialize Gradio client for the Llama demo Space
146
- client = Client(LLAMA_SPACE_ID, hf_token=HF_TOKEN)
147
-
148
- # The Llama demo exposes a simple /chat endpoint with standard decoding params
149
- result = client.predict(
150
- message=prompt,
151
- max_new_tokens=128,
152
- temperature=0.6,
153
- top_p=0.9,
154
- top_k=50,
155
- repetition_penalty=1.2,
156
- api_name=LLAMA_API_NAME,
157
- )
158
-
159
- # Normalize and clean up model output
160
- text = _extract_llama_text(result)
161
- text = process.normalize_text(text, lower=False)
162
-
163
- # Handle empty or malformed outputs
164
- if not text:
165
- raise ValueError("Empty response from Llama Space")
166
-
167
- # In case the model produces multiple lines or options, pick the first full sentence
168
- first_line = next((ln.strip() for ln in text.splitlines() if ln.strip()), "")
169
- return first_line or text
170
-
171
- except Exception as e:
172
- print(f"[gen_sentence_llm] Llama Space call failed: {type(e).__name__}: {e}")
173
- if fallback_on_error:
174
- # If fallback is enabled, use a predefined sentence instead
175
- return random.choice(SENTENCE_BANK)
176
- # Otherwise propagate the exception so the UI displays it
177
- raise
178
-
179
-
180
- def gen_sentence_set() -> str:
181
- """
182
- Return a sentence from a predefined static list.
183
-
184
- This is used as a simple fallback generator when model-based
185
- generation is unavailable or for testing the ASR pipeline
186
- without network access.
187
-
188
- Returns
189
- -------
190
- str
191
- A single English sentence from the fallback bank.
192
  """
 
 
 
 
 
 
 
 
 
 
 
 
193
  return random.choice(SENTENCE_BANK)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import random
2
+
3
+ from transformers import pipeline, AutoTokenizer
4
 
5
  import src.process as process
 
6
 
7
+ # You can choose to use either:
8
+ # (1) a list of pre-specified sentences, in SENTENCE_BANK
9
+ # (2) an LLM-generated sentence.
10
+ # SENTENCE_BANK is used in the `gen_sentence_set` function.
11
+ # LLM generation is used in the `gen_sentence_llm` function.
12
 
13
+ # ------------------- Sentence Bank (customize freely) -------------------
14
  SENTENCE_BANK = [
15
  "The quick brown fox jumps over the lazy dog.",
16
  "I promise to speak clearly and at a steady pace.",
 
25
  ]
26
 
27
 
28
+ def gen_sentence_llm():
29
+ """Generates a sentence using an LLM.
30
+ Returns:
31
+ Normalized text string to display in the UI.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  """
33
+ prompt = ""
34
+ tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
35
+ generator = pipeline('text-generation', model='gpt2')
36
+ result = generator(prompt, stop_strings=[".", ], num_return_sequences=1,
37
+ tokenizer=tokenizer, pad_token_id=tokenizer.eos_token_id)
38
+ display_text = process.normalize_text(result[0]["generated_text"],
39
+ lower=False)
40
+ return display_text
41
+
42
+
43
+ def gen_sentence_set():
44
+ """Returns a sentence for the user to say using a prespecified set of options."""
45
  return random.choice(SENTENCE_BANK)
src/prompts.py CHANGED
@@ -1,59 +1,47 @@
1
- import random
2
 
3
- def get_consent_generation_prompt(audio_model_name: str) -> str:
4
  """
5
  Returns a text prompt instructing the model to generate a natural-sounding
6
  consent sentence for voice cloning with the specified model.
7
 
8
  Args:
9
  audio_model_name (str): Name of the audio model to mention in the prompt.
 
 
10
 
11
  Returns:
12
- str: The prompt text, with a randomized topic for the second sentence.
13
  """
14
 
15
- # Possible neutral or everyday topics to diversify phonetic variety
16
- topics = [
17
- "the weather",
18
- "daily routines",
19
- "travel or commuting",
20
- "food or cooking",
21
- "music",
22
- "nature or seasons",
23
- "time of day",
24
- "a calm place like a park or café",
25
- "light exercise or relaxation",
26
- "reading or learning something new",
27
- "a pleasant conversation with a friend",
28
- "observing surroundings like streets or sky",
29
- "working or focusing quietly"
30
- ]
31
-
32
- # Randomly choose one for this prompt instance
33
- topic = random.choice(topics)
34
 
35
  return f"""
36
- Generate exactly two short, natural-sounding English sentences (10-15 words each) that a person could say aloud, using everyday language.
37
-
38
- Sentence 1 (Consent sentence):
39
- * Clearly states informed consent to use their voice for generating synthetic audio with an AI model called {audio_model_name}.
40
- * Must explicitly include a consent phrase such as “I give my consent,” “I agree,” or “I allow.”
41
- * Must clearly mention the model name {audio_model_name} in the sentence.
42
- * Should sound fluent, polite, and natural to read aloud.
43
- * Should have a neutral or positive tone and be self-contained.
44
-
45
- Sentence 2 (Phonetic variety sentence):
46
- * Should not repeat the consent content.
47
- * Adds phonetic variety with a neutral descriptive clause, for example about {topic}.
48
- * Should be fluent, natural, and comfortable to read aloud.
49
- * Should sound polite and neutral, without emotional extremes.
50
- * Should include diverse vowels and consonants naturally for clear pronunciation.
51
-
52
- FORMAT:
53
- * Output EXACTLY two sentences.
54
- * No numbering, no quotes, no bullet points, and no introductory text.
55
- * Use standard punctuation.
56
-
57
- Example format (don’t copy text, just the format):
58
- I give my consent to use my voice for generating audio with the model {audio_model_name}. The weather is clear and calm this afternoon, and I’m speaking at an even pace.
59
- """
 
1
+ # src/utils/prompts.py
2
 
3
+ def get_consent_generation_prompt(audio_model_name: str, short_prompt: bool = False) -> str:
4
  """
5
  Returns a text prompt instructing the model to generate a natural-sounding
6
  consent sentence for voice cloning with the specified model.
7
 
8
  Args:
9
  audio_model_name (str): Name of the audio model to mention in the prompt.
10
+ short_prompt (bool): If True, returns a concise one-line prompt suitable
11
+ for direct model input. If False (default), returns the full detailed prompt.
12
 
13
  Returns:
14
+ str: The prompt text.
15
  """
16
 
17
+ if short_prompt:
18
+ return (
19
+ f"Generate one natural, spoken-style English sentence (10–20 words) in which a person "
20
+ f"clearly gives informed consent to use their voice for generating synthetic audio "
21
+ f"with the model {audio_model_name}. The sentence should sound conversational, include "
22
+ f"a clear consent phrase like 'I give my consent' or 'I agree', mention {audio_model_name} "
23
+ f"by name, and be phonetically varied but neutral in tone. Output only the final sentence."
24
+ )
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  return f"""
27
+ Generate a short, natural-sounding English sentence (10–20 words) that a person could say aloud
28
+ to clearly state their informed consent to use their voice for generating synthetic audio with
29
+ an AI model called {audio_model_name}.
30
+
31
+ The sentence should:
32
+ - Sound natural and conversational, not like legal text.
33
+ - Explicitly include a consent phrase, such as “I give my consent,” “I agree,” or “I allow.”
34
+ - Mention the model name ({audio_model_name}) clearly in the sentence.
35
+ - Include a neutral descriptive clause before or after the consent phrase to add phonetic variety
36
+ (e.g., “The weather today is bright and calm” or “This recording is made clearly and freely.”)
37
+ - Have a neutral or polite tone (no emotional extremes).
38
+ - Be comfortable to read aloud and phonetically rich, covering diverse vowels and consonants naturally.
39
+ - Be self-contained, so the full sentence can serve as an independent audio clip.
40
+
41
+ Examples of structure to follow:
42
+ - “The weather is clear and warm today. I give my consent to use my voice for generating audio with the model {audio_model_name}.”
43
+ - “I give my consent to use my voice for generating audio with the model {audio_model_name}. This statement is made freely and clearly.”
44
+ - “Good afternoon. I agree to the use of my recorded voice for audio generation with the model {audio_model_name}.”
45
+
46
+ The output should be a single, natural sentence ready to be spoken aloud for recording purposes.
47
+ """