Spaces:
Running
on
T4
Running
on
T4
Text generation with the options of Qwen and Llama Instruct models
#2
by
frimelle
HF Staff
- opened
- app.py +55 -106
- src/generate.py +24 -172
- src/prompts.py +34 -46
app.py
CHANGED
|
@@ -1,25 +1,26 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
|
| 3 |
from gradio_client import Client, handle_file
|
| 4 |
|
| 5 |
import src.generate as generate
|
| 6 |
import src.process as process
|
| 7 |
|
|
|
|
| 8 |
global client
|
| 9 |
|
| 10 |
# TODO: Ideally, instead of the Client method we're using for an external voice cloning app, we use the .load() function and pass in arguments to it directly while displaying the developer's desired UI.
|
| 11 |
#chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
|
| 12 |
# ------------------- UI printing functions -------------------
|
| 13 |
def clear_all():
|
| 14 |
-
# target, user_transcript, score_html,
|
| 15 |
-
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
def make_result_html(pass_threshold, passed, ratio):
|
| 19 |
"""Returns HTML summarizing results.
|
| 20 |
Parameters:
|
| 21 |
-
pass_threshold: Minimum percentage of match between target and
|
| 22 |
-
recognized user utterance that counts as passing.
|
| 23 |
passed: Whether the recognized user utterance is >= `pass_threshold`.
|
| 24 |
ratio: Sequence match ratio.
|
| 25 |
"""
|
|
@@ -78,16 +79,16 @@ def make_html(sentence_match):
|
|
| 78 |
return score_html, result_html, diff_html
|
| 79 |
|
| 80 |
|
| 81 |
-
# ------------------- Core Check (
|
| 82 |
# @spaces.GPU
|
| 83 |
def get_user_transcript(audio_path: gr.Audio, target_sentence: str,
|
| 84 |
-
|
| 85 |
"""ASR for the input audio and basic validation.
|
| 86 |
-
Uses the selected ASR model `
|
| 87 |
Parameters:
|
| 88 |
audio_path: Processed audio file returned from gradio Audio component.
|
| 89 |
target_sentence: Sentence the user needs to say.
|
| 90 |
-
|
| 91 |
device_pref: Preferred ASR processing device. Can be "auto", "cpu", "cuda".
|
| 92 |
Returns:
|
| 93 |
error_msg: If there's an error, a string describing what happened.
|
|
@@ -101,7 +102,7 @@ def get_user_transcript(audio_path: gr.Audio, target_sentence: str,
|
|
| 101 |
return "Please start, record, then stop the audio recording before trying to transcribe.", ""
|
| 102 |
|
| 103 |
# Runs the automatic speech recognition
|
| 104 |
-
user_transcript = process.run_asr(audio_path,
|
| 105 |
|
| 106 |
# Handles processing errors.
|
| 107 |
if isinstance(user_transcript, Exception):
|
|
@@ -109,13 +110,13 @@ def get_user_transcript(audio_path: gr.Audio, target_sentence: str,
|
|
| 109 |
return "", user_transcript
|
| 110 |
|
| 111 |
|
| 112 |
-
def transcribe_check(audio_path, target_sentence,
|
| 113 |
pass_threshold):
|
| 114 |
"""Transcribe user, calculate match to target sentence, create results HTML.
|
| 115 |
Parameters:
|
| 116 |
audio_path: Local path to recorded audio.
|
| 117 |
target_sentence: Sentence the user needs to say.
|
| 118 |
-
|
| 119 |
device_pref: Preferred ASR processing device. Can be "auto", "cpu", "cuda".
|
| 120 |
Returns:
|
| 121 |
user_transcript: The recognized user utterance
|
|
@@ -127,8 +128,7 @@ def transcribe_check(audio_path, target_sentence, asr_model_id, device_pref,
|
|
| 127 |
clone_audio = False
|
| 128 |
# Transcribe user input
|
| 129 |
error_msg, user_transcript = get_user_transcript(audio_path,
|
| 130 |
-
target_sentence,
|
| 131 |
-
asr_model_id,
|
| 132 |
device_pref)
|
| 133 |
if error_msg:
|
| 134 |
score_html = ""
|
|
@@ -144,74 +144,33 @@ def transcribe_check(audio_path, target_sentence, asr_model_id, device_pref,
|
|
| 144 |
# Create the output to print out
|
| 145 |
score_html, result_html, diff_html = make_html(sentence_match)
|
| 146 |
|
| 147 |
-
return
|
| 148 |
-
gr.Row(visible=clone_audio))
|
| 149 |
|
| 150 |
-
def clone_voice(audio_input, text_input
|
| 151 |
-
|
| 152 |
global client
|
| 153 |
# Additional specifications for Chatterbox include:
|
| 154 |
# exaggeration_input=0.5,
|
| 155 |
# temperature_input=0.8,
|
| 156 |
-
# seed_num_input=0,
|
| 157 |
# cfgw_input=0.5,
|
| 158 |
# api_name="/generate_tts_audio"
|
| 159 |
return client.predict(text_input=text_input,
|
| 160 |
-
|
| 161 |
-
exaggeration_input=exaggeration_input,
|
| 162 |
-
cfgw_input=cfgw_input,
|
| 163 |
-
seed_num_input=seed_num_input,
|
| 164 |
-
temperature_input=temperature_input)
|
| 165 |
|
| 166 |
|
| 167 |
# ------------------- UI -------------------
|
| 168 |
-
with gr.Blocks(title="
|
| 169 |
-
gr.Markdown(
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
|
|
|
| 177 |
|
| 178 |
-
To create a basic voice cloning system with a voice consent gate, you need three parts:
|
| 179 |
-
1. A way of generating novel consent sentences for the person whose voice will be cloned – the “speaker” – to say, making sure the sentence isn’t part of a previous recording but instead uniquely references the current consent context.
|
| 180 |
-
2. An _automatic speech recognition (ASR) system_ that recognizes the sentence conveying consent.
|
| 181 |
-
3. A _voice-cloning text-to-speech (TTS) system_ that takes as input text and the voice clonee’s speech snippets to generate speech.
|
| 182 |
-
Some voice-cloning TTS systems can now generate speech similar to a speaker’s voice using _just one sentence_. This means that a sentence used for consent can **also** be used for voice cloning. We demonstrate one way to do that here.
|
| 183 |
-
""")
|
| 184 |
-
with gr.Row():
|
| 185 |
-
with gr.Column(scale=2):
|
| 186 |
-
gr.Markdown(
|
| 187 |
-
"""# 🎤 Say the Sentence (English)"""
|
| 188 |
-
)
|
| 189 |
-
gr.Markdown(
|
| 190 |
-
"""
|
| 191 |
-
## 1) Generate a sentence.
|
| 192 |
-
## 2) Record yourself reading it.
|
| 193 |
-
## 3) Transcribe & check your accuracy.
|
| 194 |
-
## 4) If matched, clone your voice to speak any sentence you enter.
|
| 195 |
-
"""
|
| 196 |
-
)
|
| 197 |
-
with gr.Column():
|
| 198 |
-
consent_method = gr.Dropdown(label="Sentence generation method",
|
| 199 |
-
choices=["Llama 3.2 3B Instruct",
|
| 200 |
-
"Pre-written"],
|
| 201 |
-
value="Pre-written")
|
| 202 |
-
asr_model = gr.Dropdown(label="Speech recognition model",
|
| 203 |
-
choices=["openai/whisper-tiny.en", # fastest (CPU-friendly)
|
| 204 |
-
"openai/whisper-base.en", # better accuracy, a bit slower
|
| 205 |
-
"distil-whisper/distil-small.en"
|
| 206 |
-
# optional distil English model
|
| 207 |
-
],
|
| 208 |
-
value="openai/whisper-tiny.en",
|
| 209 |
-
)
|
| 210 |
-
voice_clone_model = gr.Dropdown(
|
| 211 |
-
label="Voice cloning model",
|
| 212 |
-
choices=["Chatterbox", ], value="Chatterbox")
|
| 213 |
-
#with gr.Column():
|
| 214 |
-
# pass # Just for spacing
|
| 215 |
with gr.Row():
|
| 216 |
target = gr.Textbox(label="Target sentence", interactive=False,
|
| 217 |
placeholder="Click 'Generate sentence'")
|
|
@@ -221,10 +180,19 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
|
|
| 221 |
btn_clear = gr.Button("🧹 Clear")
|
| 222 |
|
| 223 |
with gr.Row():
|
| 224 |
-
consent_audio = gr.Audio(sources=["microphone"], type="filepath",
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
device_pref = gr.Radio(
|
| 229 |
choices=["auto", "cpu", "cuda"],
|
| 230 |
value="auto",
|
|
@@ -243,66 +211,47 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
|
|
| 243 |
diff_html = gr.HTML(
|
| 244 |
label="Word-level diff (red = expected but missing / green = extra or replacement)")
|
| 245 |
|
| 246 |
-
gr.Markdown("## 🔁 Voice Consent Gate (opens upon consent)")
|
| 247 |
# TODO: Ideally this is gr.Blocks, but that seems to have a visibility-change bug.
|
| 248 |
with gr.Row(visible=False) as tts_ui:
|
| 249 |
-
# Using the render decorator so that we can
|
| 250 |
@gr.render(inputs=consent_audio)
|
| 251 |
def show_tts(audio_input):
|
|
|
|
| 252 |
global client
|
| 253 |
if audio_input:
|
| 254 |
client = Client("ResembleAI/Chatterbox")
|
|
|
|
|
|
|
| 255 |
with gr.Row():
|
| 256 |
with gr.Column():
|
| 257 |
gr.Markdown("## Audio input")
|
| 258 |
# Prepopulating with the consent audio.
|
| 259 |
-
|
| 260 |
-
tts_audio = gr.Audio(audio_input, type="filepath")
|
| 261 |
with gr.Row():
|
| 262 |
with gr.Column():
|
| 263 |
gr.Markdown("## Text input")
|
| 264 |
tts_text = gr.Textbox(
|
| 265 |
"Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.", interactive=True)
|
| 266 |
-
with gr.Row():
|
| 267 |
-
# TODO: Ideally, these options aren't hardcoded -- e.g., using .load(), where they're imported, allowing for different options depending on the client.
|
| 268 |
-
with gr.Accordion("More options", open=False):
|
| 269 |
-
exaggeration = gr.Slider(
|
| 270 |
-
0.25, 2, step=.05,
|
| 271 |
-
label="Exaggeration (Neutral = 0.5, extreme values can be unstable)",
|
| 272 |
-
value=.5
|
| 273 |
-
)
|
| 274 |
-
cfg_weight = gr.Slider(
|
| 275 |
-
0.2, 1, step=.05, label="CFG/Pace", value=0.5
|
| 276 |
-
)
|
| 277 |
-
seed_num = gr.Number(value=0,
|
| 278 |
-
label="Random seed (0 for random)")
|
| 279 |
-
temp = gr.Slider(0.05, 5, step=.05,
|
| 280 |
-
label="Temperature", value=.8)
|
| 281 |
with gr.Row():
|
| 282 |
clone_btn = gr.Button("Clone!")
|
| 283 |
cloned_audio = gr.Audio()
|
| 284 |
-
clone_btn.click(fn=clone_voice,
|
| 285 |
-
inputs=[tts_audio, tts_text, exaggeration,
|
| 286 |
-
cfg_weight, seed_num, temp],
|
| 287 |
-
outputs=[cloned_audio])
|
| 288 |
|
| 289 |
# -------- Events --------
|
| 290 |
-
#
|
| 291 |
-
btn_gen.click(
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
outputs=target
|
| 295 |
-
)
|
| 296 |
|
|
|
|
| 297 |
btn_clear.click(
|
| 298 |
fn=clear_all,
|
| 299 |
-
outputs=[target, user_transcript, score_html, result_html, diff_html
|
| 300 |
-
tts_ui]
|
| 301 |
)
|
| 302 |
|
| 303 |
btn_check.click(
|
| 304 |
fn=transcribe_check,
|
| 305 |
-
inputs=[consent_audio, target,
|
| 306 |
outputs=[user_transcript, score_html, result_html, diff_html, tts_ui]
|
| 307 |
)
|
| 308 |
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
# import spaces
|
| 3 |
from gradio_client import Client, handle_file
|
| 4 |
|
| 5 |
import src.generate as generate
|
| 6 |
import src.process as process
|
| 7 |
|
| 8 |
+
# TODO: Abusing the 'global' notation for now so we can be flexible to multiple clients.
|
| 9 |
global client
|
| 10 |
|
| 11 |
# TODO: Ideally, instead of the Client method we're using for an external voice cloning app, we use the .load() function and pass in arguments to it directly while displaying the developer's desired UI.
|
| 12 |
#chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
|
| 13 |
# ------------------- UI printing functions -------------------
|
| 14 |
def clear_all():
|
| 15 |
+
# target, user_transcript, score_html, diff_html, result_html,
|
| 16 |
+
# TODO(?): Add tts_text, tts_audio, clone_status (Maybe? Was there before.)
|
| 17 |
+
return "", "", "", "", "", "", "", None,
|
| 18 |
|
| 19 |
|
| 20 |
def make_result_html(pass_threshold, passed, ratio):
|
| 21 |
"""Returns HTML summarizing results.
|
| 22 |
Parameters:
|
| 23 |
+
pass_threshold: Minimum percentage of match between target and recognized user utterance that counts as passing.
|
|
|
|
| 24 |
passed: Whether the recognized user utterance is >= `pass_threshold`.
|
| 25 |
ratio: Sequence match ratio.
|
| 26 |
"""
|
|
|
|
| 79 |
return score_html, result_html, diff_html
|
| 80 |
|
| 81 |
|
| 82 |
+
# ------------------- Core Check (English-only) -------------------
|
| 83 |
# @spaces.GPU
|
| 84 |
def get_user_transcript(audio_path: gr.Audio, target_sentence: str,
|
| 85 |
+
model_id: str, device_pref: str) -> (str, str):
|
| 86 |
"""ASR for the input audio and basic validation.
|
| 87 |
+
Uses the selected ASR model `model_id` to recognize words in the input `audio_path`.
|
| 88 |
Parameters:
|
| 89 |
audio_path: Processed audio file returned from gradio Audio component.
|
| 90 |
target_sentence: Sentence the user needs to say.
|
| 91 |
+
model_id: Desired ASR model.
|
| 92 |
device_pref: Preferred ASR processing device. Can be "auto", "cpu", "cuda".
|
| 93 |
Returns:
|
| 94 |
error_msg: If there's an error, a string describing what happened.
|
|
|
|
| 102 |
return "Please start, record, then stop the audio recording before trying to transcribe.", ""
|
| 103 |
|
| 104 |
# Runs the automatic speech recognition
|
| 105 |
+
user_transcript = process.run_asr(audio_path, model_id, device_pref)
|
| 106 |
|
| 107 |
# Handles processing errors.
|
| 108 |
if isinstance(user_transcript, Exception):
|
|
|
|
| 110 |
return "", user_transcript
|
| 111 |
|
| 112 |
|
| 113 |
+
def transcribe_check(audio_path, target_sentence, model_id, device_pref,
|
| 114 |
pass_threshold):
|
| 115 |
"""Transcribe user, calculate match to target sentence, create results HTML.
|
| 116 |
Parameters:
|
| 117 |
audio_path: Local path to recorded audio.
|
| 118 |
target_sentence: Sentence the user needs to say.
|
| 119 |
+
model_id: Desired ASR model.
|
| 120 |
device_pref: Preferred ASR processing device. Can be "auto", "cpu", "cuda".
|
| 121 |
Returns:
|
| 122 |
user_transcript: The recognized user utterance
|
|
|
|
| 128 |
clone_audio = False
|
| 129 |
# Transcribe user input
|
| 130 |
error_msg, user_transcript = get_user_transcript(audio_path,
|
| 131 |
+
target_sentence, model_id,
|
|
|
|
| 132 |
device_pref)
|
| 133 |
if error_msg:
|
| 134 |
score_html = ""
|
|
|
|
| 144 |
# Create the output to print out
|
| 145 |
score_html, result_html, diff_html = make_html(sentence_match)
|
| 146 |
|
| 147 |
+
return user_transcript, score_html, result_html, diff_html, gr.Row(visible=clone_audio)
|
|
|
|
| 148 |
|
| 149 |
+
def clone_voice(audio_input, text_input):
|
| 150 |
+
# TODO: Note that this is the 'global' hack to pass in the client.
|
| 151 |
global client
|
| 152 |
# Additional specifications for Chatterbox include:
|
| 153 |
# exaggeration_input=0.5,
|
| 154 |
# temperature_input=0.8,
|
| 155 |
+
# seed_num_input=0,
|
| 156 |
# cfgw_input=0.5,
|
| 157 |
# api_name="/generate_tts_audio"
|
| 158 |
return client.predict(text_input=text_input,
|
| 159 |
+
audio_prompt_path_input=handle_file(audio_input))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
|
| 162 |
# ------------------- UI -------------------
|
| 163 |
+
with gr.Blocks(title="Say the Sentence (English)") as demo:
|
| 164 |
+
gr.Markdown(
|
| 165 |
+
"""
|
| 166 |
+
# 🎤 Say the Sentence (English)
|
| 167 |
+
1) Generate a sentence.
|
| 168 |
+
2) Record yourself reading it.
|
| 169 |
+
3) Transcribe & check your accuracy.
|
| 170 |
+
4) If matched, clone your voice to speak any sentence you enter.
|
| 171 |
+
"""
|
| 172 |
+
)
|
| 173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
with gr.Row():
|
| 175 |
target = gr.Textbox(label="Target sentence", interactive=False,
|
| 176 |
placeholder="Click 'Generate sentence'")
|
|
|
|
| 180 |
btn_clear = gr.Button("🧹 Clear")
|
| 181 |
|
| 182 |
with gr.Row():
|
| 183 |
+
consent_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record your voice", key='consent_audio')
|
| 184 |
+
|
| 185 |
+
with gr.Accordion("Advanced settings", open=False):
|
| 186 |
+
model_id = gr.Dropdown(
|
| 187 |
+
choices=[
|
| 188 |
+
"openai/whisper-tiny.en", # fastest (CPU-friendly)
|
| 189 |
+
"openai/whisper-base.en", # better accuracy, a bit slower
|
| 190 |
+
"distil-whisper/distil-small.en" # optional distil English model
|
| 191 |
+
"distil-whisper/distil-small.en",
|
| 192 |
+
],
|
| 193 |
+
value="openai/whisper-tiny.en",
|
| 194 |
+
label="ASR model (English only)",
|
| 195 |
+
)
|
| 196 |
device_pref = gr.Radio(
|
| 197 |
choices=["auto", "cpu", "cuda"],
|
| 198 |
value="auto",
|
|
|
|
| 211 |
diff_html = gr.HTML(
|
| 212 |
label="Word-level diff (red = expected but missing / green = extra or replacement)")
|
| 213 |
|
|
|
|
| 214 |
# TODO: Ideally this is gr.Blocks, but that seems to have a visibility-change bug.
|
| 215 |
with gr.Row(visible=False) as tts_ui:
|
| 216 |
+
# Using the render decorator so that we can easily pass in the consent audio after it's recorded.
|
| 217 |
@gr.render(inputs=consent_audio)
|
| 218 |
def show_tts(audio_input):
|
| 219 |
+
# TODO: Abusing global, since we can't send a Client as a component to a function.
|
| 220 |
global client
|
| 221 |
if audio_input:
|
| 222 |
client = Client("ResembleAI/Chatterbox")
|
| 223 |
+
with gr.Row():
|
| 224 |
+
gr.Markdown("# 🔁 Voice cloning")
|
| 225 |
with gr.Row():
|
| 226 |
with gr.Column():
|
| 227 |
gr.Markdown("## Audio input")
|
| 228 |
# Prepopulating with the consent audio.
|
| 229 |
+
tts_audio = gr.Audio(audio_input, interactive=True, type="filepath")
|
|
|
|
| 230 |
with gr.Row():
|
| 231 |
with gr.Column():
|
| 232 |
gr.Markdown("## Text input")
|
| 233 |
tts_text = gr.Textbox(
|
| 234 |
"Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.", interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
with gr.Row():
|
| 236 |
clone_btn = gr.Button("Clone!")
|
| 237 |
cloned_audio = gr.Audio()
|
| 238 |
+
clone_btn.click(fn=clone_voice, inputs=[tts_audio, tts_text], outputs=[cloned_audio])
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
# -------- Events --------
|
| 241 |
+
# Use pre-specified sentence bank by default
|
| 242 |
+
btn_gen.click(fn=generate.gen_sentence_set, outputs=target)
|
| 243 |
+
# Or use LLM generation:
|
| 244 |
+
# btn_gen.click(fn=generate.gen_sentence_llm, outputs=target)
|
|
|
|
|
|
|
| 245 |
|
| 246 |
+
# TODO(?): clearing tts_text, tts_audio, clone_status (not sure what that was)
|
| 247 |
btn_clear.click(
|
| 248 |
fn=clear_all,
|
| 249 |
+
outputs=[target, user_transcript, score_html, result_html, diff_html]
|
|
|
|
| 250 |
)
|
| 251 |
|
| 252 |
btn_check.click(
|
| 253 |
fn=transcribe_check,
|
| 254 |
+
inputs=[consent_audio, target, model_id, device_pref, pass_threshold],
|
| 255 |
outputs=[user_transcript, score_html, result_html, diff_html, tts_ui]
|
| 256 |
)
|
| 257 |
|
src/generate.py
CHANGED
|
@@ -1,32 +1,16 @@
|
|
| 1 |
-
# src/generate.py
|
| 2 |
-
"""
|
| 3 |
-
Module: generate
|
| 4 |
-
----------------
|
| 5 |
-
Handles the generation of "consent sentences" for the Voice Consent Gate demo.
|
| 6 |
-
|
| 7 |
-
This module connects to an external language model (in this case, the public
|
| 8 |
-
Hugging Face Space for Llama 3.2 3B Instruct) to generate natural-sounding
|
| 9 |
-
sentences that users can read aloud to give informed consent for voice cloning.
|
| 10 |
-
|
| 11 |
-
If the model call fails (e.g., due to rate limits or network issues),
|
| 12 |
-
a fallback sentence is chosen from a small built-in sentence bank.
|
| 13 |
-
|
| 14 |
-
Functions:
|
| 15 |
-
- _extract_llama_text(): Normalize the API output from the Llama demo.
|
| 16 |
-
- gen_sentence_llm(): Generate a consent sentence from the Llama model Space.
|
| 17 |
-
- gen_sentence_set(): Select a random prewritten sentence (for fallback/testing).
|
| 18 |
-
"""
|
| 19 |
-
|
| 20 |
-
import os
|
| 21 |
import random
|
| 22 |
-
|
| 23 |
-
from
|
| 24 |
|
| 25 |
import src.process as process
|
| 26 |
-
from src.prompts import get_consent_generation_prompt
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
# ------------------- Sentence Bank (
|
| 30 |
SENTENCE_BANK = [
|
| 31 |
"The quick brown fox jumps over the lazy dog.",
|
| 32 |
"I promise to speak clearly and at a steady pace.",
|
|
@@ -41,153 +25,21 @@ SENTENCE_BANK = [
|
|
| 41 |
]
|
| 42 |
|
| 43 |
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
"LLAMA_SPACE_ID", "huggingface-projects/llama-3.2-3B-Instruct"
|
| 49 |
-
)
|
| 50 |
-
LLAMA_API_NAME = "/chat" # The Space exposes a single /chat endpoint.
|
| 51 |
-
HF_TOKEN = os.getenv("HF_TOKEN") # Optional; not required for public Spaces.
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
def _extract_llama_text(result: Any) -> str:
|
| 55 |
-
"""
|
| 56 |
-
Normalize the API response from the Llama 3.2 3B demo Space into plain text.
|
| 57 |
-
|
| 58 |
-
The Space’s `/chat` endpoint may return different shapes depending on how
|
| 59 |
-
the Gradio app is structured — sometimes a string, other times a dictionary
|
| 60 |
-
or list. This function recursively traverses and extracts the first
|
| 61 |
-
meaningful text string it finds.
|
| 62 |
-
|
| 63 |
-
Parameters
|
| 64 |
-
----------
|
| 65 |
-
result : Any
|
| 66 |
-
The raw output returned by `client.predict()`.
|
| 67 |
-
|
| 68 |
-
Returns
|
| 69 |
-
-------
|
| 70 |
-
str
|
| 71 |
-
Cleaned text output (may be empty string if extraction fails).
|
| 72 |
-
"""
|
| 73 |
-
if isinstance(result, str):
|
| 74 |
-
return result.strip()
|
| 75 |
-
if isinstance(result, (int, float, bool)):
|
| 76 |
-
return str(result)
|
| 77 |
-
if isinstance(result, list):
|
| 78 |
-
# If multiple segments are returned (e.g., multiple sentences),
|
| 79 |
-
# join them into one string.
|
| 80 |
-
parts = []
|
| 81 |
-
for x in result:
|
| 82 |
-
s = _extract_llama_text(x)
|
| 83 |
-
if s:
|
| 84 |
-
parts.append(s)
|
| 85 |
-
return " ".join(parts).strip()
|
| 86 |
-
if isinstance(result, dict):
|
| 87 |
-
# Common key names used in Gradio JSON responses
|
| 88 |
-
for key in ("text", "response", "content", "generated_text", "message"):
|
| 89 |
-
v = result.get(key)
|
| 90 |
-
if isinstance(v, str) and v.strip():
|
| 91 |
-
return v.strip()
|
| 92 |
-
return ""
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
def gen_sentence(sentence_method="Pre-written", audio_model_name="Chatterbox"):
|
| 96 |
-
# chatterbox model name, detailed prompt (short_prompt=False)
|
| 97 |
-
if sentence_method == "Pre-written":
|
| 98 |
-
return gen_sentence_set()
|
| 99 |
-
else:
|
| 100 |
-
try:
|
| 101 |
-
return gen_sentence_llm(sentence_method,
|
| 102 |
-
audio_model_name,
|
| 103 |
-
fallback_on_error=False # ← show errors during testing
|
| 104 |
-
)
|
| 105 |
-
except Exception as e:
|
| 106 |
-
# Show a helpful message directly in the Target sentence box
|
| 107 |
-
return f"[ERROR calling LLM] {type(e).__name__}: {e}"
|
| 108 |
-
|
| 109 |
-
# TODO: Support more than just Llama 3.2 3B Instruct
|
| 110 |
-
def gen_sentence_llm(sentence_method="Llama 3.2 3B Instruct", audio_model_name: str = "Chatterbox", *, fallback_on_error: bool = False # Set True for production to avoid crashes
|
| 111 |
-
) -> str:
|
| 112 |
-
"""
|
| 113 |
-
Generate a consent sentence using the Llama 3.2 3B Instruct demo Space.
|
| 114 |
-
|
| 115 |
-
This function constructs a prompt describing the linguistic and ethical
|
| 116 |
-
requirements for a consent sentence (via `get_consent_generation_prompt`)
|
| 117 |
-
and sends it to the Llama demo hosted on Hugging Face Spaces.
|
| 118 |
-
|
| 119 |
-
The response is normalized into a single English sentence suitable
|
| 120 |
-
for reading aloud.
|
| 121 |
-
|
| 122 |
-
Parameters
|
| 123 |
-
----------
|
| 124 |
-
audio_model_name : str, optional
|
| 125 |
-
The name of the voice-cloning model to mention in the sentence.
|
| 126 |
-
Defaults to "Chatterbox".
|
| 127 |
-
fallback_on_error : bool, optional
|
| 128 |
-
If True, return a random fallback sentence instead of raising
|
| 129 |
-
an error when the Space call fails. Default is False for debugging.
|
| 130 |
-
|
| 131 |
-
Returns
|
| 132 |
-
-------
|
| 133 |
-
str
|
| 134 |
-
A clean, human-readable consent sentence.
|
| 135 |
-
|
| 136 |
-
Raises
|
| 137 |
-
------
|
| 138 |
-
Exception
|
| 139 |
-
Re-raises the underlying error if `fallback_on_error` is False.
|
| 140 |
-
"""
|
| 141 |
-
# Generate the full natural-language prompt that the LLM will receive
|
| 142 |
-
prompt = get_consent_generation_prompt(audio_model_name)
|
| 143 |
-
|
| 144 |
-
try:
|
| 145 |
-
# Initialize Gradio client for the Llama demo Space
|
| 146 |
-
client = Client(LLAMA_SPACE_ID, hf_token=HF_TOKEN)
|
| 147 |
-
|
| 148 |
-
# The Llama demo exposes a simple /chat endpoint with standard decoding params
|
| 149 |
-
result = client.predict(
|
| 150 |
-
message=prompt,
|
| 151 |
-
max_new_tokens=128,
|
| 152 |
-
temperature=0.6,
|
| 153 |
-
top_p=0.9,
|
| 154 |
-
top_k=50,
|
| 155 |
-
repetition_penalty=1.2,
|
| 156 |
-
api_name=LLAMA_API_NAME,
|
| 157 |
-
)
|
| 158 |
-
|
| 159 |
-
# Normalize and clean up model output
|
| 160 |
-
text = _extract_llama_text(result)
|
| 161 |
-
text = process.normalize_text(text, lower=False)
|
| 162 |
-
|
| 163 |
-
# Handle empty or malformed outputs
|
| 164 |
-
if not text:
|
| 165 |
-
raise ValueError("Empty response from Llama Space")
|
| 166 |
-
|
| 167 |
-
# In case the model produces multiple lines or options, pick the first full sentence
|
| 168 |
-
first_line = next((ln.strip() for ln in text.splitlines() if ln.strip()), "")
|
| 169 |
-
return first_line or text
|
| 170 |
-
|
| 171 |
-
except Exception as e:
|
| 172 |
-
print(f"[gen_sentence_llm] Llama Space call failed: {type(e).__name__}: {e}")
|
| 173 |
-
if fallback_on_error:
|
| 174 |
-
# If fallback is enabled, use a predefined sentence instead
|
| 175 |
-
return random.choice(SENTENCE_BANK)
|
| 176 |
-
# Otherwise propagate the exception so the UI displays it
|
| 177 |
-
raise
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
def gen_sentence_set() -> str:
|
| 181 |
-
"""
|
| 182 |
-
Return a sentence from a predefined static list.
|
| 183 |
-
|
| 184 |
-
This is used as a simple fallback generator when model-based
|
| 185 |
-
generation is unavailable or for testing the ASR pipeline
|
| 186 |
-
without network access.
|
| 187 |
-
|
| 188 |
-
Returns
|
| 189 |
-
-------
|
| 190 |
-
str
|
| 191 |
-
A single English sentence from the fallback bank.
|
| 192 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
return random.choice(SENTENCE_BANK)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import random
|
| 2 |
+
|
| 3 |
+
from transformers import pipeline, AutoTokenizer
|
| 4 |
|
| 5 |
import src.process as process
|
|
|
|
| 6 |
|
| 7 |
+
# You can choose to use either:
|
| 8 |
+
# (1) a list of pre-specified sentences, in SENTENCE_BANK
|
| 9 |
+
# (2) an LLM-generated sentence.
|
| 10 |
+
# SENTENCE_BANK is used in the `gen_sentence_set` function.
|
| 11 |
+
# LLM generation is used in the `gen_sentence_llm` function.
|
| 12 |
|
| 13 |
+
# ------------------- Sentence Bank (customize freely) -------------------
|
| 14 |
SENTENCE_BANK = [
|
| 15 |
"The quick brown fox jumps over the lazy dog.",
|
| 16 |
"I promise to speak clearly and at a steady pace.",
|
|
|
|
| 25 |
]
|
| 26 |
|
| 27 |
|
| 28 |
+
def gen_sentence_llm():
|
| 29 |
+
"""Generates a sentence using an LLM.
|
| 30 |
+
Returns:
|
| 31 |
+
Normalized text string to display in the UI.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
"""
|
| 33 |
+
prompt = ""
|
| 34 |
+
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
|
| 35 |
+
generator = pipeline('text-generation', model='gpt2')
|
| 36 |
+
result = generator(prompt, stop_strings=[".", ], num_return_sequences=1,
|
| 37 |
+
tokenizer=tokenizer, pad_token_id=tokenizer.eos_token_id)
|
| 38 |
+
display_text = process.normalize_text(result[0]["generated_text"],
|
| 39 |
+
lower=False)
|
| 40 |
+
return display_text
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def gen_sentence_set():
|
| 44 |
+
"""Returns a sentence for the user to say using a prespecified set of options."""
|
| 45 |
return random.choice(SENTENCE_BANK)
|
src/prompts.py
CHANGED
|
@@ -1,59 +1,47 @@
|
|
| 1 |
-
|
| 2 |
|
| 3 |
-
def get_consent_generation_prompt(audio_model_name: str) -> str:
|
| 4 |
"""
|
| 5 |
Returns a text prompt instructing the model to generate a natural-sounding
|
| 6 |
consent sentence for voice cloning with the specified model.
|
| 7 |
|
| 8 |
Args:
|
| 9 |
audio_model_name (str): Name of the audio model to mention in the prompt.
|
|
|
|
|
|
|
| 10 |
|
| 11 |
Returns:
|
| 12 |
-
str: The prompt text
|
| 13 |
"""
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
"time of day",
|
| 24 |
-
"a calm place like a park or café",
|
| 25 |
-
"light exercise or relaxation",
|
| 26 |
-
"reading or learning something new",
|
| 27 |
-
"a pleasant conversation with a friend",
|
| 28 |
-
"observing surroundings like streets or sky",
|
| 29 |
-
"working or focusing quietly"
|
| 30 |
-
]
|
| 31 |
-
|
| 32 |
-
# Randomly choose one for this prompt instance
|
| 33 |
-
topic = random.choice(topics)
|
| 34 |
|
| 35 |
return f"""
|
| 36 |
-
Generate
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
Example format (don’t copy text, just the format):
|
| 58 |
-
I give my consent to use my voice for generating audio with the model {audio_model_name}. The weather is clear and calm this afternoon, and I’m speaking at an even pace.
|
| 59 |
-
"""
|
|
|
|
| 1 |
+
# src/utils/prompts.py
|
| 2 |
|
| 3 |
+
def get_consent_generation_prompt(audio_model_name: str, short_prompt: bool = False) -> str:
|
| 4 |
"""
|
| 5 |
Returns a text prompt instructing the model to generate a natural-sounding
|
| 6 |
consent sentence for voice cloning with the specified model.
|
| 7 |
|
| 8 |
Args:
|
| 9 |
audio_model_name (str): Name of the audio model to mention in the prompt.
|
| 10 |
+
short_prompt (bool): If True, returns a concise one-line prompt suitable
|
| 11 |
+
for direct model input. If False (default), returns the full detailed prompt.
|
| 12 |
|
| 13 |
Returns:
|
| 14 |
+
str: The prompt text.
|
| 15 |
"""
|
| 16 |
|
| 17 |
+
if short_prompt:
|
| 18 |
+
return (
|
| 19 |
+
f"Generate one natural, spoken-style English sentence (10–20 words) in which a person "
|
| 20 |
+
f"clearly gives informed consent to use their voice for generating synthetic audio "
|
| 21 |
+
f"with the model {audio_model_name}. The sentence should sound conversational, include "
|
| 22 |
+
f"a clear consent phrase like 'I give my consent' or 'I agree', mention {audio_model_name} "
|
| 23 |
+
f"by name, and be phonetically varied but neutral in tone. Output only the final sentence."
|
| 24 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
return f"""
|
| 27 |
+
Generate a short, natural-sounding English sentence (10–20 words) that a person could say aloud
|
| 28 |
+
to clearly state their informed consent to use their voice for generating synthetic audio with
|
| 29 |
+
an AI model called {audio_model_name}.
|
| 30 |
+
|
| 31 |
+
The sentence should:
|
| 32 |
+
- Sound natural and conversational, not like legal text.
|
| 33 |
+
- Explicitly include a consent phrase, such as “I give my consent,” “I agree,” or “I allow.”
|
| 34 |
+
- Mention the model name ({audio_model_name}) clearly in the sentence.
|
| 35 |
+
- Include a neutral descriptive clause before or after the consent phrase to add phonetic variety
|
| 36 |
+
(e.g., “The weather today is bright and calm” or “This recording is made clearly and freely.”)
|
| 37 |
+
- Have a neutral or polite tone (no emotional extremes).
|
| 38 |
+
- Be comfortable to read aloud and phonetically rich, covering diverse vowels and consonants naturally.
|
| 39 |
+
- Be self-contained, so the full sentence can serve as an independent audio clip.
|
| 40 |
+
|
| 41 |
+
Examples of structure to follow:
|
| 42 |
+
- “The weather is clear and warm today. I give my consent to use my voice for generating audio with the model {audio_model_name}.”
|
| 43 |
+
- “I give my consent to use my voice for generating audio with the model {audio_model_name}. This statement is made freely and clearly.”
|
| 44 |
+
- “Good afternoon. I agree to the use of my recorded voice for audio generation with the model {audio_model_name}.”
|
| 45 |
+
|
| 46 |
+
The output should be a single, natural sentence ready to be spoken aloud for recording purposes.
|
| 47 |
+
"""
|
|
|
|
|
|
|
|
|