Spaces:
Running
on
T4
Running
on
T4
Remove Sentence Bank and pre-written option
#3
by
frimelle
HF Staff
- opened
- .gitattributes +0 -2
- app.py +20 -18
- assets/voice_consent_gate.png +0 -3
- assets/voice_consent_gate_50.png +0 -0
- src/generate.py +35 -37
.gitattributes
CHANGED
|
@@ -33,5 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
-
voice_consent_gate.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
-
assets/voice_consent_gate.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
app.py
CHANGED
|
@@ -7,8 +7,6 @@ import src.process as process
|
|
| 7 |
|
| 8 |
global client
|
| 9 |
|
| 10 |
-
GATE_IMAGE_PATH = "./assets/voice_consent_gate_50.png"
|
| 11 |
-
|
| 12 |
# TODO: Ideally, instead of the Client method we're using for an external voice cloning app, we use the .load() function and pass in arguments to it directly while displaying the developer's desired UI.
|
| 13 |
#chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
|
| 14 |
# ------------------- UI printing functions -------------------
|
|
@@ -170,20 +168,19 @@ def clone_voice(audio_input, text_input, exaggeration_input, cfgw_input,
|
|
| 170 |
with gr.Blocks(title="Voice Consent Gate") as demo:
|
| 171 |
gr.Markdown("# Voice Consent Gate: Demo")
|
| 172 |
with gr.Row():
|
| 173 |
-
with gr.Column():
|
| 174 |
-
gr.Image(GATE_IMAGE_PATH, interactive=False, show_download_button=False)
|
| 175 |
with gr.Column():
|
| 176 |
with gr.Accordion(
|
| 177 |
label="Click for further information on this demo",
|
| 178 |
open=False):
|
| 179 |
gr.Markdown("""
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
|
|
|
| 187 |
with gr.Row():
|
| 188 |
with gr.Column(scale=2):
|
| 189 |
gr.Markdown(
|
|
@@ -199,11 +196,11 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
|
|
| 199 |
)
|
| 200 |
with gr.Column():
|
| 201 |
consent_method = gr.Dropdown(
|
| 202 |
-
label="Sentence generation method
|
| 203 |
choices=["Llama 3.2 3B Instruct"],
|
| 204 |
value="Llama 3.2 3B Instruct"
|
| 205 |
)
|
| 206 |
-
asr_model = gr.Dropdown(label="Speech recognition model
|
| 207 |
choices=["openai/whisper-tiny.en", # fastest (CPU-friendly)
|
| 208 |
"openai/whisper-base.en", # better accuracy, a bit slower
|
| 209 |
"distil-whisper/distil-small.en"
|
|
@@ -212,26 +209,31 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
|
|
| 212 |
value="openai/whisper-tiny.en",
|
| 213 |
)
|
| 214 |
voice_clone_model = gr.Dropdown(
|
| 215 |
-
label="Voice cloning model
|
| 216 |
choices=["Chatterbox", ], value="Chatterbox")
|
|
|
|
|
|
|
| 217 |
with gr.Row():
|
| 218 |
target = gr.Textbox(label="Target sentence", interactive=False,
|
| 219 |
placeholder="Click 'Generate sentence'")
|
|
|
|
| 220 |
with gr.Row():
|
| 221 |
btn_gen = gr.Button("π² Generate sentence", variant="primary")
|
| 222 |
btn_clear = gr.Button("π§Ή Clear")
|
|
|
|
| 223 |
with gr.Row():
|
| 224 |
consent_audio = gr.Audio(sources=["microphone"], type="filepath",
|
| 225 |
label="Record your voice", key='consent_audio')
|
|
|
|
| 226 |
with gr.Accordion("Advanced ASR settings", open=False):
|
| 227 |
device_pref = gr.Radio(
|
| 228 |
choices=["auto", "cpu", "cuda"],
|
| 229 |
value="auto",
|
| 230 |
label="Device preference"
|
| 231 |
)
|
| 232 |
-
# In your own code, do not provide users with the option to change this: Set it yourself.
|
| 233 |
pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01,
|
| 234 |
label="Match threshold")
|
|
|
|
| 235 |
with gr.Row():
|
| 236 |
btn_check = gr.Button("β
Transcribe & Check", variant="primary")
|
| 237 |
with gr.Row():
|
|
@@ -255,8 +257,8 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
|
|
| 255 |
with gr.Column():
|
| 256 |
gr.Markdown("## Audio input")
|
| 257 |
# Prepopulating with the consent audio.
|
| 258 |
-
#
|
| 259 |
-
tts_audio = gr.Audio(audio_input, type="filepath"
|
| 260 |
with gr.Row():
|
| 261 |
with gr.Column():
|
| 262 |
gr.Markdown("## Text input")
|
|
@@ -279,7 +281,7 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
|
|
| 279 |
label="Temperature", value=.8)
|
| 280 |
with gr.Row():
|
| 281 |
clone_btn = gr.Button("Clone!")
|
| 282 |
-
cloned_audio = gr.Audio(
|
| 283 |
clone_btn.click(fn=clone_voice,
|
| 284 |
inputs=[tts_audio, tts_text, exaggeration,
|
| 285 |
cfg_weight, seed_num, temp],
|
|
|
|
| 7 |
|
| 8 |
global client
|
| 9 |
|
|
|
|
|
|
|
| 10 |
# TODO: Ideally, instead of the Client method we're using for an external voice cloning app, we use the .load() function and pass in arguments to it directly while displaying the developer's desired UI.
|
| 11 |
#chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
|
| 12 |
# ------------------- UI printing functions -------------------
|
|
|
|
| 168 |
with gr.Blocks(title="Voice Consent Gate") as demo:
|
| 169 |
gr.Markdown("# Voice Consent Gate: Demo")
|
| 170 |
with gr.Row():
|
|
|
|
|
|
|
| 171 |
with gr.Column():
|
| 172 |
with gr.Accordion(
|
| 173 |
label="Click for further information on this demo",
|
| 174 |
open=False):
|
| 175 |
gr.Markdown("""
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
To create a basic voice cloning system with a voice consent gate, you need three parts:
|
| 179 |
+
1. A way of generating novel consent sentences for the person whose voice will be cloned β the βspeakerβ β to say, making sure the sentence isnβt part of a previous recording but instead uniquely references the current consent context.
|
| 180 |
+
2. An _automatic speech recognition (ASR) system_ that recognizes the sentence conveying consent.
|
| 181 |
+
3. A _voice-cloning text-to-speech (TTS) system_ that takes as input text and the voice cloneeβs speech snippets to generate speech.
|
| 182 |
+
Some voice-cloning TTS systems can now generate speech similar to a speakerβs voice using _just one sentence_. This means that a sentence used for consent can **also** be used for voice cloning. We demonstrate one way to do that here.
|
| 183 |
+
""")
|
| 184 |
with gr.Row():
|
| 185 |
with gr.Column(scale=2):
|
| 186 |
gr.Markdown(
|
|
|
|
| 196 |
)
|
| 197 |
with gr.Column():
|
| 198 |
consent_method = gr.Dropdown(
|
| 199 |
+
label="Sentence generation method",
|
| 200 |
choices=["Llama 3.2 3B Instruct"],
|
| 201 |
value="Llama 3.2 3B Instruct"
|
| 202 |
)
|
| 203 |
+
asr_model = gr.Dropdown(label="Speech recognition model",
|
| 204 |
choices=["openai/whisper-tiny.en", # fastest (CPU-friendly)
|
| 205 |
"openai/whisper-base.en", # better accuracy, a bit slower
|
| 206 |
"distil-whisper/distil-small.en"
|
|
|
|
| 209 |
value="openai/whisper-tiny.en",
|
| 210 |
)
|
| 211 |
voice_clone_model = gr.Dropdown(
|
| 212 |
+
label="Voice cloning model",
|
| 213 |
choices=["Chatterbox", ], value="Chatterbox")
|
| 214 |
+
#with gr.Column():
|
| 215 |
+
# pass # Just for spacing
|
| 216 |
with gr.Row():
|
| 217 |
target = gr.Textbox(label="Target sentence", interactive=False,
|
| 218 |
placeholder="Click 'Generate sentence'")
|
| 219 |
+
|
| 220 |
with gr.Row():
|
| 221 |
btn_gen = gr.Button("π² Generate sentence", variant="primary")
|
| 222 |
btn_clear = gr.Button("π§Ή Clear")
|
| 223 |
+
|
| 224 |
with gr.Row():
|
| 225 |
consent_audio = gr.Audio(sources=["microphone"], type="filepath",
|
| 226 |
label="Record your voice", key='consent_audio')
|
| 227 |
+
|
| 228 |
with gr.Accordion("Advanced ASR settings", open=False):
|
| 229 |
device_pref = gr.Radio(
|
| 230 |
choices=["auto", "cpu", "cuda"],
|
| 231 |
value="auto",
|
| 232 |
label="Device preference"
|
| 233 |
)
|
|
|
|
| 234 |
pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01,
|
| 235 |
label="Match threshold")
|
| 236 |
+
|
| 237 |
with gr.Row():
|
| 238 |
btn_check = gr.Button("β
Transcribe & Check", variant="primary")
|
| 239 |
with gr.Row():
|
|
|
|
| 257 |
with gr.Column():
|
| 258 |
gr.Markdown("## Audio input")
|
| 259 |
# Prepopulating with the consent audio.
|
| 260 |
+
# Set interactive=True to be able to change.
|
| 261 |
+
tts_audio = gr.Audio(audio_input, type="filepath")
|
| 262 |
with gr.Row():
|
| 263 |
with gr.Column():
|
| 264 |
gr.Markdown("## Text input")
|
|
|
|
| 281 |
label="Temperature", value=.8)
|
| 282 |
with gr.Row():
|
| 283 |
clone_btn = gr.Button("Clone!")
|
| 284 |
+
cloned_audio = gr.Audio()
|
| 285 |
clone_btn.click(fn=clone_voice,
|
| 286 |
inputs=[tts_audio, tts_text, exaggeration,
|
| 287 |
cfg_weight, seed_num, temp],
|
assets/voice_consent_gate.png
DELETED
Git LFS Details
|
assets/voice_consent_gate_50.png
DELETED
|
Binary file (90 kB)
|
|
|
src/generate.py
CHANGED
|
@@ -10,7 +10,6 @@ sentences that users can read aloud to give informed consent for voice cloning.
|
|
| 10 |
|
| 11 |
Functions:
|
| 12 |
- _extract_llama_text(): Normalize the API output from the Llama demo.
|
| 13 |
-
- gen_sentence(): Wrapper for gen_sentence_llm(); previously supported other options.
|
| 14 |
- gen_sentence_llm(): Generate a consent sentence from the Llama model Space.
|
| 15 |
"""
|
| 16 |
|
|
@@ -42,9 +41,14 @@ def _extract_llama_text(result: Any) -> str:
|
|
| 42 |
meaningful text string it finds.
|
| 43 |
|
| 44 |
Parameters
|
| 45 |
-
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
| 48 |
"""
|
| 49 |
if isinstance(result, str):
|
| 50 |
return result.strip()
|
|
@@ -68,55 +72,49 @@ def _extract_llama_text(result: Any) -> str:
|
|
| 68 |
return ""
|
| 69 |
|
| 70 |
|
| 71 |
-
def gen_sentence(
|
| 72 |
"""
|
| 73 |
-
Always
|
| 74 |
-
Parameters
|
| 75 |
-
consent_method: str
|
| 76 |
-
The language model used to generate a consent sentence
|
| 77 |
-
voice_clone_model: str
|
| 78 |
-
The voice cloning model
|
| 79 |
"""
|
| 80 |
try:
|
| 81 |
-
return gen_sentence_llm(
|
| 82 |
except Exception as e:
|
| 83 |
# Show a helpful message directly in the Target sentence box
|
| 84 |
return f"[ERROR calling LLM] {type(e).__name__}: {e}"
|
| 85 |
|
| 86 |
# TODO: Support more than just Llama 3.2 3B Instruct
|
| 87 |
-
def gen_sentence_llm(
|
| 88 |
-
""
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
-
The response is normalized into a single English sentence suitable
|
| 96 |
-
for reading aloud.
|
| 97 |
Parameters
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
The name of the voice-cloning model to mention in the sentence.
|
| 103 |
-
Defaults to "Chatterbox".
|
| 104 |
|
| 105 |
Returns
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
|
|
|
| 109 |
# Generate the full natural-language prompt that the LLM will receive
|
| 110 |
-
prompt = get_consent_generation_prompt(
|
| 111 |
-
space_id = LLAMA_SPACE_ID
|
| 112 |
-
api_name = LLAMA_API_NAME
|
| 113 |
|
| 114 |
try:
|
| 115 |
-
#
|
| 116 |
-
|
| 117 |
-
print("Not currently implemented for %s; using Llama 3.2 3B Instruct" % consent_method)
|
| 118 |
-
# Initialize Gradio client for the language model Space
|
| 119 |
-
client = Client(space_id, hf_token=HF_TOKEN)
|
| 120 |
|
| 121 |
# The Llama demo exposes a simple /chat endpoint with standard decoding params
|
| 122 |
result = client.predict(
|
|
@@ -126,7 +124,7 @@ def gen_sentence_llm(consent_method="Llama 3.2 3B Instruct", voice_clone_model="
|
|
| 126 |
top_p=0.9,
|
| 127 |
top_k=50,
|
| 128 |
repetition_penalty=1.2,
|
| 129 |
-
api_name=
|
| 130 |
)
|
| 131 |
|
| 132 |
# Normalize and clean up model output
|
|
|
|
| 10 |
|
| 11 |
Functions:
|
| 12 |
- _extract_llama_text(): Normalize the API output from the Llama demo.
|
|
|
|
| 13 |
- gen_sentence_llm(): Generate a consent sentence from the Llama model Space.
|
| 14 |
"""
|
| 15 |
|
|
|
|
| 41 |
meaningful text string it finds.
|
| 42 |
|
| 43 |
Parameters
|
| 44 |
+
----------
|
| 45 |
+
result : Any
|
| 46 |
+
The raw output returned by `client.predict()`.
|
| 47 |
|
| 48 |
+
Returns
|
| 49 |
+
-------
|
| 50 |
+
str
|
| 51 |
+
Cleaned text output (may be empty string if extraction fails).
|
| 52 |
"""
|
| 53 |
if isinstance(result, str):
|
| 54 |
return result.strip()
|
|
|
|
| 72 |
return ""
|
| 73 |
|
| 74 |
|
| 75 |
+
def gen_sentence(audio_model_name="Chatterbox"):
|
| 76 |
"""
|
| 77 |
+
Always generate a sentence via the LLM.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
"""
|
| 79 |
try:
|
| 80 |
+
return gen_sentence_llm(audio_model_name=audio_model_name)
|
| 81 |
except Exception as e:
|
| 82 |
# Show a helpful message directly in the Target sentence box
|
| 83 |
return f"[ERROR calling LLM] {type(e).__name__}: {e}"
|
| 84 |
|
| 85 |
# TODO: Support more than just Llama 3.2 3B Instruct
|
| 86 |
+
def gen_sentence_llm(
|
| 87 |
+
sentence_method: str = "Llama 3.2 3B Instruct",
|
| 88 |
+
audio_model_name: str = "Chatterbox",
|
| 89 |
+
*
|
| 90 |
+
) -> str:
|
| 91 |
+
"""
|
| 92 |
+
Generate a consent sentence using the Llama 3.2 3B Instruct demo Space.
|
| 93 |
|
| 94 |
+
This function constructs a prompt describing the linguistic and ethical
|
| 95 |
+
requirements for a consent sentence (via `get_consent_generation_prompt`)
|
| 96 |
+
and sends it to the Llama demo hosted on Hugging Face Spaces.
|
| 97 |
+
|
| 98 |
+
The response is normalized into a single English sentence suitable
|
| 99 |
+
for reading aloud.
|
| 100 |
|
|
|
|
|
|
|
| 101 |
Parameters
|
| 102 |
+
----------
|
| 103 |
+
audio_model_name : str, optional
|
| 104 |
+
The name of the voice-cloning model to mention in the sentence.
|
| 105 |
+
Defaults to "Chatterbox".
|
|
|
|
|
|
|
| 106 |
|
| 107 |
Returns
|
| 108 |
+
-------
|
| 109 |
+
str
|
| 110 |
+
A clean, human-readable consent sentence.
|
| 111 |
+
"""
|
| 112 |
# Generate the full natural-language prompt that the LLM will receive
|
| 113 |
+
prompt = get_consent_generation_prompt(audio_model_name)
|
|
|
|
|
|
|
| 114 |
|
| 115 |
try:
|
| 116 |
+
# Initialize Gradio client for the Llama demo Space
|
| 117 |
+
client = Client(LLAMA_SPACE_ID, hf_token=HF_TOKEN)
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
# The Llama demo exposes a simple /chat endpoint with standard decoding params
|
| 120 |
result = client.predict(
|
|
|
|
| 124 |
top_p=0.9,
|
| 125 |
top_k=50,
|
| 126 |
repetition_penalty=1.2,
|
| 127 |
+
api_name=LLAMA_API_NAME,
|
| 128 |
)
|
| 129 |
|
| 130 |
# Normalize and clean up model output
|