Remove Sentence Bank and pre-written option

#3
by frimelle HF Staff - opened
.gitattributes CHANGED
@@ -33,5 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- voice_consent_gate.png filter=lfs diff=lfs merge=lfs -text
37
- assets/voice_consent_gate.png filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
app.py CHANGED
@@ -7,8 +7,6 @@ import src.process as process
7
 
8
  global client
9
 
10
- GATE_IMAGE_PATH = "./assets/voice_consent_gate_50.png"
11
-
12
  # TODO: Ideally, instead of the Client method we're using for an external voice cloning app, we use the .load() function and pass in arguments to it directly while displaying the developer's desired UI.
13
  #chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
14
  # ------------------- UI printing functions -------------------
@@ -170,20 +168,19 @@ def clone_voice(audio_input, text_input, exaggeration_input, cfgw_input,
170
  with gr.Blocks(title="Voice Consent Gate") as demo:
171
  gr.Markdown("# Voice Consent Gate: Demo")
172
  with gr.Row():
173
- with gr.Column():
174
- gr.Image(GATE_IMAGE_PATH, interactive=False, show_download_button=False)
175
  with gr.Column():
176
  with gr.Accordion(
177
  label="Click for further information on this demo",
178
  open=False):
179
  gr.Markdown("""
180
- To create a basic voice cloning system with a voice consent gate, you need three parts:
181
- 1. A way of generating novel consent sentences for the person whose voice will be cloned – the β€œspeaker” – to say, uniquely referencing the current consent context.
182
- 2. An _automatic speech recognition (ASR) system_ that recognizes the sentence conveying consent.
183
- 3. A _voice-cloning text-to-speech (TTS) system_ that takes as input text and the speaker's speech snippets to generate speech.
184
-
185
- Since some voice-cloning TTS systems can now generate speech similar to a speaker’s voice using _just one sentence_, a sentence used for consent can **also** be used for voice cloning.
186
- """)
 
187
  with gr.Row():
188
  with gr.Column(scale=2):
189
  gr.Markdown(
@@ -199,11 +196,11 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
199
  )
200
  with gr.Column():
201
  consent_method = gr.Dropdown(
202
- label="Sentence generation method (currently limited to Llama 3.2 3B Instruct)",
203
  choices=["Llama 3.2 3B Instruct"],
204
  value="Llama 3.2 3B Instruct"
205
  )
206
- asr_model = gr.Dropdown(label="Speech recognition model (currently limited to Whisper)",
207
  choices=["openai/whisper-tiny.en", # fastest (CPU-friendly)
208
  "openai/whisper-base.en", # better accuracy, a bit slower
209
  "distil-whisper/distil-small.en"
@@ -212,26 +209,31 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
212
  value="openai/whisper-tiny.en",
213
  )
214
  voice_clone_model = gr.Dropdown(
215
- label="Voice cloning model (currently limited to Chatterbox)",
216
  choices=["Chatterbox", ], value="Chatterbox")
 
 
217
  with gr.Row():
218
  target = gr.Textbox(label="Target sentence", interactive=False,
219
  placeholder="Click 'Generate sentence'")
 
220
  with gr.Row():
221
  btn_gen = gr.Button("🎲 Generate sentence", variant="primary")
222
  btn_clear = gr.Button("🧹 Clear")
 
223
  with gr.Row():
224
  consent_audio = gr.Audio(sources=["microphone"], type="filepath",
225
  label="Record your voice", key='consent_audio')
 
226
  with gr.Accordion("Advanced ASR settings", open=False):
227
  device_pref = gr.Radio(
228
  choices=["auto", "cpu", "cuda"],
229
  value="auto",
230
  label="Device preference"
231
  )
232
- # In your own code, do not provide users with the option to change this: Set it yourself.
233
  pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01,
234
  label="Match threshold")
 
235
  with gr.Row():
236
  btn_check = gr.Button("βœ… Transcribe & Check", variant="primary")
237
  with gr.Row():
@@ -255,8 +257,8 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
255
  with gr.Column():
256
  gr.Markdown("## Audio input")
257
  # Prepopulating with the consent audio.
258
- # Setting interactive=False keeps it from being possible to upload something else.
259
- tts_audio = gr.Audio(audio_input, type="filepath", interactive=False)
260
  with gr.Row():
261
  with gr.Column():
262
  gr.Markdown("## Text input")
@@ -279,7 +281,7 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
279
  label="Temperature", value=.8)
280
  with gr.Row():
281
  clone_btn = gr.Button("Clone!")
282
- cloned_audio = gr.Audio(show_download_button=True)
283
  clone_btn.click(fn=clone_voice,
284
  inputs=[tts_audio, tts_text, exaggeration,
285
  cfg_weight, seed_num, temp],
 
7
 
8
  global client
9
 
 
 
10
  # TODO: Ideally, instead of the Client method we're using for an external voice cloning app, we use the .load() function and pass in arguments to it directly while displaying the developer's desired UI.
11
  #chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
12
  # ------------------- UI printing functions -------------------
 
168
  with gr.Blocks(title="Voice Consent Gate") as demo:
169
  gr.Markdown("# Voice Consent Gate: Demo")
170
  with gr.Row():
 
 
171
  with gr.Column():
172
  with gr.Accordion(
173
  label="Click for further information on this demo",
174
  open=False):
175
  gr.Markdown("""
176
+
177
+
178
+ To create a basic voice cloning system with a voice consent gate, you need three parts:
179
+ 1. A way of generating novel consent sentences for the person whose voice will be cloned – the β€œspeaker” – to say, making sure the sentence isn’t part of a previous recording but instead uniquely references the current consent context.
180
+ 2. An _automatic speech recognition (ASR) system_ that recognizes the sentence conveying consent.
181
+ 3. A _voice-cloning text-to-speech (TTS) system_ that takes as input text and the voice clonee’s speech snippets to generate speech.
182
+ Some voice-cloning TTS systems can now generate speech similar to a speaker’s voice using _just one sentence_. This means that a sentence used for consent can **also** be used for voice cloning. We demonstrate one way to do that here.
183
+ """)
184
  with gr.Row():
185
  with gr.Column(scale=2):
186
  gr.Markdown(
 
196
  )
197
  with gr.Column():
198
  consent_method = gr.Dropdown(
199
+ label="Sentence generation method",
200
  choices=["Llama 3.2 3B Instruct"],
201
  value="Llama 3.2 3B Instruct"
202
  )
203
+ asr_model = gr.Dropdown(label="Speech recognition model",
204
  choices=["openai/whisper-tiny.en", # fastest (CPU-friendly)
205
  "openai/whisper-base.en", # better accuracy, a bit slower
206
  "distil-whisper/distil-small.en"
 
209
  value="openai/whisper-tiny.en",
210
  )
211
  voice_clone_model = gr.Dropdown(
212
+ label="Voice cloning model",
213
  choices=["Chatterbox", ], value="Chatterbox")
214
+ #with gr.Column():
215
+ # pass # Just for spacing
216
  with gr.Row():
217
  target = gr.Textbox(label="Target sentence", interactive=False,
218
  placeholder="Click 'Generate sentence'")
219
+
220
  with gr.Row():
221
  btn_gen = gr.Button("🎲 Generate sentence", variant="primary")
222
  btn_clear = gr.Button("🧹 Clear")
223
+
224
  with gr.Row():
225
  consent_audio = gr.Audio(sources=["microphone"], type="filepath",
226
  label="Record your voice", key='consent_audio')
227
+
228
  with gr.Accordion("Advanced ASR settings", open=False):
229
  device_pref = gr.Radio(
230
  choices=["auto", "cpu", "cuda"],
231
  value="auto",
232
  label="Device preference"
233
  )
 
234
  pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01,
235
  label="Match threshold")
236
+
237
  with gr.Row():
238
  btn_check = gr.Button("βœ… Transcribe & Check", variant="primary")
239
  with gr.Row():
 
257
  with gr.Column():
258
  gr.Markdown("## Audio input")
259
  # Prepopulating with the consent audio.
260
+ # Set interactive=True to be able to change.
261
+ tts_audio = gr.Audio(audio_input, type="filepath")
262
  with gr.Row():
263
  with gr.Column():
264
  gr.Markdown("## Text input")
 
281
  label="Temperature", value=.8)
282
  with gr.Row():
283
  clone_btn = gr.Button("Clone!")
284
+ cloned_audio = gr.Audio()
285
  clone_btn.click(fn=clone_voice,
286
  inputs=[tts_audio, tts_text, exaggeration,
287
  cfg_weight, seed_num, temp],
assets/voice_consent_gate.png DELETED

Git LFS Details

  • SHA256: 1692551c8bace0152f60ef5039731e990e12b1429fdf004aefe328ef976d55b4
  • Pointer size: 131 Bytes
  • Size of remote file: 209 kB
assets/voice_consent_gate_50.png DELETED
Binary file (90 kB)
 
src/generate.py CHANGED
@@ -10,7 +10,6 @@ sentences that users can read aloud to give informed consent for voice cloning.
10
 
11
  Functions:
12
  - _extract_llama_text(): Normalize the API output from the Llama demo.
13
- - gen_sentence(): Wrapper for gen_sentence_llm(); previously supported other options.
14
  - gen_sentence_llm(): Generate a consent sentence from the Llama model Space.
15
  """
16
 
@@ -42,9 +41,14 @@ def _extract_llama_text(result: Any) -> str:
42
  meaningful text string it finds.
43
 
44
  Parameters
45
- result : The raw output returned by `client.predict()`.
 
 
46
 
47
- str : Cleaned text output (may be empty string if extraction fails).
 
 
 
48
  """
49
  if isinstance(result, str):
50
  return result.strip()
@@ -68,55 +72,49 @@ def _extract_llama_text(result: Any) -> str:
68
  return ""
69
 
70
 
71
- def gen_sentence(consent_method="Llama 3.2 3B Instruct", voice_clone_model="Chatterbox"):
72
  """
73
- Always generates a sentence via the LLM.
74
- Parameters
75
- consent_method: str
76
- The language model used to generate a consent sentence
77
- voice_clone_model: str
78
- The voice cloning model
79
  """
80
  try:
81
- return gen_sentence_llm(consent_method, voice_clone_model)
82
  except Exception as e:
83
  # Show a helpful message directly in the Target sentence box
84
  return f"[ERROR calling LLM] {type(e).__name__}: {e}"
85
 
86
  # TODO: Support more than just Llama 3.2 3B Instruct
87
- def gen_sentence_llm(consent_method="Llama 3.2 3B Instruct", voice_clone_model="Chatterbox") -> str:
88
- """
89
- Generate a consent sentence using the Llama 3.2 3B Instruct demo Space.
 
 
 
 
90
 
91
- This function constructs a prompt describing the linguistic and ethical
92
- requirements for a consent sentence (via `get_consent_generation_prompt`)
93
- and sends it to the Llama demo hosted on Hugging Face Spaces.
 
 
 
94
 
95
- The response is normalized into a single English sentence suitable
96
- for reading aloud.
97
  Parameters
98
- consent_method : str
99
- The name of the language model used to generate the consent utterance.
100
- Currently just implemented for Llama 3.2 3B Instruct.
101
- audio_model_name : str
102
- The name of the voice-cloning model to mention in the sentence.
103
- Defaults to "Chatterbox".
104
 
105
  Returns
106
- str
107
- A clean, human-readable consent sentence.
108
- """
 
109
  # Generate the full natural-language prompt that the LLM will receive
110
- prompt = get_consent_generation_prompt(voice_clone_model)
111
- space_id = LLAMA_SPACE_ID
112
- api_name = LLAMA_API_NAME
113
 
114
  try:
115
- # Currently always true.
116
- if consent_method != "Llama 3.2 3B Instruct":
117
- print("Not currently implemented for %s; using Llama 3.2 3B Instruct" % consent_method)
118
- # Initialize Gradio client for the language model Space
119
- client = Client(space_id, hf_token=HF_TOKEN)
120
 
121
  # The Llama demo exposes a simple /chat endpoint with standard decoding params
122
  result = client.predict(
@@ -126,7 +124,7 @@ def gen_sentence_llm(consent_method="Llama 3.2 3B Instruct", voice_clone_model="
126
  top_p=0.9,
127
  top_k=50,
128
  repetition_penalty=1.2,
129
- api_name=api_name,
130
  )
131
 
132
  # Normalize and clean up model output
 
10
 
11
  Functions:
12
  - _extract_llama_text(): Normalize the API output from the Llama demo.
 
13
  - gen_sentence_llm(): Generate a consent sentence from the Llama model Space.
14
  """
15
 
 
41
  meaningful text string it finds.
42
 
43
  Parameters
44
+ ----------
45
+ result : Any
46
+ The raw output returned by `client.predict()`.
47
 
48
+ Returns
49
+ -------
50
+ str
51
+ Cleaned text output (may be empty string if extraction fails).
52
  """
53
  if isinstance(result, str):
54
  return result.strip()
 
72
  return ""
73
 
74
 
75
+ def gen_sentence(audio_model_name="Chatterbox"):
76
  """
77
+ Always generate a sentence via the LLM.
 
 
 
 
 
78
  """
79
  try:
80
+ return gen_sentence_llm(audio_model_name=audio_model_name)
81
  except Exception as e:
82
  # Show a helpful message directly in the Target sentence box
83
  return f"[ERROR calling LLM] {type(e).__name__}: {e}"
84
 
85
  # TODO: Support more than just Llama 3.2 3B Instruct
86
+ def gen_sentence_llm(
87
+ sentence_method: str = "Llama 3.2 3B Instruct",
88
+ audio_model_name: str = "Chatterbox",
89
+ *
90
+ ) -> str:
91
+ """
92
+ Generate a consent sentence using the Llama 3.2 3B Instruct demo Space.
93
 
94
+ This function constructs a prompt describing the linguistic and ethical
95
+ requirements for a consent sentence (via `get_consent_generation_prompt`)
96
+ and sends it to the Llama demo hosted on Hugging Face Spaces.
97
+
98
+ The response is normalized into a single English sentence suitable
99
+ for reading aloud.
100
 
 
 
101
  Parameters
102
+ ----------
103
+ audio_model_name : str, optional
104
+ The name of the voice-cloning model to mention in the sentence.
105
+ Defaults to "Chatterbox".
 
 
106
 
107
  Returns
108
+ -------
109
+ str
110
+ A clean, human-readable consent sentence.
111
+ """
112
  # Generate the full natural-language prompt that the LLM will receive
113
+ prompt = get_consent_generation_prompt(audio_model_name)
 
 
114
 
115
  try:
116
+ # Initialize Gradio client for the Llama demo Space
117
+ client = Client(LLAMA_SPACE_ID, hf_token=HF_TOKEN)
 
 
 
118
 
119
  # The Llama demo exposes a simple /chat endpoint with standard decoding params
120
  result = client.predict(
 
124
  top_p=0.9,
125
  top_k=50,
126
  repetition_penalty=1.2,
127
+ api_name=LLAMA_API_NAME,
128
  )
129
 
130
  # Normalize and clean up model output