RepeatAfterMe

Running on T4

App Files Files Community

Remove Sentence Bank and pre-written option

by frimelle HF Staff - opened 3 days ago

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+55

-60

Files changed (5) hide show

.gitattributes +0 -2
app.py +20 -18
assets/voice_consent_gate.png +0 -3
assets/voice_consent_gate_50.png +0 -0
src/generate.py +35 -37

.gitattributes CHANGED Viewed

@@ -33,5 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-voice_consent_gate.png filter=lfs diff=lfs merge=lfs -text
-assets/voice_consent_gate.png filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -7,8 +7,6 @@ import src.process as process
 global client
-GATE_IMAGE_PATH = "./assets/voice_consent_gate_50.png"
 # TODO: Ideally, instead of the Client method we're using for an external voice cloning app, we use the .load() function and pass in arguments to it directly while displaying the developer's desired UI.
 #chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
 # ------------------- UI printing functions -------------------
@@ -170,20 +168,19 @@ def clone_voice(audio_input, text_input, exaggeration_input, cfgw_input,
 with gr.Blocks(title="Voice Consent Gate") as demo:
     gr.Markdown("# Voice Consent Gate: Demo")
     with gr.Row():
-        with gr.Column():
-            gr.Image(GATE_IMAGE_PATH, interactive=False, show_download_button=False)
         with gr.Column():
             with gr.Accordion(
                     label="Click for further information on this demo",
                     open=False):
                 gr.Markdown("""
-                    To create a basic voice cloning system with a voice consent gate, you need three parts:
-                    1. A way of generating novel consent sentences for the person whose voice will be cloned – the “speaker” – to say, uniquely referencing the current consent context.
-                    2. An _automatic speech recognition (ASR) system_ that recognizes the sentence conveying consent.
-                    3. A _voice-cloning text-to-speech (TTS) system_ that takes as input text and the speaker's speech snippets to generate speech.
-                    Since some voice-cloning TTS systems can now generate speech similar to a speaker’s voice using _just one sentence_, a sentence used for consent can **also** be used for voice cloning.
-                    """)
     with gr.Row():
         with gr.Column(scale=2):
             gr.Markdown(
@@ -199,11 +196,11 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
             )
         with gr.Column():
             consent_method = gr.Dropdown(
-                label="Sentence generation method (currently limited to Llama 3.2 3B Instruct)",
                 choices=["Llama 3.2 3B Instruct"],
                 value="Llama 3.2 3B Instruct"
             )
-            asr_model = gr.Dropdown(label="Speech recognition model (currently limited to Whisper)",
                                     choices=["openai/whisper-tiny.en",  # fastest (CPU-friendly)
                                             "openai/whisper-base.en",  # better accuracy, a bit slower
                                             "distil-whisper/distil-small.en"
@@ -212,26 +209,31 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
                                     value="openai/whisper-tiny.en",
                                     )
             voice_clone_model = gr.Dropdown(
-                label="Voice cloning model (currently limited to Chatterbox)",
                 choices=["Chatterbox", ], value="Chatterbox")
     with gr.Row():
         target = gr.Textbox(label="Target sentence", interactive=False,
                             placeholder="Click 'Generate sentence'")
     with gr.Row():
         btn_gen = gr.Button("🎲 Generate sentence", variant="primary")
         btn_clear = gr.Button("🧹 Clear")
     with gr.Row():
         consent_audio = gr.Audio(sources=["microphone"], type="filepath",
                                  label="Record your voice", key='consent_audio')
     with gr.Accordion("Advanced ASR settings", open=False):
         device_pref = gr.Radio(
             choices=["auto", "cpu", "cuda"],
             value="auto",
             label="Device preference"
         )
-        # In your own code, do not provide users with the option to change this: Set it yourself.
         pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01,
                                    label="Match threshold")
     with gr.Row():
         btn_check = gr.Button("✅ Transcribe & Check", variant="primary")
     with gr.Row():
@@ -255,8 +257,8 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
                     with gr.Column():
                         gr.Markdown("## Audio input")
                         # Prepopulating with the consent audio.
-                        # Setting interactive=False keeps it from being possible to upload something else.
-                        tts_audio = gr.Audio(audio_input, type="filepath", interactive=False)
                 with gr.Row():
                     with gr.Column():
                         gr.Markdown("## Text input")
@@ -279,7 +281,7 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
                                          label="Temperature", value=.8)
                 with gr.Row():
                     clone_btn = gr.Button("Clone!")
-                    cloned_audio = gr.Audio(show_download_button=True)
                     clone_btn.click(fn=clone_voice,
                                     inputs=[tts_audio, tts_text, exaggeration,
                                             cfg_weight, seed_num, temp],

 global client
 # TODO: Ideally, instead of the Client method we're using for an external voice cloning app, we use the .load() function and pass in arguments to it directly while displaying the developer's desired UI.
 #chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
 # ------------------- UI printing functions -------------------
 with gr.Blocks(title="Voice Consent Gate") as demo:
     gr.Markdown("# Voice Consent Gate: Demo")
     with gr.Row():
         with gr.Column():
             with gr.Accordion(
                     label="Click for further information on this demo",
                     open=False):
                 gr.Markdown("""
+                To create a basic voice cloning system with a voice consent gate, you need three parts:
+                1.  A way of generating novel consent sentences for the person whose voice will be cloned – the “speaker” – to say, making sure the sentence isn’t part of a previous recording but instead uniquely references the current consent context.
+                2. An _automatic speech recognition (ASR) system_ that recognizes the sentence conveying consent.
+                3. A _voice-cloning text-to-speech (TTS) system_ that takes as input text and the voice clonee’s speech snippets to generate speech.
+                Some voice-cloning TTS systems can now generate speech similar to a speaker’s voice using _just one sentence_. This means that a sentence used for consent can **also** be used for voice cloning. We demonstrate one way to do that here.
+                """)
     with gr.Row():
         with gr.Column(scale=2):
             gr.Markdown(
             )
         with gr.Column():
             consent_method = gr.Dropdown(
+                label="Sentence generation method",
                 choices=["Llama 3.2 3B Instruct"],
                 value="Llama 3.2 3B Instruct"
             )
+            asr_model = gr.Dropdown(label="Speech recognition model",
                                     choices=["openai/whisper-tiny.en",  # fastest (CPU-friendly)
                                             "openai/whisper-base.en",  # better accuracy, a bit slower
                                             "distil-whisper/distil-small.en"
                                     value="openai/whisper-tiny.en",
                                     )
             voice_clone_model = gr.Dropdown(
+                label="Voice cloning model",
                 choices=["Chatterbox", ], value="Chatterbox")
+        #with gr.Column():
+        #    pass # Just for spacing
     with gr.Row():
         target = gr.Textbox(label="Target sentence", interactive=False,
                             placeholder="Click 'Generate sentence'")
     with gr.Row():
         btn_gen = gr.Button("🎲 Generate sentence", variant="primary")
         btn_clear = gr.Button("🧹 Clear")
     with gr.Row():
         consent_audio = gr.Audio(sources=["microphone"], type="filepath",
                                  label="Record your voice", key='consent_audio')
     with gr.Accordion("Advanced ASR settings", open=False):
         device_pref = gr.Radio(
             choices=["auto", "cpu", "cuda"],
             value="auto",
             label="Device preference"
         )
         pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01,
                                    label="Match threshold")
     with gr.Row():
         btn_check = gr.Button("✅ Transcribe & Check", variant="primary")
     with gr.Row():
                     with gr.Column():
                         gr.Markdown("## Audio input")
                         # Prepopulating with the consent audio.
+                        # Set interactive=True to be able to change.
+                        tts_audio = gr.Audio(audio_input, type="filepath")
                 with gr.Row():
                     with gr.Column():
                         gr.Markdown("## Text input")
                                          label="Temperature", value=.8)
                 with gr.Row():
                     clone_btn = gr.Button("Clone!")
+                    cloned_audio = gr.Audio()
                     clone_btn.click(fn=clone_voice,
                                     inputs=[tts_audio, tts_text, exaggeration,
                                             cfg_weight, seed_num, temp],

assets/voice_consent_gate.png DELETED Viewed

Git LFS Details

SHA256: 1692551c8bace0152f60ef5039731e990e12b1429fdf004aefe328ef976d55b4
Pointer size: 131 Bytes
Size of remote file: 209 kB

assets/voice_consent_gate_50.png DELETED Viewed

Binary file (90 kB)

src/generate.py CHANGED Viewed

@@ -10,7 +10,6 @@ sentences that users can read aloud to give informed consent for voice cloning.
 Functions:
     - _extract_llama_text(): Normalize the API output from the Llama demo.
-    - gen_sentence(): Wrapper for gen_sentence_llm(); previously supported other options.
     - gen_sentence_llm(): Generate a consent sentence from the Llama model Space.
 """
@@ -42,9 +41,14 @@ def _extract_llama_text(result: Any) -> str:
     meaningful text string it finds.
     Parameters
-        result : The raw output returned by `client.predict()`.
-        str : Cleaned text output (may be empty string if extraction fails).
     """
     if isinstance(result, str):
         return result.strip()
@@ -68,55 +72,49 @@ def _extract_llama_text(result: Any) -> str:
     return ""
-def gen_sentence(consent_method="Llama 3.2 3B Instruct", voice_clone_model="Chatterbox"):
     """
-    Always generates a sentence via the LLM.
-    Parameters
-        consent_method: str
-            The language model used to generate a consent sentence
-        voice_clone_model: str
-            The voice cloning model
     """
     try:
-        return gen_sentence_llm(consent_method, voice_clone_model)
     except Exception as e:
         # Show a helpful message directly in the Target sentence box
         return f"[ERROR calling LLM] {type(e).__name__}: {e}"
 # TODO: Support more than just Llama 3.2 3B Instruct
-def gen_sentence_llm(consent_method="Llama 3.2 3B Instruct", voice_clone_model="Chatterbox") -> str:
-    """
-   Generate a consent sentence using the Llama 3.2 3B Instruct demo Space.
-   This function constructs a prompt describing the linguistic and ethical
-   requirements for a consent sentence (via `get_consent_generation_prompt`)
-   and sends it to the Llama demo hosted on Hugging Face Spaces.
-   The response is normalized into a single English sentence suitable
-   for reading aloud.
     Parameters
-        consent_method : str
-            The name of the language model used to generate the consent utterance.
-            Currently just implemented for Llama 3.2 3B Instruct.
-        audio_model_name : str
-            The name of the voice-cloning model to mention in the sentence.
-            Defaults to "Chatterbox".
     Returns
-        str
-            A clean, human-readable consent sentence.
-   """
     # Generate the full natural-language prompt that the LLM will receive
-    prompt = get_consent_generation_prompt(voice_clone_model)
-    space_id = LLAMA_SPACE_ID
-    api_name = LLAMA_API_NAME
     try:
-        # Currently always true.
-        if consent_method != "Llama 3.2 3B Instruct":
-            print("Not currently implemented for %s; using Llama 3.2 3B Instruct" % consent_method)
-        # Initialize Gradio client for the language model Space
-        client = Client(space_id, hf_token=HF_TOKEN)
         # The Llama demo exposes a simple /chat endpoint with standard decoding params
         result = client.predict(
@@ -126,7 +124,7 @@ def gen_sentence_llm(consent_method="Llama 3.2 3B Instruct", voice_clone_model="
             top_p=0.9,
             top_k=50,
             repetition_penalty=1.2,
-            api_name=api_name,
         )
         # Normalize and clean up model output

 Functions:
     - _extract_llama_text(): Normalize the API output from the Llama demo.
     - gen_sentence_llm(): Generate a consent sentence from the Llama model Space.
 """
     meaningful text string it finds.
     Parameters
+    ----------
+    result : Any
+        The raw output returned by `client.predict()`.
+    Returns
+    -------
+    str
+        Cleaned text output (may be empty string if extraction fails).
     """
     if isinstance(result, str):
         return result.strip()
     return ""
+def gen_sentence(audio_model_name="Chatterbox"):
     """
+    Always generate a sentence via the LLM.
     """
     try:
+        return gen_sentence_llm(audio_model_name=audio_model_name)
     except Exception as e:
         # Show a helpful message directly in the Target sentence box
         return f"[ERROR calling LLM] {type(e).__name__}: {e}"
 # TODO: Support more than just Llama 3.2 3B Instruct
+def gen_sentence_llm(
+    sentence_method: str = "Llama 3.2 3B Instruct",
+    audio_model_name: str = "Chatterbox",
+    *
+) -> str:
+     """
+    Generate a consent sentence using the Llama 3.2 3B Instruct demo Space.
+    This function constructs a prompt describing the linguistic and ethical
+    requirements for a consent sentence (via `get_consent_generation_prompt`)
+    and sends it to the Llama demo hosted on Hugging Face Spaces.
+    The response is normalized into a single English sentence suitable
+    for reading aloud.
     Parameters
+    ----------
+    audio_model_name : str, optional
+        The name of the voice-cloning model to mention in the sentence.
+        Defaults to "Chatterbox".
     Returns
+    -------
+    str
+        A clean, human-readable consent sentence.
+    """
     # Generate the full natural-language prompt that the LLM will receive
+    prompt = get_consent_generation_prompt(audio_model_name)
     try:
+        # Initialize Gradio client for the Llama demo Space
+        client = Client(LLAMA_SPACE_ID, hf_token=HF_TOKEN)
         # The Llama demo exposes a simple /chat endpoint with standard decoding params
         result = client.predict(
             top_p=0.9,
             top_k=50,
             repetition_penalty=1.2,
+            api_name=LLAMA_API_NAME,
         )
         # Normalize and clean up model output