Spaces:

JotunnBurton
/

wuwa-bert-vits2

Sleeping

App Files Files Community

JotunnBurton commited on Apr 16

Commit

6c3d54e

verified ·

1 Parent(s): da52ef1

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -46

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import sys
 import logging
 import os
@@ -144,60 +146,33 @@ if __name__ == "__main__":
                             noise_scale_w = gr.Slider(0.1, 2.0, 0.8, 0.1, label="Noise_W")
                             length_scale = gr.Slider(0.1, 2.0, 1.0, 0.1, label="Length")
                             language = gr.Dropdown(choices=["JP", "ZH", "EN", "mix", "auto"], value="JP", label="Language")
-                            style_text = gr.Textbox(label="Style Text", placeholder="(leave blank for none)")
-                            style_weight = gr.Slider(0, 1, 0.7, 0.1, label="Style Weight")
                             btn = gr.Button("Generate Audio", variant="primary")
-                             with gr.Column():
-                                output_msg = gr.Textbox(label="Output Message")
-                                output_audio = gr.Audio(label="Output Audio")
-                            with gr.Column():
-                                with gr.Accordion("Semantic Fusion", open=False):
-                                    gr.Markdown(
-                                        value="Use auxiliary text semantics to assist speech generation (the language remains the same as the main text)\n\n"
-                                        "**Note**: Do not use **command-style text** (e.g., 'Happy'), instead use **emotionally expressive text** (e.g., 'I'm so happy!!!')\n\n"
-                                        "Effectiveness is uncertain; leave it blank to disable this feature\n\n"
-                                        "**If the main text is mispronounced, try replacing the mispronounced characters with phonetically correct ones, and input the original text here with weight set to max to retain the original semantic intent while correcting pronunciation.**"
-                                    )
-                                    style_text = gr.Textbox(label="Auxiliary Text")
-                                    style_weight = gr.Slider(
-                                        minimum=0,
-                                        maximum=1,
-                                        value=0.7,
-                                        step=0.1,
-                                        label="Weight",
-                                        info="Mixing ratio between main text and auxiliary text in BERT embedding. 0 means main text only, 1 means auxiliary text only.",
-                                    )
-                                with gr.Row():
-                                    with gr.Column():
-                                        interval_between_sent = gr.Slider(
-                                            minimum=0,
-                                            maximum=5,
-                                            value=0.2,
-                                            step=0.1,
-                                            label="Pause between sentences (seconds). Effective only when sentence splitting is enabled.",
-                                        )
-                                        interval_between_para = gr.Slider(
-                                            minimum=0,
-                                            maximum=10,
-                                            value=1,
-                                            step=0.1,
-                                            label="Pause between paragraphs (seconds). Must be longer than sentence pause.",
-                                        )
-                                        opt_cut_by_sent = gr.Checkbox(
-                                            label="Split by sentence — further splits text by sentence in addition to paragraph splitting"
-                                        )
-                                        slicer = gr.Button("Split and Generate", variant="primary")
                         with gr.Column():
                             output_msg = gr.Textbox(label="Output Message")
                             output_audio = gr.Audio(label="Output Audio")
                     prompt_mode.change(lambda x: gr_util(x), inputs=[prompt_mode], outputs=[text_prompt, audio_prompt])
                     audio_prompt.upload(lambda x: load_audio(x), inputs=[audio_prompt], outputs=[audio_prompt])
                     btn.click(tts_fn, inputs=[input_text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language, audio_prompt, text_prompt, prompt_mode, style_text, style_weight], outputs=[output_msg, output_audio])
-                    slicer.click(lambda: ("Slicing logic not fully supported yet in this patch", None), inputs=[], outputs=[output_msg, output_audio])
     app.queue().launch(share=args.share)

+# app.py (fully patched)
 import sys
 import logging
 import os
                             noise_scale_w = gr.Slider(0.1, 2.0, 0.8, 0.1, label="Noise_W")
                             length_scale = gr.Slider(0.1, 2.0, 1.0, 0.1, label="Length")
                             language = gr.Dropdown(choices=["JP", "ZH", "EN", "mix", "auto"], value="JP", label="Language")
                             btn = gr.Button("Generate Audio", variant="primary")
                         with gr.Column():
                             output_msg = gr.Textbox(label="Output Message")
                             output_audio = gr.Audio(label="Output Audio")
+                        with gr.Column():
+                            with gr.Accordion("Semantic Fusion", open=False):
+                                gr.Markdown(
+                                    value="Use auxiliary text semantics to assist speech generation (language remains same as main text)\n\n"
+                                          "**Note**: Avoid using *command-style text* (e.g., 'Happy'). Use *emotionally rich text* (e.g., 'I'm so happy!!!')\n\n"
+                                          "Leave it blank to disable. \n\n"
+                                          "**If mispronunciations occur, try replacing characters and inputting the original here with weight set to 1.0 for semantic retention.**"
+                                )
+                                style_text = gr.Textbox(label="Auxiliary Text")
+                                style_weight = gr.Slider(0, 1, 0.7, 0.1, label="Weight", info="Ratio between main and auxiliary BERT embeddings")
+                            with gr.Row():
+                                with gr.Column():
+                                    interval_between_sent = gr.Slider(0, 5, 0.2, 0.1, label="Pause between sentences (sec)")
+                                    interval_between_para = gr.Slider(0, 10, 1, 0.1, label="Pause between paragraphs (sec)")
+                                    opt_cut_by_sent = gr.Checkbox(label="Split by sentence")
+                                    slicer = gr.Button("Split and Generate", variant="primary")
                     prompt_mode.change(lambda x: gr_util(x), inputs=[prompt_mode], outputs=[text_prompt, audio_prompt])
                     audio_prompt.upload(lambda x: load_audio(x), inputs=[audio_prompt], outputs=[audio_prompt])
                     btn.click(tts_fn, inputs=[input_text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language, audio_prompt, text_prompt, prompt_mode, style_text, style_weight], outputs=[output_msg, output_audio])
+                    slicer.click(lambda: ("Slicing logic not yet implemented in this version", None), inputs=[], outputs=[output_msg, output_audio])
     app.queue().launch(share=args.share)