Spaces:
Running
on
Zero
Running
on
Zero
Update gradio_app.py
Browse files- gradio_app.py +25 -22
gradio_app.py
CHANGED
|
@@ -1,13 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import re
|
| 3 |
import torch
|
| 4 |
import torchaudio
|
| 5 |
import gradio as gr
|
| 6 |
-
import warnings
|
| 7 |
import numpy as np
|
| 8 |
import tempfile
|
| 9 |
from einops import rearrange
|
| 10 |
-
warnings.filterwarnings("ignore", message=".*weights_only=False.*", category=FutureWarning)
|
| 11 |
from vocos import Vocos
|
| 12 |
from pydub import AudioSegment, silence
|
| 13 |
from model import CFM, UNetT, DiT, MMDiT
|
|
@@ -20,7 +32,6 @@ from model.utils import (
|
|
| 20 |
)
|
| 21 |
from transformers import pipeline
|
| 22 |
import librosa
|
| 23 |
-
import click
|
| 24 |
import soundfile as sf
|
| 25 |
|
| 26 |
try:
|
|
@@ -79,6 +90,7 @@ speed = 1.0
|
|
| 79 |
fix_duration = None
|
| 80 |
|
| 81 |
|
|
|
|
| 82 |
def load_model(repo_name, exp_name, model_cls, model_cfg, ckpt_step):
|
| 83 |
ckpt_path = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
|
| 84 |
# ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.pt" # .pt | .safetensors
|
|
@@ -212,6 +224,7 @@ def split_text_into_batches(text, max_chars=200, split_words=SPLIT_WORDS):
|
|
| 212 |
|
| 213 |
return batches
|
| 214 |
|
|
|
|
| 215 |
def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, progress=gr.Progress()):
|
| 216 |
if exp_name == "F5-TTS":
|
| 217 |
ema_model = F5TTS_ema_model
|
|
@@ -296,6 +309,7 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
|
|
| 296 |
|
| 297 |
return (target_sample_rate, final_wave), spectrogram_path
|
| 298 |
|
|
|
|
| 299 |
def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, custom_split_words=''):
|
| 300 |
if not custom_split_words.strip():
|
| 301 |
custom_words = [word.strip() for word in custom_split_words.split(',')]
|
|
@@ -345,6 +359,7 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, custom_s
|
|
| 345 |
gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
|
| 346 |
return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence)
|
| 347 |
|
|
|
|
| 348 |
def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name, ref_audio2, ref_text2, exp_name, remove_silence):
|
| 349 |
# Split the script into speaker blocks
|
| 350 |
speaker_pattern = re.compile(f"^({re.escape(speaker1_name)}|{re.escape(speaker2_name)}):", re.MULTILINE)
|
|
@@ -681,6 +696,7 @@ with gr.Blocks() as app_emotional:
|
|
| 681 |
# Output audio
|
| 682 |
audio_output_emotional = gr.Audio(label="Synthesized Audio")
|
| 683 |
|
|
|
|
| 684 |
def generate_emotional_speech(
|
| 685 |
regular_audio,
|
| 686 |
regular_ref_text,
|
|
@@ -803,24 +819,11 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
|
|
| 803 |
)
|
| 804 |
gr.TabbedInterface([app_tts, app_podcast, app_emotional, app_credits], ["TTS", "Podcast", "Multi-Style", "Credits"])
|
| 805 |
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
default=False,
|
| 813 |
-
is_flag=True,
|
| 814 |
-
help="Share the app via Gradio share link",
|
| 815 |
-
)
|
| 816 |
-
@click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
|
| 817 |
-
def main(port, host, share, api):
|
| 818 |
-
global app
|
| 819 |
-
print(f"Starting app...")
|
| 820 |
-
app.queue(api_open=api).launch(
|
| 821 |
-
server_name=host, server_port=port, share=share, show_api=api
|
| 822 |
)
|
| 823 |
|
| 824 |
-
|
| 825 |
-
if __name__ == "__main__":
|
| 826 |
-
main()
|
|
|
|
| 1 |
+
# Requirements for Hugging Face Spaces:
|
| 2 |
+
# torch
|
| 3 |
+
# torchaudio
|
| 4 |
+
# gradio
|
| 5 |
+
# numpy
|
| 6 |
+
# einops
|
| 7 |
+
# vocos
|
| 8 |
+
# pydub
|
| 9 |
+
# cached_path
|
| 10 |
+
# transformers
|
| 11 |
+
# librosa
|
| 12 |
+
# soundfile
|
| 13 |
+
# spaces
|
| 14 |
+
|
| 15 |
import os
|
| 16 |
import re
|
| 17 |
import torch
|
| 18 |
import torchaudio
|
| 19 |
import gradio as gr
|
|
|
|
| 20 |
import numpy as np
|
| 21 |
import tempfile
|
| 22 |
from einops import rearrange
|
|
|
|
| 23 |
from vocos import Vocos
|
| 24 |
from pydub import AudioSegment, silence
|
| 25 |
from model import CFM, UNetT, DiT, MMDiT
|
|
|
|
| 32 |
)
|
| 33 |
from transformers import pipeline
|
| 34 |
import librosa
|
|
|
|
| 35 |
import soundfile as sf
|
| 36 |
|
| 37 |
try:
|
|
|
|
| 90 |
fix_duration = None
|
| 91 |
|
| 92 |
|
| 93 |
+
@gpu_decorator
|
| 94 |
def load_model(repo_name, exp_name, model_cls, model_cfg, ckpt_step):
|
| 95 |
ckpt_path = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
|
| 96 |
# ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.pt" # .pt | .safetensors
|
|
|
|
| 224 |
|
| 225 |
return batches
|
| 226 |
|
| 227 |
+
@gpu_decorator
|
| 228 |
def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, progress=gr.Progress()):
|
| 229 |
if exp_name == "F5-TTS":
|
| 230 |
ema_model = F5TTS_ema_model
|
|
|
|
| 309 |
|
| 310 |
return (target_sample_rate, final_wave), spectrogram_path
|
| 311 |
|
| 312 |
+
@gpu_decorator
|
| 313 |
def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, custom_split_words=''):
|
| 314 |
if not custom_split_words.strip():
|
| 315 |
custom_words = [word.strip() for word in custom_split_words.split(',')]
|
|
|
|
| 359 |
gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
|
| 360 |
return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence)
|
| 361 |
|
| 362 |
+
@gpu_decorator
|
| 363 |
def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name, ref_audio2, ref_text2, exp_name, remove_silence):
|
| 364 |
# Split the script into speaker blocks
|
| 365 |
speaker_pattern = re.compile(f"^({re.escape(speaker1_name)}|{re.escape(speaker2_name)}):", re.MULTILINE)
|
|
|
|
| 696 |
# Output audio
|
| 697 |
audio_output_emotional = gr.Audio(label="Synthesized Audio")
|
| 698 |
|
| 699 |
+
@gpu_decorator
|
| 700 |
def generate_emotional_speech(
|
| 701 |
regular_audio,
|
| 702 |
regular_ref_text,
|
|
|
|
| 819 |
)
|
| 820 |
gr.TabbedInterface([app_tts, app_podcast, app_emotional, app_credits], ["TTS", "Podcast", "Multi-Style", "Credits"])
|
| 821 |
|
| 822 |
+
print(f"Starting app on Hugging Face Spaces...")
|
| 823 |
+
app.queue(api_open=True).launch(
|
| 824 |
+
server_name="0.0.0.0",
|
| 825 |
+
server_port=7860,
|
| 826 |
+
share=False,
|
| 827 |
+
show_api=True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 828 |
)
|
| 829 |
|
|
|
|
|
|
|
|
|