UniFlow-Audio / app.py
wsntxxn's picture
Update app.py
f09eb6f verified
raw
history blame
24.2 kB
#!/usr/bin/env python3
import os
import gradio as gr
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import spaces
from inference_cli import InferenceCLI
# Initialize inference CLI
cli = InferenceCLI()
# Available model choices
MODEL_CHOICES = [
"UniFlow-Audio-large", "UniFlow-Audio-medium", "UniFlow-Audio-small"
]
# Default model name
DEFAULT_MODEL = "UniFlow-Audio-large"
# Pre-initialize models
print("Initializing models, please wait...")
print(f"Loading main model: {DEFAULT_MODEL}")
cli.init_model(DEFAULT_MODEL)
print("Loading speaker model for TTS...")
cli.init_speaker_model()
print("Loading SVS processor for singing voice synthesis...")
cli.init_svs_processor()
print("Loading video preprocessor for V2A...")
cli.init_video_preprocessor()
print("All models loaded successfully!")
@spaces.GPU(duration=60)
def text_to_audio(
caption,
model_name,
guidance_scale,
num_steps,
progress=gr.Progress(track_tqdm=True)
):
"""Text to Audio generation"""
output_path = "./outputs/t2a_output.wav"
os.makedirs("./outputs", exist_ok=True)
try:
cli.t2a(
caption=caption,
model_name=model_name,
guidance_scale=guidance_scale,
num_steps=num_steps,
output_path=output_path
)
return output_path, "Generation successful!"
except Exception as e:
return None, f"Error: {str(e)}"
@spaces.GPU(duration=60)
def text_to_music(
caption,
model_name,
guidance_scale,
num_steps,
progress=gr.Progress(track_tqdm=True)
):
"""Text to Music generation"""
output_path = "./outputs/t2m_output.wav"
os.makedirs("./outputs", exist_ok=True)
try:
cli.t2m(
caption=caption,
model_name=model_name,
guidance_scale=guidance_scale,
num_steps=num_steps,
output_path=output_path
)
return output_path, "Generation successful!"
except Exception as e:
return None, f"Error: {str(e)}"
@spaces.GPU(duration=60)
def text_to_speech(
transcript,
ref_speaker_audio,
model_name,
guidance_scale,
num_steps,
progress=gr.Progress(track_tqdm=True)
):
"""Text to Speech synthesis"""
output_path = "./outputs/tts_output.wav"
os.makedirs("./outputs", exist_ok=True)
try:
cli.tts(
transcript=transcript,
ref_speaker_speech=ref_speaker_audio,
model_name=model_name,
guidance_scale=guidance_scale,
num_steps=num_steps,
output_path=output_path
)
return output_path, "Generation successful!"
except Exception as e:
return None, f"Error: {str(e)}"
@spaces.GPU(duration=60)
def singing_voice_synthesis(
singer,
lyric,
notes,
note_durations,
model_name,
guidance_scale,
num_steps,
progress=gr.Progress(track_tqdm=True)
):
"""Singing Voice Synthesis"""
output_path = "./outputs/svs_output.wav"
os.makedirs("./outputs", exist_ok=True)
try:
music_score = f"{lyric}<sep>{notes}<sep>{note_durations}"
cli.svs(
singer=singer,
music_score=music_score,
model_name=model_name,
guidance_scale=guidance_scale,
num_steps=num_steps,
output_path=output_path
)
return output_path, "Generation successful!"
except Exception as e:
return None, f"Error: {str(e)}"
@spaces.GPU(duration=60)
def speech_enhancement(
noisy_audio,
model_name,
guidance_scale,
num_steps,
progress=gr.Progress(track_tqdm=True)
):
"""Speech Enhancement"""
output_path = "./outputs/se_output.wav"
os.makedirs("./outputs", exist_ok=True)
try:
cli.se(
noisy_speech=noisy_audio,
model_name=model_name,
guidance_scale=guidance_scale,
num_steps=num_steps,
output_path=output_path
)
return output_path, "Enhancement successful!"
except Exception as e:
return None, f"Error: {str(e)}"
def generate_spectrogram(audio_path, title="Spectrogram"):
"""Generate spectrogram from audio file"""
try:
# Load audio file
y, sr = librosa.load(audio_path, sr=None)
# Create figure
fig, ax = plt.subplots(figsize=(10, 4))
# Generate mel spectrogram
D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
# Display spectrogram
img = librosa.display.specshow(
D, y_axis='hz', x_axis='time', sr=sr, ax=ax
)
ax.set_title(f'{title} (Sample Rate: {sr} Hz)')
fig.colorbar(img, ax=ax, format='%+2.0f dB')
# Save to file
spec_path = audio_path.replace('.wav', '_spec.png')
plt.tight_layout()
fig.savefig(spec_path, dpi=100, bbox_inches='tight')
plt.close(fig)
return spec_path
except Exception as e:
print(f"Error generating spectrogram: {str(e)}")
return None
@spaces.GPU(duration=60)
def audio_super_resolution(
low_sr_audio,
model_name,
guidance_scale,
num_steps,
progress=gr.Progress(track_tqdm=True)
):
"""Audio Super Resolution"""
output_path = "./outputs/sr_output.wav"
os.makedirs("./outputs", exist_ok=True)
try:
cli.sr(
low_sr_audio=low_sr_audio,
model_name=model_name,
guidance_scale=guidance_scale,
num_steps=num_steps,
output_path=output_path
)
# Generate spectrograms for input and output
input_spec = generate_spectrogram(
low_sr_audio, "Input Audio Spectrogram"
)
output_spec = generate_spectrogram(
output_path, "Output Audio Spectrogram"
)
return output_path, "Super-resolution successful!", input_spec, output_spec
except Exception as e:
return None, f"Error: {str(e)}", None, None
@spaces.GPU(duration=60)
def video_to_audio(
video,
model_name,
guidance_scale,
num_steps,
progress=gr.Progress(track_tqdm=True)
):
"""Video to Audio generation"""
output_path = "./outputs/v2a_output.mp4"
os.makedirs("./outputs", exist_ok=True)
try:
cli.v2a(
video=video,
model_name=model_name,
guidance_scale=guidance_scale,
num_steps=num_steps,
output_path=output_path
)
return output_path, "Generation successful!"
except Exception as e:
return None, f"Error: {str(e)}"
# Custom CSS for better tab display
custom_css = """
.tab-nav button {
font-size: 14px !important;
padding: 8px 12px !important;
min-width: fit-content !important;
}
.tab-nav {
overflow-x: auto !important;
flex-wrap: nowrap !important;
}
"""
# Create Gradio Interface
with gr.Blocks(
title="UniFlow-Audio Inference Demo",
theme=gr.themes.Soft(),
css=custom_css
) as demo:
gr.Markdown("# πŸ”Š UniFlow-Audio Inference Demo")
gr.Markdown(
"Multi-task Audio Generation System based on [UniFlow-Audio](https://arxiv.org/abs/2509.24391)"
)
gr.HTML("""
<div style="padding: 10px; background-color: #fffbcc; border: 1px solid #ffe564; border-radius:4px;">
<strong>Note: </strong>For TTS, due to the restriction of HuggingFace Space, the g2p phonemizer used here is inconsistant with the one used during training, so there may be problems. Please refer to <a href="https://github.com/wsntxxn/UniFlow-Audio/blob/master/docs/INFERENCE_CLI.md">INFERENCE_CLI.md</a> for CLI calling guidance.
</div>
""")
with gr.Tabs():
# Tab 1: Text to Audio
with gr.Tab("πŸ“’ Text to Audio"):
with gr.Row():
with gr.Column():
t2a_caption = gr.Textbox(
label="Audio Caption",
placeholder="e.g., a man is speaking while a dog barks",
lines=3
)
t2a_model = gr.Dropdown(
label="Model Name",
choices=MODEL_CHOICES,
value=DEFAULT_MODEL
)
with gr.Row():
t2a_guidance = gr.Slider(
label="Guidance Scale",
minimum=1.0,
maximum=10.0,
value=5.0,
step=0.5
)
t2a_steps = gr.Slider(
label="Sampling Steps",
minimum=1,
maximum=100,
value=25,
step=1
)
t2a_button = gr.Button("Generate Audio", variant="primary")
with gr.Column():
t2a_output = gr.Audio(
label="Generated Audio", type="filepath"
)
t2a_status = gr.Textbox(label="Status")
t2a_button.click(
fn=text_to_audio,
inputs=[t2a_caption, t2a_model, t2a_guidance, t2a_steps],
outputs=[t2a_output, t2a_status]
)
gr.Examples(
examples=[
["a man is speaking while a dog barks", 5.0, 25],
["footsteps on wooden floor", 5.0, 25],
],
inputs=[t2a_caption, t2a_guidance, t2a_steps]
)
# Tab 2: Text to Music
with gr.Tab("🎼 Text to Music"):
with gr.Row():
with gr.Column():
t2m_caption = gr.Textbox(
label="Music Caption",
placeholder="e.g., pop music with a male singing rap",
lines=3
)
t2m_model = gr.Dropdown(
label="Model Name",
choices=MODEL_CHOICES,
value=DEFAULT_MODEL
)
with gr.Row():
t2m_guidance = gr.Slider(
label="Guidance Scale",
minimum=1.0,
maximum=10.0,
value=5.0,
step=0.5
)
t2m_steps = gr.Slider(
label="Sampling Steps",
minimum=1,
maximum=100,
value=25,
step=1
)
t2m_button = gr.Button("Generate Music", variant="primary")
with gr.Column():
t2m_output = gr.Audio(
label="Generated Music", type="filepath"
)
t2m_status = gr.Textbox(label="Status")
t2m_button.click(
fn=text_to_music,
inputs=[t2m_caption, t2m_model, t2m_guidance, t2m_steps],
outputs=[t2m_output, t2m_status]
)
gr.Examples(
examples=[
["pop music with a male singing rap", 5.0, 25],
["classical piano solo", 5.0, 25],
],
inputs=[t2m_caption, t2m_guidance, t2m_steps]
)
# Tab 3: Text to Speech
with gr.Tab("πŸ—£οΈ Text to Speech"):
with gr.Row():
with gr.Column():
tts_transcript = gr.Textbox(
label="Text to Synthesize",
placeholder="e.g., Hello this is a special sentence",
lines=3
)
tts_ref_audio = gr.Audio(
label="Reference Speaker Audio", type="filepath"
)
tts_model = gr.Dropdown(
label="Model Name",
choices=MODEL_CHOICES,
value=DEFAULT_MODEL
)
with gr.Row():
tts_guidance = gr.Slider(
label="Guidance Scale",
minimum=1.0,
maximum=10.0,
value=5.0,
step=0.5
)
tts_steps = gr.Slider(
label="Sampling Steps",
minimum=1,
maximum=100,
value=25,
step=1
)
tts_button = gr.Button(
"Synthesize Speech", variant="primary"
)
with gr.Column():
tts_output = gr.Audio(
label="Synthesized Speech", type="filepath"
)
tts_status = gr.Textbox(label="Status")
tts_button.click(
fn=text_to_speech,
inputs=[
tts_transcript, tts_ref_audio, tts_model, tts_guidance,
tts_steps
],
outputs=[tts_output, tts_status]
)
gr.Examples(
examples=[
[
"Hello this is a special sentence with zyloph",
"./data/egs/tts_speaker_ref.wav", 5.0, 25
],
],
inputs=[
tts_transcript, tts_ref_audio, tts_guidance, tts_steps
]
)
# Tab 4: Singing Voice Synthesis
with gr.Tab("🎀 Singing Voice Synthesis"):
with gr.Row():
with gr.Column():
svs_singer = gr.Dropdown(
label="Singer",
choices=[
"Alto-1", "Alto-2", "Alto-3", "Alto-4", "Alto-5",
"Alto-6", "Alto-7", "Bass-1", "Bass-2", "Bass-3",
"Soprano-1", "Soprano-2", "Soprano-3", "Tenor-1",
"Tenor-2", "Tenor-3", "Tenor-4", "Tenor-5",
"Tenor-6", "Tenor-7"
],
value="Alto-2"
)
svs_lyric = gr.Textbox(
label="Lyrics",
placeholder="e.g., AP你要相俑APη›ΈδΏ‘ζˆ‘δ»¬δΌšεƒη«₯θ―ζ•…δΊ‹ι‡ŒAP",
lines=2
)
svs_notes = gr.Textbox(
label="Note Sequence",
placeholder="e.g., rest | G#3 | A#3 C4 | D#4 | ...",
lines=2
)
svs_durations = gr.Textbox(
label="Note Durations",
placeholder=
"e.g., 0.14 | 0.47 | 0.1905 0.1895 | 0.41 | ...",
lines=2
)
svs_model = gr.Dropdown(
label="Model Name",
choices=MODEL_CHOICES,
value=DEFAULT_MODEL
)
with gr.Row():
svs_guidance = gr.Slider(
label="Guidance Scale",
minimum=1.0,
maximum=10.0,
value=5.0,
step=0.5
)
svs_steps = gr.Slider(
label="Sampling Steps",
minimum=1,
maximum=100,
value=25,
step=1
)
svs_button = gr.Button(
"Synthesize Singing", variant="primary"
)
with gr.Column():
svs_output = gr.Audio(
label="Synthesized Singing", type="filepath"
)
svs_status = gr.Textbox(label="Status")
svs_button.click(
fn=singing_voice_synthesis,
inputs=[
svs_singer, svs_lyric, svs_notes, svs_durations, svs_model,
svs_guidance, svs_steps
],
outputs=[svs_output, svs_status]
)
gr.Examples(
examples=[
[
"Alto-2", "AP你要相俑APη›ΈδΏ‘ζˆ‘δ»¬δΌšεƒη«₯θ―ζ•…δΊ‹ι‡ŒAP",
"rest | G#3 | A#3 C4 | D#4 | D#4 F4 | rest | E4 F4 | F4 | D#4 A#3 | A#3 | A#3 | C#4 | B3 C4 | C#4 | B3 C4 | A#3 | G#3 | rest",
"0.14 | 0.47 | 0.1905 0.1895 | 0.41 | 0.3005 0.3895 | 0.21 | 0.2391 0.1809 | 0.32 | 0.4105 0.2095 | 0.35 | 0.43 | 0.45 | 0.2309 0.2291 | 0.48 | 0.225 0.195 | 0.29 | 0.71 | 0.14",
5.0, 25
],
],
inputs=[
svs_singer, svs_lyric, svs_notes, svs_durations,
svs_guidance, svs_steps
]
)
gr.Markdown(
"""
### Usage Instructions
- **Lyrics Format**: Use AP for pauses, e.g., `AP你要相俑APη›ΈδΏ‘ζˆ‘δ»¬δΌšεƒη«₯θ―ζ•…δΊ‹ι‡ŒAP`
- **Note Format**: Separate with `|`, use spaces for simultaneous notes, use `rest` for rests
- **Duration Format**: Note durations in seconds, separated by `|`
"""
)
# Tab 5: Speech Enhancement
with gr.Tab("πŸ”Š Speech Enhancement"):
with gr.Row():
with gr.Column():
se_input = gr.Audio(label="Noisy Speech", type="filepath")
se_model = gr.Dropdown(
label="Model Name",
choices=MODEL_CHOICES,
value=DEFAULT_MODEL
)
with gr.Row():
se_guidance = gr.Slider(
label="Guidance Scale",
minimum=1.0,
maximum=10.0,
value=1.0,
step=0.5
)
se_steps = gr.Slider(
label="Sampling Steps",
minimum=1,
maximum=100,
value=25,
step=1
)
se_button = gr.Button("Enhance Speech", variant="primary")
with gr.Column():
se_output = gr.Audio(
label="Enhanced Speech", type="filepath"
)
se_status = gr.Textbox(label="Status")
se_button.click(
fn=speech_enhancement,
inputs=[se_input, se_model, se_guidance, se_steps],
outputs=[se_output, se_status]
)
gr.Examples(
examples=[
["./data/egs/se_noisy_sample.wav", 1.0, 25],
],
inputs=[se_input, se_guidance, se_steps]
)
# Tab 6: Audio Super Resolution
with gr.Tab("⬆️ Audio SR"):
with gr.Row():
with gr.Column():
sr_input = gr.Audio(
label="Low Sample Rate Audio", type="filepath"
)
sr_model = gr.Dropdown(
label="Model Name",
choices=MODEL_CHOICES,
value=DEFAULT_MODEL
)
with gr.Row():
sr_guidance = gr.Slider(
label="Guidance Scale",
minimum=1.0,
maximum=10.0,
value=1.0,
step=0.5
)
sr_steps = gr.Slider(
label="Sampling Steps",
minimum=1,
maximum=100,
value=25,
step=1
)
sr_button = gr.Button(
"Super-Resolve Audio", variant="primary"
)
with gr.Column():
sr_output = gr.Audio(
label="High Sample Rate Audio", type="filepath"
)
sr_status = gr.Textbox(label="Status")
# Spectrograms display
with gr.Row():
with gr.Column():
sr_input_spec = gr.Image(
label="Input Spectrogram", type="filepath"
)
with gr.Column():
sr_output_spec = gr.Image(
label="Output Spectrogram", type="filepath"
)
sr_button.click(
fn=audio_super_resolution,
inputs=[sr_input, sr_model, sr_guidance, sr_steps],
outputs=[sr_output, sr_status, sr_input_spec, sr_output_spec]
)
gr.Examples(
examples=[
["./data/egs/sr_low_sr_sample.wav", 1.0, 25],
],
inputs=[sr_input, sr_guidance, sr_steps]
)
# Tab 7: Video to Audio
with gr.Tab("🎬 Video to Audio"):
with gr.Row():
with gr.Column():
v2a_input = gr.Video(label="Input Video")
v2a_model = gr.Dropdown(
label="Model Name",
choices=MODEL_CHOICES,
value=DEFAULT_MODEL
)
with gr.Row():
v2a_guidance = gr.Slider(
label="Guidance Scale",
minimum=1.0,
maximum=10.0,
value=5.0,
step=0.5
)
v2a_steps = gr.Slider(
label="Sampling Steps",
minimum=1,
maximum=100,
value=25,
step=1
)
v2a_button = gr.Button("Generate Audio", variant="primary")
with gr.Column():
v2a_output = gr.Video(label="Video with Audio")
v2a_status = gr.Textbox(label="Status")
v2a_button.click(
fn=video_to_audio,
inputs=[v2a_input, v2a_model, v2a_guidance, v2a_steps],
outputs=[v2a_output, v2a_status]
)
gr.Examples(
examples=[
["./data/egs/v2a_video_sample.mp4", 5.0, 25],
],
inputs=[v2a_input, v2a_guidance, v2a_steps]
)
gr.Markdown(
"""
---
### πŸ“ Notes
- **Model Name**: Choose from `UniFlow-Audio-large`, `UniFlow-Audio-medium`, or `UniFlow-Audio-small`
- **Guidance Scale**: Controls the guidance strength of the input condition on the output
- **Sampling Steps**: Number of flow matching sampling steps
πŸ’‘ Tip: Models will be automatically downloaded on first run, please be patient
"""
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, share=False)