Spaces:
Running
on
Zero
Running
on
Zero
| #!/usr/bin/env python3 | |
| import os | |
| import gradio as gr | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import librosa | |
| import librosa.display | |
| import spaces | |
| from inference_cli import InferenceCLI | |
| # Initialize inference CLI | |
| cli = InferenceCLI() | |
| # Available model choices | |
| MODEL_CHOICES = [ | |
| "UniFlow-Audio-large", "UniFlow-Audio-medium", "UniFlow-Audio-small" | |
| ] | |
| # Default model name | |
| DEFAULT_MODEL = "UniFlow-Audio-large" | |
| # Pre-initialize models | |
| print("Initializing models, please wait...") | |
| print(f"Loading main model: {DEFAULT_MODEL}") | |
| cli.init_model(DEFAULT_MODEL) | |
| print("Loading speaker model for TTS...") | |
| cli.init_speaker_model() | |
| print("Loading SVS processor for singing voice synthesis...") | |
| cli.init_svs_processor() | |
| print("Loading video preprocessor for V2A...") | |
| cli.init_video_preprocessor() | |
| print("All models loaded successfully!") | |
| def text_to_audio( | |
| caption, | |
| model_name, | |
| guidance_scale, | |
| num_steps, | |
| progress=gr.Progress(track_tqdm=True) | |
| ): | |
| """Text to Audio generation""" | |
| output_path = "./outputs/t2a_output.wav" | |
| os.makedirs("./outputs", exist_ok=True) | |
| try: | |
| cli.t2a( | |
| caption=caption, | |
| model_name=model_name, | |
| guidance_scale=guidance_scale, | |
| num_steps=num_steps, | |
| output_path=output_path | |
| ) | |
| return output_path, "Generation successful!" | |
| except Exception as e: | |
| return None, f"Error: {str(e)}" | |
| def text_to_music( | |
| caption, | |
| model_name, | |
| guidance_scale, | |
| num_steps, | |
| progress=gr.Progress(track_tqdm=True) | |
| ): | |
| """Text to Music generation""" | |
| output_path = "./outputs/t2m_output.wav" | |
| os.makedirs("./outputs", exist_ok=True) | |
| try: | |
| cli.t2m( | |
| caption=caption, | |
| model_name=model_name, | |
| guidance_scale=guidance_scale, | |
| num_steps=num_steps, | |
| output_path=output_path | |
| ) | |
| return output_path, "Generation successful!" | |
| except Exception as e: | |
| return None, f"Error: {str(e)}" | |
| def text_to_speech( | |
| transcript, | |
| ref_speaker_audio, | |
| model_name, | |
| guidance_scale, | |
| num_steps, | |
| progress=gr.Progress(track_tqdm=True) | |
| ): | |
| """Text to Speech synthesis""" | |
| output_path = "./outputs/tts_output.wav" | |
| os.makedirs("./outputs", exist_ok=True) | |
| try: | |
| cli.tts( | |
| transcript=transcript, | |
| ref_speaker_speech=ref_speaker_audio, | |
| model_name=model_name, | |
| guidance_scale=guidance_scale, | |
| num_steps=num_steps, | |
| output_path=output_path | |
| ) | |
| return output_path, "Generation successful!" | |
| except Exception as e: | |
| return None, f"Error: {str(e)}" | |
| def singing_voice_synthesis( | |
| singer, | |
| lyric, | |
| notes, | |
| note_durations, | |
| model_name, | |
| guidance_scale, | |
| num_steps, | |
| progress=gr.Progress(track_tqdm=True) | |
| ): | |
| """Singing Voice Synthesis""" | |
| output_path = "./outputs/svs_output.wav" | |
| os.makedirs("./outputs", exist_ok=True) | |
| try: | |
| music_score = f"{lyric}<sep>{notes}<sep>{note_durations}" | |
| cli.svs( | |
| singer=singer, | |
| music_score=music_score, | |
| model_name=model_name, | |
| guidance_scale=guidance_scale, | |
| num_steps=num_steps, | |
| output_path=output_path | |
| ) | |
| return output_path, "Generation successful!" | |
| except Exception as e: | |
| return None, f"Error: {str(e)}" | |
| def speech_enhancement( | |
| noisy_audio, | |
| model_name, | |
| guidance_scale, | |
| num_steps, | |
| progress=gr.Progress(track_tqdm=True) | |
| ): | |
| """Speech Enhancement""" | |
| output_path = "./outputs/se_output.wav" | |
| os.makedirs("./outputs", exist_ok=True) | |
| try: | |
| cli.se( | |
| noisy_speech=noisy_audio, | |
| model_name=model_name, | |
| guidance_scale=guidance_scale, | |
| num_steps=num_steps, | |
| output_path=output_path | |
| ) | |
| return output_path, "Enhancement successful!" | |
| except Exception as e: | |
| return None, f"Error: {str(e)}" | |
| def generate_spectrogram(audio_path, title="Spectrogram"): | |
| """Generate spectrogram from audio file""" | |
| try: | |
| # Load audio file | |
| y, sr = librosa.load(audio_path, sr=None) | |
| # Create figure | |
| fig, ax = plt.subplots(figsize=(10, 4)) | |
| # Generate mel spectrogram | |
| D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max) | |
| # Display spectrogram | |
| img = librosa.display.specshow( | |
| D, y_axis='hz', x_axis='time', sr=sr, ax=ax | |
| ) | |
| ax.set_title(f'{title} (Sample Rate: {sr} Hz)') | |
| fig.colorbar(img, ax=ax, format='%+2.0f dB') | |
| # Save to file | |
| spec_path = audio_path.replace('.wav', '_spec.png') | |
| plt.tight_layout() | |
| fig.savefig(spec_path, dpi=100, bbox_inches='tight') | |
| plt.close(fig) | |
| return spec_path | |
| except Exception as e: | |
| print(f"Error generating spectrogram: {str(e)}") | |
| return None | |
| def audio_super_resolution( | |
| low_sr_audio, | |
| model_name, | |
| guidance_scale, | |
| num_steps, | |
| progress=gr.Progress(track_tqdm=True) | |
| ): | |
| """Audio Super Resolution""" | |
| output_path = "./outputs/sr_output.wav" | |
| os.makedirs("./outputs", exist_ok=True) | |
| try: | |
| cli.sr( | |
| low_sr_audio=low_sr_audio, | |
| model_name=model_name, | |
| guidance_scale=guidance_scale, | |
| num_steps=num_steps, | |
| output_path=output_path | |
| ) | |
| # Generate spectrograms for input and output | |
| input_spec = generate_spectrogram( | |
| low_sr_audio, "Input Audio Spectrogram" | |
| ) | |
| output_spec = generate_spectrogram( | |
| output_path, "Output Audio Spectrogram" | |
| ) | |
| return output_path, "Super-resolution successful!", input_spec, output_spec | |
| except Exception as e: | |
| return None, f"Error: {str(e)}", None, None | |
| def video_to_audio( | |
| video, | |
| model_name, | |
| guidance_scale, | |
| num_steps, | |
| progress=gr.Progress(track_tqdm=True) | |
| ): | |
| """Video to Audio generation""" | |
| output_path = "./outputs/v2a_output.mp4" | |
| os.makedirs("./outputs", exist_ok=True) | |
| try: | |
| cli.v2a( | |
| video=video, | |
| model_name=model_name, | |
| guidance_scale=guidance_scale, | |
| num_steps=num_steps, | |
| output_path=output_path | |
| ) | |
| return output_path, "Generation successful!" | |
| except Exception as e: | |
| return None, f"Error: {str(e)}" | |
| # Custom CSS for better tab display | |
| custom_css = """ | |
| .tab-nav button { | |
| font-size: 14px !important; | |
| padding: 8px 12px !important; | |
| min-width: fit-content !important; | |
| } | |
| .tab-nav { | |
| overflow-x: auto !important; | |
| flex-wrap: nowrap !important; | |
| } | |
| """ | |
| # Create Gradio Interface | |
| with gr.Blocks( | |
| title="UniFlow-Audio Inference Demo", | |
| theme=gr.themes.Soft(), | |
| css=custom_css | |
| ) as demo: | |
| gr.Markdown("# π UniFlow-Audio Inference Demo") | |
| gr.Markdown( | |
| "Multi-task Audio Generation System based on [UniFlow-Audio](https://arxiv.org/abs/2509.24391)" | |
| ) | |
| gr.HTML(""" | |
| <div style="padding: 10px; background-color: #fffbcc; border: 1px solid #ffe564; border-radius:4px;"> | |
| <strong>Note: </strong>For TTS, due to the restriction of HuggingFace Space, the g2p phonemizer used here is inconsistant with the one used during training, so there may be problems. Please refer to <a href="https://github.com/wsntxxn/UniFlow-Audio/blob/master/docs/INFERENCE_CLI.md">INFERENCE_CLI.md</a> for CLI calling guidance. | |
| </div> | |
| """) | |
| with gr.Tabs(): | |
| # Tab 1: Text to Audio | |
| with gr.Tab("π’ Text to Audio"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| t2a_caption = gr.Textbox( | |
| label="Audio Caption", | |
| placeholder="e.g., a man is speaking while a dog barks", | |
| lines=3 | |
| ) | |
| t2a_model = gr.Dropdown( | |
| label="Model Name", | |
| choices=MODEL_CHOICES, | |
| value=DEFAULT_MODEL | |
| ) | |
| with gr.Row(): | |
| t2a_guidance = gr.Slider( | |
| label="Guidance Scale", | |
| minimum=1.0, | |
| maximum=10.0, | |
| value=5.0, | |
| step=0.5 | |
| ) | |
| t2a_steps = gr.Slider( | |
| label="Sampling Steps", | |
| minimum=1, | |
| maximum=100, | |
| value=25, | |
| step=1 | |
| ) | |
| t2a_button = gr.Button("Generate Audio", variant="primary") | |
| with gr.Column(): | |
| t2a_output = gr.Audio( | |
| label="Generated Audio", type="filepath" | |
| ) | |
| t2a_status = gr.Textbox(label="Status") | |
| t2a_button.click( | |
| fn=text_to_audio, | |
| inputs=[t2a_caption, t2a_model, t2a_guidance, t2a_steps], | |
| outputs=[t2a_output, t2a_status] | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["a man is speaking while a dog barks", 5.0, 25], | |
| ["footsteps on wooden floor", 5.0, 25], | |
| ], | |
| inputs=[t2a_caption, t2a_guidance, t2a_steps] | |
| ) | |
| # Tab 2: Text to Music | |
| with gr.Tab("πΌ Text to Music"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| t2m_caption = gr.Textbox( | |
| label="Music Caption", | |
| placeholder="e.g., pop music with a male singing rap", | |
| lines=3 | |
| ) | |
| t2m_model = gr.Dropdown( | |
| label="Model Name", | |
| choices=MODEL_CHOICES, | |
| value=DEFAULT_MODEL | |
| ) | |
| with gr.Row(): | |
| t2m_guidance = gr.Slider( | |
| label="Guidance Scale", | |
| minimum=1.0, | |
| maximum=10.0, | |
| value=5.0, | |
| step=0.5 | |
| ) | |
| t2m_steps = gr.Slider( | |
| label="Sampling Steps", | |
| minimum=1, | |
| maximum=100, | |
| value=25, | |
| step=1 | |
| ) | |
| t2m_button = gr.Button("Generate Music", variant="primary") | |
| with gr.Column(): | |
| t2m_output = gr.Audio( | |
| label="Generated Music", type="filepath" | |
| ) | |
| t2m_status = gr.Textbox(label="Status") | |
| t2m_button.click( | |
| fn=text_to_music, | |
| inputs=[t2m_caption, t2m_model, t2m_guidance, t2m_steps], | |
| outputs=[t2m_output, t2m_status] | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["pop music with a male singing rap", 5.0, 25], | |
| ["classical piano solo", 5.0, 25], | |
| ], | |
| inputs=[t2m_caption, t2m_guidance, t2m_steps] | |
| ) | |
| # Tab 3: Text to Speech | |
| with gr.Tab("π£οΈ Text to Speech"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| tts_transcript = gr.Textbox( | |
| label="Text to Synthesize", | |
| placeholder="e.g., Hello this is a special sentence", | |
| lines=3 | |
| ) | |
| tts_ref_audio = gr.Audio( | |
| label="Reference Speaker Audio", type="filepath" | |
| ) | |
| tts_model = gr.Dropdown( | |
| label="Model Name", | |
| choices=MODEL_CHOICES, | |
| value=DEFAULT_MODEL | |
| ) | |
| with gr.Row(): | |
| tts_guidance = gr.Slider( | |
| label="Guidance Scale", | |
| minimum=1.0, | |
| maximum=10.0, | |
| value=5.0, | |
| step=0.5 | |
| ) | |
| tts_steps = gr.Slider( | |
| label="Sampling Steps", | |
| minimum=1, | |
| maximum=100, | |
| value=25, | |
| step=1 | |
| ) | |
| tts_button = gr.Button( | |
| "Synthesize Speech", variant="primary" | |
| ) | |
| with gr.Column(): | |
| tts_output = gr.Audio( | |
| label="Synthesized Speech", type="filepath" | |
| ) | |
| tts_status = gr.Textbox(label="Status") | |
| tts_button.click( | |
| fn=text_to_speech, | |
| inputs=[ | |
| tts_transcript, tts_ref_audio, tts_model, tts_guidance, | |
| tts_steps | |
| ], | |
| outputs=[tts_output, tts_status] | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| [ | |
| "Hello this is a special sentence with zyloph", | |
| "./data/egs/tts_speaker_ref.wav", 5.0, 25 | |
| ], | |
| ], | |
| inputs=[ | |
| tts_transcript, tts_ref_audio, tts_guidance, tts_steps | |
| ] | |
| ) | |
| # Tab 4: Singing Voice Synthesis | |
| with gr.Tab("π€ Singing Voice Synthesis"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| svs_singer = gr.Dropdown( | |
| label="Singer", | |
| choices=[ | |
| "Alto-1", "Alto-2", "Alto-3", "Alto-4", "Alto-5", | |
| "Alto-6", "Alto-7", "Bass-1", "Bass-2", "Bass-3", | |
| "Soprano-1", "Soprano-2", "Soprano-3", "Tenor-1", | |
| "Tenor-2", "Tenor-3", "Tenor-4", "Tenor-5", | |
| "Tenor-6", "Tenor-7" | |
| ], | |
| value="Alto-2" | |
| ) | |
| svs_lyric = gr.Textbox( | |
| label="Lyrics", | |
| placeholder="e.g., APδ½ θ¦ηΈδΏ‘APηΈδΏ‘ζ们δΌεη«₯θ―ζ δΊιAP", | |
| lines=2 | |
| ) | |
| svs_notes = gr.Textbox( | |
| label="Note Sequence", | |
| placeholder="e.g., rest | G#3 | A#3 C4 | D#4 | ...", | |
| lines=2 | |
| ) | |
| svs_durations = gr.Textbox( | |
| label="Note Durations", | |
| placeholder= | |
| "e.g., 0.14 | 0.47 | 0.1905 0.1895 | 0.41 | ...", | |
| lines=2 | |
| ) | |
| svs_model = gr.Dropdown( | |
| label="Model Name", | |
| choices=MODEL_CHOICES, | |
| value=DEFAULT_MODEL | |
| ) | |
| with gr.Row(): | |
| svs_guidance = gr.Slider( | |
| label="Guidance Scale", | |
| minimum=1.0, | |
| maximum=10.0, | |
| value=5.0, | |
| step=0.5 | |
| ) | |
| svs_steps = gr.Slider( | |
| label="Sampling Steps", | |
| minimum=1, | |
| maximum=100, | |
| value=25, | |
| step=1 | |
| ) | |
| svs_button = gr.Button( | |
| "Synthesize Singing", variant="primary" | |
| ) | |
| with gr.Column(): | |
| svs_output = gr.Audio( | |
| label="Synthesized Singing", type="filepath" | |
| ) | |
| svs_status = gr.Textbox(label="Status") | |
| svs_button.click( | |
| fn=singing_voice_synthesis, | |
| inputs=[ | |
| svs_singer, svs_lyric, svs_notes, svs_durations, svs_model, | |
| svs_guidance, svs_steps | |
| ], | |
| outputs=[svs_output, svs_status] | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| [ | |
| "Alto-2", "APδ½ θ¦ηΈδΏ‘APηΈδΏ‘ζ们δΌεη«₯θ―ζ δΊιAP", | |
| "rest | G#3 | A#3 C4 | D#4 | D#4 F4 | rest | E4 F4 | F4 | D#4 A#3 | A#3 | A#3 | C#4 | B3 C4 | C#4 | B3 C4 | A#3 | G#3 | rest", | |
| "0.14 | 0.47 | 0.1905 0.1895 | 0.41 | 0.3005 0.3895 | 0.21 | 0.2391 0.1809 | 0.32 | 0.4105 0.2095 | 0.35 | 0.43 | 0.45 | 0.2309 0.2291 | 0.48 | 0.225 0.195 | 0.29 | 0.71 | 0.14", | |
| 5.0, 25 | |
| ], | |
| ], | |
| inputs=[ | |
| svs_singer, svs_lyric, svs_notes, svs_durations, | |
| svs_guidance, svs_steps | |
| ] | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### Usage Instructions | |
| - **Lyrics Format**: Use AP for pauses, e.g., `APδ½ θ¦ηΈδΏ‘APηΈδΏ‘ζ们δΌεη«₯θ―ζ δΊιAP` | |
| - **Note Format**: Separate with `|`, use spaces for simultaneous notes, use `rest` for rests | |
| - **Duration Format**: Note durations in seconds, separated by `|` | |
| """ | |
| ) | |
| # Tab 5: Speech Enhancement | |
| with gr.Tab("π Speech Enhancement"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| se_input = gr.Audio(label="Noisy Speech", type="filepath") | |
| se_model = gr.Dropdown( | |
| label="Model Name", | |
| choices=MODEL_CHOICES, | |
| value=DEFAULT_MODEL | |
| ) | |
| with gr.Row(): | |
| se_guidance = gr.Slider( | |
| label="Guidance Scale", | |
| minimum=1.0, | |
| maximum=10.0, | |
| value=1.0, | |
| step=0.5 | |
| ) | |
| se_steps = gr.Slider( | |
| label="Sampling Steps", | |
| minimum=1, | |
| maximum=100, | |
| value=25, | |
| step=1 | |
| ) | |
| se_button = gr.Button("Enhance Speech", variant="primary") | |
| with gr.Column(): | |
| se_output = gr.Audio( | |
| label="Enhanced Speech", type="filepath" | |
| ) | |
| se_status = gr.Textbox(label="Status") | |
| se_button.click( | |
| fn=speech_enhancement, | |
| inputs=[se_input, se_model, se_guidance, se_steps], | |
| outputs=[se_output, se_status] | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["./data/egs/se_noisy_sample.wav", 1.0, 25], | |
| ], | |
| inputs=[se_input, se_guidance, se_steps] | |
| ) | |
| # Tab 6: Audio Super Resolution | |
| with gr.Tab("β¬οΈ Audio SR"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| sr_input = gr.Audio( | |
| label="Low Sample Rate Audio", type="filepath" | |
| ) | |
| sr_model = gr.Dropdown( | |
| label="Model Name", | |
| choices=MODEL_CHOICES, | |
| value=DEFAULT_MODEL | |
| ) | |
| with gr.Row(): | |
| sr_guidance = gr.Slider( | |
| label="Guidance Scale", | |
| minimum=1.0, | |
| maximum=10.0, | |
| value=1.0, | |
| step=0.5 | |
| ) | |
| sr_steps = gr.Slider( | |
| label="Sampling Steps", | |
| minimum=1, | |
| maximum=100, | |
| value=25, | |
| step=1 | |
| ) | |
| sr_button = gr.Button( | |
| "Super-Resolve Audio", variant="primary" | |
| ) | |
| with gr.Column(): | |
| sr_output = gr.Audio( | |
| label="High Sample Rate Audio", type="filepath" | |
| ) | |
| sr_status = gr.Textbox(label="Status") | |
| # Spectrograms display | |
| with gr.Row(): | |
| with gr.Column(): | |
| sr_input_spec = gr.Image( | |
| label="Input Spectrogram", type="filepath" | |
| ) | |
| with gr.Column(): | |
| sr_output_spec = gr.Image( | |
| label="Output Spectrogram", type="filepath" | |
| ) | |
| sr_button.click( | |
| fn=audio_super_resolution, | |
| inputs=[sr_input, sr_model, sr_guidance, sr_steps], | |
| outputs=[sr_output, sr_status, sr_input_spec, sr_output_spec] | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["./data/egs/sr_low_sr_sample.wav", 1.0, 25], | |
| ], | |
| inputs=[sr_input, sr_guidance, sr_steps] | |
| ) | |
| # Tab 7: Video to Audio | |
| with gr.Tab("π¬ Video to Audio"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| v2a_input = gr.Video(label="Input Video") | |
| v2a_model = gr.Dropdown( | |
| label="Model Name", | |
| choices=MODEL_CHOICES, | |
| value=DEFAULT_MODEL | |
| ) | |
| with gr.Row(): | |
| v2a_guidance = gr.Slider( | |
| label="Guidance Scale", | |
| minimum=1.0, | |
| maximum=10.0, | |
| value=5.0, | |
| step=0.5 | |
| ) | |
| v2a_steps = gr.Slider( | |
| label="Sampling Steps", | |
| minimum=1, | |
| maximum=100, | |
| value=25, | |
| step=1 | |
| ) | |
| v2a_button = gr.Button("Generate Audio", variant="primary") | |
| with gr.Column(): | |
| v2a_output = gr.Video(label="Video with Audio") | |
| v2a_status = gr.Textbox(label="Status") | |
| v2a_button.click( | |
| fn=video_to_audio, | |
| inputs=[v2a_input, v2a_model, v2a_guidance, v2a_steps], | |
| outputs=[v2a_output, v2a_status] | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["./data/egs/v2a_video_sample.mp4", 5.0, 25], | |
| ], | |
| inputs=[v2a_input, v2a_guidance, v2a_steps] | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| ### π Notes | |
| - **Model Name**: Choose from `UniFlow-Audio-large`, `UniFlow-Audio-medium`, or `UniFlow-Audio-small` | |
| - **Guidance Scale**: Controls the guidance strength of the input condition on the output | |
| - **Sampling Steps**: Number of flow matching sampling steps | |
| π‘ Tip: Models will be automatically downloaded on first run, please be patient | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=False) | |