#!/usr/bin/env python3 import os import gradio as gr import numpy as np import matplotlib.pyplot as plt import librosa import librosa.display import spaces from inference_cli import InferenceCLI # Initialize inference CLI cli = InferenceCLI() # Available model choices MODEL_CHOICES = [ "UniFlow-Audio-large", "UniFlow-Audio-medium", "UniFlow-Audio-small" ] # Default model name DEFAULT_MODEL = "UniFlow-Audio-large" # Pre-initialize models print("Initializing models, please wait...") print(f"Loading main model: {DEFAULT_MODEL}") cli.init_model(DEFAULT_MODEL) print("Loading speaker model for TTS...") cli.init_speaker_model() print("Loading SVS processor for singing voice synthesis...") cli.init_svs_processor() print("Loading video preprocessor for V2A...") cli.init_video_preprocessor() print("All models loaded successfully!") @spaces.GPU(duration=60) def text_to_audio( caption, model_name, guidance_scale, num_steps, progress=gr.Progress(track_tqdm=True) ): """Text to Audio generation""" output_path = "./outputs/t2a_output.wav" os.makedirs("./outputs", exist_ok=True) try: cli.t2a( caption=caption, model_name=model_name, guidance_scale=guidance_scale, num_steps=num_steps, output_path=output_path ) return output_path, "Generation successful!" except Exception as e: return None, f"Error: {str(e)}" @spaces.GPU(duration=60) def text_to_music( caption, model_name, guidance_scale, num_steps, progress=gr.Progress(track_tqdm=True) ): """Text to Music generation""" output_path = "./outputs/t2m_output.wav" os.makedirs("./outputs", exist_ok=True) try: cli.t2m( caption=caption, model_name=model_name, guidance_scale=guidance_scale, num_steps=num_steps, output_path=output_path ) return output_path, "Generation successful!" except Exception as e: return None, f"Error: {str(e)}" @spaces.GPU(duration=60) def text_to_speech( transcript, ref_speaker_audio, model_name, guidance_scale, num_steps, progress=gr.Progress(track_tqdm=True) ): """Text to Speech synthesis""" output_path = "./outputs/tts_output.wav" os.makedirs("./outputs", exist_ok=True) try: cli.tts( transcript=transcript, ref_speaker_speech=ref_speaker_audio, model_name=model_name, guidance_scale=guidance_scale, num_steps=num_steps, output_path=output_path ) return output_path, "Generation successful!" except Exception as e: return None, f"Error: {str(e)}" @spaces.GPU(duration=60) def singing_voice_synthesis( singer, lyric, notes, note_durations, model_name, guidance_scale, num_steps, progress=gr.Progress(track_tqdm=True) ): """Singing Voice Synthesis""" output_path = "./outputs/svs_output.wav" os.makedirs("./outputs", exist_ok=True) try: music_score = f"{lyric}{notes}{note_durations}" cli.svs( singer=singer, music_score=music_score, model_name=model_name, guidance_scale=guidance_scale, num_steps=num_steps, output_path=output_path ) return output_path, "Generation successful!" except Exception as e: return None, f"Error: {str(e)}" @spaces.GPU(duration=60) def speech_enhancement( noisy_audio, model_name, guidance_scale, num_steps, progress=gr.Progress(track_tqdm=True) ): """Speech Enhancement""" output_path = "./outputs/se_output.wav" os.makedirs("./outputs", exist_ok=True) try: cli.se( noisy_speech=noisy_audio, model_name=model_name, guidance_scale=guidance_scale, num_steps=num_steps, output_path=output_path ) return output_path, "Enhancement successful!" except Exception as e: return None, f"Error: {str(e)}" def generate_spectrogram(audio_path, title="Spectrogram"): """Generate spectrogram from audio file""" try: # Load audio file y, sr = librosa.load(audio_path, sr=None) # Create figure fig, ax = plt.subplots(figsize=(10, 4)) # Generate mel spectrogram D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max) # Display spectrogram img = librosa.display.specshow( D, y_axis='hz', x_axis='time', sr=sr, ax=ax ) ax.set_title(f'{title} (Sample Rate: {sr} Hz)') fig.colorbar(img, ax=ax, format='%+2.0f dB') # Save to file spec_path = audio_path.replace('.wav', '_spec.png') plt.tight_layout() fig.savefig(spec_path, dpi=100, bbox_inches='tight') plt.close(fig) return spec_path except Exception as e: print(f"Error generating spectrogram: {str(e)}") return None @spaces.GPU(duration=60) def audio_super_resolution( low_sr_audio, model_name, guidance_scale, num_steps, progress=gr.Progress(track_tqdm=True) ): """Audio Super Resolution""" output_path = "./outputs/sr_output.wav" os.makedirs("./outputs", exist_ok=True) try: cli.sr( low_sr_audio=low_sr_audio, model_name=model_name, guidance_scale=guidance_scale, num_steps=num_steps, output_path=output_path ) # Generate spectrograms for input and output input_spec = generate_spectrogram( low_sr_audio, "Input Audio Spectrogram" ) output_spec = generate_spectrogram( output_path, "Output Audio Spectrogram" ) return output_path, "Super-resolution successful!", input_spec, output_spec except Exception as e: return None, f"Error: {str(e)}", None, None @spaces.GPU(duration=60) def video_to_audio( video, model_name, guidance_scale, num_steps, progress=gr.Progress(track_tqdm=True) ): """Video to Audio generation""" output_path = "./outputs/v2a_output.mp4" os.makedirs("./outputs", exist_ok=True) try: cli.v2a( video=video, model_name=model_name, guidance_scale=guidance_scale, num_steps=num_steps, output_path=output_path ) return output_path, "Generation successful!" except Exception as e: return None, f"Error: {str(e)}" # Custom CSS for better tab display custom_css = """ .tab-nav button { font-size: 14px !important; padding: 8px 12px !important; min-width: fit-content !important; } .tab-nav { overflow-x: auto !important; flex-wrap: nowrap !important; } """ # Create Gradio Interface with gr.Blocks( title="UniFlow-Audio Inference Demo", theme=gr.themes.Soft(), css=custom_css ) as demo: gr.Markdown("# 🔊 UniFlow-Audio Inference Demo") gr.Markdown( "Multi-task Audio Generation System based on [UniFlow-Audio](https://arxiv.org/abs/2509.24391)" ) gr.HTML("""
Note: For TTS, due to the restriction of HuggingFace Space, the g2p phonemizer used here is inconsistant with the one used during training, so there may be problems. Please refer to INFERENCE_CLI.md for CLI calling guidance.
""") with gr.Tabs(): # Tab 1: Text to Audio with gr.Tab("📢 Text to Audio"): with gr.Row(): with gr.Column(): t2a_caption = gr.Textbox( label="Audio Caption", placeholder="e.g., a man is speaking while a dog barks", lines=3 ) t2a_model = gr.Dropdown( label="Model Name", choices=MODEL_CHOICES, value=DEFAULT_MODEL ) with gr.Row(): t2a_guidance = gr.Slider( label="Guidance Scale", minimum=1.0, maximum=10.0, value=5.0, step=0.5 ) t2a_steps = gr.Slider( label="Sampling Steps", minimum=1, maximum=100, value=25, step=1 ) t2a_button = gr.Button("Generate Audio", variant="primary") with gr.Column(): t2a_output = gr.Audio( label="Generated Audio", type="filepath" ) t2a_status = gr.Textbox(label="Status") t2a_button.click( fn=text_to_audio, inputs=[t2a_caption, t2a_model, t2a_guidance, t2a_steps], outputs=[t2a_output, t2a_status] ) gr.Examples( examples=[ ["a man is speaking while a dog barks", 5.0, 25], ["footsteps on wooden floor", 5.0, 25], ], inputs=[t2a_caption, t2a_guidance, t2a_steps] ) # Tab 2: Text to Music with gr.Tab("🎼 Text to Music"): with gr.Row(): with gr.Column(): t2m_caption = gr.Textbox( label="Music Caption", placeholder="e.g., pop music with a male singing rap", lines=3 ) t2m_model = gr.Dropdown( label="Model Name", choices=MODEL_CHOICES, value=DEFAULT_MODEL ) with gr.Row(): t2m_guidance = gr.Slider( label="Guidance Scale", minimum=1.0, maximum=10.0, value=5.0, step=0.5 ) t2m_steps = gr.Slider( label="Sampling Steps", minimum=1, maximum=100, value=25, step=1 ) t2m_button = gr.Button("Generate Music", variant="primary") with gr.Column(): t2m_output = gr.Audio( label="Generated Music", type="filepath" ) t2m_status = gr.Textbox(label="Status") t2m_button.click( fn=text_to_music, inputs=[t2m_caption, t2m_model, t2m_guidance, t2m_steps], outputs=[t2m_output, t2m_status] ) gr.Examples( examples=[ ["pop music with a male singing rap", 5.0, 25], ["classical piano solo", 5.0, 25], ], inputs=[t2m_caption, t2m_guidance, t2m_steps] ) # Tab 3: Text to Speech with gr.Tab("🗣️ Text to Speech"): with gr.Row(): with gr.Column(): tts_transcript = gr.Textbox( label="Text to Synthesize", placeholder="e.g., Hello this is a special sentence", lines=3 ) tts_ref_audio = gr.Audio( label="Reference Speaker Audio", type="filepath" ) tts_model = gr.Dropdown( label="Model Name", choices=MODEL_CHOICES, value=DEFAULT_MODEL ) with gr.Row(): tts_guidance = gr.Slider( label="Guidance Scale", minimum=1.0, maximum=10.0, value=5.0, step=0.5 ) tts_steps = gr.Slider( label="Sampling Steps", minimum=1, maximum=100, value=25, step=1 ) tts_button = gr.Button( "Synthesize Speech", variant="primary" ) with gr.Column(): tts_output = gr.Audio( label="Synthesized Speech", type="filepath" ) tts_status = gr.Textbox(label="Status") tts_button.click( fn=text_to_speech, inputs=[ tts_transcript, tts_ref_audio, tts_model, tts_guidance, tts_steps ], outputs=[tts_output, tts_status] ) gr.Examples( examples=[ [ "Hello this is a special sentence with zyloph", "./data/egs/tts_speaker_ref.wav", 5.0, 25 ], ], inputs=[ tts_transcript, tts_ref_audio, tts_guidance, tts_steps ] ) # Tab 4: Singing Voice Synthesis with gr.Tab("🎤 Singing Voice Synthesis"): with gr.Row(): with gr.Column(): svs_singer = gr.Dropdown( label="Singer", choices=[ "Alto-1", "Alto-2", "Alto-3", "Alto-4", "Alto-5", "Alto-6", "Alto-7", "Bass-1", "Bass-2", "Bass-3", "Soprano-1", "Soprano-2", "Soprano-3", "Tenor-1", "Tenor-2", "Tenor-3", "Tenor-4", "Tenor-5", "Tenor-6", "Tenor-7" ], value="Alto-2" ) svs_lyric = gr.Textbox( label="Lyrics", placeholder="e.g., AP你要相信AP相信我们会像童话故事里AP", lines=2 ) svs_notes = gr.Textbox( label="Note Sequence", placeholder="e.g., rest | G#3 | A#3 C4 | D#4 | ...", lines=2 ) svs_durations = gr.Textbox( label="Note Durations", placeholder= "e.g., 0.14 | 0.47 | 0.1905 0.1895 | 0.41 | ...", lines=2 ) svs_model = gr.Dropdown( label="Model Name", choices=MODEL_CHOICES, value=DEFAULT_MODEL ) with gr.Row(): svs_guidance = gr.Slider( label="Guidance Scale", minimum=1.0, maximum=10.0, value=5.0, step=0.5 ) svs_steps = gr.Slider( label="Sampling Steps", minimum=1, maximum=100, value=25, step=1 ) svs_button = gr.Button( "Synthesize Singing", variant="primary" ) with gr.Column(): svs_output = gr.Audio( label="Synthesized Singing", type="filepath" ) svs_status = gr.Textbox(label="Status") svs_button.click( fn=singing_voice_synthesis, inputs=[ svs_singer, svs_lyric, svs_notes, svs_durations, svs_model, svs_guidance, svs_steps ], outputs=[svs_output, svs_status] ) gr.Examples( examples=[ [ "Alto-2", "AP你要相信AP相信我们会像童话故事里AP", "rest | G#3 | A#3 C4 | D#4 | D#4 F4 | rest | E4 F4 | F4 | D#4 A#3 | A#3 | A#3 | C#4 | B3 C4 | C#4 | B3 C4 | A#3 | G#3 | rest", "0.14 | 0.47 | 0.1905 0.1895 | 0.41 | 0.3005 0.3895 | 0.21 | 0.2391 0.1809 | 0.32 | 0.4105 0.2095 | 0.35 | 0.43 | 0.45 | 0.2309 0.2291 | 0.48 | 0.225 0.195 | 0.29 | 0.71 | 0.14", 5.0, 25 ], ], inputs=[ svs_singer, svs_lyric, svs_notes, svs_durations, svs_guidance, svs_steps ] ) gr.Markdown( """ ### Usage Instructions - **Lyrics Format**: Use AP for pauses, e.g., `AP你要相信AP相信我们会像童话故事里AP` - **Note Format**: Separate with `|`, use spaces for simultaneous notes, use `rest` for rests - **Duration Format**: Note durations in seconds, separated by `|` """ ) # Tab 5: Speech Enhancement with gr.Tab("🔊 Speech Enhancement"): with gr.Row(): with gr.Column(): se_input = gr.Audio(label="Noisy Speech", type="filepath") se_model = gr.Dropdown( label="Model Name", choices=MODEL_CHOICES, value=DEFAULT_MODEL ) with gr.Row(): se_guidance = gr.Slider( label="Guidance Scale", minimum=1.0, maximum=10.0, value=1.0, step=0.5 ) se_steps = gr.Slider( label="Sampling Steps", minimum=1, maximum=100, value=25, step=1 ) se_button = gr.Button("Enhance Speech", variant="primary") with gr.Column(): se_output = gr.Audio( label="Enhanced Speech", type="filepath" ) se_status = gr.Textbox(label="Status") se_button.click( fn=speech_enhancement, inputs=[se_input, se_model, se_guidance, se_steps], outputs=[se_output, se_status] ) gr.Examples( examples=[ ["./data/egs/se_noisy_sample.wav", 1.0, 25], ], inputs=[se_input, se_guidance, se_steps] ) # Tab 6: Audio Super Resolution with gr.Tab("⬆️ Audio SR"): with gr.Row(): with gr.Column(): sr_input = gr.Audio( label="Low Sample Rate Audio", type="filepath" ) sr_model = gr.Dropdown( label="Model Name", choices=MODEL_CHOICES, value=DEFAULT_MODEL ) with gr.Row(): sr_guidance = gr.Slider( label="Guidance Scale", minimum=1.0, maximum=10.0, value=1.0, step=0.5 ) sr_steps = gr.Slider( label="Sampling Steps", minimum=1, maximum=100, value=25, step=1 ) sr_button = gr.Button( "Super-Resolve Audio", variant="primary" ) with gr.Column(): sr_output = gr.Audio( label="High Sample Rate Audio", type="filepath" ) sr_status = gr.Textbox(label="Status") # Spectrograms display with gr.Row(): with gr.Column(): sr_input_spec = gr.Image( label="Input Spectrogram", type="filepath" ) with gr.Column(): sr_output_spec = gr.Image( label="Output Spectrogram", type="filepath" ) sr_button.click( fn=audio_super_resolution, inputs=[sr_input, sr_model, sr_guidance, sr_steps], outputs=[sr_output, sr_status, sr_input_spec, sr_output_spec] ) gr.Examples( examples=[ ["./data/egs/sr_low_sr_sample.wav", 1.0, 25], ], inputs=[sr_input, sr_guidance, sr_steps] ) # Tab 7: Video to Audio with gr.Tab("🎬 Video to Audio"): with gr.Row(): with gr.Column(): v2a_input = gr.Video(label="Input Video") v2a_model = gr.Dropdown( label="Model Name", choices=MODEL_CHOICES, value=DEFAULT_MODEL ) with gr.Row(): v2a_guidance = gr.Slider( label="Guidance Scale", minimum=1.0, maximum=10.0, value=5.0, step=0.5 ) v2a_steps = gr.Slider( label="Sampling Steps", minimum=1, maximum=100, value=25, step=1 ) v2a_button = gr.Button("Generate Audio", variant="primary") with gr.Column(): v2a_output = gr.Video(label="Video with Audio") v2a_status = gr.Textbox(label="Status") v2a_button.click( fn=video_to_audio, inputs=[v2a_input, v2a_model, v2a_guidance, v2a_steps], outputs=[v2a_output, v2a_status] ) gr.Examples( examples=[ ["./data/egs/v2a_video_sample.mp4", 5.0, 25], ], inputs=[v2a_input, v2a_guidance, v2a_steps] ) gr.Markdown( """ --- ### 📝 Notes - **Model Name**: Choose from `UniFlow-Audio-large`, `UniFlow-Audio-medium`, or `UniFlow-Audio-small` - **Guidance Scale**: Controls the guidance strength of the input condition on the output - **Sampling Steps**: Number of flow matching sampling steps 💡 Tip: Models will be automatically downloaded on first run, please be patient """ ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860, share=False)