Spaces:

wsntxxn
/

UniFlow-Audio

Running on Zero

File size: 24,241 Bytes

#!/usr/bin/env python3

import os
import gradio as gr
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display

import spaces

from inference_cli import InferenceCLI

# Initialize inference CLI
cli = InferenceCLI()

# Available model choices
MODEL_CHOICES = [
    "UniFlow-Audio-large", "UniFlow-Audio-medium", "UniFlow-Audio-small"
]

# Default model name
DEFAULT_MODEL = "UniFlow-Audio-large"

# Pre-initialize models
print("Initializing models, please wait...")
print(f"Loading main model: {DEFAULT_MODEL}")
cli.init_model(DEFAULT_MODEL)

print("Loading speaker model for TTS...")
cli.init_speaker_model()

print("Loading SVS processor for singing voice synthesis...")
cli.init_svs_processor()

print("Loading video preprocessor for V2A...")
cli.init_video_preprocessor()

print("All models loaded successfully!")


@spaces.GPU(duration=60)
def text_to_audio(
    caption,
    model_name,
    guidance_scale,
    num_steps,
    progress=gr.Progress(track_tqdm=True)
):
    """Text to Audio generation"""
    output_path = "./outputs/t2a_output.wav"
    os.makedirs("./outputs", exist_ok=True)

    try:
        cli.t2a(
            caption=caption,
            model_name=model_name,
            guidance_scale=guidance_scale,
            num_steps=num_steps,
            output_path=output_path
        )
        return output_path, "Generation successful!"
    except Exception as e:
        return None, f"Error: {str(e)}"


@spaces.GPU(duration=60)
def text_to_music(
    caption,
    model_name,
    guidance_scale,
    num_steps,
    progress=gr.Progress(track_tqdm=True)
):
    """Text to Music generation"""
    output_path = "./outputs/t2m_output.wav"
    os.makedirs("./outputs", exist_ok=True)

    try:
        cli.t2m(
            caption=caption,
            model_name=model_name,
            guidance_scale=guidance_scale,
            num_steps=num_steps,
            output_path=output_path
        )
        return output_path, "Generation successful!"
    except Exception as e:
        return None, f"Error: {str(e)}"


@spaces.GPU(duration=60)
def text_to_speech(
    transcript,
    ref_speaker_audio,
    model_name,
    guidance_scale,
    num_steps,
    progress=gr.Progress(track_tqdm=True)
):
    """Text to Speech synthesis"""
    output_path = "./outputs/tts_output.wav"
    os.makedirs("./outputs", exist_ok=True)

    try:
        cli.tts(
            transcript=transcript,
            ref_speaker_speech=ref_speaker_audio,
            model_name=model_name,
            guidance_scale=guidance_scale,
            num_steps=num_steps,
            output_path=output_path
        )
        return output_path, "Generation successful!"
    except Exception as e:
        return None, f"Error: {str(e)}"


@spaces.GPU(duration=60)
def singing_voice_synthesis(
    singer,
    lyric,
    notes,
    note_durations,
    model_name,
    guidance_scale,
    num_steps,
    progress=gr.Progress(track_tqdm=True)
):
    """Singing Voice Synthesis"""
    output_path = "./outputs/svs_output.wav"
    os.makedirs("./outputs", exist_ok=True)

    try:
        music_score = f"{lyric}<sep>{notes}<sep>{note_durations}"
        cli.svs(
            singer=singer,
            music_score=music_score,
            model_name=model_name,
            guidance_scale=guidance_scale,
            num_steps=num_steps,
            output_path=output_path
        )
        return output_path, "Generation successful!"
    except Exception as e:
        return None, f"Error: {str(e)}"


@spaces.GPU(duration=60)
def speech_enhancement(
    noisy_audio,
    model_name,
    guidance_scale,
    num_steps,
    progress=gr.Progress(track_tqdm=True)
):
    """Speech Enhancement"""
    output_path = "./outputs/se_output.wav"
    os.makedirs("./outputs", exist_ok=True)

    try:
        cli.se(
            noisy_speech=noisy_audio,
            model_name=model_name,
            guidance_scale=guidance_scale,
            num_steps=num_steps,
            output_path=output_path
        )
        return output_path, "Enhancement successful!"
    except Exception as e:
        return None, f"Error: {str(e)}"


def generate_spectrogram(audio_path, title="Spectrogram"):
    """Generate spectrogram from audio file"""
    try:
        # Load audio file
        y, sr = librosa.load(audio_path, sr=None)

        # Create figure
        fig, ax = plt.subplots(figsize=(10, 4))

        # Generate mel spectrogram
        D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)

        # Display spectrogram
        img = librosa.display.specshow(
            D, y_axis='hz', x_axis='time', sr=sr, ax=ax
        )
        ax.set_title(f'{title} (Sample Rate: {sr} Hz)')
        fig.colorbar(img, ax=ax, format='%+2.0f dB')

        # Save to file
        spec_path = audio_path.replace('.wav', '_spec.png')
        plt.tight_layout()
        fig.savefig(spec_path, dpi=100, bbox_inches='tight')
        plt.close(fig)

        return spec_path
    except Exception as e:
        print(f"Error generating spectrogram: {str(e)}")
        return None


@spaces.GPU(duration=60)
def audio_super_resolution(
    low_sr_audio,
    model_name,
    guidance_scale,
    num_steps,
    progress=gr.Progress(track_tqdm=True)
):
    """Audio Super Resolution"""
    output_path = "./outputs/sr_output.wav"
    os.makedirs("./outputs", exist_ok=True)

    try:
        cli.sr(
            low_sr_audio=low_sr_audio,
            model_name=model_name,
            guidance_scale=guidance_scale,
            num_steps=num_steps,
            output_path=output_path
        )

        # Generate spectrograms for input and output
        input_spec = generate_spectrogram(
            low_sr_audio, "Input Audio Spectrogram"
        )
        output_spec = generate_spectrogram(
            output_path, "Output Audio Spectrogram"
        )

        return output_path, "Super-resolution successful!", input_spec, output_spec
    except Exception as e:
        return None, f"Error: {str(e)}", None, None


@spaces.GPU(duration=60)
def video_to_audio(
    video,
    model_name,
    guidance_scale,
    num_steps,
    progress=gr.Progress(track_tqdm=True)
):
    """Video to Audio generation"""
    output_path = "./outputs/v2a_output.mp4"
    os.makedirs("./outputs", exist_ok=True)

    try:
        cli.v2a(
            video=video,
            model_name=model_name,
            guidance_scale=guidance_scale,
            num_steps=num_steps,
            output_path=output_path
        )
        return output_path, "Generation successful!"
    except Exception as e:
        return None, f"Error: {str(e)}"


# Custom CSS for better tab display
custom_css = """
.tab-nav button {
    font-size: 14px !important;
    padding: 8px 12px !important;
    min-width: fit-content !important;
}
.tab-nav {
    overflow-x: auto !important;
    flex-wrap: nowrap !important;
}
"""

# Create Gradio Interface
with gr.Blocks(
    title="UniFlow-Audio Inference Demo",
    theme=gr.themes.Soft(),
    css=custom_css
) as demo:
    gr.Markdown("# 🔊 UniFlow-Audio Inference Demo")
    gr.Markdown(
        "Multi-task Audio Generation System based on [UniFlow-Audio](https://arxiv.org/abs/2509.24391)"
    )
    gr.HTML("""
    <div style="padding: 10px; background-color: #fffbcc; border: 1px solid #ffe564; border-radius:4px;">
        <strong>Note: </strong>For TTS, due to the restriction of HuggingFace Space, the g2p phonemizer used here is inconsistant with the one used during training, so there may be problems. Please refer to <a href="https://github.com/wsntxxn/UniFlow-Audio/blob/master/docs/INFERENCE_CLI.md">INFERENCE_CLI.md</a> for CLI calling guidance.
    </div>
    """)

    with gr.Tabs():
        # Tab 1: Text to Audio
        with gr.Tab("📢 Text to Audio"):
            with gr.Row():
                with gr.Column():
                    t2a_caption = gr.Textbox(
                        label="Audio Caption",
                        placeholder="e.g., a man is speaking while a dog barks",
                        lines=3
                    )
                    t2a_model = gr.Dropdown(
                        label="Model Name",
                        choices=MODEL_CHOICES,
                        value=DEFAULT_MODEL
                    )
                    with gr.Row():
                        t2a_guidance = gr.Slider(
                            label="Guidance Scale",
                            minimum=1.0,
                            maximum=10.0,
                            value=5.0,
                            step=0.5
                        )
                        t2a_steps = gr.Slider(
                            label="Sampling Steps",
                            minimum=1,
                            maximum=100,
                            value=25,
                            step=1
                        )
                    t2a_button = gr.Button("Generate Audio", variant="primary")

                with gr.Column():
                    t2a_output = gr.Audio(
                        label="Generated Audio", type="filepath"
                    )
                    t2a_status = gr.Textbox(label="Status")

            t2a_button.click(
                fn=text_to_audio,
                inputs=[t2a_caption, t2a_model, t2a_guidance, t2a_steps],
                outputs=[t2a_output, t2a_status]
            )

            gr.Examples(
                examples=[
                    ["a man is speaking while a dog barks", 5.0, 25],
                    ["footsteps on wooden floor", 5.0, 25],
                ],
                inputs=[t2a_caption, t2a_guidance, t2a_steps]
            )

        # Tab 2: Text to Music
        with gr.Tab("🎼 Text to Music"):
            with gr.Row():
                with gr.Column():
                    t2m_caption = gr.Textbox(
                        label="Music Caption",
                        placeholder="e.g., pop music with a male singing rap",
                        lines=3
                    )
                    t2m_model = gr.Dropdown(
                        label="Model Name",
                        choices=MODEL_CHOICES,
                        value=DEFAULT_MODEL
                    )
                    with gr.Row():
                        t2m_guidance = gr.Slider(
                            label="Guidance Scale",
                            minimum=1.0,
                            maximum=10.0,
                            value=5.0,
                            step=0.5
                        )
                        t2m_steps = gr.Slider(
                            label="Sampling Steps",
                            minimum=1,
                            maximum=100,
                            value=25,
                            step=1
                        )
                    t2m_button = gr.Button("Generate Music", variant="primary")

                with gr.Column():
                    t2m_output = gr.Audio(
                        label="Generated Music", type="filepath"
                    )
                    t2m_status = gr.Textbox(label="Status")

            t2m_button.click(
                fn=text_to_music,
                inputs=[t2m_caption, t2m_model, t2m_guidance, t2m_steps],
                outputs=[t2m_output, t2m_status]
            )

            gr.Examples(
                examples=[
                    ["pop music with a male singing rap", 5.0, 25],
                    ["classical piano solo", 5.0, 25],
                ],
                inputs=[t2m_caption, t2m_guidance, t2m_steps]
            )

        # Tab 3: Text to Speech
        with gr.Tab("🗣️ Text to Speech"):
            with gr.Row():
                with gr.Column():
                    tts_transcript = gr.Textbox(
                        label="Text to Synthesize",
                        placeholder="e.g., Hello this is a special sentence",
                        lines=3
                    )
                    tts_ref_audio = gr.Audio(
                        label="Reference Speaker Audio", type="filepath"
                    )
                    tts_model = gr.Dropdown(
                        label="Model Name",
                        choices=MODEL_CHOICES,
                        value=DEFAULT_MODEL
                    )
                    with gr.Row():
                        tts_guidance = gr.Slider(
                            label="Guidance Scale",
                            minimum=1.0,
                            maximum=10.0,
                            value=5.0,
                            step=0.5
                        )
                        tts_steps = gr.Slider(
                            label="Sampling Steps",
                            minimum=1,
                            maximum=100,
                            value=25,
                            step=1
                        )
                    tts_button = gr.Button(
                        "Synthesize Speech", variant="primary"
                    )

                with gr.Column():
                    tts_output = gr.Audio(
                        label="Synthesized Speech", type="filepath"
                    )
                    tts_status = gr.Textbox(label="Status")

            tts_button.click(
                fn=text_to_speech,
                inputs=[
                    tts_transcript, tts_ref_audio, tts_model, tts_guidance,
                    tts_steps
                ],
                outputs=[tts_output, tts_status]
            )

            gr.Examples(
                examples=[
                    [
                        "Hello this is a special sentence with zyloph",
                        "./data/egs/tts_speaker_ref.wav", 5.0, 25
                    ],
                ],
                inputs=[
                    tts_transcript, tts_ref_audio, tts_guidance, tts_steps
                ]
            )

        # Tab 4: Singing Voice Synthesis
        with gr.Tab("🎤 Singing Voice Synthesis"):
            with gr.Row():
                with gr.Column():
                    svs_singer = gr.Dropdown(
                        label="Singer",
                        choices=[
                            "Alto-1", "Alto-2", "Alto-3", "Alto-4", "Alto-5",
                            "Alto-6", "Alto-7", "Bass-1", "Bass-2", "Bass-3",
                            "Soprano-1", "Soprano-2", "Soprano-3", "Tenor-1",
                            "Tenor-2", "Tenor-3", "Tenor-4", "Tenor-5",
                            "Tenor-6", "Tenor-7"
                        ],
                        value="Alto-2"
                    )
                    svs_lyric = gr.Textbox(
                        label="Lyrics",
                        placeholder="e.g., AP你要相信AP相信我们会像童话故事里AP",
                        lines=2
                    )
                    svs_notes = gr.Textbox(
                        label="Note Sequence",
                        placeholder="e.g., rest | G#3 | A#3 C4 | D#4 | ...",
                        lines=2
                    )
                    svs_durations = gr.Textbox(
                        label="Note Durations",
                        placeholder=
                        "e.g., 0.14 | 0.47 | 0.1905 0.1895 | 0.41 | ...",
                        lines=2
                    )
                    svs_model = gr.Dropdown(
                        label="Model Name",
                        choices=MODEL_CHOICES,
                        value=DEFAULT_MODEL
                    )
                    with gr.Row():
                        svs_guidance = gr.Slider(
                            label="Guidance Scale",
                            minimum=1.0,
                            maximum=10.0,
                            value=5.0,
                            step=0.5
                        )
                        svs_steps = gr.Slider(
                            label="Sampling Steps",
                            minimum=1,
                            maximum=100,
                            value=25,
                            step=1
                        )
                    svs_button = gr.Button(
                        "Synthesize Singing", variant="primary"
                    )

                with gr.Column():
                    svs_output = gr.Audio(
                        label="Synthesized Singing", type="filepath"
                    )
                    svs_status = gr.Textbox(label="Status")

            svs_button.click(
                fn=singing_voice_synthesis,
                inputs=[
                    svs_singer, svs_lyric, svs_notes, svs_durations, svs_model,
                    svs_guidance, svs_steps
                ],
                outputs=[svs_output, svs_status]
            )

            gr.Examples(
                examples=[
                    [
                        "Alto-2", "AP你要相信AP相信我们会像童话故事里AP",
                        "rest | G#3 | A#3 C4 | D#4 | D#4 F4 | rest | E4 F4 | F4 | D#4 A#3 | A#3 | A#3 | C#4 | B3 C4 | C#4 | B3 C4 | A#3 | G#3 | rest",
                        "0.14 | 0.47 | 0.1905 0.1895 | 0.41 | 0.3005 0.3895 | 0.21 | 0.2391 0.1809 | 0.32 | 0.4105 0.2095 | 0.35 | 0.43 | 0.45 | 0.2309 0.2291 | 0.48 | 0.225 0.195 | 0.29 | 0.71 | 0.14",
                        5.0, 25
                    ],
                ],
                inputs=[
                    svs_singer, svs_lyric, svs_notes, svs_durations,
                    svs_guidance, svs_steps
                ]
            )

            gr.Markdown(
                """
            ### Usage Instructions
            - **Lyrics Format**: Use AP for pauses, e.g., `AP你要相信AP相信我们会像童话故事里AP`
            - **Note Format**: Separate with `|`, use spaces for simultaneous notes, use `rest` for rests
            - **Duration Format**: Note durations in seconds, separated by `|`
            """
            )

        # Tab 5: Speech Enhancement
        with gr.Tab("🔊 Speech Enhancement"):
            with gr.Row():
                with gr.Column():
                    se_input = gr.Audio(label="Noisy Speech", type="filepath")
                    se_model = gr.Dropdown(
                        label="Model Name",
                        choices=MODEL_CHOICES,
                        value=DEFAULT_MODEL
                    )
                    with gr.Row():
                        se_guidance = gr.Slider(
                            label="Guidance Scale",
                            minimum=1.0,
                            maximum=10.0,
                            value=1.0,
                            step=0.5
                        )
                        se_steps = gr.Slider(
                            label="Sampling Steps",
                            minimum=1,
                            maximum=100,
                            value=25,
                            step=1
                        )
                    se_button = gr.Button("Enhance Speech", variant="primary")

                with gr.Column():
                    se_output = gr.Audio(
                        label="Enhanced Speech", type="filepath"
                    )
                    se_status = gr.Textbox(label="Status")

            se_button.click(
                fn=speech_enhancement,
                inputs=[se_input, se_model, se_guidance, se_steps],
                outputs=[se_output, se_status]
            )

            gr.Examples(
                examples=[
                    ["./data/egs/se_noisy_sample.wav", 1.0, 25],
                ],
                inputs=[se_input, se_guidance, se_steps]
            )

        # Tab 6: Audio Super Resolution
        with gr.Tab("⬆️ Audio SR"):
            with gr.Row():
                with gr.Column():
                    sr_input = gr.Audio(
                        label="Low Sample Rate Audio", type="filepath"
                    )
                    sr_model = gr.Dropdown(
                        label="Model Name",
                        choices=MODEL_CHOICES,
                        value=DEFAULT_MODEL
                    )
                    with gr.Row():
                        sr_guidance = gr.Slider(
                            label="Guidance Scale",
                            minimum=1.0,
                            maximum=10.0,
                            value=1.0,
                            step=0.5
                        )
                        sr_steps = gr.Slider(
                            label="Sampling Steps",
                            minimum=1,
                            maximum=100,
                            value=25,
                            step=1
                        )
                    sr_button = gr.Button(
                        "Super-Resolve Audio", variant="primary"
                    )

                with gr.Column():
                    sr_output = gr.Audio(
                        label="High Sample Rate Audio", type="filepath"
                    )
                    sr_status = gr.Textbox(label="Status")

            # Spectrograms display
            with gr.Row():
                with gr.Column():
                    sr_input_spec = gr.Image(
                        label="Input Spectrogram", type="filepath"
                    )
                with gr.Column():
                    sr_output_spec = gr.Image(
                        label="Output Spectrogram", type="filepath"
                    )

            sr_button.click(
                fn=audio_super_resolution,
                inputs=[sr_input, sr_model, sr_guidance, sr_steps],
                outputs=[sr_output, sr_status, sr_input_spec, sr_output_spec]
            )

            gr.Examples(
                examples=[
                    ["./data/egs/sr_low_sr_sample.wav", 1.0, 25],
                ],
                inputs=[sr_input, sr_guidance, sr_steps]
            )

        # Tab 7: Video to Audio
        with gr.Tab("🎬 Video to Audio"):
            with gr.Row():
                with gr.Column():
                    v2a_input = gr.Video(label="Input Video")
                    v2a_model = gr.Dropdown(
                        label="Model Name",
                        choices=MODEL_CHOICES,
                        value=DEFAULT_MODEL
                    )
                    with gr.Row():
                        v2a_guidance = gr.Slider(
                            label="Guidance Scale",
                            minimum=1.0,
                            maximum=10.0,
                            value=5.0,
                            step=0.5
                        )
                        v2a_steps = gr.Slider(
                            label="Sampling Steps",
                            minimum=1,
                            maximum=100,
                            value=25,
                            step=1
                        )
                    v2a_button = gr.Button("Generate Audio", variant="primary")

                with gr.Column():
                    v2a_output = gr.Video(label="Video with Audio")
                    v2a_status = gr.Textbox(label="Status")

            v2a_button.click(
                fn=video_to_audio,
                inputs=[v2a_input, v2a_model, v2a_guidance, v2a_steps],
                outputs=[v2a_output, v2a_status]
            )

            gr.Examples(
                examples=[
                    ["./data/egs/v2a_video_sample.mp4", 5.0, 25],
                ],
                inputs=[v2a_input, v2a_guidance, v2a_steps]
            )

    gr.Markdown(
        """
    ---
    ### 📝 Notes
    - **Model Name**: Choose from `UniFlow-Audio-large`, `UniFlow-Audio-medium`, or `UniFlow-Audio-small`
    - **Guidance Scale**: Controls the guidance strength of the input condition on the output
    - **Sampling Steps**: Number of flow matching sampling steps
    
    💡 Tip: Models will be automatically downloaded on first run, please be patient
    """
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)