Spaces:

wsntxxn
/

UniFlow-Audio

Running on Zero

App Files Files Community

UniFlow-Audio / app.py

wsntxxn

Update app.py

f09eb6f verified 24 days ago

raw

history blame

24.2 kB

	#!/usr/bin/env python3

	import os
	import gradio as gr
	import numpy as np
	import matplotlib.pyplot as plt
	import librosa
	import librosa.display

	import spaces

	from inference_cli import InferenceCLI

	# Initialize inference CLI
	cli = InferenceCLI()

	# Available model choices
	MODEL_CHOICES = [
	"UniFlow-Audio-large", "UniFlow-Audio-medium", "UniFlow-Audio-small"
	]

	# Default model name
	DEFAULT_MODEL = "UniFlow-Audio-large"

	# Pre-initialize models
	print("Initializing models, please wait...")
	print(f"Loading main model: {DEFAULT_MODEL}")
	cli.init_model(DEFAULT_MODEL)

	print("Loading speaker model for TTS...")
	cli.init_speaker_model()

	print("Loading SVS processor for singing voice synthesis...")
	cli.init_svs_processor()

	print("Loading video preprocessor for V2A...")
	cli.init_video_preprocessor()

	print("All models loaded successfully!")


	@spaces.GPU(duration=60)
	def text_to_audio(
	caption,
	model_name,
	guidance_scale,
	num_steps,
	progress=gr.Progress(track_tqdm=True)
	):
	"""Text to Audio generation"""
	output_path = "./outputs/t2a_output.wav"
	os.makedirs("./outputs", exist_ok=True)

	try:
	cli.t2a(
	caption=caption,
	model_name=model_name,
	guidance_scale=guidance_scale,
	num_steps=num_steps,
	output_path=output_path
	)
	return output_path, "Generation successful!"
	except Exception as e:
	return None, f"Error: {str(e)}"


	@spaces.GPU(duration=60)
	def text_to_music(
	caption,
	model_name,
	guidance_scale,
	num_steps,
	progress=gr.Progress(track_tqdm=True)
	):
	"""Text to Music generation"""
	output_path = "./outputs/t2m_output.wav"
	os.makedirs("./outputs", exist_ok=True)

	try:
	cli.t2m(
	caption=caption,
	model_name=model_name,
	guidance_scale=guidance_scale,
	num_steps=num_steps,
	output_path=output_path
	)
	return output_path, "Generation successful!"
	except Exception as e:
	return None, f"Error: {str(e)}"


	@spaces.GPU(duration=60)
	def text_to_speech(
	transcript,
	ref_speaker_audio,
	model_name,
	guidance_scale,
	num_steps,
	progress=gr.Progress(track_tqdm=True)
	):
	"""Text to Speech synthesis"""
	output_path = "./outputs/tts_output.wav"
	os.makedirs("./outputs", exist_ok=True)

	try:
	cli.tts(
	transcript=transcript,
	ref_speaker_speech=ref_speaker_audio,
	model_name=model_name,
	guidance_scale=guidance_scale,
	num_steps=num_steps,
	output_path=output_path
	)
	return output_path, "Generation successful!"
	except Exception as e:
	return None, f"Error: {str(e)}"


	@spaces.GPU(duration=60)
	def singing_voice_synthesis(
	singer,
	lyric,
	notes,
	note_durations,
	model_name,
	guidance_scale,
	num_steps,
	progress=gr.Progress(track_tqdm=True)
	):
	"""Singing Voice Synthesis"""
	output_path = "./outputs/svs_output.wav"
	os.makedirs("./outputs", exist_ok=True)

	try:
	music_score = f"{lyric}<sep>{notes}<sep>{note_durations}"
	cli.svs(
	singer=singer,
	music_score=music_score,
	model_name=model_name,
	guidance_scale=guidance_scale,
	num_steps=num_steps,
	output_path=output_path
	)
	return output_path, "Generation successful!"
	except Exception as e:
	return None, f"Error: {str(e)}"


	@spaces.GPU(duration=60)
	def speech_enhancement(
	noisy_audio,
	model_name,
	guidance_scale,
	num_steps,
	progress=gr.Progress(track_tqdm=True)
	):
	"""Speech Enhancement"""
	output_path = "./outputs/se_output.wav"
	os.makedirs("./outputs", exist_ok=True)

	try:
	cli.se(
	noisy_speech=noisy_audio,
	model_name=model_name,
	guidance_scale=guidance_scale,
	num_steps=num_steps,
	output_path=output_path
	)
	return output_path, "Enhancement successful!"
	except Exception as e:
	return None, f"Error: {str(e)}"


	def generate_spectrogram(audio_path, title="Spectrogram"):
	"""Generate spectrogram from audio file"""
	try:
	# Load audio file
	y, sr = librosa.load(audio_path, sr=None)

	# Create figure
	fig, ax = plt.subplots(figsize=(10, 4))

	# Generate mel spectrogram
	D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)

	# Display spectrogram
	img = librosa.display.specshow(
	D, y_axis='hz', x_axis='time', sr=sr, ax=ax
	)
	ax.set_title(f'{title} (Sample Rate: {sr} Hz)')
	fig.colorbar(img, ax=ax, format='%+2.0f dB')

	# Save to file
	spec_path = audio_path.replace('.wav', '_spec.png')
	plt.tight_layout()
	fig.savefig(spec_path, dpi=100, bbox_inches='tight')
	plt.close(fig)

	return spec_path
	except Exception as e:
	print(f"Error generating spectrogram: {str(e)}")
	return None


	@spaces.GPU(duration=60)
	def audio_super_resolution(
	low_sr_audio,
	model_name,
	guidance_scale,
	num_steps,
	progress=gr.Progress(track_tqdm=True)
	):
	"""Audio Super Resolution"""
	output_path = "./outputs/sr_output.wav"
	os.makedirs("./outputs", exist_ok=True)

	try:
	cli.sr(
	low_sr_audio=low_sr_audio,
	model_name=model_name,
	guidance_scale=guidance_scale,
	num_steps=num_steps,
	output_path=output_path
	)

	# Generate spectrograms for input and output
	input_spec = generate_spectrogram(
	low_sr_audio, "Input Audio Spectrogram"
	)
	output_spec = generate_spectrogram(
	output_path, "Output Audio Spectrogram"
	)

	return output_path, "Super-resolution successful!", input_spec, output_spec
	except Exception as e:
	return None, f"Error: {str(e)}", None, None


	@spaces.GPU(duration=60)
	def video_to_audio(
	video,
	model_name,
	guidance_scale,
	num_steps,
	progress=gr.Progress(track_tqdm=True)
	):
	"""Video to Audio generation"""
	output_path = "./outputs/v2a_output.mp4"
	os.makedirs("./outputs", exist_ok=True)

	try:
	cli.v2a(
	video=video,
	model_name=model_name,
	guidance_scale=guidance_scale,
	num_steps=num_steps,
	output_path=output_path
	)
	return output_path, "Generation successful!"
	except Exception as e:
	return None, f"Error: {str(e)}"


	# Custom CSS for better tab display
	custom_css = """
	.tab-nav button {
	font-size: 14px !important;
	padding: 8px 12px !important;
	min-width: fit-content !important;
	}
	.tab-nav {
	overflow-x: auto !important;
	flex-wrap: nowrap !important;
	}
	"""

	# Create Gradio Interface
	with gr.Blocks(
	title="UniFlow-Audio Inference Demo",
	theme=gr.themes.Soft(),
	css=custom_css
	) as demo:
	gr.Markdown("# 🔊 UniFlow-Audio Inference Demo")
	gr.Markdown(
	"Multi-task Audio Generation System based on [UniFlow-Audio](https://arxiv.org/abs/2509.24391)"
	)
	gr.HTML("""
	<div style="padding: 10px; background-color: #fffbcc; border: 1px solid #ffe564; border-radius:4px;">
	<strong>Note: </strong>For TTS, due to the restriction of HuggingFace Space, the g2p phonemizer used here is inconsistant with the one used during training, so there may be problems. Please refer to <a href="https://github.com/wsntxxn/UniFlow-Audio/blob/master/docs/INFERENCE_CLI.md">INFERENCE_CLI.md</a> for CLI calling guidance.
	</div>
	""")

	with gr.Tabs():
	# Tab 1: Text to Audio
	with gr.Tab("📢 Text to Audio"):
	with gr.Row():
	with gr.Column():
	t2a_caption = gr.Textbox(
	label="Audio Caption",
	placeholder="e.g., a man is speaking while a dog barks",
	lines=3
	)
	t2a_model = gr.Dropdown(
	label="Model Name",
	choices=MODEL_CHOICES,
	value=DEFAULT_MODEL
	)
	with gr.Row():
	t2a_guidance = gr.Slider(
	label="Guidance Scale",
	minimum=1.0,
	maximum=10.0,
	value=5.0,
	step=0.5
	)
	t2a_steps = gr.Slider(
	label="Sampling Steps",
	minimum=1,
	maximum=100,
	value=25,
	step=1
	)
	t2a_button = gr.Button("Generate Audio", variant="primary")

	with gr.Column():
	t2a_output = gr.Audio(
	label="Generated Audio", type="filepath"
	)
	t2a_status = gr.Textbox(label="Status")

	t2a_button.click(
	fn=text_to_audio,
	inputs=[t2a_caption, t2a_model, t2a_guidance, t2a_steps],
	outputs=[t2a_output, t2a_status]
	)

	gr.Examples(
	examples=[
	["a man is speaking while a dog barks", 5.0, 25],
	["footsteps on wooden floor", 5.0, 25],
	],
	inputs=[t2a_caption, t2a_guidance, t2a_steps]
	)

	# Tab 2: Text to Music
	with gr.Tab("🎼 Text to Music"):
	with gr.Row():
	with gr.Column():
	t2m_caption = gr.Textbox(
	label="Music Caption",
	placeholder="e.g., pop music with a male singing rap",
	lines=3
	)
	t2m_model = gr.Dropdown(
	label="Model Name",
	choices=MODEL_CHOICES,
	value=DEFAULT_MODEL
	)
	with gr.Row():
	t2m_guidance = gr.Slider(
	label="Guidance Scale",
	minimum=1.0,
	maximum=10.0,
	value=5.0,
	step=0.5
	)
	t2m_steps = gr.Slider(
	label="Sampling Steps",
	minimum=1,
	maximum=100,
	value=25,
	step=1
	)
	t2m_button = gr.Button("Generate Music", variant="primary")

	with gr.Column():
	t2m_output = gr.Audio(
	label="Generated Music", type="filepath"
	)
	t2m_status = gr.Textbox(label="Status")

	t2m_button.click(
	fn=text_to_music,
	inputs=[t2m_caption, t2m_model, t2m_guidance, t2m_steps],
	outputs=[t2m_output, t2m_status]
	)

	gr.Examples(
	examples=[
	["pop music with a male singing rap", 5.0, 25],
	["classical piano solo", 5.0, 25],
	],
	inputs=[t2m_caption, t2m_guidance, t2m_steps]
	)

	# Tab 3: Text to Speech
	with gr.Tab("🗣️ Text to Speech"):
	with gr.Row():
	with gr.Column():
	tts_transcript = gr.Textbox(
	label="Text to Synthesize",
	placeholder="e.g., Hello this is a special sentence",
	lines=3
	)
	tts_ref_audio = gr.Audio(
	label="Reference Speaker Audio", type="filepath"
	)
	tts_model = gr.Dropdown(
	label="Model Name",
	choices=MODEL_CHOICES,
	value=DEFAULT_MODEL
	)
	with gr.Row():
	tts_guidance = gr.Slider(
	label="Guidance Scale",
	minimum=1.0,
	maximum=10.0,
	value=5.0,
	step=0.5
	)
	tts_steps = gr.Slider(
	label="Sampling Steps",
	minimum=1,
	maximum=100,
	value=25,
	step=1
	)
	tts_button = gr.Button(
	"Synthesize Speech", variant="primary"
	)

	with gr.Column():
	tts_output = gr.Audio(
	label="Synthesized Speech", type="filepath"
	)
	tts_status = gr.Textbox(label="Status")

	tts_button.click(
	fn=text_to_speech,
	inputs=[
	tts_transcript, tts_ref_audio, tts_model, tts_guidance,
	tts_steps
	],
	outputs=[tts_output, tts_status]
	)

	gr.Examples(
	examples=[
	[
	"Hello this is a special sentence with zyloph",
	"./data/egs/tts_speaker_ref.wav", 5.0, 25
	],
	],
	inputs=[
	tts_transcript, tts_ref_audio, tts_guidance, tts_steps
	]
	)

	# Tab 4: Singing Voice Synthesis
	with gr.Tab("🎤 Singing Voice Synthesis"):
	with gr.Row():
	with gr.Column():
	svs_singer = gr.Dropdown(
	label="Singer",
	choices=[
	"Alto-1", "Alto-2", "Alto-3", "Alto-4", "Alto-5",
	"Alto-6", "Alto-7", "Bass-1", "Bass-2", "Bass-3",
	"Soprano-1", "Soprano-2", "Soprano-3", "Tenor-1",
	"Tenor-2", "Tenor-3", "Tenor-4", "Tenor-5",
	"Tenor-6", "Tenor-7"
	],
	value="Alto-2"
	)
	svs_lyric = gr.Textbox(
	label="Lyrics",
	placeholder="e.g., AP你要相信AP相信我们会像童话故事里AP",
	lines=2
	)
	svs_notes = gr.Textbox(
	label="Note Sequence",
	placeholder="e.g., rest \| G#3 \| A#3 C4 \| D#4 \| ...",
	lines=2
	)
	svs_durations = gr.Textbox(
	label="Note Durations",
	placeholder=
	"e.g., 0.14 \| 0.47 \| 0.1905 0.1895 \| 0.41 \| ...",
	lines=2
	)
	svs_model = gr.Dropdown(
	label="Model Name",
	choices=MODEL_CHOICES,
	value=DEFAULT_MODEL
	)
	with gr.Row():
	svs_guidance = gr.Slider(
	label="Guidance Scale",
	minimum=1.0,
	maximum=10.0,
	value=5.0,
	step=0.5
	)
	svs_steps = gr.Slider(
	label="Sampling Steps",
	minimum=1,
	maximum=100,
	value=25,
	step=1
	)
	svs_button = gr.Button(
	"Synthesize Singing", variant="primary"
	)

	with gr.Column():
	svs_output = gr.Audio(
	label="Synthesized Singing", type="filepath"
	)
	svs_status = gr.Textbox(label="Status")

	svs_button.click(
	fn=singing_voice_synthesis,
	inputs=[
	svs_singer, svs_lyric, svs_notes, svs_durations, svs_model,
	svs_guidance, svs_steps
	],
	outputs=[svs_output, svs_status]
	)

	gr.Examples(
	examples=[
	[
	"Alto-2", "AP你要相信AP相信我们会像童话故事里AP",
	"rest \| G#3 \| A#3 C4 \| D#4 \| D#4 F4 \| rest \| E4 F4 \| F4 \| D#4 A#3 \| A#3 \| A#3 \| C#4 \| B3 C4 \| C#4 \| B3 C4 \| A#3 \| G#3 \| rest",
	"0.14 \| 0.47 \| 0.1905 0.1895 \| 0.41 \| 0.3005 0.3895 \| 0.21 \| 0.2391 0.1809 \| 0.32 \| 0.4105 0.2095 \| 0.35 \| 0.43 \| 0.45 \| 0.2309 0.2291 \| 0.48 \| 0.225 0.195 \| 0.29 \| 0.71 \| 0.14",
	5.0, 25
	],
	],
	inputs=[
	svs_singer, svs_lyric, svs_notes, svs_durations,
	svs_guidance, svs_steps
	]
	)

	gr.Markdown(
	"""
	### Usage Instructions
	- Lyrics Format: Use AP for pauses, e.g., `AP你要相信AP相信我们会像童话故事里AP`
	- Note Format: Separate with `\|`, use spaces for simultaneous notes, use `rest` for rests
	- Duration Format: Note durations in seconds, separated by `\|`
	"""
	)

	# Tab 5: Speech Enhancement
	with gr.Tab("🔊 Speech Enhancement"):
	with gr.Row():
	with gr.Column():
	se_input = gr.Audio(label="Noisy Speech", type="filepath")
	se_model = gr.Dropdown(
	label="Model Name",
	choices=MODEL_CHOICES,
	value=DEFAULT_MODEL
	)
	with gr.Row():
	se_guidance = gr.Slider(
	label="Guidance Scale",
	minimum=1.0,
	maximum=10.0,
	value=1.0,
	step=0.5
	)
	se_steps = gr.Slider(
	label="Sampling Steps",
	minimum=1,
	maximum=100,
	value=25,
	step=1
	)
	se_button = gr.Button("Enhance Speech", variant="primary")

	with gr.Column():
	se_output = gr.Audio(
	label="Enhanced Speech", type="filepath"
	)
	se_status = gr.Textbox(label="Status")

	se_button.click(
	fn=speech_enhancement,
	inputs=[se_input, se_model, se_guidance, se_steps],
	outputs=[se_output, se_status]
	)

	gr.Examples(
	examples=[
	["./data/egs/se_noisy_sample.wav", 1.0, 25],
	],
	inputs=[se_input, se_guidance, se_steps]
	)

	# Tab 6: Audio Super Resolution
	with gr.Tab("⬆️ Audio SR"):
	with gr.Row():
	with gr.Column():
	sr_input = gr.Audio(
	label="Low Sample Rate Audio", type="filepath"
	)
	sr_model = gr.Dropdown(
	label="Model Name",
	choices=MODEL_CHOICES,
	value=DEFAULT_MODEL
	)
	with gr.Row():
	sr_guidance = gr.Slider(
	label="Guidance Scale",
	minimum=1.0,
	maximum=10.0,
	value=1.0,
	step=0.5
	)
	sr_steps = gr.Slider(
	label="Sampling Steps",
	minimum=1,
	maximum=100,
	value=25,
	step=1
	)
	sr_button = gr.Button(
	"Super-Resolve Audio", variant="primary"
	)

	with gr.Column():
	sr_output = gr.Audio(
	label="High Sample Rate Audio", type="filepath"
	)
	sr_status = gr.Textbox(label="Status")

	# Spectrograms display
	with gr.Row():
	with gr.Column():
	sr_input_spec = gr.Image(
	label="Input Spectrogram", type="filepath"
	)
	with gr.Column():
	sr_output_spec = gr.Image(
	label="Output Spectrogram", type="filepath"
	)

	sr_button.click(
	fn=audio_super_resolution,
	inputs=[sr_input, sr_model, sr_guidance, sr_steps],
	outputs=[sr_output, sr_status, sr_input_spec, sr_output_spec]
	)

	gr.Examples(
	examples=[
	["./data/egs/sr_low_sr_sample.wav", 1.0, 25],
	],
	inputs=[sr_input, sr_guidance, sr_steps]
	)

	# Tab 7: Video to Audio
	with gr.Tab("🎬 Video to Audio"):
	with gr.Row():
	with gr.Column():
	v2a_input = gr.Video(label="Input Video")
	v2a_model = gr.Dropdown(
	label="Model Name",
	choices=MODEL_CHOICES,
	value=DEFAULT_MODEL
	)
	with gr.Row():
	v2a_guidance = gr.Slider(
	label="Guidance Scale",
	minimum=1.0,
	maximum=10.0,
	value=5.0,
	step=0.5
	)
	v2a_steps = gr.Slider(
	label="Sampling Steps",
	minimum=1,
	maximum=100,
	value=25,
	step=1
	)
	v2a_button = gr.Button("Generate Audio", variant="primary")

	with gr.Column():
	v2a_output = gr.Video(label="Video with Audio")
	v2a_status = gr.Textbox(label="Status")

	v2a_button.click(
	fn=video_to_audio,
	inputs=[v2a_input, v2a_model, v2a_guidance, v2a_steps],
	outputs=[v2a_output, v2a_status]
	)

	gr.Examples(
	examples=[
	["./data/egs/v2a_video_sample.mp4", 5.0, 25],
	],
	inputs=[v2a_input, v2a_guidance, v2a_steps]
	)

	gr.Markdown(
	"""
	---
	### 📝 Notes
	- Model Name: Choose from `UniFlow-Audio-large`, `UniFlow-Audio-medium`, or `UniFlow-Audio-small`
	- Guidance Scale: Controls the guidance strength of the input condition on the output
	- Sampling Steps: Number of flow matching sampling steps

	💡 Tip: Models will be automatically downloaded on first run, please be patient
	"""
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860, share=False)