import gradio as gr import torch import llava import os import spaces from huggingface_hub import snapshot_download import copy # --------------------------------- # SINGLE-TURN MODEL SETUP # --------------------------------- MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3") MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35') model_single = llava.load(MODEL_BASE_SINGLE, model_base=None) model_single = model_single.to("cuda") generation_config_single = model_single.default_generation_config # --------------------------------- # SINGLE-TURN INFERENCE FUNCTION # --------------------------------- @spaces.GPU def single_turn_infer(audio_file, prompt_text): try: sound = llava.Sound(audio_file) full_prompt = f"\n{prompt_text}" response = model_single.generate_content([sound, full_prompt], generation_config=generation_config_single) return response except Exception as e: return f"❌ Error: {str(e)}" # --------------------------------- # INTERFACE # --------------------------------- with gr.Blocks(css=""" .gradio-container { max-width: 100% !important; width: 100% !important; margin: 0 !important; padding: 0 !important; } #component-0, .gr-block.gr-box { width: 100% !important; } .gr-block.gr-box, .gr-column, .gr-row { padding: 0 !important; margin: 0 !important; } """) as demo: with gr.Column(): gr.HTML("""
Audio Flamingo 3 Logo

Audio Understanding Demo (WeaveMuse Edition)

Powered by NVIDIA’s open Audio Flamingo 3 model

This simple interface demonstrates audio–language reasoning and transcription using open models from NVIDIA. Integrated for use in the WeaveMuse framework.

""") with gr.Tabs(): # ---------------- SINGLE-TURN ---------------- with gr.Tab("🎧 Audio Inference"): with gr.Row(): with gr.Column(): audio_input_single = gr.Audio(type="filepath", label="Upload Audio Clip") prompt_input_single = gr.Textbox(label="Prompt", placeholder="Ask about the audio (e.g. 'Describe the sound' or 'Transcribe any speech')", lines=6) btn_single = gr.Button("Generate Response") gr.Examples( examples=[ ["static/emergent/audio1.wav", "What is happening in this audio?"], ["static/audio/audio2.wav", "Describe the sounds you hear."], ["static/speech/audio3.wav", "Transcribe the spoken words."], ], inputs=[audio_input_single, prompt_input_single], label="🧪 Example Prompts" ) with gr.Column(): output_single = gr.Textbox(label="Model Response", lines=15) btn_single.click(fn=single_turn_infer, inputs=[audio_input_single, prompt_input_single], outputs=output_single) with gr.Tab("ℹ️ About"): gr.Markdown(""" ### 🎶 Overview This demo showcases a **simple single-turn audio–language interface** built for integration with the **WeaveMuse** framework. It uses the open-source **NVIDIA Audio Flamingo 3** model for audio understanding, transcription, and sound reasoning. You can upload an audio file and ask natural-language questions such as: - “What kind of sound is this?” - “Describe the scene.” - “Transcribe any speech.” **Acknowledgment:** Model and research credit to **NVIDIA**, for the development of the open **Audio Flamingo 3** model and datasets used for training. This interface is a simplified demonstration of how such models can be integrated into broader creative AI systems like WeaveMuse. **Tech stack:** Gradio + PyTorch + llava + WeaveMuse Integration """) gr.Markdown("© 2025 WeaveMuse | Model © NVIDIA — Audio Flamingo 3 | Built with ❤️ using Gradio & PyTorch") # ----------------------- # Launch App # ----------------------- if __name__ == "__main__": demo.launch(share=True)