audio-flamingo-3

Running on Zero

App Files Files Community

manoskary commited on 25 days ago

Commit

877491b

verified ·

1 Parent(s): eb7d238

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -71

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import os
 import spaces
 from huggingface_hub import snapshot_download
 import copy
 # ---------------------------------
 # SINGLE-TURN MODEL SETUP
 # ---------------------------------
@@ -12,7 +13,6 @@ import copy
 MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3")
 MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')
-# model_single = llava.load(MODEL_BASE_SINGLE, model_base=None, devices=[0])
 model_single = llava.load(MODEL_BASE_SINGLE, model_base=None)
 model_single = model_single.to("cuda")
@@ -53,67 +53,41 @@ with gr.Blocks(css="""
     with gr.Column():
         gr.HTML("""
 <div align="center">
-  <img src="https://raw.githubusercontent.com/NVIDIA/audio-flamingo/audio_flamingo_3/static/logo-no-bg.png" alt="Audio Flamingo 3 Logo" width="120" style="margin-bottom: 10px;">
-  <h2><strong>Audio Flamingo 3</strong></h2>
-  <p><em>Advancing Audio Intelligence with Fully Open Large Audio-Language Models</em></p>
 </div>
 <div align="center" style="margin-top: 10px;">
-  <a href="https://arxiv.org/abs/2507.08128">
-    <img src="https://img.shields.io/badge/arXiv-2503.03983-AD1C18" alt="arXiv" style="display:inline;">
-  </a>
-  <a href="https://research.nvidia.com/labs/adlr/AF3/">
-    <img src="https://img.shields.io/badge/Demo%20page-228B22" alt="Demo Page" style="display:inline;">
-  </a>
-  <a href="https://github.com/NVIDIA/audio-flamingo">
-    <img src="https://img.shields.io/badge/Github-Audio_Flamingo_3-9C276A" alt="GitHub" style="display:inline;">
-  </a>
-  <a href="https://github.com/NVIDIA/audio-flamingo/stargazers">
-    <img src="https://img.shields.io/github/stars/NVIDIA/audio-flamingo.svg?style=social" alt="GitHub Stars" style="display:inline;">
-  </a>
-</div>
-<div align="center" style="display: flex; justify-content: center; margin-top: 10px; flex-wrap: wrap; gap: 5px;">
   <a href="https://huggingface.co/nvidia/audio-flamingo-3">
-    <img src="https://img.shields.io/badge/🤗-Checkpoints-ED5A22.svg">
   </a>
-  <a href="https://huggingface.co/nvidia/audio-flamingo-3-chat">
-    <img src="https://img.shields.io/badge/🤗-Checkpoints_(Chat)-ED5A22.svg">
   </a>
 </div>
-<div align="center" style="display: flex; justify-content: center; margin-top: 10px; flex-wrap: wrap; gap: 5px;">
-  <a href="https://huggingface.co/datasets/nvidia/AudioSkills">
-    <img src="https://img.shields.io/badge/🤗-Dataset:_AudioSkills--XL-ED5A22.svg">
-  </a>
-  <a href="https://huggingface.co/datasets/nvidia/LongAudio">
-    <img src="https://img.shields.io/badge/🤗-Dataset:_LongAudio--XL-ED5A22.svg">
-  </a>
-  <a href="https://huggingface.co/datasets/nvidia/AF-Chat">
-    <img src="https://img.shields.io/badge/🤗-Dataset:_AF--Chat-ED5A22.svg">
-  </a>
-  <a href="https://huggingface.co/datasets/nvidia/AF-Think">
-    <img src="https://img.shields.io/badge/🤗-Dataset:_AF--Think-ED5A22.svg">
-  </a>
 </div>
 """)
-    # gr.Markdown("#### NVIDIA (2025)")
     with gr.Tabs():
         # ---------------- SINGLE-TURN ----------------
-        with gr.Tab("🎯 Single-Turn Inference"):
             with gr.Row():
                 with gr.Column():
-                    audio_input_single = gr.Audio(type="filepath", label="Upload Audio")
-                    prompt_input_single = gr.Textbox(label="Prompt", placeholder="Ask a question about the audio...", lines=8)
-                    btn_single = gr.Button("Generate Answer")
                     gr.Examples(
                         examples=[
-                            ["static/emergent/audio1.wav", "What is surprising about the relationship between the barking and the music?"],
-                            ["static/audio/audio2.wav", "Please describe the audio in detail."],
-                            ["static/speech/audio3.wav", "Transcribe any speech you hear."],
                         ],
                         inputs=[audio_input_single, prompt_input_single],
-                        label="🧪 Try Examples"
                     )
                 with gr.Column():
@@ -121,41 +95,26 @@ with gr.Blocks(css="""
             btn_single.click(fn=single_turn_infer, inputs=[audio_input_single, prompt_input_single], outputs=output_single)
-        with gr.Tab("📄 About"):
             gr.Markdown("""
-### 📚 Overview
-**Audio Flamingo 3** is a fully open state-of-the-art (SOTA) large audio-language model that advances reasoning and understanding across speech, sound, and music. AF3 introduces:
-(i) AF-Whisper, a unified audio encoder trained using a novel strategy for joint representation learning across all 3 modalities of speech, sound, and music;
-(ii) flexible, on-demand thinking, allowing the model to do chain-of-thought reasoning before answering;
-(iii) multi-turn, multi-audio chat;
-(iv) long audio understanding and reasoning (including speech) up to 10 minutes; and
-(v) voice-to-voice interaction.
-To enable these capabilities, we propose several large-scale training datasets curated using novel strategies, including AudioSkills-XL, LongAudio-XL, AF-Think, and AF-Chat, and train AF3 with a novel five-stage curriculum-based training strategy. Trained on only open-source audio data, AF3 achieves new SOTA results on over 20+ (long) audio understanding and reasoning benchmarks, surpassing both open-weight and closed-source models trained on much larger datasets.
-**Key Features:**
-💡 Audio Flamingo 3 has strong audio, music and speech understanding capabilities.
-💡 Audio Flamingo 3 supports on-demand thinking for chain-of-though reasoning.
-💡 Audio Flamingo 3 supports long audio and speech understanding for audios up to 10 minutes.
-💡 Audio Flamingo 3 can have multi-turn, multi-audio chat with users under complex context.
-💡 Audio Flamingo 3 has voice-to-voice conversation abilities.
 """)
-    gr.Markdown("© 2025 NVIDIA | Built with ❤️ using Gradio + PyTorch")
 # -----------------------
 # Launch App

 import spaces
 from huggingface_hub import snapshot_download
 import copy
 # ---------------------------------
 # SINGLE-TURN MODEL SETUP
 # ---------------------------------
 MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3")
 MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')
 model_single = llava.load(MODEL_BASE_SINGLE, model_base=None)
 model_single = model_single.to("cuda")
     with gr.Column():
         gr.HTML("""
 <div align="center">
+  <img src="https://raw.githubusercontent.com/NVIDIA/audio-flamingo/audio_flamingo_3/static/logo-no-bg.png" alt="Audio Flamingo 3 Logo" width="110" style="margin-bottom: 8px;">
+  <h2><strong>Audio Understanding Demo (WeaveMuse Edition)</strong></h2>
+  <p><em>Powered by NVIDIA’s open Audio Flamingo 3 model</em></p>
 </div>
 <div align="center" style="margin-top: 10px;">
   <a href="https://huggingface.co/nvidia/audio-flamingo-3">
+    <img src="https://img.shields.io/badge/🤗-Model:_audio--flamingo--3-ED5A22.svg">
   </a>
+  <a href="https://github.com/NVIDIA/audio-flamingo">
+    <img src="https://img.shields.io/badge/GitHub-NVIDIA%2Faudio--flamingo-9C276A.svg">
   </a>
 </div>
+<div align="center" style="margin-top: 8px;">
+  <p><small>This simple interface demonstrates audio–language reasoning and transcription using open models from NVIDIA. Integrated for use in the <strong>WeaveMuse</strong> framework.</small></p>
 </div>
 """)
     with gr.Tabs():
         # ---------------- SINGLE-TURN ----------------
+        with gr.Tab("🎧 Audio Inference"):
             with gr.Row():
                 with gr.Column():
+                    audio_input_single = gr.Audio(type="filepath", label="Upload Audio Clip")
+                    prompt_input_single = gr.Textbox(label="Prompt", placeholder="Ask about the audio (e.g. 'Describe the sound' or 'Transcribe any speech')", lines=6)
+                    btn_single = gr.Button("Generate Response")
                     gr.Examples(
                         examples=[
+                            ["static/emergent/audio1.wav", "What is happening in this audio?"],
+                            ["static/audio/audio2.wav", "Describe the sounds you hear."],
+                            ["static/speech/audio3.wav", "Transcribe the spoken words."],
                         ],
                         inputs=[audio_input_single, prompt_input_single],
+                        label="🧪 Example Prompts"
                     )
                 with gr.Column():
             btn_single.click(fn=single_turn_infer, inputs=[audio_input_single, prompt_input_single], outputs=output_single)
+        with gr.Tab("ℹ️ About"):
             gr.Markdown("""
+### 🎶 Overview
+This demo showcases a **simple single-turn audio–language interface** built for integration with the **WeaveMuse** framework.
+It uses the open-source **NVIDIA Audio Flamingo 3** model for audio understanding, transcription, and sound reasoning.
+You can upload an audio file and ask natural-language questions such as:
+- “What kind of sound is this?”
+- “Describe the scene.”
+- “Transcribe any speech.”
+**Acknowledgment:**
+Model and research credit to **NVIDIA**, for the development of the open **Audio Flamingo 3** model and datasets used for training.
+This interface is a simplified demonstration of how such models can be integrated into broader creative AI systems like WeaveMuse.
+**Tech stack:** Gradio + PyTorch + llava + WeaveMuse Integration
 """)
+    gr.Markdown("© 2025 WeaveMuse | Model © NVIDIA — Audio Flamingo 3 | Built with ❤️ using Gradio & PyTorch")
 # -----------------------
 # Launch App