manoskary commited on
Commit
877491b
·
verified ·
1 Parent(s): eb7d238

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -71
app.py CHANGED
@@ -5,6 +5,7 @@ import os
5
  import spaces
6
  from huggingface_hub import snapshot_download
7
  import copy
 
8
  # ---------------------------------
9
  # SINGLE-TURN MODEL SETUP
10
  # ---------------------------------
@@ -12,7 +13,6 @@ import copy
12
  MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3")
13
  MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')
14
 
15
- # model_single = llava.load(MODEL_BASE_SINGLE, model_base=None, devices=[0])
16
  model_single = llava.load(MODEL_BASE_SINGLE, model_base=None)
17
  model_single = model_single.to("cuda")
18
 
@@ -53,67 +53,41 @@ with gr.Blocks(css="""
53
  with gr.Column():
54
  gr.HTML("""
55
  <div align="center">
56
- <img src="https://raw.githubusercontent.com/NVIDIA/audio-flamingo/audio_flamingo_3/static/logo-no-bg.png" alt="Audio Flamingo 3 Logo" width="120" style="margin-bottom: 10px;">
57
- <h2><strong>Audio Flamingo 3</strong></h2>
58
- <p><em>Advancing Audio Intelligence with Fully Open Large Audio-Language Models</em></p>
59
  </div>
60
 
61
  <div align="center" style="margin-top: 10px;">
62
- <a href="https://arxiv.org/abs/2507.08128">
63
- <img src="https://img.shields.io/badge/arXiv-2503.03983-AD1C18" alt="arXiv" style="display:inline;">
64
- </a>
65
- <a href="https://research.nvidia.com/labs/adlr/AF3/">
66
- <img src="https://img.shields.io/badge/Demo%20page-228B22" alt="Demo Page" style="display:inline;">
67
- </a>
68
- <a href="https://github.com/NVIDIA/audio-flamingo">
69
- <img src="https://img.shields.io/badge/Github-Audio_Flamingo_3-9C276A" alt="GitHub" style="display:inline;">
70
- </a>
71
- <a href="https://github.com/NVIDIA/audio-flamingo/stargazers">
72
- <img src="https://img.shields.io/github/stars/NVIDIA/audio-flamingo.svg?style=social" alt="GitHub Stars" style="display:inline;">
73
- </a>
74
- </div>
75
- <div align="center" style="display: flex; justify-content: center; margin-top: 10px; flex-wrap: wrap; gap: 5px;">
76
  <a href="https://huggingface.co/nvidia/audio-flamingo-3">
77
- <img src="https://img.shields.io/badge/🤗-Checkpoints-ED5A22.svg">
78
  </a>
79
- <a href="https://huggingface.co/nvidia/audio-flamingo-3-chat">
80
- <img src="https://img.shields.io/badge/🤗-Checkpoints_(Chat)-ED5A22.svg">
81
  </a>
82
  </div>
83
- <div align="center" style="display: flex; justify-content: center; margin-top: 10px; flex-wrap: wrap; gap: 5px;">
84
- <a href="https://huggingface.co/datasets/nvidia/AudioSkills">
85
- <img src="https://img.shields.io/badge/🤗-Dataset:_AudioSkills--XL-ED5A22.svg">
86
- </a>
87
- <a href="https://huggingface.co/datasets/nvidia/LongAudio">
88
- <img src="https://img.shields.io/badge/🤗-Dataset:_LongAudio--XL-ED5A22.svg">
89
- </a>
90
- <a href="https://huggingface.co/datasets/nvidia/AF-Chat">
91
- <img src="https://img.shields.io/badge/🤗-Dataset:_AF--Chat-ED5A22.svg">
92
- </a>
93
- <a href="https://huggingface.co/datasets/nvidia/AF-Think">
94
- <img src="https://img.shields.io/badge/🤗-Dataset:_AF--Think-ED5A22.svg">
95
- </a>
96
  </div>
97
  """)
98
- # gr.Markdown("#### NVIDIA (2025)")
99
 
100
  with gr.Tabs():
101
  # ---------------- SINGLE-TURN ----------------
102
- with gr.Tab("🎯 Single-Turn Inference"):
103
  with gr.Row():
104
  with gr.Column():
105
- audio_input_single = gr.Audio(type="filepath", label="Upload Audio")
106
- prompt_input_single = gr.Textbox(label="Prompt", placeholder="Ask a question about the audio...", lines=8)
107
- btn_single = gr.Button("Generate Answer")
108
 
109
  gr.Examples(
110
  examples=[
111
- ["static/emergent/audio1.wav", "What is surprising about the relationship between the barking and the music?"],
112
- ["static/audio/audio2.wav", "Please describe the audio in detail."],
113
- ["static/speech/audio3.wav", "Transcribe any speech you hear."],
114
  ],
115
  inputs=[audio_input_single, prompt_input_single],
116
- label="🧪 Try Examples"
117
  )
118
 
119
  with gr.Column():
@@ -121,41 +95,26 @@ with gr.Blocks(css="""
121
 
122
  btn_single.click(fn=single_turn_infer, inputs=[audio_input_single, prompt_input_single], outputs=output_single)
123
 
124
- with gr.Tab("📄 About"):
125
  gr.Markdown("""
126
- ### 📚 Overview
127
 
128
- **Audio Flamingo 3** is a fully open state-of-the-art (SOTA) large audio-language model that advances reasoning and understanding across speech, sound, and music. AF3 introduces:
 
129
 
130
- (i) AF-Whisper, a unified audio encoder trained using a novel strategy for joint representation learning across all 3 modalities of speech, sound, and music;
131
-
132
- (ii) flexible, on-demand thinking, allowing the model to do chain-of-thought reasoning before answering;
133
-
134
- (iii) multi-turn, multi-audio chat;
135
-
136
- (iv) long audio understanding and reasoning (including speech) up to 10 minutes; and
137
-
138
- (v) voice-to-voice interaction.
139
-
140
- To enable these capabilities, we propose several large-scale training datasets curated using novel strategies, including AudioSkills-XL, LongAudio-XL, AF-Think, and AF-Chat, and train AF3 with a novel five-stage curriculum-based training strategy. Trained on only open-source audio data, AF3 achieves new SOTA results on over 20+ (long) audio understanding and reasoning benchmarks, surpassing both open-weight and closed-source models trained on much larger datasets.
141
-
142
- **Key Features:**
143
-
144
- 💡 Audio Flamingo 3 has strong audio, music and speech understanding capabilities.
145
-
146
- 💡 Audio Flamingo 3 supports on-demand thinking for chain-of-though reasoning.
147
-
148
- 💡 Audio Flamingo 3 supports long audio and speech understanding for audios up to 10 minutes.
149
-
150
- 💡 Audio Flamingo 3 can have multi-turn, multi-audio chat with users under complex context.
151
-
152
- 💡 Audio Flamingo 3 has voice-to-voice conversation abilities.
153
 
 
 
 
154
 
 
155
  """)
156
 
157
- gr.Markdown("© 2025 NVIDIA | Built with ❤️ using Gradio + PyTorch")
158
-
159
 
160
  # -----------------------
161
  # Launch App
 
5
  import spaces
6
  from huggingface_hub import snapshot_download
7
  import copy
8
+
9
  # ---------------------------------
10
  # SINGLE-TURN MODEL SETUP
11
  # ---------------------------------
 
13
  MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3")
14
  MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')
15
 
 
16
  model_single = llava.load(MODEL_BASE_SINGLE, model_base=None)
17
  model_single = model_single.to("cuda")
18
 
 
53
  with gr.Column():
54
  gr.HTML("""
55
  <div align="center">
56
+ <img src="https://raw.githubusercontent.com/NVIDIA/audio-flamingo/audio_flamingo_3/static/logo-no-bg.png" alt="Audio Flamingo 3 Logo" width="110" style="margin-bottom: 8px;">
57
+ <h2><strong>Audio Understanding Demo (WeaveMuse Edition)</strong></h2>
58
+ <p><em>Powered by NVIDIA’s open Audio Flamingo 3 model</em></p>
59
  </div>
60
 
61
  <div align="center" style="margin-top: 10px;">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  <a href="https://huggingface.co/nvidia/audio-flamingo-3">
63
+ <img src="https://img.shields.io/badge/🤗-Model:_audio--flamingo--3-ED5A22.svg">
64
  </a>
65
+ <a href="https://github.com/NVIDIA/audio-flamingo">
66
+ <img src="https://img.shields.io/badge/GitHub-NVIDIA%2Faudio--flamingo-9C276A.svg">
67
  </a>
68
  </div>
69
+ <div align="center" style="margin-top: 8px;">
70
+ <p><small>This simple interface demonstrates audio–language reasoning and transcription using open models from NVIDIA. Integrated for use in the <strong>WeaveMuse</strong> framework.</small></p>
 
 
 
 
 
 
 
 
 
 
 
71
  </div>
72
  """)
 
73
 
74
  with gr.Tabs():
75
  # ---------------- SINGLE-TURN ----------------
76
+ with gr.Tab("🎧 Audio Inference"):
77
  with gr.Row():
78
  with gr.Column():
79
+ audio_input_single = gr.Audio(type="filepath", label="Upload Audio Clip")
80
+ prompt_input_single = gr.Textbox(label="Prompt", placeholder="Ask about the audio (e.g. 'Describe the sound' or 'Transcribe any speech')", lines=6)
81
+ btn_single = gr.Button("Generate Response")
82
 
83
  gr.Examples(
84
  examples=[
85
+ ["static/emergent/audio1.wav", "What is happening in this audio?"],
86
+ ["static/audio/audio2.wav", "Describe the sounds you hear."],
87
+ ["static/speech/audio3.wav", "Transcribe the spoken words."],
88
  ],
89
  inputs=[audio_input_single, prompt_input_single],
90
+ label="🧪 Example Prompts"
91
  )
92
 
93
  with gr.Column():
 
95
 
96
  btn_single.click(fn=single_turn_infer, inputs=[audio_input_single, prompt_input_single], outputs=output_single)
97
 
98
+ with gr.Tab("ℹ️ About"):
99
  gr.Markdown("""
100
+ ### 🎶 Overview
101
 
102
+ This demo showcases a **simple single-turn audiolanguage interface** built for integration with the **WeaveMuse** framework.
103
+ It uses the open-source **NVIDIA Audio Flamingo 3** model for audio understanding, transcription, and sound reasoning.
104
 
105
+ You can upload an audio file and ask natural-language questions such as:
106
+ - “What kind of sound is this?”
107
+ - “Describe the scene.”
108
+ - “Transcribe any speech.”
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
+ **Acknowledgment:**
111
+ Model and research credit to **NVIDIA**, for the development of the open **Audio Flamingo 3** model and datasets used for training.
112
+ This interface is a simplified demonstration of how such models can be integrated into broader creative AI systems like WeaveMuse.
113
 
114
+ **Tech stack:** Gradio + PyTorch + llava + WeaveMuse Integration
115
  """)
116
 
117
+ gr.Markdown("© 2025 WeaveMuse | Model © NVIDIA — Audio Flamingo 3 | Built with ❤️ using Gradio & PyTorch")
 
118
 
119
  # -----------------------
120
  # Launch App