Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -5,6 +5,7 @@ import os | |
| 5 | 
             
            import spaces
         | 
| 6 | 
             
            from huggingface_hub import snapshot_download
         | 
| 7 | 
             
            import copy 
         | 
|  | |
| 8 | 
             
            # ---------------------------------
         | 
| 9 | 
             
            # SINGLE-TURN MODEL SETUP
         | 
| 10 | 
             
            # ---------------------------------
         | 
| @@ -12,7 +13,6 @@ import copy | |
| 12 | 
             
            MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3")
         | 
| 13 | 
             
            MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')
         | 
| 14 |  | 
| 15 | 
            -
            # model_single = llava.load(MODEL_BASE_SINGLE, model_base=None, devices=[0])
         | 
| 16 | 
             
            model_single = llava.load(MODEL_BASE_SINGLE, model_base=None)
         | 
| 17 | 
             
            model_single = model_single.to("cuda")
         | 
| 18 |  | 
| @@ -53,67 +53,41 @@ with gr.Blocks(css=""" | |
| 53 | 
             
                with gr.Column():
         | 
| 54 | 
             
                    gr.HTML("""
         | 
| 55 | 
             
            <div align="center">
         | 
| 56 | 
            -
              <img src="https://raw.githubusercontent.com/NVIDIA/audio-flamingo/audio_flamingo_3/static/logo-no-bg.png" alt="Audio Flamingo 3 Logo" width=" | 
| 57 | 
            -
              <h2><strong>Audio  | 
| 58 | 
            -
              <p><em> | 
| 59 | 
             
            </div>
         | 
| 60 |  | 
| 61 | 
             
            <div align="center" style="margin-top: 10px;">
         | 
| 62 | 
            -
              <a href="https://arxiv.org/abs/2507.08128">
         | 
| 63 | 
            -
                <img src="https://img.shields.io/badge/arXiv-2503.03983-AD1C18" alt="arXiv" style="display:inline;">
         | 
| 64 | 
            -
              </a>
         | 
| 65 | 
            -
              <a href="https://research.nvidia.com/labs/adlr/AF3/">
         | 
| 66 | 
            -
                <img src="https://img.shields.io/badge/Demo%20page-228B22" alt="Demo Page" style="display:inline;">
         | 
| 67 | 
            -
              </a>
         | 
| 68 | 
            -
              <a href="https://github.com/NVIDIA/audio-flamingo">
         | 
| 69 | 
            -
                <img src="https://img.shields.io/badge/Github-Audio_Flamingo_3-9C276A" alt="GitHub" style="display:inline;">
         | 
| 70 | 
            -
              </a>
         | 
| 71 | 
            -
              <a href="https://github.com/NVIDIA/audio-flamingo/stargazers">
         | 
| 72 | 
            -
                <img src="https://img.shields.io/github/stars/NVIDIA/audio-flamingo.svg?style=social" alt="GitHub Stars" style="display:inline;">
         | 
| 73 | 
            -
              </a>
         | 
| 74 | 
            -
            </div>
         | 
| 75 | 
            -
            <div align="center" style="display: flex; justify-content: center; margin-top: 10px; flex-wrap: wrap; gap: 5px;">
         | 
| 76 | 
             
              <a href="https://huggingface.co/nvidia/audio-flamingo-3">
         | 
| 77 | 
            -
                <img src="https://img.shields.io/badge/🤗- | 
| 78 | 
             
              </a>
         | 
| 79 | 
            -
              <a href="https:// | 
| 80 | 
            -
                <img src="https://img.shields.io/badge | 
| 81 | 
             
              </a>
         | 
| 82 | 
             
            </div>
         | 
| 83 | 
            -
            <div align="center" style=" | 
| 84 | 
            -
              < | 
| 85 | 
            -
                <img src="https://img.shields.io/badge/🤗-Dataset:_AudioSkills--XL-ED5A22.svg">
         | 
| 86 | 
            -
              </a>
         | 
| 87 | 
            -
              <a href="https://huggingface.co/datasets/nvidia/LongAudio">
         | 
| 88 | 
            -
                <img src="https://img.shields.io/badge/🤗-Dataset:_LongAudio--XL-ED5A22.svg">
         | 
| 89 | 
            -
              </a>
         | 
| 90 | 
            -
              <a href="https://huggingface.co/datasets/nvidia/AF-Chat">
         | 
| 91 | 
            -
                <img src="https://img.shields.io/badge/🤗-Dataset:_AF--Chat-ED5A22.svg">
         | 
| 92 | 
            -
              </a>
         | 
| 93 | 
            -
              <a href="https://huggingface.co/datasets/nvidia/AF-Think">
         | 
| 94 | 
            -
                <img src="https://img.shields.io/badge/🤗-Dataset:_AF--Think-ED5A22.svg">
         | 
| 95 | 
            -
              </a>
         | 
| 96 | 
             
            </div>
         | 
| 97 | 
             
            """)
         | 
| 98 | 
            -
                # gr.Markdown("#### NVIDIA (2025)")
         | 
| 99 |  | 
| 100 | 
             
                with gr.Tabs():
         | 
| 101 | 
             
                    # ---------------- SINGLE-TURN ----------------
         | 
| 102 | 
            -
                    with gr.Tab(" | 
| 103 | 
             
                        with gr.Row():
         | 
| 104 | 
             
                            with gr.Column():
         | 
| 105 | 
            -
                                audio_input_single = gr.Audio(type="filepath", label="Upload Audio")
         | 
| 106 | 
            -
                                prompt_input_single = gr.Textbox(label="Prompt", placeholder="Ask  | 
| 107 | 
            -
                                btn_single = gr.Button("Generate  | 
| 108 |  | 
| 109 | 
             
                                gr.Examples(
         | 
| 110 | 
             
                                    examples=[
         | 
| 111 | 
            -
                                        ["static/emergent/audio1.wav", "What is  | 
| 112 | 
            -
                                        ["static/audio/audio2.wav", " | 
| 113 | 
            -
                                        ["static/speech/audio3.wav", "Transcribe  | 
| 114 | 
             
                                    ],
         | 
| 115 | 
             
                                    inputs=[audio_input_single, prompt_input_single],
         | 
| 116 | 
            -
                                    label="🧪  | 
| 117 | 
             
                                )
         | 
| 118 |  | 
| 119 | 
             
                            with gr.Column():
         | 
| @@ -121,41 +95,26 @@ with gr.Blocks(css=""" | |
| 121 |  | 
| 122 | 
             
                        btn_single.click(fn=single_turn_infer, inputs=[audio_input_single, prompt_input_single], outputs=output_single)        
         | 
| 123 |  | 
| 124 | 
            -
                    with gr.Tab(" | 
| 125 | 
             
                        gr.Markdown("""
         | 
| 126 | 
            -
            ###  | 
| 127 |  | 
| 128 | 
            -
             | 
|  | |
| 129 |  | 
| 130 | 
            -
             | 
| 131 | 
            -
             | 
| 132 | 
            -
             | 
| 133 | 
            -
             | 
| 134 | 
            -
            (iii) multi-turn, multi-audio chat;
         | 
| 135 | 
            -
             | 
| 136 | 
            -
            (iv) long audio understanding and reasoning (including speech) up to 10 minutes; and
         | 
| 137 | 
            -
             | 
| 138 | 
            -
            (v) voice-to-voice interaction.
         | 
| 139 | 
            -
             | 
| 140 | 
            -
            To enable these capabilities, we propose several large-scale training datasets curated using novel strategies, including AudioSkills-XL, LongAudio-XL, AF-Think, and AF-Chat, and train AF3 with a novel five-stage curriculum-based training strategy. Trained on only open-source audio data, AF3 achieves new SOTA results on over 20+ (long) audio understanding and reasoning benchmarks, surpassing both open-weight and closed-source models trained on much larger datasets.
         | 
| 141 | 
            -
             | 
| 142 | 
            -
            **Key Features:**
         | 
| 143 | 
            -
             | 
| 144 | 
            -
            💡 Audio Flamingo 3 has strong audio, music and speech understanding capabilities.
         | 
| 145 | 
            -
             | 
| 146 | 
            -
            💡 Audio Flamingo 3 supports on-demand thinking for chain-of-though reasoning.
         | 
| 147 | 
            -
             | 
| 148 | 
            -
            💡 Audio Flamingo 3 supports long audio and speech understanding for audios up to 10 minutes.
         | 
| 149 | 
            -
             | 
| 150 | 
            -
            💡 Audio Flamingo 3 can have multi-turn, multi-audio chat with users under complex context.
         | 
| 151 | 
            -
             | 
| 152 | 
            -
            💡 Audio Flamingo 3 has voice-to-voice conversation abilities.
         | 
| 153 |  | 
|  | |
|  | |
|  | |
| 154 |  | 
|  | |
| 155 | 
             
            """)
         | 
| 156 |  | 
| 157 | 
            -
                gr.Markdown("© 2025 NVIDIA | Built with ❤️ using Gradio  | 
| 158 | 
            -
             | 
| 159 |  | 
| 160 | 
             
            # -----------------------
         | 
| 161 | 
             
            # Launch App
         | 
|  | |
| 5 | 
             
            import spaces
         | 
| 6 | 
             
            from huggingface_hub import snapshot_download
         | 
| 7 | 
             
            import copy 
         | 
| 8 | 
            +
             | 
| 9 | 
             
            # ---------------------------------
         | 
| 10 | 
             
            # SINGLE-TURN MODEL SETUP
         | 
| 11 | 
             
            # ---------------------------------
         | 
|  | |
| 13 | 
             
            MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3")
         | 
| 14 | 
             
            MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')
         | 
| 15 |  | 
|  | |
| 16 | 
             
            model_single = llava.load(MODEL_BASE_SINGLE, model_base=None)
         | 
| 17 | 
             
            model_single = model_single.to("cuda")
         | 
| 18 |  | 
|  | |
| 53 | 
             
                with gr.Column():
         | 
| 54 | 
             
                    gr.HTML("""
         | 
| 55 | 
             
            <div align="center">
         | 
| 56 | 
            +
              <img src="https://raw.githubusercontent.com/NVIDIA/audio-flamingo/audio_flamingo_3/static/logo-no-bg.png" alt="Audio Flamingo 3 Logo" width="110" style="margin-bottom: 8px;">
         | 
| 57 | 
            +
              <h2><strong>Audio Understanding Demo (WeaveMuse Edition)</strong></h2>
         | 
| 58 | 
            +
              <p><em>Powered by NVIDIA’s open Audio Flamingo 3 model</em></p>
         | 
| 59 | 
             
            </div>
         | 
| 60 |  | 
| 61 | 
             
            <div align="center" style="margin-top: 10px;">
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 62 | 
             
              <a href="https://huggingface.co/nvidia/audio-flamingo-3">
         | 
| 63 | 
            +
                <img src="https://img.shields.io/badge/🤗-Model:_audio--flamingo--3-ED5A22.svg">
         | 
| 64 | 
             
              </a>
         | 
| 65 | 
            +
              <a href="https://github.com/NVIDIA/audio-flamingo">
         | 
| 66 | 
            +
                <img src="https://img.shields.io/badge/GitHub-NVIDIA%2Faudio--flamingo-9C276A.svg">
         | 
| 67 | 
             
              </a>
         | 
| 68 | 
             
            </div>
         | 
| 69 | 
            +
            <div align="center" style="margin-top: 8px;">
         | 
| 70 | 
            +
              <p><small>This simple interface demonstrates audio–language reasoning and transcription using open models from NVIDIA. Integrated for use in the <strong>WeaveMuse</strong> framework.</small></p>
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 71 | 
             
            </div>
         | 
| 72 | 
             
            """)
         | 
|  | |
| 73 |  | 
| 74 | 
             
                with gr.Tabs():
         | 
| 75 | 
             
                    # ---------------- SINGLE-TURN ----------------
         | 
| 76 | 
            +
                    with gr.Tab("🎧 Audio Inference"):
         | 
| 77 | 
             
                        with gr.Row():
         | 
| 78 | 
             
                            with gr.Column():
         | 
| 79 | 
            +
                                audio_input_single = gr.Audio(type="filepath", label="Upload Audio Clip")
         | 
| 80 | 
            +
                                prompt_input_single = gr.Textbox(label="Prompt", placeholder="Ask about the audio (e.g. 'Describe the sound' or 'Transcribe any speech')", lines=6)
         | 
| 81 | 
            +
                                btn_single = gr.Button("Generate Response")
         | 
| 82 |  | 
| 83 | 
             
                                gr.Examples(
         | 
| 84 | 
             
                                    examples=[
         | 
| 85 | 
            +
                                        ["static/emergent/audio1.wav", "What is happening in this audio?"],
         | 
| 86 | 
            +
                                        ["static/audio/audio2.wav", "Describe the sounds you hear."],
         | 
| 87 | 
            +
                                        ["static/speech/audio3.wav", "Transcribe the spoken words."],
         | 
| 88 | 
             
                                    ],
         | 
| 89 | 
             
                                    inputs=[audio_input_single, prompt_input_single],
         | 
| 90 | 
            +
                                    label="🧪 Example Prompts"
         | 
| 91 | 
             
                                )
         | 
| 92 |  | 
| 93 | 
             
                            with gr.Column():
         | 
|  | |
| 95 |  | 
| 96 | 
             
                        btn_single.click(fn=single_turn_infer, inputs=[audio_input_single, prompt_input_single], outputs=output_single)        
         | 
| 97 |  | 
| 98 | 
            +
                    with gr.Tab("ℹ️ About"):
         | 
| 99 | 
             
                        gr.Markdown("""
         | 
| 100 | 
            +
            ### 🎶 Overview
         | 
| 101 |  | 
| 102 | 
            +
            This demo showcases a **simple single-turn audio–language interface** built for integration with the **WeaveMuse** framework.  
         | 
| 103 | 
            +
            It uses the open-source **NVIDIA Audio Flamingo 3** model for audio understanding, transcription, and sound reasoning.
         | 
| 104 |  | 
| 105 | 
            +
            You can upload an audio file and ask natural-language questions such as:
         | 
| 106 | 
            +
            - “What kind of sound is this?”
         | 
| 107 | 
            +
            - “Describe the scene.”
         | 
| 108 | 
            +
            - “Transcribe any speech.”
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 109 |  | 
| 110 | 
            +
            **Acknowledgment:**  
         | 
| 111 | 
            +
            Model and research credit to **NVIDIA**, for the development of the open **Audio Flamingo 3** model and datasets used for training.  
         | 
| 112 | 
            +
            This interface is a simplified demonstration of how such models can be integrated into broader creative AI systems like WeaveMuse.
         | 
| 113 |  | 
| 114 | 
            +
            **Tech stack:** Gradio + PyTorch + llava + WeaveMuse Integration
         | 
| 115 | 
             
            """)
         | 
| 116 |  | 
| 117 | 
            +
                gr.Markdown("© 2025 WeaveMuse | Model © NVIDIA — Audio Flamingo 3 | Built with ❤️ using Gradio & PyTorch")
         | 
|  | |
| 118 |  | 
| 119 | 
             
            # -----------------------
         | 
| 120 | 
             
            # Launch App
         | 
 
			
