import torch import torchaudio import gradio as gr from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState, LFMModality HF_REPO = "LiquidAI/LFM2-Audio-1.5B" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load processor and model processor = LFM2AudioProcessor.from_pretrained(HF_REPO) model = LFM2AudioModel.from_pretrained(HF_REPO).to(device).eval() # Persistent chat state chat = ChatState(processor) def reset_chat(): global chat chat = ChatState(processor) return [], "Chat reset successfully." def generate_response(audio_input, text_input, history): global chat # Initialize system prompt if first turn if not history: chat.new_turn("system") chat.add_text("You are a helpful multimodal AI assistant that can reply with both text and audio.") chat.end_turn() # New user turn chat.new_turn("user") if text_input: chat.add_text(text_input) if audio_input: wav, sr = torchaudio.load(audio_input) chat.add_audio(wav, sr) chat.end_turn() # Assistant generation chat.new_turn("assistant") text_out, audio_out, modality_out = [], [], [] for t in model.generate_interleaved( **chat, max_new_tokens=512, audio_temperature=1.0, audio_top_k=4 ): if t.numel() == 1: text_out.append(t) modality_out.append(LFMModality.TEXT) else: audio_out.append(t) modality_out.append(LFMModality.AUDIO_OUT) decoded_text, audio_path = "", None # Decode text output if text_out: tokens = torch.stack(text_out, 1) decoded_text = processor.text.decode(tokens) # Decode audio output if audio_out: mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0) with torch.no_grad(): waveform = processor.mimi.decode(mimi_codes)[0] audio_path = "assistant_reply.wav" torchaudio.save(audio_path, waveform.cpu(), 24000) # Add to chat history history.append((text_input or "[Audio Input]", decoded_text or "[Audio Output]")) chat.append( text=torch.stack(text_out, 1) if text_out else None, audio_out=torch.stack(audio_out, 1) if audio_out else None, modality_flag=torch.tensor(modality_out), ) chat.end_turn() return history, decoded_text, audio_path # === Gradio UI === with gr.Blocks(title="🎧 LFM2-Audio-1.5B Chat") as demo: gr.Markdown("## 🎧 LFM2-Audio-1.5B — Multimodal AI Chatbot") gr.Markdown("Chat using **text or voice** — get replies in **text and audio** form.") with gr.Row(): text_inp = gr.Textbox(label="💬 Type your message", placeholder="Say something...") audio_inp = gr.Audio(source="microphone", type="filepath", label="🎙 Record / Upload Audio") with gr.Row(): send_btn = gr.Button("Generate Response", variant="primary") reset_btn = gr.Button("🔁 Reset Chat") chatbox = gr.Chatbot(label="Conversation History", height=400) text_out = gr.Textbox(label="📝 Text Response") audio_out = gr.Audio(label="🔊 Audio Response", type="filepath") send_btn.click( generate_response, inputs=[audio_inp, text_inp, chatbox], outputs=[chatbox, text_out, audio_out], ) reset_btn.click(reset_chat, outputs=[chatbox, text_out]) demo.queue().launch()