Existance commited on
Commit
5c2f019
Β·
verified Β·
1 Parent(s): d45cf80

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -0
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ import gradio as gr
4
+ from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState, LFMModality
5
+
6
+ HF_REPO = "LiquidAI/LFM2-Audio-1.5B"
7
+
8
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
+
10
+ # Load processor and model
11
+ processor = LFM2AudioProcessor.from_pretrained(HF_REPO)
12
+ model = LFM2AudioModel.from_pretrained(HF_REPO).to(device).eval()
13
+
14
+ # Persistent chat state
15
+ chat = ChatState(processor)
16
+
17
+ def reset_chat():
18
+ global chat
19
+ chat = ChatState(processor)
20
+ return [], "Chat reset successfully."
21
+
22
+ def generate_response(audio_input, text_input, history):
23
+ global chat
24
+
25
+ # Initialize system prompt if first turn
26
+ if not history:
27
+ chat.new_turn("system")
28
+ chat.add_text("You are a helpful multimodal AI assistant that can reply with both text and audio.")
29
+ chat.end_turn()
30
+
31
+ # New user turn
32
+ chat.new_turn("user")
33
+ if text_input:
34
+ chat.add_text(text_input)
35
+ if audio_input:
36
+ wav, sr = torchaudio.load(audio_input)
37
+ chat.add_audio(wav, sr)
38
+ chat.end_turn()
39
+
40
+ # Assistant generation
41
+ chat.new_turn("assistant")
42
+
43
+ text_out, audio_out, modality_out = [], [], []
44
+
45
+ for t in model.generate_interleaved(
46
+ **chat, max_new_tokens=512, audio_temperature=1.0, audio_top_k=4
47
+ ):
48
+ if t.numel() == 1:
49
+ text_out.append(t)
50
+ modality_out.append(LFMModality.TEXT)
51
+ else:
52
+ audio_out.append(t)
53
+ modality_out.append(LFMModality.AUDIO_OUT)
54
+
55
+ decoded_text, audio_path = "", None
56
+
57
+ # Decode text output
58
+ if text_out:
59
+ tokens = torch.stack(text_out, 1)
60
+ decoded_text = processor.text.decode(tokens)
61
+
62
+ # Decode audio output
63
+ if audio_out:
64
+ mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0)
65
+ with torch.no_grad():
66
+ waveform = processor.mimi.decode(mimi_codes)[0]
67
+ audio_path = "assistant_reply.wav"
68
+ torchaudio.save(audio_path, waveform.cpu(), 24000)
69
+
70
+ # Add to chat history
71
+ history.append((text_input or "[Audio Input]", decoded_text or "[Audio Output]"))
72
+ chat.append(
73
+ text=torch.stack(text_out, 1) if text_out else None,
74
+ audio_out=torch.stack(audio_out, 1) if audio_out else None,
75
+ modality_flag=torch.tensor(modality_out),
76
+ )
77
+ chat.end_turn()
78
+
79
+ return history, decoded_text, audio_path
80
+
81
+
82
+ # === Gradio UI ===
83
+ with gr.Blocks(title="🎧 LFM2-Audio-1.5B Chat") as demo:
84
+ gr.Markdown("## 🎧 LFM2-Audio-1.5B β€” Multimodal AI Chatbot")
85
+ gr.Markdown("Chat using **text or voice** β€” get replies in **text and audio** form.")
86
+
87
+ with gr.Row():
88
+ text_inp = gr.Textbox(label="πŸ’¬ Type your message", placeholder="Say something...")
89
+ audio_inp = gr.Audio(source="microphone", type="filepath", label="πŸŽ™ Record / Upload Audio")
90
+
91
+ with gr.Row():
92
+ send_btn = gr.Button("Generate Response", variant="primary")
93
+ reset_btn = gr.Button("πŸ” Reset Chat")
94
+
95
+ chatbox = gr.Chatbot(label="Conversation History", height=400)
96
+ text_out = gr.Textbox(label="πŸ“ Text Response")
97
+ audio_out = gr.Audio(label="πŸ”Š Audio Response", type="filepath")
98
+
99
+ send_btn.click(
100
+ generate_response,
101
+ inputs=[audio_inp, text_inp, chatbox],
102
+ outputs=[chatbox, text_out, audio_out],
103
+ )
104
+
105
+ reset_btn.click(reset_chat, outputs=[chatbox, text_out])
106
+
107
+ demo.queue().launch()