Trouter-Library commited on
Commit
ef0c6e1
·
verified ·
1 Parent(s): dccc9c1

Create inference/client.py

Browse files
Files changed (1) hide show
  1. inference/client.py +309 -0
inference/client.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Helion-2.5-Rnd Python Client
4
+ Easy-to-use client for interacting with Helion inference server
5
+ """
6
+
7
+ import json
8
+ import requests
9
+ from typing import Dict, Generator, List, Optional, Union
10
+
11
+
12
+ class HelionClient:
13
+ """Client for Helion-2.5-Rnd inference API"""
14
+
15
+ def __init__(
16
+ self,
17
+ base_url: str = "http://localhost:8000",
18
+ api_key: Optional[str] = None,
19
+ timeout: int = 300
20
+ ):
21
+ """
22
+ Initialize Helion client
23
+
24
+ Args:
25
+ base_url: Base URL of the inference server
26
+ api_key: Optional API key for authentication
27
+ timeout: Request timeout in seconds
28
+ """
29
+ self.base_url = base_url.rstrip('/')
30
+ self.timeout = timeout
31
+ self.headers = {
32
+ "Content-Type": "application/json"
33
+ }
34
+ if api_key:
35
+ self.headers["Authorization"] = f"Bearer {api_key}"
36
+
37
+ def chat(
38
+ self,
39
+ messages: List[Dict[str, str]],
40
+ temperature: float = 0.7,
41
+ max_tokens: int = 4096,
42
+ stream: bool = False,
43
+ **kwargs
44
+ ) -> Union[str, Generator[str, None, None]]:
45
+ """
46
+ Send a chat completion request
47
+
48
+ Args:
49
+ messages: List of message dicts with 'role' and 'content'
50
+ temperature: Sampling temperature (0.0 to 2.0)
51
+ max_tokens: Maximum tokens to generate
52
+ stream: Whether to stream the response
53
+ **kwargs: Additional parameters
54
+
55
+ Returns:
56
+ Generated text or generator for streaming
57
+ """
58
+ payload = {
59
+ "messages": messages,
60
+ "temperature": temperature,
61
+ "max_tokens": max_tokens,
62
+ "stream": stream,
63
+ **kwargs
64
+ }
65
+
66
+ if stream:
67
+ return self._stream_chat(payload)
68
+ else:
69
+ return self._complete_chat(payload)
70
+
71
+ def _complete_chat(self, payload: Dict) -> str:
72
+ """Non-streaming chat completion"""
73
+ response = requests.post(
74
+ f"{self.base_url}/v1/chat/completions",
75
+ headers=self.headers,
76
+ json=payload,
77
+ timeout=self.timeout
78
+ )
79
+ response.raise_for_status()
80
+
81
+ data = response.json()
82
+ return data["choices"][0]["message"]["content"]
83
+
84
+ def _stream_chat(self, payload: Dict) -> Generator[str, None, None]:
85
+ """Streaming chat completion"""
86
+ response = requests.post(
87
+ f"{self.base_url}/v1/chat/completions",
88
+ headers=self.headers,
89
+ json=payload,
90
+ stream=True,
91
+ timeout=self.timeout
92
+ )
93
+ response.raise_for_status()
94
+
95
+ for line in response.iter_lines():
96
+ if line:
97
+ line = line.decode('utf-8')
98
+ if line.startswith('data: '):
99
+ data_str = line[6:]
100
+ if data_str == '[DONE]':
101
+ break
102
+
103
+ try:
104
+ data = json.loads(data_str)
105
+ delta = data["choices"][0]["delta"].get("content", "")
106
+ if delta:
107
+ yield delta
108
+ except json.JSONDecodeError:
109
+ continue
110
+
111
+ def complete(
112
+ self,
113
+ prompt: str,
114
+ temperature: float = 0.7,
115
+ max_tokens: int = 4096,
116
+ stream: bool = False,
117
+ **kwargs
118
+ ) -> Union[str, Generator[str, None, None]]:
119
+ """
120
+ Send a text completion request
121
+
122
+ Args:
123
+ prompt: Input text prompt
124
+ temperature: Sampling temperature
125
+ max_tokens: Maximum tokens to generate
126
+ stream: Whether to stream the response
127
+ **kwargs: Additional parameters
128
+
129
+ Returns:
130
+ Generated text or generator for streaming
131
+ """
132
+ messages = [{"role": "user", "content": prompt}]
133
+ return self.chat(
134
+ messages=messages,
135
+ temperature=temperature,
136
+ max_tokens=max_tokens,
137
+ stream=stream,
138
+ **kwargs
139
+ )
140
+
141
+ def health_check(self) -> Dict:
142
+ """Check server health"""
143
+ response = requests.get(
144
+ f"{self.base_url}/health",
145
+ headers=self.headers,
146
+ timeout=10
147
+ )
148
+ response.raise_for_status()
149
+ return response.json()
150
+
151
+ def list_models(self) -> List[Dict]:
152
+ """List available models"""
153
+ response = requests.get(
154
+ f"{self.base_url}/v1/models",
155
+ headers=self.headers,
156
+ timeout=10
157
+ )
158
+ response.raise_for_status()
159
+ return response.json()["data"]
160
+
161
+
162
+ class HelionAssistant:
163
+ """High-level assistant interface for Helion"""
164
+
165
+ def __init__(
166
+ self,
167
+ base_url: str = "http://localhost:8000",
168
+ system_prompt: Optional[str] = None,
169
+ **client_kwargs
170
+ ):
171
+ """
172
+ Initialize Helion assistant
173
+
174
+ Args:
175
+ base_url: Base URL of inference server
176
+ system_prompt: System prompt to use for all conversations
177
+ **client_kwargs: Additional arguments for HelionClient
178
+ """
179
+ self.client = HelionClient(base_url=base_url, **client_kwargs)
180
+ self.system_prompt = system_prompt or (
181
+ "You are Helion, an advanced AI assistant developed by DeepXR. "
182
+ "You are helpful, harmless, and honest."
183
+ )
184
+ self.conversation_history: List[Dict[str, str]] = []
185
+
186
+ def chat(
187
+ self,
188
+ message: str,
189
+ temperature: float = 0.7,
190
+ max_tokens: int = 4096,
191
+ stream: bool = False,
192
+ reset_history: bool = False
193
+ ) -> Union[str, Generator[str, None, None]]:
194
+ """
195
+ Chat with the assistant
196
+
197
+ Args:
198
+ message: User message
199
+ temperature: Sampling temperature
200
+ max_tokens: Maximum tokens to generate
201
+ stream: Whether to stream the response
202
+ reset_history: Whether to reset conversation history
203
+
204
+ Returns:
205
+ Assistant response
206
+ """
207
+ if reset_history:
208
+ self.conversation_history = []
209
+
210
+ # Build messages
211
+ messages = [{"role": "system", "content": self.system_prompt}]
212
+ messages.extend(self.conversation_history)
213
+ messages.append({"role": "user", "content": message})
214
+
215
+ # Get response
216
+ if stream:
217
+ return self._stream_and_store(messages, temperature, max_tokens, message)
218
+ else:
219
+ response = self.client.chat(
220
+ messages=messages,
221
+ temperature=temperature,
222
+ max_tokens=max_tokens,
223
+ stream=False
224
+ )
225
+
226
+ # Update history
227
+ self.conversation_history.append({"role": "user", "content": message})
228
+ self.conversation_history.append({"role": "assistant", "content": response})
229
+
230
+ return response
231
+
232
+ def _stream_and_store(
233
+ self,
234
+ messages: List[Dict],
235
+ temperature: float,
236
+ max_tokens: int,
237
+ user_message: str
238
+ ) -> Generator[str, None, None]:
239
+ """Stream response and store in history"""
240
+ full_response = ""
241
+
242
+ for chunk in self.client.chat(
243
+ messages=messages,
244
+ temperature=temperature,
245
+ max_tokens=max_tokens,
246
+ stream=True
247
+ ):
248
+ full_response += chunk
249
+ yield chunk
250
+
251
+ # Update history after streaming complete
252
+ self.conversation_history.append({"role": "user", "content": user_message})
253
+ self.conversation_history.append({"role": "assistant", "content": full_response})
254
+
255
+ def reset(self):
256
+ """Reset conversation history"""
257
+ self.conversation_history = []
258
+
259
+ def get_history(self) -> List[Dict[str, str]]:
260
+ """Get conversation history"""
261
+ return self.conversation_history.copy()
262
+
263
+
264
+ # Example usage
265
+ def example_usage():
266
+ """Example usage of Helion client"""
267
+
268
+ # Initialize client
269
+ client = HelionClient(base_url="http://localhost:8000")
270
+
271
+ # Check health
272
+ health = client.health_check()
273
+ print(f"Server status: {health['status']}")
274
+
275
+ # Simple completion
276
+ response = client.complete(
277
+ "Explain quantum computing in simple terms:",
278
+ temperature=0.7,
279
+ max_tokens=500
280
+ )
281
+ print(f"\nResponse: {response}")
282
+
283
+ # Chat with conversation
284
+ messages = [
285
+ {"role": "system", "content": "You are a helpful coding assistant."},
286
+ {"role": "user", "content": "Write a Python function to calculate fibonacci numbers"}
287
+ ]
288
+
289
+ response = client.chat(messages=messages, temperature=0.3)
290
+ print(f"\nCode: {response}")
291
+
292
+ # Streaming example
293
+ print("\nStreaming response:")
294
+ for chunk in client.complete("Tell me a short story about AI:", stream=True):
295
+ print(chunk, end='', flush=True)
296
+ print()
297
+
298
+ # Using assistant interface
299
+ assistant = HelionAssistant()
300
+ response = assistant.chat("What is machine learning?")
301
+ print(f"\nAssistant: {response}")
302
+
303
+ # Continue conversation
304
+ response = assistant.chat("Can you give me an example?")
305
+ print(f"\nAssistant: {response}")
306
+
307
+
308
+ if __name__ == "__main__":
309
+ example_usage()