Trouter-Library's picture
Create inference/client.py
ef0c6e1 verified
#!/usr/bin/env python3
"""
Helion-2.5-Rnd Python Client
Easy-to-use client for interacting with Helion inference server
"""
import json
import requests
from typing import Dict, Generator, List, Optional, Union
class HelionClient:
"""Client for Helion-2.5-Rnd inference API"""
def __init__(
self,
base_url: str = "http://localhost:8000",
api_key: Optional[str] = None,
timeout: int = 300
):
"""
Initialize Helion client
Args:
base_url: Base URL of the inference server
api_key: Optional API key for authentication
timeout: Request timeout in seconds
"""
self.base_url = base_url.rstrip('/')
self.timeout = timeout
self.headers = {
"Content-Type": "application/json"
}
if api_key:
self.headers["Authorization"] = f"Bearer {api_key}"
def chat(
self,
messages: List[Dict[str, str]],
temperature: float = 0.7,
max_tokens: int = 4096,
stream: bool = False,
**kwargs
) -> Union[str, Generator[str, None, None]]:
"""
Send a chat completion request
Args:
messages: List of message dicts with 'role' and 'content'
temperature: Sampling temperature (0.0 to 2.0)
max_tokens: Maximum tokens to generate
stream: Whether to stream the response
**kwargs: Additional parameters
Returns:
Generated text or generator for streaming
"""
payload = {
"messages": messages,
"temperature": temperature,
"max_tokens": max_tokens,
"stream": stream,
**kwargs
}
if stream:
return self._stream_chat(payload)
else:
return self._complete_chat(payload)
def _complete_chat(self, payload: Dict) -> str:
"""Non-streaming chat completion"""
response = requests.post(
f"{self.base_url}/v1/chat/completions",
headers=self.headers,
json=payload,
timeout=self.timeout
)
response.raise_for_status()
data = response.json()
return data["choices"][0]["message"]["content"]
def _stream_chat(self, payload: Dict) -> Generator[str, None, None]:
"""Streaming chat completion"""
response = requests.post(
f"{self.base_url}/v1/chat/completions",
headers=self.headers,
json=payload,
stream=True,
timeout=self.timeout
)
response.raise_for_status()
for line in response.iter_lines():
if line:
line = line.decode('utf-8')
if line.startswith('data: '):
data_str = line[6:]
if data_str == '[DONE]':
break
try:
data = json.loads(data_str)
delta = data["choices"][0]["delta"].get("content", "")
if delta:
yield delta
except json.JSONDecodeError:
continue
def complete(
self,
prompt: str,
temperature: float = 0.7,
max_tokens: int = 4096,
stream: bool = False,
**kwargs
) -> Union[str, Generator[str, None, None]]:
"""
Send a text completion request
Args:
prompt: Input text prompt
temperature: Sampling temperature
max_tokens: Maximum tokens to generate
stream: Whether to stream the response
**kwargs: Additional parameters
Returns:
Generated text or generator for streaming
"""
messages = [{"role": "user", "content": prompt}]
return self.chat(
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
stream=stream,
**kwargs
)
def health_check(self) -> Dict:
"""Check server health"""
response = requests.get(
f"{self.base_url}/health",
headers=self.headers,
timeout=10
)
response.raise_for_status()
return response.json()
def list_models(self) -> List[Dict]:
"""List available models"""
response = requests.get(
f"{self.base_url}/v1/models",
headers=self.headers,
timeout=10
)
response.raise_for_status()
return response.json()["data"]
class HelionAssistant:
"""High-level assistant interface for Helion"""
def __init__(
self,
base_url: str = "http://localhost:8000",
system_prompt: Optional[str] = None,
**client_kwargs
):
"""
Initialize Helion assistant
Args:
base_url: Base URL of inference server
system_prompt: System prompt to use for all conversations
**client_kwargs: Additional arguments for HelionClient
"""
self.client = HelionClient(base_url=base_url, **client_kwargs)
self.system_prompt = system_prompt or (
"You are Helion, an advanced AI assistant developed by DeepXR. "
"You are helpful, harmless, and honest."
)
self.conversation_history: List[Dict[str, str]] = []
def chat(
self,
message: str,
temperature: float = 0.7,
max_tokens: int = 4096,
stream: bool = False,
reset_history: bool = False
) -> Union[str, Generator[str, None, None]]:
"""
Chat with the assistant
Args:
message: User message
temperature: Sampling temperature
max_tokens: Maximum tokens to generate
stream: Whether to stream the response
reset_history: Whether to reset conversation history
Returns:
Assistant response
"""
if reset_history:
self.conversation_history = []
# Build messages
messages = [{"role": "system", "content": self.system_prompt}]
messages.extend(self.conversation_history)
messages.append({"role": "user", "content": message})
# Get response
if stream:
return self._stream_and_store(messages, temperature, max_tokens, message)
else:
response = self.client.chat(
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
stream=False
)
# Update history
self.conversation_history.append({"role": "user", "content": message})
self.conversation_history.append({"role": "assistant", "content": response})
return response
def _stream_and_store(
self,
messages: List[Dict],
temperature: float,
max_tokens: int,
user_message: str
) -> Generator[str, None, None]:
"""Stream response and store in history"""
full_response = ""
for chunk in self.client.chat(
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
stream=True
):
full_response += chunk
yield chunk
# Update history after streaming complete
self.conversation_history.append({"role": "user", "content": user_message})
self.conversation_history.append({"role": "assistant", "content": full_response})
def reset(self):
"""Reset conversation history"""
self.conversation_history = []
def get_history(self) -> List[Dict[str, str]]:
"""Get conversation history"""
return self.conversation_history.copy()
# Example usage
def example_usage():
"""Example usage of Helion client"""
# Initialize client
client = HelionClient(base_url="http://localhost:8000")
# Check health
health = client.health_check()
print(f"Server status: {health['status']}")
# Simple completion
response = client.complete(
"Explain quantum computing in simple terms:",
temperature=0.7,
max_tokens=500
)
print(f"\nResponse: {response}")
# Chat with conversation
messages = [
{"role": "system", "content": "You are a helpful coding assistant."},
{"role": "user", "content": "Write a Python function to calculate fibonacci numbers"}
]
response = client.chat(messages=messages, temperature=0.3)
print(f"\nCode: {response}")
# Streaming example
print("\nStreaming response:")
for chunk in client.complete("Tell me a short story about AI:", stream=True):
print(chunk, end='', flush=True)
print()
# Using assistant interface
assistant = HelionAssistant()
response = assistant.chat("What is machine learning?")
print(f"\nAssistant: {response}")
# Continue conversation
response = assistant.chat("Can you give me an example?")
print(f"\nAssistant: {response}")
if __name__ == "__main__":
example_usage()