File size: 7,583 Bytes
c1bee18
 
 
 
 
 
c40f3d0
 
 
c1bee18
 
c40f3d0
c1bee18
 
 
 
 
 
 
c40f3d0
 
 
c1bee18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c40f3d0
c1bee18
c40f3d0
c1bee18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c40f3d0
c1bee18
 
 
 
 
 
 
 
 
 
 
 
c40f3d0
 
 
 
 
 
 
 
 
 
 
 
 
 
c1bee18
c40f3d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1bee18
c40f3d0
 
 
 
 
 
c1bee18
 
c40f3d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1bee18
 
c40f3d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1bee18
 
c40f3d0
 
 
 
 
 
c1bee18
 
 
 
c192021
c1bee18
 
c192021
 
c1bee18
 
 
 
c192021
c1bee18
 
 
 
 
 
 
 
 
 
c192021
c1bee18
 
 
c192021
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
"""
Chat functionality handler for HF-Inferoxy AI Hub.
Handles chat completion requests with streaming responses.
"""

import os
import time
import threading
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
from huggingface_hub import InferenceClient
from huggingface_hub.errors import HfHubHTTPError
from requests.exceptions import ConnectionError, Timeout, RequestException
from hf_token_utils import get_proxy_token, report_token_status
from utils import (
    validate_proxy_key, 
    parse_model_and_provider, 
    format_error_message
)

# Timeout configuration for inference requests
INFERENCE_TIMEOUT = 120  # 2 minutes max for inference


def chat_respond(
    message,
    history: list[dict[str, str]],
    system_message,
    model_name,
    max_tokens,
    temperature,
    top_p,
):
    """
    Chat completion function using HF-Inferoxy token management.
    """
    # Validate proxy API key
    is_valid, error_msg = validate_proxy_key()
    if not is_valid:
        yield error_msg
        return
    
    proxy_api_key = os.getenv("PROXY_KEY")
    
    token_id = None
    try:
        # Get token from HF-Inferoxy proxy server with timeout handling
        print(f"πŸ”‘ Chat: Requesting token from proxy...")
        token, token_id = get_proxy_token(api_key=proxy_api_key)
        print(f"βœ… Chat: Got token: {token_id}")
        
        # Parse model name and provider if specified
        model, provider = parse_model_and_provider(model_name)
        
        print(f"πŸ€– Chat: Using model='{model}', provider='{provider if provider else 'auto'}'")
        
        # Prepare messages first
        messages = [{"role": "system", "content": system_message}]
        messages.extend(history)
        messages.append({"role": "user", "content": message})

        print(f"πŸ’¬ Chat: Prepared {len(messages)} messages, creating client...")
        
        # Create client with provider (auto if none specified) and always pass model
        client = InferenceClient(
            provider=provider if provider else "auto", 
            api_key=token
        )
        
        print(f"πŸš€ Chat: Client created, starting inference with timeout...")
        
        chat_completion_kwargs = {
            "model": model,
            "messages": messages,
            "max_tokens": max_tokens,
            "stream": True,
            "temperature": temperature,
            "top_p": top_p,
        }

        response = ""
        
        print(f"πŸ“‘ Chat: Making streaming request with {INFERENCE_TIMEOUT}s timeout...")
        
        # Create streaming function for timeout handling
        def create_stream():
            return client.chat_completion(**chat_completion_kwargs)
        
        # Execute with timeout using ThreadPoolExecutor
        with ThreadPoolExecutor(max_workers=1) as executor:
            future = executor.submit(create_stream)
            
            try:
                # Get the stream with timeout
                stream = future.result(timeout=INFERENCE_TIMEOUT)
                print(f"πŸ”„ Chat: Got stream, starting to iterate...")

                # Track streaming time to detect hangs
                last_token_time = time.time()
                token_timeout = 30  # 30 seconds between tokens
                
                for message in stream:
                    current_time = time.time()
                    
                    # Check if we've been waiting too long for a token
                    if current_time - last_token_time > token_timeout:
                        raise TimeoutError(f"No response received for {token_timeout} seconds during streaming")
                    
                    choices = message.choices
                    token_content = ""
                    if len(choices) and choices[0].delta.content:
                        token_content = choices[0].delta.content
                        last_token_time = current_time  # Reset timer when we get content

                    response += token_content
                    yield response
                    
            except FutureTimeoutError:
                future.cancel()  # Cancel the running task
                raise TimeoutError(f"Chat request timed out after {INFERENCE_TIMEOUT} seconds")
        
        # Report successful token usage
        if token_id:
            report_token_status(token_id, "success", api_key=proxy_api_key)
            
    except ConnectionError as e:
        # Handle proxy connection errors
        error_msg = f"Cannot connect to HF-Inferoxy server: {str(e)}"
        print(f"πŸ”Œ Chat connection error: {error_msg}")
        if token_id:
            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
        yield format_error_message("Connection Error", "Unable to connect to the proxy server. Please check if it's running.")
        
    except TimeoutError as e:
        # Handle timeout errors
        error_msg = f"Request timed out: {str(e)}"
        print(f"⏰ Chat timeout: {error_msg}")
        if token_id:
            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
        yield format_error_message("Timeout Error", "The request took too long. The server may be overloaded. Please try again.")
        
    except HfHubHTTPError as e:
        # Handle HuggingFace API errors
        error_msg = str(e)
        print(f"πŸ€— Chat HF error: {error_msg}")
        if token_id:
            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
        
        # Provide more user-friendly error messages
        if "401" in error_msg:
            yield format_error_message("Authentication Error", "Invalid or expired API token. The proxy will provide a new token on retry.")
        elif "402" in error_msg:
            yield format_error_message("Quota Exceeded", "API quota exceeded. The proxy will try alternative providers.")
        elif "429" in error_msg:
            yield format_error_message("Rate Limited", "Too many requests. Please wait a moment and try again.")
        else:
            yield format_error_message("HuggingFace API Error", error_msg)
        
    except Exception as e:
        # Handle all other errors
        error_msg = str(e)
        print(f"❌ Chat unexpected error: {error_msg}")
        if token_id:
            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
        yield format_error_message("Unexpected Error", f"An unexpected error occurred: {error_msg}")


def handle_chat_submit(message, history, system_msg, model_name, max_tokens, temperature, top_p):
    """
    Handle chat submission and manage conversation history with streaming.
    """
    if not message.strip():
        yield history, ""
        return
    
    # Add user message to history
    history = history + [{"role": "user", "content": message}]
    
    # Generate response with streaming
    response_generator = chat_respond(
        message, 
        history[:-1],  # Don't include the current message in history for the function
        system_msg, 
        model_name, 
        max_tokens, 
        temperature, 
        top_p
    )
    
    # Stream the assistant response token by token
    assistant_response = ""
    for partial_response in response_generator:
        assistant_response = partial_response
        # Update history with the current partial response and yield it
        current_history = history + [{"role": "assistant", "content": assistant_response}]
        yield current_history, ""