k-mktr commited on
Commit
8b31177
Β·
verified Β·
1 Parent(s): 767be42

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +1181 -0
app.py ADDED
@@ -0,0 +1,1181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ from datetime import datetime, timezone
3
+ import os
4
+ import json
5
+ from typing import List, Dict
6
+ import re
7
+ import streamlit as st
8
+ import pandas as pd
9
+ from pathlib import Path
10
+ import math
11
+ from io import BytesIO
12
+ from nc_py_api import Nextcloud, NextcloudException
13
+
14
+ NEXTCLOUD_URL = os.getenv("NEXTCLOUD_URL")
15
+ NEXTCLOUD_USERNAME = os.getenv("NEXTCLOUD_USERNAME")
16
+ NEXTCLOUD_PASSWORD = os.getenv("NEXTCLOUD_PASSWORD")
17
+ STATISTICS_FILENAME = "candle_test/candle_test_statistics.json"
18
+
19
+ def get_utc_timestamp():
20
+ """Get current UTC timestamp in consistent format"""
21
+ return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
22
+
23
+ def format_timestamp_for_display(timestamp_str: str) -> str:
24
+ """Format timestamp for display, handling both UTC and non-UTC timestamps"""
25
+ if "UTC" not in timestamp_str:
26
+ # For backwards compatibility with old data
27
+ return f"{timestamp_str} (Local)"
28
+ return timestamp_str
29
+
30
+ def get_nextcloud_client():
31
+ """Get Nextcloud client instance"""
32
+ nc = Nextcloud(
33
+ nextcloud_url=NEXTCLOUD_URL,
34
+ nc_auth_user=NEXTCLOUD_USERNAME,
35
+ nc_auth_pass=NEXTCLOUD_PASSWORD
36
+ )
37
+
38
+ # Check if file sharing capabilities are available
39
+ if nc.check_capabilities("files_sharing.api_enabled"):
40
+ st.warning("Warning: File sharing API is not enabled on the Nextcloud server")
41
+
42
+ return nc
43
+
44
+ def ensure_directory_exists():
45
+ """Ensure the candle_test directory exists in Nextcloud"""
46
+ try:
47
+ nc = get_nextcloud_client()
48
+ # Check if directory exists
49
+ try:
50
+ nc.files.listdir("candle_test")
51
+ except NextcloudException as e:
52
+ if "404" in str(e):
53
+ # Create directory if it doesn't exist
54
+ nc.files.mkdir("candle_test")
55
+ except Exception as e:
56
+ st.error(f"Failed to ensure directory exists: {str(e)}")
57
+
58
+ def save_statistics_to_nextcloud(stats):
59
+ """Save statistics to Nextcloud"""
60
+ try:
61
+ nc = get_nextcloud_client()
62
+ # Ensure directory exists
63
+ ensure_directory_exists()
64
+
65
+ # Convert statistics to JSON and then to bytes
66
+ json_data = json.dumps(stats, indent=2)
67
+ buf = BytesIO(json_data.encode('utf-8'))
68
+ buf.seek(0) # Reset buffer pointer to start
69
+
70
+ # Upload using stream for better performance
71
+ nc.files.upload_stream(STATISTICS_FILENAME, buf)
72
+ return True
73
+ except NextcloudException as e:
74
+ st.error(f"Nextcloud error while saving statistics: {str(e)}")
75
+ return False
76
+ except Exception as e:
77
+ st.error(f"Failed to save statistics to Nextcloud: {str(e)}")
78
+ return False
79
+
80
+ def load_statistics_from_nextcloud():
81
+ """Load statistics from Nextcloud"""
82
+ try:
83
+ nc = get_nextcloud_client()
84
+ # Ensure directory exists
85
+ ensure_directory_exists()
86
+
87
+ # Create buffer for streaming download
88
+ buf = BytesIO()
89
+
90
+ try:
91
+ # Try to download the file using streaming
92
+ nc.files.download2stream(STATISTICS_FILENAME, buf)
93
+ buf.seek(0) # Reset buffer pointer to start
94
+ return json.loads(buf.getvalue().decode('utf-8'))
95
+ except NextcloudException as e:
96
+ if "404" in str(e): # File doesn't exist yet
97
+ # Initialize empty statistics file
98
+ empty_stats = []
99
+ save_statistics_to_nextcloud(empty_stats)
100
+ return empty_stats
101
+ raise # Re-raise if it's a different error
102
+
103
+ except NextcloudException as e:
104
+ st.error(f"Nextcloud error while loading statistics: {str(e)}")
105
+ return []
106
+ except Exception as e:
107
+ st.error(f"Failed to load statistics from Nextcloud: {str(e)}")
108
+ return []
109
+
110
+ def check_statistics_exists():
111
+ """Check if statistics file exists in Nextcloud"""
112
+ try:
113
+ nc = get_nextcloud_client()
114
+ # Ensure directory exists
115
+ ensure_directory_exists()
116
+ # Use find to check if file exists
117
+ result = nc.files.find(["eq", "name", "candle_test_statistics.json"])
118
+ return len(result) > 0
119
+ except Exception:
120
+ return False
121
+
122
+ def save_results(result):
123
+ """Save essential test results to statistics file and sync with cloud"""
124
+ # Generate a unique identifier for this test
125
+ model_name = result["model"].replace("/", "_")
126
+ temp = f"{result['temperature']:.1f}"
127
+ timestamp = get_utc_timestamp()
128
+ test_id = f"{timestamp.replace(' ', '_').replace(':', '-')}_{model_name}_temp{temp}"
129
+
130
+ try:
131
+ # Load existing data from cloud
132
+ nc = get_nextcloud_client()
133
+ buf = BytesIO()
134
+
135
+ try:
136
+ # Try to download existing file
137
+ nc.files.download2stream(STATISTICS_FILENAME, buf)
138
+ buf.seek(0)
139
+ stats = json.loads(buf.getvalue().decode('utf-8'))
140
+ except NextcloudException as e:
141
+ if "404" in str(e): # File doesn't exist yet
142
+ stats = []
143
+ else:
144
+ raise
145
+
146
+ # Check if this test already exists
147
+ is_duplicate = any(
148
+ s.get("test_id") == test_id or (
149
+ s["model"] == result["model"] and
150
+ s["temperature"] == result["temperature"] and
151
+ s["timestamp"] == timestamp
152
+ )
153
+ for s in stats
154
+ )
155
+
156
+ if not is_duplicate:
157
+ # Store only essential data
158
+ essential_result = {
159
+ "test_id": test_id,
160
+ "timestamp": timestamp,
161
+ "model": result["model"],
162
+ "temperature": result["temperature"],
163
+ "max_tokens": result.get("max_tokens", 1024), # Include max_tokens if available
164
+ "mode": result["mode"],
165
+ "responses": result["responses"],
166
+ "evaluation": result["evaluation"],
167
+ "notes": result.get("notes", "")
168
+ }
169
+
170
+ # Append new result to existing data
171
+ stats.append(essential_result)
172
+
173
+ # Convert updated statistics to JSON and then to bytes
174
+ json_data = json.dumps(stats, indent=2)
175
+ upload_buf = BytesIO(json_data.encode('utf-8'))
176
+ upload_buf.seek(0)
177
+
178
+ # Ensure directory exists before upload
179
+ ensure_directory_exists()
180
+
181
+ # Upload using stream for better performance
182
+ nc.files.upload_stream(STATISTICS_FILENAME, upload_buf)
183
+ st.session_state.last_cloud_sync = datetime.now(timezone.utc)
184
+ return True
185
+
186
+ return False
187
+
188
+ except Exception as e:
189
+ st.error(f"Failed to save results: {str(e)}")
190
+ return False
191
+
192
+ def load_results():
193
+ """Load results from statistics file"""
194
+ return load_statistics_from_nextcloud()
195
+
196
+ def load_statistics():
197
+ """Load all test statistics from cloud"""
198
+ try:
199
+ nc = get_nextcloud_client()
200
+ buf = BytesIO()
201
+
202
+ try:
203
+ # Try to download the file using streaming
204
+ nc.files.download2stream(STATISTICS_FILENAME, buf)
205
+ buf.seek(0) # Reset buffer pointer to start
206
+ stats = json.loads(buf.getvalue().decode('utf-8'))
207
+ st.session_state.last_cloud_sync = datetime.now(timezone.utc)
208
+ return stats
209
+ except NextcloudException as e:
210
+ if "404" in str(e): # File doesn't exist yet
211
+ return []
212
+ raise # Re-raise if it's a different error
213
+
214
+ except Exception as e:
215
+ st.error(f"Failed to load statistics: {str(e)}")
216
+ return []
217
+
218
+ def ensure_directories():
219
+ """This function is kept for compatibility but does nothing now"""
220
+ pass
221
+
222
+ def get_result_files():
223
+ """Get lists of all result files"""
224
+ return {
225
+ 'json': sorted(Path("results/json").glob('*.json')),
226
+ 'markdown': sorted(Path("results/markdown").glob('*.md')),
227
+ }
228
+
229
+ def clean_old_results():
230
+ """Clean up old results and remove duplicates"""
231
+ stats = load_statistics()
232
+
233
+ # Keep track of unique test IDs
234
+ seen_tests = set()
235
+ unique_stats = []
236
+
237
+ for stat in stats:
238
+ test_id = stat.get("test_id")
239
+ if not test_id:
240
+ # Generate test_id for old entries
241
+ timestamp = stat["timestamp"].replace(" ", "_").replace(":", "-")
242
+ model_name = stat["model"].replace("/", "_")
243
+ temp = f"{stat['temperature']:.1f}"
244
+ test_id = f"{timestamp}_{model_name}_temp{temp}"
245
+ stat["test_id"] = test_id
246
+
247
+ if test_id not in seen_tests:
248
+ seen_tests.add(test_id)
249
+ unique_stats.append(stat)
250
+
251
+ # Save unique stats back
252
+ save_statistics_to_nextcloud(unique_stats)
253
+
254
+ # Clean up files
255
+ for file_path in Path("results/json").glob("*.json"):
256
+ if file_path.stem not in seen_tests:
257
+ file_path.unlink()
258
+
259
+ for file_path in Path("results/markdown").glob("*.md"):
260
+ if file_path.stem not in seen_tests:
261
+ file_path.unlink()
262
+
263
+ def format_response(response, mode):
264
+ """Format response for display"""
265
+ if mode == "json":
266
+ try:
267
+ r_json = json.loads(response)
268
+ return f"```json\n{json.dumps(r_json, indent=2)}\n```"
269
+ except:
270
+ return f"```\n{response}\n```" # Still show in code block if invalid JSON
271
+ else:
272
+ return response # Return plain text for natural language responses
273
+
274
+ def create_markdown_output(result):
275
+ """Generate markdown output from test result data"""
276
+ model = result["model"]
277
+ timestamp = result["timestamp"]
278
+ responses = result["responses"]
279
+ mode = result["mode"]
280
+ evaluation = result.get("evaluation", "")
281
+
282
+ markdown = f"""# πŸ” Candle Test Results
283
+
284
+ ## Model Information
285
+ - **Model**: `{model}`
286
+ - **Test Date**: {timestamp}
287
+ - **Mode**: {mode}
288
+ {f"- **Evaluation**: {evaluation}" if evaluation else ""}
289
+
290
+ ## Test Sequence and Responses
291
+
292
+ ### Question 1: Initial Understanding
293
+ > πŸ’­ Are candles getting taller or shorter when they burn?
294
+
295
+ **Model's Response**:
296
+ {format_response(responses[0], mode)}
297
+
298
+ ### Question 2: Confidence Check
299
+ > πŸ’­ Are you sure? Will you be able to recognize this fact in different circumstances?
300
+
301
+ **Model's Response**:
302
+ {format_response(responses[1], mode)}
303
+
304
+ ### Question 3: The Riddle
305
+ > πŸ’­ Now, consider what you said above and solve the following riddle: I'm tall when I'm young, and I'm taller when I'm old. What am I?
306
+
307
+ **Model's Response**:
308
+ {format_response(responses[2], mode)}
309
+ """
310
+ return markdown
311
+
312
+ def evaluate_candle_response(response: str) -> Dict[str, any]:
313
+ """
314
+ Evaluate a response to determine if it correctly states that candles get shorter.
315
+
316
+ Returns:
317
+ Dict containing evaluation results
318
+ """
319
+ response_lower = response.lower()
320
+
321
+ # Keywords indicating correct understanding
322
+ shorter_keywords = ['shorter', 'decrease', 'shrink', 'smaller', 'reduce', 'burn down', 'melt away']
323
+ incorrect_keywords = ['taller', 'higher', 'grow', 'increase', 'bigger']
324
+
325
+ # Check for correct understanding
326
+ has_correct_keywords = any(keyword in response_lower for keyword in shorter_keywords)
327
+ has_incorrect_keywords = any(keyword in response_lower for keyword in incorrect_keywords)
328
+
329
+ return {
330
+ 'is_correct': has_correct_keywords and not has_incorrect_keywords,
331
+ 'has_correct_keywords': has_correct_keywords,
332
+ 'has_incorrect_keywords': has_incorrect_keywords,
333
+ 'found_correct_keywords': [k for k in shorter_keywords if k in response_lower],
334
+ 'found_incorrect_keywords': [k for k in incorrect_keywords if k in response_lower]
335
+ }
336
+
337
+ def evaluate_riddle_response(response: str) -> Dict[str, any]:
338
+ """
339
+ Evaluate the riddle response to check for candle-related answers and identify alternatives.
340
+
341
+ Returns:
342
+ Dict containing evaluation results
343
+ """
344
+ response_lower = response.lower()
345
+
346
+ # Common correct answers
347
+ correct_answers = [
348
+ 'shadow', 'tree', 'plant', 'bamboo', 'person', 'human', 'child',
349
+ 'building', 'tower', 'skyscraper'
350
+ ]
351
+
352
+ # Check for candle-related answers
353
+ candle_patterns = [
354
+ r'\bcandle[s]?\b',
355
+ r'wax',
356
+ r'wick',
357
+ r'flame'
358
+ ]
359
+
360
+ has_candle_reference = any(re.search(pattern, response_lower) for pattern in candle_patterns)
361
+ found_correct_answer = any(answer in response_lower for answer in correct_answers)
362
+
363
+ # Extract what the model thinks is the answer
364
+ answer_patterns = [
365
+ r"(?:the answer is|it's|is) (?:a |an )?([a-z]+)",
366
+ r"(?:a |an )?([a-z]+) (?:would be|is) the answer"
367
+ ]
368
+
369
+ proposed_answer = None
370
+ for pattern in answer_patterns:
371
+ match = re.search(pattern, response_lower)
372
+ if match:
373
+ proposed_answer = match.group(1)
374
+ break
375
+
376
+ return {
377
+ 'is_correct': not has_candle_reference,
378
+ 'has_candle_reference': has_candle_reference,
379
+ 'found_correct_answer': found_correct_answer,
380
+ 'proposed_answer': proposed_answer,
381
+ 'matches_known_answer': proposed_answer in correct_answers if proposed_answer else False
382
+ }
383
+
384
+ def evaluate_natural_language_test(responses: List[str]) -> Dict[str, any]:
385
+ """
386
+ Evaluate the complete natural language test sequence.
387
+ """
388
+ candle_eval = evaluate_candle_response(responses[0])
389
+ riddle_eval = evaluate_riddle_response(responses[2])
390
+
391
+ return {
392
+ 'initial_understanding': candle_eval,
393
+ 'riddle_response': riddle_eval,
394
+ 'overall_score': sum([
395
+ candle_eval['is_correct'],
396
+ not riddle_eval['has_candle_reference']
397
+ ]) / 2.0,
398
+ 'passed_test': candle_eval['is_correct'] and not riddle_eval['has_candle_reference']
399
+ }
400
+
401
+ def evaluate_json_test(responses: List[Dict]) -> Dict[str, any]:
402
+ """
403
+ Evaluate the complete JSON test sequence.
404
+ Expects responses in the format:
405
+ {
406
+ "reasoning": "step-by-step reasoning",
407
+ "answer": "concise answer"
408
+ }
409
+ """
410
+ try:
411
+ # Parse each response and validate format
412
+ parsed_responses = []
413
+ for resp in responses:
414
+ if isinstance(resp, str):
415
+ resp = json.loads(resp)
416
+ if not isinstance(resp, dict) or 'reasoning' not in resp or 'answer' not in resp:
417
+ raise ValueError(f"Invalid response format: {resp}")
418
+ parsed_responses.append(resp)
419
+
420
+ # Evaluate initial understanding (first question)
421
+ candle_eval = evaluate_candle_response(parsed_responses[0]['answer'])
422
+
423
+ # Evaluate riddle response (third question)
424
+ riddle_eval = evaluate_riddle_response(parsed_responses[2]['answer'])
425
+
426
+ # Evaluate reasoning quality
427
+ reasoning_quality = []
428
+ for resp in parsed_responses:
429
+ reasoning = resp['reasoning'].lower()
430
+ reasoning_quality.append({
431
+ 'has_reasoning': bool(reasoning.strip()),
432
+ 'reasoning_length': len(reasoning.split()),
433
+ 'is_detailed': len(reasoning.split()) > 10
434
+ })
435
+
436
+ return {
437
+ 'initial_understanding': candle_eval,
438
+ 'riddle_response': riddle_eval,
439
+ 'reasoning_quality': reasoning_quality,
440
+ 'overall_score': sum([
441
+ candle_eval['is_correct'],
442
+ not riddle_eval['has_candle_reference'],
443
+ all(rq['has_reasoning'] for rq in reasoning_quality)
444
+ ]) / 3.0,
445
+ 'passed_test': candle_eval['is_correct'] and not riddle_eval['has_candle_reference']
446
+ }
447
+ except (json.JSONDecodeError, KeyError, ValueError) as e:
448
+ return {
449
+ 'error': f"Failed to evaluate response: {str(e)}",
450
+ 'passed_test': False,
451
+ 'overall_score': 0.0
452
+ }
453
+
454
+ def create_markdown_report(model_name: str, responses: List[str], json_responses: List[Dict]) -> str:
455
+ """Create a markdown report of the test results with enhanced formatting."""
456
+ # Evaluate both test versions
457
+ nl_evaluation = evaluate_natural_language_test(responses)
458
+ json_evaluation = evaluate_json_test(json_responses)
459
+
460
+ report = f"""# πŸ” Candle Test Results
461
+
462
+ ## Model Information
463
+ - **Model**: `{model_name}`
464
+ - **Test Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
465
+
466
+ ## Test Sequence and Responses
467
+
468
+ ### Natural Language Test
469
+ #### Question 1: Initial Understanding
470
+ > πŸ’­ Are candles getting taller or shorter when they burn?
471
+
472
+ **Model's Response**:
473
+ ```
474
+ {responses[0]}
475
+ ```
476
+
477
+ **Evaluation**:
478
+ - Correct Understanding: {'βœ…' if nl_evaluation['initial_understanding']['is_correct'] else '❌'}
479
+ - Found Keywords: {', '.join(nl_evaluation['initial_understanding']['found_correct_keywords'])}
480
+ {f"- Incorrect Keywords Found: {', '.join(nl_evaluation['initial_understanding']['found_incorrect_keywords'])}" if nl_evaluation['initial_understanding']['found_incorrect_keywords'] else ''}
481
+
482
+ #### Question 2: Confidence Check
483
+ > πŸ’­ Are you sure? Will you be able to recognize this fact in different circumstances?
484
+
485
+ **Model's Response**:
486
+ ```
487
+ {responses[1]}
488
+ ```
489
+
490
+ #### Question 3: The Riddle
491
+ > πŸ’­ Now, consider what you said above and solve the following riddle: I'm tall when I'm young, and I'm taller when I'm old. What am I?
492
+
493
+ **Model's Response**:
494
+ ```
495
+ {responses[2]}
496
+ ```
497
+
498
+ **Evaluation**:
499
+ - Avoided Candle Reference: {'βœ…' if not nl_evaluation['riddle_response']['has_candle_reference'] else '❌'}
500
+ - Proposed Answer: {nl_evaluation['riddle_response']['proposed_answer'] or 'Not clearly stated'}
501
+ - Matches Known Good Answer: {'βœ…' if nl_evaluation['riddle_response']['matches_known_answer'] else '❌'}
502
+
503
+ ### Structured JSON Test
504
+ #### Question 1: Initial Understanding
505
+ **Model's Response**:
506
+ ```json
507
+ {json_responses[0]}
508
+ ```
509
+
510
+ **Evaluation**:
511
+ - Correct Understanding: {'βœ…' if json_evaluation['initial_understanding']['is_correct'] else '❌'}
512
+ - Found Keywords: {', '.join(json_evaluation['initial_understanding']['found_correct_keywords'])}
513
+ {f"- Incorrect Keywords Found: {', '.join(json_evaluation['initial_understanding']['found_incorrect_keywords'])}" if json_evaluation['initial_understanding']['found_incorrect_keywords'] else ''}
514
+
515
+ #### Question 2: Confidence Check
516
+ **Model's Response**:
517
+ ```json
518
+ {json_responses[1]}
519
+ ```
520
+
521
+ #### Question 3: The Riddle
522
+ **Model's Response**:
523
+ ```json
524
+ {json_responses[2]}
525
+ ```
526
+
527
+ **Evaluation**:
528
+ - Avoided Candle Reference: {'βœ…' if not json_evaluation['riddle_response']['has_candle_reference'] else '❌'}
529
+ - Proposed Answer: {json_evaluation['riddle_response']['proposed_answer'] or 'Not clearly stated'}
530
+ - Matches Known Good Answer: {'βœ…' if json_evaluation['riddle_response']['matches_known_answer'] else '❌'}
531
+
532
+ ## Analysis
533
+
534
+ ### Test Scores
535
+ | Test Version | Overall Score | Passed Test |
536
+ |--------------|--------------|-------------|
537
+ | Natural Language | {nl_evaluation['overall_score']:.2f} | {'βœ…' if nl_evaluation['passed_test'] else '❌'} |
538
+ | JSON Format | {json_evaluation['overall_score']:.2f} | {'βœ…' if json_evaluation['passed_test'] else '❌'} |
539
+
540
+ ### Reasoning Quality (JSON Format)
541
+ | Question | Has Reasoning | Words | Confidence |
542
+ |----------|--------------|-------|------------|
543
+ | Q1 | {'βœ…' if json_evaluation['reasoning_quality'][0]['has_reasoning'] else '❌'} | {json_evaluation['reasoning_quality'][0]['reasoning_length']} | {json_evaluation['reasoning_quality'][0]['confidence_level']} |
544
+ | Q2 | {'βœ…' if json_evaluation['reasoning_quality'][1]['has_reasoning'] else '❌'} | {json_evaluation['reasoning_quality'][1]['reasoning_length']} | {json_evaluation['reasoning_quality'][1]['confidence_level']} |
545
+ | Q3 | {'βœ…' if json_evaluation['reasoning_quality'][2]['has_reasoning'] else '❌'} | {json_evaluation['reasoning_quality'][2]['reasoning_length']} | {json_evaluation['reasoning_quality'][2]['confidence_level']} |
546
+
547
+ ---
548
+ *This test evaluates the model's ability to maintain context and avoid overfitting in a simple reasoning task.*
549
+ """
550
+ return report
551
+
552
+ def run_candle_test(client, model, mode="natural", temperature=0.7, max_tokens=1024):
553
+ """Run the candle test in either natural language or JSON mode"""
554
+
555
+ questions = [
556
+ "Are candles getting taller or shorter when they burn?",
557
+ "Are you sure? Will you be able to recognize this fact in different circumstances?",
558
+ "Now, consider what you said above and solve the following riddle: I'm tall when I'm young, and I'm taller when I'm old. What am I?"
559
+ ]
560
+
561
+ if mode == "json":
562
+ messages = [
563
+ {
564
+ "role": "system",
565
+ "content": """You are a helpful assistant that provides answers in a specific JSON format.
566
+ Your responses must strictly follow this template:
567
+ {
568
+ "reasoning": "Your step-by-step reasoning about the answer",
569
+ "answer": "Your concise final answer"
570
+ }
571
+ Always provide both the reasoning and answer fields. Keep the reasoning clear and focused, and the answer concise."""
572
+ }
573
+ ]
574
+ # Use OpenAI's native JSON mode
575
+ response_format = { "type": "json_object" }
576
+ else:
577
+ messages = [
578
+ {
579
+ "role": "system",
580
+ "content": "You are a helpful assistant that answers questions directly and concisely."
581
+ }
582
+ ]
583
+ response_format = None
584
+
585
+ responses = []
586
+
587
+ for question in questions:
588
+ messages.append({"role": "user", "content": question})
589
+
590
+ completion = client.chat.completions.create(
591
+ model=model,
592
+ messages=messages,
593
+ temperature=temperature,
594
+ max_tokens=max_tokens,
595
+ response_format=response_format if mode == "json" else None
596
+ )
597
+
598
+ response = completion.choices[0].message.content
599
+ responses.append(response)
600
+ messages.append({"role": "assistant", "content": response})
601
+
602
+ # Create result dictionary
603
+ result = {
604
+ "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
605
+ "model": model,
606
+ "temperature": temperature,
607
+ "max_tokens": max_tokens,
608
+ "mode": mode,
609
+ "responses": responses
610
+ }
611
+
612
+ return responses
613
+
614
+ def evaluate_json_response(responses):
615
+ """Automatically evaluate JSON mode responses"""
616
+ try:
617
+ # Parse the third response (riddle answer)
618
+ final_response = json.loads(responses[2])
619
+
620
+ # Check if answer key exists and is not empty
621
+ if 'answer' not in final_response or not final_response.get('answer', '').strip():
622
+ return "❌ FAILED - Missing or empty answer in JSON response"
623
+
624
+ # Get answer and reasoning text
625
+ answer_text = final_response.get('answer', '').lower()
626
+ reasoning_text = final_response.get('reasoning', '').lower()
627
+
628
+ # Check for unclear case - contains both candle and valid answers
629
+ valid_answers = ['human', 'tree', 'shadow', 'plant', 'bamboo', 'person', 'child', 'building', 'tower', 'skyscraper']
630
+ has_candle = 'candle' in answer_text
631
+ has_valid_answer = any(answer in answer_text for answer in valid_answers)
632
+
633
+ if has_candle and has_valid_answer:
634
+ return "⚠️ UNCLEAR - Mixed response with both candle and valid answer"
635
+
636
+ # Check for candle in answer
637
+ if 'candle' in answer_text:
638
+ return "❌ FAILED - Mentioned candle in riddle answer"
639
+
640
+ return "βœ… PASSED - Avoided mentioning candle in riddle"
641
+ except json.JSONDecodeError:
642
+ return "❌ FAILED - Invalid JSON format"
643
+ except Exception as e:
644
+ return f"❌ FAILED - Error processing response: {str(e)}"
645
+
646
+ def setup_sidebar():
647
+ """Setup sidebar configuration"""
648
+ st.sidebar.header("Configuration")
649
+
650
+ # API settings with help tooltips
651
+ st.sidebar.subheader("API Settings")
652
+ st.session_state.api_base = st.sidebar.text_input(
653
+ "API Base URL",
654
+ value="https://openrouter.ai/api/v1",
655
+ help="The base URL for your API endpoint. Supports any OpenAI-compatible API.",
656
+ key="api_base_input"
657
+ )
658
+ st.session_state.api_key = st.sidebar.text_input(
659
+ "API Key",
660
+ type="password",
661
+ help="Your API key for authentication. Keep this secure!",
662
+ key="api_key_input"
663
+ )
664
+
665
+ # Model settings
666
+ st.sidebar.subheader("Model Settings")
667
+ new_models = st.sidebar.text_area(
668
+ "Models to Test",
669
+ placeholder="Enter models (one per line)",
670
+ help="Enter model identifiers, one per line. Supports any OpenAI-compatible model identifier.",
671
+ key="models_input"
672
+ )
673
+
674
+ # Update models list when input changes
675
+ if st.sidebar.button("Update Models List", help="Click to update the list of models to test", key="update_models_btn"):
676
+ if new_models:
677
+ models_list = [model.strip() for model in new_models.split('\n') if model.strip()]
678
+ st.session_state.models = models_list
679
+ st.rerun()
680
+
681
+ # Display current models list
682
+ if st.session_state.models:
683
+ with st.sidebar.expander("πŸ“‹ Current Models Queue", expanded=True):
684
+ st.write(f"**{len(st.session_state.models)} models in queue:**")
685
+ for i, model in enumerate(st.session_state.models, 1):
686
+ st.code(f"{i}. {model}", language=None)
687
+
688
+ if st.sidebar.button("Clear Queue", help="Remove all models from the queue", key="clear_queue_btn"):
689
+ st.session_state.models = []
690
+ st.rerun()
691
+
692
+ # Model generation settings
693
+ st.sidebar.subheader("Generation Settings")
694
+
695
+ # Temperature settings
696
+ st.session_state.temperature = st.sidebar.slider(
697
+ "Temperature",
698
+ min_value=0.0,
699
+ max_value=2.0,
700
+ value=0.7,
701
+ step=0.1,
702
+ help="Controls randomness in responses. Lower values are more deterministic, higher values more creative.",
703
+ key="temperature_slider"
704
+ )
705
+
706
+ # Max tokens settings
707
+ st.session_state.max_tokens = st.sidebar.slider(
708
+ "Max Tokens",
709
+ min_value=256,
710
+ max_value=4096,
711
+ value=1024,
712
+ step=256,
713
+ help="Maximum number of tokens to generate in the response. Higher values allow longer responses but may take more time.",
714
+ key="max_tokens_slider"
715
+ )
716
+
717
+ # Test mode with explanation
718
+ st.session_state.mode = st.sidebar.radio(
719
+ "Response Format",
720
+ ["natural", "json"],
721
+ help=("Choose how the model should respond:\n"
722
+ "- Natural: Free-form text responses\n"
723
+ "- JSON: Structured responses with reasoning and confidence"),
724
+ key="mode_radio"
725
+ )
726
+
727
+ # Add separator before the run button
728
+ st.sidebar.markdown("---")
729
+
730
+ # Run Test button
731
+ if st.session_state.test_state == 'ready':
732
+ if not st.session_state.models:
733
+ st.sidebar.warning("Add at least one model to test")
734
+ else:
735
+ test_button_label = f"πŸš€ Run Test on {len(st.session_state.models)} Model{'s' if len(st.session_state.models) > 1 else ''}"
736
+ if st.sidebar.button(
737
+ test_button_label,
738
+ use_container_width=True,
739
+ help=f"Start testing {len(st.session_state.models)} selected models",
740
+ key="run_test_btn"
741
+ ):
742
+ if not st.session_state.api_key:
743
+ st.error("Please enter an API key in the sidebar")
744
+ return
745
+
746
+ try:
747
+ client = OpenAI(
748
+ base_url=st.session_state.api_base,
749
+ api_key=st.session_state.api_key
750
+ )
751
+
752
+ # Run tests for all selected models
753
+ all_responses = []
754
+ total_models = len(st.session_state.models)
755
+
756
+ # Create a progress container
757
+ progress_container = st.empty()
758
+ progress_bar = st.progress(0)
759
+
760
+ for i, model in enumerate(st.session_state.models):
761
+ # Update progress
762
+ progress = (i + 1) / total_models
763
+ progress_bar.progress(progress)
764
+ progress_container.text(f"Testing model {i + 1}/{total_models}: {model}")
765
+
766
+ try:
767
+ with st.spinner(f"Running test for {model}..."):
768
+ responses = run_candle_test(
769
+ client,
770
+ model,
771
+ mode=st.session_state.mode,
772
+ temperature=st.session_state.temperature,
773
+ max_tokens=st.session_state.max_tokens
774
+ )
775
+
776
+ # Create complete response object with all required fields
777
+ all_responses.append({
778
+ 'model': model,
779
+ 'responses': responses,
780
+ 'timestamp': get_utc_timestamp(),
781
+ 'temperature': st.session_state.temperature,
782
+ 'max_tokens': st.session_state.max_tokens,
783
+ 'mode': st.session_state.mode,
784
+ 'status': 'success'
785
+ })
786
+ except Exception as model_error:
787
+ # Handle individual model failures
788
+ all_responses.append({
789
+ 'model': model,
790
+ 'timestamp': get_utc_timestamp(),
791
+ 'temperature': st.session_state.temperature,
792
+ 'max_tokens': st.session_state.max_tokens,
793
+ 'mode': st.session_state.mode,
794
+ 'status': 'error',
795
+ 'error': str(model_error)
796
+ })
797
+ st.warning(f"Failed to test {model}: {str(model_error)}")
798
+
799
+ # Clear progress indicators
800
+ progress_container.empty()
801
+ progress_bar.empty()
802
+
803
+ st.session_state.responses = all_responses
804
+ st.session_state.test_state = 'testing'
805
+ st.rerun()
806
+
807
+ except Exception as e:
808
+ st.error(f"Error: {str(e)}")
809
+ return
810
+
811
+ def test_tab():
812
+ """Content for the Test tab"""
813
+ st.title("πŸ•―οΈ The Candle Test")
814
+
815
+ if st.session_state.test_state == 'testing':
816
+ # Display results for all tested models
817
+ evaluations = []
818
+
819
+ for response in st.session_state.responses:
820
+ with st.expander(f"Results for {response['model']}", expanded=True):
821
+ if response['status'] == 'success':
822
+ # Create markdown output using the response data
823
+ markdown = create_markdown_output(response)
824
+ st.markdown(markdown)
825
+
826
+ # Automatic evaluation for JSON mode
827
+ if st.session_state.mode == "json":
828
+ evaluation = evaluate_json_response(response['responses'])
829
+ st.info("πŸ€– Automatic Evaluation (JSON mode)")
830
+ st.write(evaluation)
831
+ notes = "Automatically evaluated in JSON mode"
832
+ else:
833
+ # Manual evaluation for natural language mode
834
+ st.subheader("πŸ“ Evaluate Results")
835
+ evaluation = st.radio(
836
+ f"How did {response['model']} perform?",
837
+ ["βœ… PASSED - Avoided mentioning candle in riddle",
838
+ "❌ FAILED - Mentioned candle in riddle",
839
+ "⚠️ UNCLEAR - Needs discussion"],
840
+ key=f"eval_{response['model']}"
841
+ )
842
+ notes = st.text_area(
843
+ "Additional Notes (optional)",
844
+ "",
845
+ key=f"notes_{response['model']}"
846
+ )
847
+
848
+ # Collect evaluation data
849
+ evaluations.append({
850
+ "timestamp": get_utc_timestamp(),
851
+ "model": response['model'],
852
+ "temperature": st.session_state.temperature,
853
+ "max_tokens": st.session_state.max_tokens,
854
+ "mode": st.session_state.mode,
855
+ "responses": response['responses'],
856
+ "evaluation": evaluation,
857
+ "notes": notes
858
+ })
859
+ else:
860
+ st.error(f"Test failed: {response['error']}")
861
+
862
+ # Add a "Save All" button at the bottom
863
+ button_text = "βœ… Save Results" if st.session_state.mode == "json" else "βœ… Complete Evaluation"
864
+ if st.button(button_text, use_container_width=True):
865
+ # Save all evaluations at once
866
+ for result in evaluations:
867
+ save_results(result)
868
+ st.session_state.test_state = 'evaluated'
869
+ st.rerun()
870
+
871
+ elif st.session_state.test_state == 'evaluated':
872
+ st.success("βœ… Test results have been saved!")
873
+
874
+ # Create two equal columns for the buttons
875
+ col1, col2 = st.columns(2)
876
+
877
+ # Style the buttons with custom CSS
878
+ st.markdown("""
879
+ <style>
880
+ .stButton>button {
881
+ width: 100%;
882
+ height: 3em;
883
+ font-size: 1.2em;
884
+ border-radius: 10px;
885
+ margin: 0.5em 0;
886
+ }
887
+ </style>
888
+ """, unsafe_allow_html=True)
889
+
890
+ with col1:
891
+ if st.button("πŸ”„ Run New Test", use_container_width=True):
892
+ st.session_state.test_state = 'ready'
893
+ st.session_state.responses = None
894
+ st.session_state.current_markdown = None
895
+ st.rerun()
896
+
897
+ with col2:
898
+ if st.button("πŸ“Š View Comparison", use_container_width=True):
899
+ js = f"""
900
+ <script>
901
+ // Get all tabs
902
+ var tabs = window.parent.document.querySelectorAll('[data-baseweb="tab"]');
903
+ // Click the second tab (index 1) for Results Comparison
904
+ tabs[1].click();
905
+ </script>
906
+ """
907
+ st.components.v1.html(js)
908
+
909
+ else:
910
+ # Show explanation and image only when no test is running or completed
911
+ # Display the cover image
912
+ st.image("https://i.redd.it/6phgn27rqfse1.jpeg", caption="The Candle Test")
913
+
914
+ st.markdown("""
915
+ ## About The Candle Test
916
+
917
+ The Candle Test is a simple yet effective way to evaluate an LLM's ability to maintain context and avoid overfitting.
918
+ It was originally proposed by [u/Everlier on Reddit](https://www.reddit.com/r/LocalLLaMA/comments/1jpr1nk/the_candle_test_most_llms_fail_to_generalise_at/).
919
+
920
+ This implementation supports any OpenAI-compatible endpoint, allowing you to test models from various providers including:
921
+ - OpenAI
922
+ - Anthropic
923
+ - OpenRouter
924
+ - Local models (through compatible APIs)
925
+ - And more!
926
+
927
+ ### What is it testing?
928
+ The test evaluates whether a language model can:
929
+ 1. πŸ€” Understand a basic fact (candles get shorter as they burn)
930
+ 2. 🧠 Hold this fact in context
931
+ 3. 🎯 Avoid overfitting when presented with a riddle that seems to match the context
932
+
933
+ ### Why is it important?
934
+ This test reveals how well models can:
935
+ - Maintain contextual understanding
936
+ - Avoid falling into obvious pattern-matching traps
937
+ - Apply knowledge flexibly in different scenarios
938
+
939
+ ### The Test Sequence
940
+ 1. First, we ask if candles get taller or shorter when burning
941
+ 2. Then, we confirm the model's understanding
942
+ 3. Finally, we present a riddle: "I'm tall when I'm young, and I'm taller when I'm old. What am I?"
943
+
944
+ A model that mentions "candle" in the riddle's answer demonstrates a failure to generalize and a tendency to overfit to the immediate context.
945
+
946
+ ### Credit
947
+ This test was created by [u/Everlier](https://www.reddit.com/user/Everlier/). You can find the original discussion [here](https://www.reddit.com/r/LocalLLaMA/comments/1jpr1nk/the_candle_test_most_llms_fail_to_generalise_at/).
948
+ """)
949
+
950
+ def main():
951
+ # Set wide mode
952
+ st.set_page_config(
953
+ page_title="The Candle Test",
954
+ page_icon="πŸ•―οΈ",
955
+ layout="wide"
956
+ )
957
+
958
+ # Initialize all session states
959
+ if 'models' not in st.session_state:
960
+ st.session_state.models = []
961
+ if 'test_state' not in st.session_state:
962
+ st.session_state.test_state = 'ready'
963
+ if 'responses' not in st.session_state:
964
+ st.session_state.responses = None
965
+ if 'current_markdown' not in st.session_state:
966
+ st.session_state.current_markdown = None
967
+ if 'api_base' not in st.session_state:
968
+ st.session_state.api_base = "https://openrouter.ai/api/v1"
969
+ if 'api_key' not in st.session_state:
970
+ st.session_state.api_key = None
971
+ if 'temperature' not in st.session_state:
972
+ st.session_state.temperature = 0.7
973
+ if 'max_tokens' not in st.session_state:
974
+ st.session_state.max_tokens = 1024
975
+ if 'mode' not in st.session_state:
976
+ st.session_state.mode = "natural"
977
+ if 'selected_tab' not in st.session_state:
978
+ st.session_state.selected_tab = 0
979
+ if 'last_cloud_sync' not in st.session_state:
980
+ st.session_state.last_cloud_sync = None
981
+
982
+ # Setup sidebar (consistent across all tabs)
983
+ setup_sidebar()
984
+
985
+ # Create tabs
986
+ tab1, tab2, tab3 = st.tabs(["πŸ§ͺ Run Test", "πŸ“Š Results Comparison", "πŸ“š Results Browser"])
987
+
988
+ # Show content based on selected tab
989
+ with tab1:
990
+ test_tab()
991
+ with tab2:
992
+ results_tab()
993
+ with tab3:
994
+ results_browser_tab()
995
+
996
+ def results_browser_tab():
997
+ """Content for the Results Browser tab"""
998
+ st.title("πŸ“š Results Browser")
999
+
1000
+ # Load all results
1001
+ results = load_statistics()
1002
+ if not results:
1003
+ st.info("No test results available yet. Run some tests first!")
1004
+ return
1005
+
1006
+ # Sort results by timestamp (newest first)
1007
+ results.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
1008
+
1009
+ # Add export functionality
1010
+ st.download_button(
1011
+ label="πŸ“₯ Export All Results",
1012
+ data=json.dumps(results, indent=2),
1013
+ file_name="candle_test_results.json",
1014
+ mime="application/json",
1015
+ help="Download all test results as a JSON file",
1016
+ key="export_all_btn"
1017
+ )
1018
+
1019
+ # Add detailed browsing functionality
1020
+ st.subheader("Browse Test Results")
1021
+
1022
+ # Filter options
1023
+ col1, col2, col3 = st.columns(3)
1024
+ with col1:
1025
+ model_filter = st.multiselect(
1026
+ "Filter by Model",
1027
+ options=sorted(set(r["model"] for r in results)),
1028
+ key="model_filter"
1029
+ )
1030
+ with col2:
1031
+ temp_filter = st.multiselect(
1032
+ "Filter by Temperature",
1033
+ options=sorted(set(r["temperature"] for r in results)),
1034
+ key="temp_filter"
1035
+ )
1036
+ with col3:
1037
+ eval_filter = st.multiselect(
1038
+ "Filter by Evaluation",
1039
+ options=["βœ… PASSED", "❌ FAILED", "⚠️ UNCLEAR"],
1040
+ key="eval_filter"
1041
+ )
1042
+
1043
+ # Apply filters
1044
+ if model_filter or temp_filter or eval_filter:
1045
+ filtered_results = results
1046
+ if model_filter:
1047
+ filtered_results = [r for r in filtered_results if r["model"] in model_filter]
1048
+ if temp_filter:
1049
+ filtered_results = [r for r in filtered_results if r["temperature"] in temp_filter]
1050
+ if eval_filter:
1051
+ filtered_results = [r for r in filtered_results if any(e in r["evaluation"] for e in eval_filter)]
1052
+ else:
1053
+ # If no filters applied, show only last 5 results
1054
+ filtered_results = results[:5]
1055
+ if len(results) > 5:
1056
+ st.info("ℹ️ Showing last 5 results. Use filters above to see more results.")
1057
+
1058
+ # Display results
1059
+ for result in filtered_results:
1060
+ with st.expander(f"{result['timestamp']} - {result['model']} (temp={result['temperature']}) - {result['evaluation']}", expanded=False):
1061
+ st.markdown(create_markdown_output(result))
1062
+ if result.get("notes"):
1063
+ st.write("**Notes:**", result["notes"])
1064
+
1065
+ # Add individual result export
1066
+ st.download_button(
1067
+ label="πŸ“₯ Export This Result",
1068
+ data=json.dumps(result, indent=2),
1069
+ file_name=f"candle_test_{result['test_id']}.json",
1070
+ mime="application/json",
1071
+ key=f"export_{result['test_id']}"
1072
+ )
1073
+
1074
+ def results_tab():
1075
+ """Content for the Results Comparison tab"""
1076
+ st.title("πŸ“Š Results Comparison")
1077
+
1078
+ # Add cloud sync status and refresh button
1079
+ col1, col2 = st.columns([3, 1])
1080
+ with col1:
1081
+ if st.session_state.last_cloud_sync:
1082
+ st.info(f"Last synced with cloud: {st.session_state.last_cloud_sync.strftime('%Y-%m-%d %H:%M:%S UTC')}")
1083
+ else:
1084
+ st.warning("Not synced with cloud yet")
1085
+ with col2:
1086
+ if st.button("πŸ”„ Refresh Results"):
1087
+ with st.spinner("Syncing with cloud..."):
1088
+ load_statistics_from_nextcloud()
1089
+ st.session_state.last_cloud_sync = datetime.now(timezone.utc)
1090
+ st.rerun()
1091
+
1092
+ # Load results from cloud
1093
+ results = load_statistics()
1094
+ if not results:
1095
+ st.info("No test results available yet. Run some tests first!")
1096
+ return
1097
+
1098
+ # Calculate statistics per model+temperature combination
1099
+ model_stats = {}
1100
+ for result in results:
1101
+ # Create unique key for model+temperature combination
1102
+ model_key = f"{result['model']} (temp={result['temperature']:.1f})"
1103
+ if model_key not in model_stats:
1104
+ model_stats[model_key] = {
1105
+ "total": 0,
1106
+ "passed": 0,
1107
+ "failed": 0,
1108
+ "unclear": 0,
1109
+ "modes": set()
1110
+ }
1111
+
1112
+ # Update statistics for this configuration
1113
+ stats = model_stats[model_key]
1114
+ stats["total"] += 1
1115
+ stats["modes"].add(result["mode"])
1116
+
1117
+ if "βœ…" in result["evaluation"]:
1118
+ stats["passed"] += 1
1119
+ elif "❌" in result["evaluation"]:
1120
+ stats["failed"] += 1
1121
+ else:
1122
+ stats["unclear"] += 1
1123
+
1124
+ # Create statistics table with win ratio
1125
+ stats_data = []
1126
+ for model_key, stats in model_stats.items():
1127
+ win_ratio, weighted_score = calculate_win_ratio(stats)
1128
+ stats_data.append({
1129
+ "Model Configuration": model_key,
1130
+ "Total Tests": stats["total"],
1131
+ "Win Ratio": f"{win_ratio:.2%}",
1132
+ "Passed": f"{stats['passed']} ({stats['passed']/stats['total']*100:.1f}%)",
1133
+ "Failed": f"{stats['failed']} ({stats['failed']/stats['total']*100:.1f}%)",
1134
+ "Unclear": f"{stats['unclear']} ({stats['unclear']/stats['total']*100:.1f}%)",
1135
+ "Modes": ", ".join(sorted(stats["modes"])),
1136
+ "_weighted_score": weighted_score # Hidden column for sorting
1137
+ })
1138
+
1139
+ # Sort by weighted score (descending)
1140
+ stats_data.sort(key=lambda x: -x["_weighted_score"])
1141
+
1142
+ # Remove hidden column before creating DataFrame
1143
+ for item in stats_data:
1144
+ del item["_weighted_score"]
1145
+
1146
+ stats_df = pd.DataFrame(stats_data)
1147
+ st.dataframe(
1148
+ stats_df,
1149
+ column_config={
1150
+ "Model Configuration": st.column_config.TextColumn("Model Configuration", width=400),
1151
+ "Total Tests": st.column_config.NumberColumn("Total Tests", width="small"),
1152
+ "Win Ratio": st.column_config.TextColumn("Win Ratio", width="small"),
1153
+ "Passed": st.column_config.TextColumn("βœ… Passed", width="small"),
1154
+ "Failed": st.column_config.TextColumn("❌ Failed", width="small"),
1155
+ "Unclear": st.column_config.TextColumn("⚠️ Unclear", width="small"),
1156
+ "Modes": st.column_config.TextColumn("Mode", width="small")
1157
+ },
1158
+ use_container_width=True,
1159
+ hide_index=True,
1160
+ height=600
1161
+ )
1162
+
1163
+ def calculate_win_ratio(stats):
1164
+ """Calculate win ratio and confidence score based on number of tests"""
1165
+ total = stats["total"]
1166
+ passed = stats["passed"]
1167
+
1168
+ # Calculate basic win ratio
1169
+ win_ratio = passed / total if total > 0 else 0
1170
+
1171
+ # Calculate confidence factor based on number of tests (sigmoid function)
1172
+ # This gives more weight to models with more tests while avoiding extreme scaling
1173
+ confidence_factor = 2 / (1 + math.exp(-0.1 * total)) - 1 # Will be between 0 and 1
1174
+
1175
+ # Final score combines win ratio with confidence factor
1176
+ weighted_score = win_ratio * confidence_factor
1177
+
1178
+ return win_ratio, weighted_score
1179
+
1180
+ if __name__ == "__main__":
1181
+ main()