|
|
""" |
|
|
Model Evaluation Script for Troviku-1.1 |
|
|
|
|
|
Comprehensive evaluation suite for testing the model's performance |
|
|
on various coding benchmarks and tasks. |
|
|
""" |
|
|
|
|
|
import json |
|
|
import time |
|
|
from typing import List, Dict, Any, Optional, Tuple |
|
|
from dataclasses import dataclass, asdict |
|
|
from collections import defaultdict |
|
|
import statistics |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class EvaluationResult: |
|
|
"""Result from a single evaluation.""" |
|
|
task_id: str |
|
|
task_type: str |
|
|
language: str |
|
|
passed: bool |
|
|
score: float |
|
|
execution_time: float |
|
|
error_message: Optional[str] = None |
|
|
|
|
|
def to_dict(self) -> Dict[str, Any]: |
|
|
return asdict(self) |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class BenchmarkResults: |
|
|
"""Aggregated benchmark results.""" |
|
|
benchmark_name: str |
|
|
total_tasks: int |
|
|
passed_tasks: int |
|
|
failed_tasks: int |
|
|
average_score: float |
|
|
pass_rate: float |
|
|
average_execution_time: float |
|
|
results_by_language: Dict[str, Dict[str, float]] |
|
|
|
|
|
def to_dict(self) -> Dict[str, Any]: |
|
|
return asdict(self) |
|
|
|
|
|
|
|
|
class CodeEvaluator: |
|
|
""" |
|
|
Evaluator for Troviku-1.1 model performance. |
|
|
|
|
|
Runs various benchmarks and coding tasks to assess model capabilities. |
|
|
""" |
|
|
|
|
|
def __init__(self, api_key: str, model: str = "OpenTrouter/Troviku-1.1"): |
|
|
""" |
|
|
Initialize the evaluator. |
|
|
|
|
|
Args: |
|
|
api_key: OpenTrouter API key |
|
|
model: Model identifier to evaluate |
|
|
""" |
|
|
from troviku_client import TrovikuClient |
|
|
|
|
|
self.client = TrovikuClient(api_key=api_key, model=model) |
|
|
self.results: List[EvaluationResult] = [] |
|
|
|
|
|
def evaluate_humaneval(self, problems: List[Dict[str, Any]]) -> BenchmarkResults: |
|
|
""" |
|
|
Evaluate on HumanEval benchmark. |
|
|
|
|
|
Args: |
|
|
problems: List of HumanEval problems |
|
|
|
|
|
Returns: |
|
|
BenchmarkResults with aggregated scores |
|
|
""" |
|
|
print("Evaluating HumanEval benchmark...") |
|
|
|
|
|
for problem in problems: |
|
|
task_id = problem['task_id'] |
|
|
prompt = problem['prompt'] |
|
|
test_cases = problem['test'] |
|
|
|
|
|
try: |
|
|
start_time = time.time() |
|
|
response = self.client.generate(prompt, language="python") |
|
|
execution_time = time.time() - start_time |
|
|
|
|
|
|
|
|
passed, error = self._execute_tests(response.code, test_cases) |
|
|
|
|
|
result = EvaluationResult( |
|
|
task_id=task_id, |
|
|
task_type="code_generation", |
|
|
language="python", |
|
|
passed=passed, |
|
|
score=1.0 if passed else 0.0, |
|
|
execution_time=execution_time, |
|
|
error_message=error |
|
|
) |
|
|
|
|
|
self.results.append(result) |
|
|
print(f" {task_id}: {'PASS' if passed else 'FAIL'}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f" {task_id}: ERROR - {str(e)}") |
|
|
result = EvaluationResult( |
|
|
task_id=task_id, |
|
|
task_type="code_generation", |
|
|
language="python", |
|
|
passed=False, |
|
|
score=0.0, |
|
|
execution_time=0.0, |
|
|
error_message=str(e) |
|
|
) |
|
|
self.results.append(result) |
|
|
|
|
|
return self._aggregate_results("HumanEval") |
|
|
|
|
|
def evaluate_mbpp(self, problems: List[Dict[str, Any]]) -> BenchmarkResults: |
|
|
""" |
|
|
Evaluate on MBPP (Mostly Basic Python Problems) benchmark. |
|
|
|
|
|
Args: |
|
|
problems: List of MBPP problems |
|
|
|
|
|
Returns: |
|
|
BenchmarkResults with aggregated scores |
|
|
""" |
|
|
print("Evaluating MBPP benchmark...") |
|
|
|
|
|
for problem in problems: |
|
|
task_id = str(problem['task_id']) |
|
|
prompt = problem['text'] |
|
|
test_cases = problem['test_list'] |
|
|
|
|
|
try: |
|
|
start_time = time.time() |
|
|
response = self.client.generate(prompt, language="python") |
|
|
execution_time = time.time() - start_time |
|
|
|
|
|
passed, error = self._execute_tests(response.code, test_cases) |
|
|
|
|
|
result = EvaluationResult( |
|
|
task_id=task_id, |
|
|
task_type="code_generation", |
|
|
language="python", |
|
|
passed=passed, |
|
|
score=1.0 if passed else 0.0, |
|
|
execution_time=execution_time, |
|
|
error_message=error |
|
|
) |
|
|
|
|
|
self.results.append(result) |
|
|
print(f" Task {task_id}: {'PASS' if passed else 'FAIL'}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f" Task {task_id}: ERROR - {str(e)}") |
|
|
|
|
|
return self._aggregate_results("MBPP") |
|
|
|
|
|
def evaluate_code_translation( |
|
|
self, |
|
|
test_cases: List[Dict[str, Any]] |
|
|
) -> BenchmarkResults: |
|
|
""" |
|
|
Evaluate code translation between languages. |
|
|
|
|
|
Args: |
|
|
test_cases: List of translation test cases |
|
|
|
|
|
Returns: |
|
|
BenchmarkResults with translation accuracy |
|
|
""" |
|
|
print("Evaluating code translation...") |
|
|
|
|
|
for test_case in test_cases: |
|
|
task_id = test_case['id'] |
|
|
source_code = test_case['source_code'] |
|
|
source_lang = test_case['source_language'] |
|
|
target_lang = test_case['target_language'] |
|
|
expected_behavior = test_case.get('expected_behavior') |
|
|
|
|
|
try: |
|
|
start_time = time.time() |
|
|
response = self.client.translate( |
|
|
code=source_code, |
|
|
source_language=source_lang, |
|
|
target_language=target_lang |
|
|
) |
|
|
execution_time = time.time() - start_time |
|
|
|
|
|
|
|
|
score = self._validate_translation( |
|
|
response.code, |
|
|
target_lang, |
|
|
expected_behavior |
|
|
) |
|
|
|
|
|
result = EvaluationResult( |
|
|
task_id=task_id, |
|
|
task_type="code_translation", |
|
|
language=f"{source_lang}_to_{target_lang}", |
|
|
passed=score >= 0.8, |
|
|
score=score, |
|
|
execution_time=execution_time |
|
|
) |
|
|
|
|
|
self.results.append(result) |
|
|
print(f" {task_id}: Score {score:.2f}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f" {task_id}: ERROR - {str(e)}") |
|
|
|
|
|
return self._aggregate_results("Code Translation") |
|
|
|
|
|
def evaluate_code_explanation( |
|
|
self, |
|
|
test_cases: List[Dict[str, Any]] |
|
|
) -> BenchmarkResults: |
|
|
""" |
|
|
Evaluate code explanation quality. |
|
|
|
|
|
Args: |
|
|
test_cases: List of explanation test cases |
|
|
|
|
|
Returns: |
|
|
BenchmarkResults with explanation scores |
|
|
""" |
|
|
print("Evaluating code explanation...") |
|
|
|
|
|
for test_case in test_cases: |
|
|
task_id = test_case['id'] |
|
|
code = test_case['code'] |
|
|
language = test_case['language'] |
|
|
key_concepts = test_case.get('key_concepts', []) |
|
|
|
|
|
try: |
|
|
start_time = time.time() |
|
|
explanation = self.client.explain(code, language) |
|
|
execution_time = time.time() - start_time |
|
|
|
|
|
|
|
|
score = self._score_explanation(explanation, key_concepts) |
|
|
|
|
|
result = EvaluationResult( |
|
|
task_id=task_id, |
|
|
task_type="code_explanation", |
|
|
language=language, |
|
|
passed=score >= 0.7, |
|
|
score=score, |
|
|
execution_time=execution_time |
|
|
) |
|
|
|
|
|
self.results.append(result) |
|
|
print(f" {task_id}: Score {score:.2f}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f" {task_id}: ERROR - {str(e)}") |
|
|
|
|
|
return self._aggregate_results("Code Explanation") |
|
|
|
|
|
def evaluate_bug_detection( |
|
|
self, |
|
|
test_cases: List[Dict[str, Any]] |
|
|
) -> BenchmarkResults: |
|
|
""" |
|
|
Evaluate bug detection and fixing capabilities. |
|
|
|
|
|
Args: |
|
|
test_cases: List of buggy code samples |
|
|
|
|
|
Returns: |
|
|
BenchmarkResults with bug fix success rate |
|
|
""" |
|
|
print("Evaluating bug detection and fixing...") |
|
|
|
|
|
for test_case in test_cases: |
|
|
task_id = test_case['id'] |
|
|
buggy_code = test_case['buggy_code'] |
|
|
error_message = test_case['error_message'] |
|
|
language = test_case['language'] |
|
|
tests = test_case.get('tests', []) |
|
|
|
|
|
try: |
|
|
start_time = time.time() |
|
|
response = self.client.debug(buggy_code, error_message, language) |
|
|
execution_time = time.time() - start_time |
|
|
|
|
|
|
|
|
passed, error = self._execute_tests(response.code, tests) |
|
|
|
|
|
result = EvaluationResult( |
|
|
task_id=task_id, |
|
|
task_type="bug_fixing", |
|
|
language=language, |
|
|
passed=passed, |
|
|
score=1.0 if passed else 0.0, |
|
|
execution_time=execution_time, |
|
|
error_message=error |
|
|
) |
|
|
|
|
|
self.results.append(result) |
|
|
print(f" {task_id}: {'FIXED' if passed else 'FAILED'}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f" {task_id}: ERROR - {str(e)}") |
|
|
|
|
|
return self._aggregate_results("Bug Detection") |
|
|
|
|
|
def _execute_tests( |
|
|
self, |
|
|
code: str, |
|
|
test_cases: List[str] |
|
|
) -> Tuple[bool, Optional[str]]: |
|
|
""" |
|
|
Execute test cases against generated code. |
|
|
|
|
|
Args: |
|
|
code: Generated code to test |
|
|
test_cases: List of test case strings |
|
|
|
|
|
Returns: |
|
|
Tuple of (passed, error_message) |
|
|
""" |
|
|
try: |
|
|
|
|
|
namespace = {} |
|
|
exec(code, namespace) |
|
|
|
|
|
|
|
|
for test in test_cases: |
|
|
exec(test, namespace) |
|
|
|
|
|
return True, None |
|
|
|
|
|
except Exception as e: |
|
|
return False, str(e) |
|
|
|
|
|
def _validate_translation( |
|
|
self, |
|
|
translated_code: str, |
|
|
target_language: str, |
|
|
expected_behavior: Optional[Dict[str, Any]] |
|
|
) -> float: |
|
|
""" |
|
|
Validate translated code quality. |
|
|
|
|
|
Args: |
|
|
translated_code: Translated code |
|
|
target_language: Target language |
|
|
expected_behavior: Expected behavior specification |
|
|
|
|
|
Returns: |
|
|
Quality score (0.0 to 1.0) |
|
|
""" |
|
|
|
|
|
score = 0.0 |
|
|
|
|
|
|
|
|
if len(translated_code.strip()) > 0: |
|
|
score += 0.3 |
|
|
|
|
|
|
|
|
if target_language.lower() in translated_code.lower(): |
|
|
score += 0.2 |
|
|
|
|
|
|
|
|
if expected_behavior: |
|
|
score += 0.5 |
|
|
|
|
|
return min(score, 1.0) |
|
|
|
|
|
def _score_explanation( |
|
|
self, |
|
|
explanation: str, |
|
|
key_concepts: List[str] |
|
|
) -> float: |
|
|
""" |
|
|
Score explanation quality based on concept coverage. |
|
|
|
|
|
Args: |
|
|
explanation: Generated explanation |
|
|
key_concepts: List of key concepts that should be covered |
|
|
|
|
|
Returns: |
|
|
Quality score (0.0 to 1.0) |
|
|
""" |
|
|
if not key_concepts: |
|
|
|
|
|
return 0.8 if len(explanation) > 100 else 0.5 |
|
|
|
|
|
explanation_lower = explanation.lower() |
|
|
covered = sum(1 for concept in key_concepts |
|
|
if concept.lower() in explanation_lower) |
|
|
|
|
|
coverage_score = covered / len(key_concepts) |
|
|
length_score = min(len(explanation) / 500, 1.0) |
|
|
|
|
|
return (coverage_score * 0.7 + length_score * 0.3) |
|
|
|
|
|
def _aggregate_results(self, benchmark_name: str) -> BenchmarkResults: |
|
|
""" |
|
|
Aggregate evaluation results for a benchmark. |
|
|
|
|
|
Args: |
|
|
benchmark_name: Name of the benchmark |
|
|
|
|
|
Returns: |
|
|
BenchmarkResults with aggregated statistics |
|
|
""" |
|
|
benchmark_results = [r for r in self.results |
|
|
if benchmark_name.lower() in r.task_id.lower() or |
|
|
benchmark_name.lower() == r.task_type.lower()] |
|
|
|
|
|
if not benchmark_results: |
|
|
return BenchmarkResults( |
|
|
benchmark_name=benchmark_name, |
|
|
total_tasks=0, |
|
|
passed_tasks=0, |
|
|
failed_tasks=0, |
|
|
average_score=0.0, |
|
|
pass_rate=0.0, |
|
|
average_execution_time=0.0, |
|
|
results_by_language={} |
|
|
) |
|
|
|
|
|
total = len(benchmark_results) |
|
|
passed = sum(1 for r in benchmark_results if r.passed) |
|
|
failed = total - passed |
|
|
avg_score = statistics.mean(r.score for r in benchmark_results) |
|
|
pass_rate = passed / total if total > 0 else 0.0 |
|
|
avg_time = statistics.mean(r.execution_time for r in benchmark_results) |
|
|
|
|
|
|
|
|
by_language = defaultdict(lambda: {"passed": 0, "total": 0, "score": []}) |
|
|
for result in benchmark_results: |
|
|
lang = result.language |
|
|
by_language[lang]["total"] += 1 |
|
|
if result.passed: |
|
|
by_language[lang]["passed"] += 1 |
|
|
by_language[lang]["score"].append(result.score) |
|
|
|
|
|
results_by_language = { |
|
|
lang: { |
|
|
"pass_rate": stats["passed"] / stats["total"], |
|
|
"average_score": statistics.mean(stats["score"]) |
|
|
} |
|
|
for lang, stats in by_language.items() |
|
|
} |
|
|
|
|
|
return BenchmarkResults( |
|
|
benchmark_name=benchmark_name, |
|
|
total_tasks=total, |
|
|
passed_tasks=passed, |
|
|
failed_tasks=failed, |
|
|
average_score=avg_score, |
|
|
pass_rate=pass_rate, |
|
|
average_execution_time=avg_time, |
|
|
results_by_language=results_by_language |
|
|
) |
|
|
|
|
|
def save_results(self, filepath: str): |
|
|
""" |
|
|
Save evaluation results to JSON file. |
|
|
|
|
|
Args: |
|
|
filepath: Path to save results |
|
|
""" |
|
|
results_data = { |
|
|
"individual_results": [r.to_dict() for r in self.results], |
|
|
"summary": self.get_summary() |
|
|
} |
|
|
|
|
|
with open(filepath, 'w') as f: |
|
|
json.dump(results_data, f, indent=2) |
|
|
|
|
|
print(f"\nResults saved to {filepath}") |
|
|
|
|
|
def get_summary(self) -> Dict[str, Any]: |
|
|
""" |
|
|
Get summary of all evaluation results. |
|
|
|
|
|
Returns: |
|
|
Dictionary with summary statistics |
|
|
""" |
|
|
if not self.results: |
|
|
return {"message": "No results available"} |
|
|
|
|
|
total = len(self.results) |
|
|
passed = sum(1 for r in self.results if r.passed) |
|
|
|
|
|
return { |
|
|
"total_tasks": total, |
|
|
"passed_tasks": passed, |
|
|
"failed_tasks": total - passed, |
|
|
"overall_pass_rate": passed / total, |
|
|
"average_score": statistics.mean(r.score for r in self.results), |
|
|
"average_execution_time": statistics.mean(r.execution_time for r in self.results), |
|
|
"by_task_type": self._group_by_field("task_type"), |
|
|
"by_language": self._group_by_field("language") |
|
|
} |
|
|
|
|
|
def _group_by_field(self, field: str) -> Dict[str, Dict[str, float]]: |
|
|
"""Group results by a specific field.""" |
|
|
grouped = defaultdict(lambda: {"passed": 0, "total": 0, "scores": []}) |
|
|
|
|
|
for result in self.results: |
|
|
value = getattr(result, field) |
|
|
grouped[value]["total"] += 1 |
|
|
if result.passed: |
|
|
grouped[value]["passed"] += 1 |
|
|
grouped[value]["scores"].append(result.score) |
|
|
|
|
|
return { |
|
|
key: { |
|
|
"pass_rate": stats["passed"] / stats["total"], |
|
|
"average_score": statistics.mean(stats["scores"]) |
|
|
} |
|
|
for key, stats in grouped.items() |
|
|
} |
|
|
|
|
|
def print_summary(self): |
|
|
"""Print evaluation summary to console.""" |
|
|
summary = self.get_summary() |
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("EVALUATION SUMMARY") |
|
|
print("="*60) |
|
|
print(f"Total Tasks: {summary['total_tasks']}") |
|
|
print(f"Passed: {summary['passed_tasks']}") |
|
|
print(f"Failed: {summary['failed_tasks']}") |
|
|
print(f"Overall Pass Rate: {summary['overall_pass_rate']:.2%}") |
|
|
print(f"Average Score: {summary['average_score']:.2f}") |
|
|
print(f"Average Execution Time: {summary['average_execution_time']:.2f}s") |
|
|
|
|
|
print("\nBy Task Type:") |
|
|
for task_type, stats in summary['by_task_type'].items(): |
|
|
print(f" {task_type}:") |
|
|
print(f" Pass Rate: {stats['pass_rate']:.2%}") |
|
|
print(f" Avg Score: {stats['average_score']:.2f}") |
|
|
|
|
|
print("\nBy Language:") |
|
|
for language, stats in summary['by_language'].items(): |
|
|
print(f" {language}:") |
|
|
print(f" Pass Rate: {stats['pass_rate']:.2%}") |
|
|
print(f" Avg Score: {stats['average_score']:.2f}") |
|
|
|
|
|
print("="*60) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
evaluator = CodeEvaluator(api_key="your_api_key_here") |
|
|
|
|
|
|
|
|
humaneval_problems = [ |
|
|
{ |
|
|
"task_id": "HumanEval/0", |
|
|
"prompt": "Write a function that takes a list of numbers and returns True if the list contains a pair of numbers that sum to zero.", |
|
|
"test": "assert has_zero_sum([1, -1, 2]) == True\nassert has_zero_sum([1, 2, 3]) == False" |
|
|
} |
|
|
] |
|
|
|
|
|
|
|
|
results = evaluator.evaluate_humaneval(humaneval_problems) |
|
|
|
|
|
|
|
|
evaluator.print_summary() |
|
|
evaluator.save_results("evaluation_results.json") |