Troviku-1.1 / benchmark_results.json
Trouter-Library's picture
Create benchmark_results.json
8697525 verified
raw
history blame
7.95 kB
{
"model": "OpenTrouter/Troviku-1.1",
"version": "1.1.0",
"evaluation_date": "2025-01-15",
"evaluator": "OpenTrouter Research Team",
"benchmarks": {
"humaneval": {
"name": "HumanEval",
"description": "Hand-written programming problems for evaluating functional correctness",
"total_problems": 164,
"metrics": {
"pass_at_1": 0.72,
"pass_at_10": 0.89,
"pass_at_100": 0.94
},
"language": "python",
"temperature": 0.8,
"evaluation_notes": "Evaluated using exact match on test cases"
},
"mbpp": {
"name": "MBPP",
"description": "Mostly Basic Python Problems - crowd-sourced programming problems",
"total_problems": 500,
"metrics": {
"pass_at_1": 0.68,
"pass_at_10": 0.84,
"pass_at_100": 0.91
},
"language": "python",
"temperature": 0.8,
"evaluation_notes": "Sanitized version used for evaluation"
},
"code_contests": {
"name": "CodeContests",
"description": "Competitive programming problems from various platforms",
"total_problems": 165,
"metrics": {
"pass_at_1": 0.45,
"pass_at_10": 0.68,
"pass_at_100": 0.79
},
"difficulty_breakdown": {
"introductory": 0.82,
"interview": 0.54,
"competition": 0.28
},
"language": "python",
"temperature": 0.8,
"evaluation_notes": "Competitive programming requiring advanced algorithms"
},
"multipl_e": {
"name": "MultiPL-E",
"description": "Multi-language evaluation benchmark",
"total_problems": 164,
"languages_evaluated": [
"python",
"java",
"javascript",
"cpp",
"rust",
"go",
"typescript",
"php",
"ruby",
"swift"
],
"metrics_by_language": {
"python": {
"pass_at_1": 0.72,
"pass_at_10": 0.88
},
"java": {
"pass_at_1": 0.65,
"pass_at_10": 0.82
},
"javascript": {
"pass_at_1": 0.68,
"pass_at_10": 0.85
},
"cpp": {
"pass_at_1": 0.61,
"pass_at_10": 0.78
},
"rust": {
"pass_at_1": 0.58,
"pass_at_10": 0.75
},
"go": {
"pass_at_1": 0.64,
"pass_at_10": 0.80
},
"typescript": {
"pass_at_1": 0.67,
"pass_at_10": 0.84
},
"php": {
"pass_at_1": 0.60,
"pass_at_10": 0.76
},
"ruby": {
"pass_at_1": 0.59,
"pass_at_10": 0.74
},
"swift": {
"pass_at_1": 0.56,
"pass_at_10": 0.72
}
},
"temperature": 0.8,
"evaluation_notes": "Cross-language code generation capability"
},
"ds_1000": {
"name": "DS-1000",
"description": "Data science code generation with 1000 problems",
"total_problems": 1000,
"metrics": {
"pass_at_1": 0.58,
"pass_at_10": 0.76
},
"libraries_covered": [
"numpy",
"pandas",
"scikit-learn",
"matplotlib",
"scipy",
"pytorch",
"tensorflow"
],
"metrics_by_library": {
"numpy": 0.64,
"pandas": 0.61,
"scikit_learn": 0.55,
"matplotlib": 0.58,
"scipy": 0.52,
"pytorch": 0.54,
"tensorflow": 0.51
},
"language": "python",
"temperature": 0.2,
"evaluation_notes": "Focused on data science and ML libraries"
}
},
"custom_evaluations": {
"code_translation": {
"description": "Translation accuracy between programming languages",
"language_pairs_tested": 45,
"metrics": {
"semantic_preservation": 0.84,
"syntax_correctness": 0.91,
"functional_equivalence": 0.78
},
"top_performing_pairs": [
{
"source": "python",
"target": "javascript",
"score": 0.89
},
{
"source": "javascript",
"target": "typescript",
"score": 0.93
},
{
"source": "java",
"target": "kotlin",
"score": 0.87
}
]
},
"code_explanation": {
"description": "Quality of code explanations and documentation",
"total_samples": 200,
"metrics": {
"accuracy": 0.88,
"completeness": 0.82,
"clarity": 0.86,
"relevance": 0.90
},
"complexity_levels": {
"beginner": 0.93,
"intermediate": 0.86,
"advanced": 0.79
}
},
"bug_detection": {
"description": "Ability to identify and fix bugs in code",
"total_samples": 250,
"bug_types": {
"syntax_errors": 0.95,
"logic_errors": 0.73,
"type_errors": 0.88,
"runtime_errors": 0.81,
"edge_cases": 0.65
},
"fix_success_rate": 0.76
},
"code_review": {
"description": "Code review and improvement suggestions",
"total_samples": 150,
"categories": {
"code_quality": 0.84,
"security_issues": 0.78,
"performance_optimization": 0.72,
"best_practices": 0.86,
"readability": 0.89
}
},
"test_generation": {
"description": "Quality of generated unit tests",
"total_samples": 180,
"metrics": {
"test_coverage": 0.81,
"edge_case_coverage": 0.68,
"test_correctness": 0.87,
"assertion_quality": 0.83
}
}
},
"performance_metrics": {
"latency": {
"p50_ms": 1250,
"p95_ms": 3100,
"p99_ms": 5200
},
"throughput": {
"tokens_per_second": 45,
"requests_per_minute": 60
},
"resource_utilization": {
"average_gpu_memory_gb": 18,
"peak_gpu_memory_gb": 22,
"average_cpu_percent": 35
}
},
"comparison_to_baselines": {
"models": [
{
"name": "GPT-4-turbo",
"humaneval_pass_at_1": 0.84,
"mbpp_pass_at_1": 0.80
},
{
"name": "Claude-3.5-Sonnet",
"humaneval_pass_at_1": 0.82,
"mbpp_pass_at_1": 0.78
},
{
"name": "CodeLlama-34B",
"humaneval_pass_at_1": 0.68,
"mbpp_pass_at_1": 0.62
},
{
"name": "Troviku-1.1",
"humaneval_pass_at_1": 0.72,
"mbpp_pass_at_1": 0.68
}
],
"notes": "Troviku-1.1 shows competitive performance with specialized optimization for coding tasks"
},
"limitations": {
"known_weaknesses": [
"Complex algorithmic problems requiring advanced mathematics",
"Very large codebases with deep context dependencies",
"Highly specialized domain-specific languages",
"Real-time system programming with strict performance constraints",
"Legacy code with unusual or deprecated patterns"
],
"improvement_areas": [
"Enhanced multi-file context understanding",
"Better handling of rare programming languages",
"Improved optimization for competitive programming",
"Stronger performance on security-critical code"
]
},
"methodology": {
"evaluation_framework": "Standard academic benchmarks with custom extensions",
"temperature_settings": "Varied by task type (0.2-0.8)",
"sampling_strategy": "Pass@k evaluated with k=[1, 10, 100]",
"test_execution": "Isolated sandbox environments with timeout limits",
"human_evaluation": "20% of outputs manually reviewed by senior engineers"
},
"reproducibility": {
"random_seed": 42,
"evaluation_code_repository": "https://github.com/OpenTrouter/troviku-eval",
"benchmark_versions": {
"humaneval": "v1.0",
"mbpp": "v1.0-sanitized",
"code_contests": "v1.0",
"multipl_e": "v0.2"
}
}
}