| { | |
| "model": "OpenTrouter/Troviku-1.1", | |
| "version": "1.1.0", | |
| "evaluation_date": "2025-01-15", | |
| "evaluator": "OpenTrouter Research Team", | |
| "benchmarks": { | |
| "humaneval": { | |
| "name": "HumanEval", | |
| "description": "Hand-written programming problems for evaluating functional correctness", | |
| "total_problems": 164, | |
| "metrics": { | |
| "pass_at_1": 0.72, | |
| "pass_at_10": 0.89, | |
| "pass_at_100": 0.94 | |
| }, | |
| "language": "python", | |
| "temperature": 0.8, | |
| "evaluation_notes": "Evaluated using exact match on test cases" | |
| }, | |
| "mbpp": { | |
| "name": "MBPP", | |
| "description": "Mostly Basic Python Problems - crowd-sourced programming problems", | |
| "total_problems": 500, | |
| "metrics": { | |
| "pass_at_1": 0.68, | |
| "pass_at_10": 0.84, | |
| "pass_at_100": 0.91 | |
| }, | |
| "language": "python", | |
| "temperature": 0.8, | |
| "evaluation_notes": "Sanitized version used for evaluation" | |
| }, | |
| "code_contests": { | |
| "name": "CodeContests", | |
| "description": "Competitive programming problems from various platforms", | |
| "total_problems": 165, | |
| "metrics": { | |
| "pass_at_1": 0.45, | |
| "pass_at_10": 0.68, | |
| "pass_at_100": 0.79 | |
| }, | |
| "difficulty_breakdown": { | |
| "introductory": 0.82, | |
| "interview": 0.54, | |
| "competition": 0.28 | |
| }, | |
| "language": "python", | |
| "temperature": 0.8, | |
| "evaluation_notes": "Competitive programming requiring advanced algorithms" | |
| }, | |
| "multipl_e": { | |
| "name": "MultiPL-E", | |
| "description": "Multi-language evaluation benchmark", | |
| "total_problems": 164, | |
| "languages_evaluated": [ | |
| "python", | |
| "java", | |
| "javascript", | |
| "cpp", | |
| "rust", | |
| "go", | |
| "typescript", | |
| "php", | |
| "ruby", | |
| "swift" | |
| ], | |
| "metrics_by_language": { | |
| "python": { | |
| "pass_at_1": 0.72, | |
| "pass_at_10": 0.88 | |
| }, | |
| "java": { | |
| "pass_at_1": 0.65, | |
| "pass_at_10": 0.82 | |
| }, | |
| "javascript": { | |
| "pass_at_1": 0.68, | |
| "pass_at_10": 0.85 | |
| }, | |
| "cpp": { | |
| "pass_at_1": 0.61, | |
| "pass_at_10": 0.78 | |
| }, | |
| "rust": { | |
| "pass_at_1": 0.58, | |
| "pass_at_10": 0.75 | |
| }, | |
| "go": { | |
| "pass_at_1": 0.64, | |
| "pass_at_10": 0.80 | |
| }, | |
| "typescript": { | |
| "pass_at_1": 0.67, | |
| "pass_at_10": 0.84 | |
| }, | |
| "php": { | |
| "pass_at_1": 0.60, | |
| "pass_at_10": 0.76 | |
| }, | |
| "ruby": { | |
| "pass_at_1": 0.59, | |
| "pass_at_10": 0.74 | |
| }, | |
| "swift": { | |
| "pass_at_1": 0.56, | |
| "pass_at_10": 0.72 | |
| } | |
| }, | |
| "temperature": 0.8, | |
| "evaluation_notes": "Cross-language code generation capability" | |
| }, | |
| "ds_1000": { | |
| "name": "DS-1000", | |
| "description": "Data science code generation with 1000 problems", | |
| "total_problems": 1000, | |
| "metrics": { | |
| "pass_at_1": 0.58, | |
| "pass_at_10": 0.76 | |
| }, | |
| "libraries_covered": [ | |
| "numpy", | |
| "pandas", | |
| "scikit-learn", | |
| "matplotlib", | |
| "scipy", | |
| "pytorch", | |
| "tensorflow" | |
| ], | |
| "metrics_by_library": { | |
| "numpy": 0.64, | |
| "pandas": 0.61, | |
| "scikit_learn": 0.55, | |
| "matplotlib": 0.58, | |
| "scipy": 0.52, | |
| "pytorch": 0.54, | |
| "tensorflow": 0.51 | |
| }, | |
| "language": "python", | |
| "temperature": 0.2, | |
| "evaluation_notes": "Focused on data science and ML libraries" | |
| } | |
| }, | |
| "custom_evaluations": { | |
| "code_translation": { | |
| "description": "Translation accuracy between programming languages", | |
| "language_pairs_tested": 45, | |
| "metrics": { | |
| "semantic_preservation": 0.84, | |
| "syntax_correctness": 0.91, | |
| "functional_equivalence": 0.78 | |
| }, | |
| "top_performing_pairs": [ | |
| { | |
| "source": "python", | |
| "target": "javascript", | |
| "score": 0.89 | |
| }, | |
| { | |
| "source": "javascript", | |
| "target": "typescript", | |
| "score": 0.93 | |
| }, | |
| { | |
| "source": "java", | |
| "target": "kotlin", | |
| "score": 0.87 | |
| } | |
| ] | |
| }, | |
| "code_explanation": { | |
| "description": "Quality of code explanations and documentation", | |
| "total_samples": 200, | |
| "metrics": { | |
| "accuracy": 0.88, | |
| "completeness": 0.82, | |
| "clarity": 0.86, | |
| "relevance": 0.90 | |
| }, | |
| "complexity_levels": { | |
| "beginner": 0.93, | |
| "intermediate": 0.86, | |
| "advanced": 0.79 | |
| } | |
| }, | |
| "bug_detection": { | |
| "description": "Ability to identify and fix bugs in code", | |
| "total_samples": 250, | |
| "bug_types": { | |
| "syntax_errors": 0.95, | |
| "logic_errors": 0.73, | |
| "type_errors": 0.88, | |
| "runtime_errors": 0.81, | |
| "edge_cases": 0.65 | |
| }, | |
| "fix_success_rate": 0.76 | |
| }, | |
| "code_review": { | |
| "description": "Code review and improvement suggestions", | |
| "total_samples": 150, | |
| "categories": { | |
| "code_quality": 0.84, | |
| "security_issues": 0.78, | |
| "performance_optimization": 0.72, | |
| "best_practices": 0.86, | |
| "readability": 0.89 | |
| } | |
| }, | |
| "test_generation": { | |
| "description": "Quality of generated unit tests", | |
| "total_samples": 180, | |
| "metrics": { | |
| "test_coverage": 0.81, | |
| "edge_case_coverage": 0.68, | |
| "test_correctness": 0.87, | |
| "assertion_quality": 0.83 | |
| } | |
| } | |
| }, | |
| "performance_metrics": { | |
| "latency": { | |
| "p50_ms": 1250, | |
| "p95_ms": 3100, | |
| "p99_ms": 5200 | |
| }, | |
| "throughput": { | |
| "tokens_per_second": 45, | |
| "requests_per_minute": 60 | |
| }, | |
| "resource_utilization": { | |
| "average_gpu_memory_gb": 18, | |
| "peak_gpu_memory_gb": 22, | |
| "average_cpu_percent": 35 | |
| } | |
| }, | |
| "comparison_to_baselines": { | |
| "models": [ | |
| { | |
| "name": "GPT-4-turbo", | |
| "humaneval_pass_at_1": 0.84, | |
| "mbpp_pass_at_1": 0.80 | |
| }, | |
| { | |
| "name": "Claude-3.5-Sonnet", | |
| "humaneval_pass_at_1": 0.82, | |
| "mbpp_pass_at_1": 0.78 | |
| }, | |
| { | |
| "name": "CodeLlama-34B", | |
| "humaneval_pass_at_1": 0.68, | |
| "mbpp_pass_at_1": 0.62 | |
| }, | |
| { | |
| "name": "Troviku-1.1", | |
| "humaneval_pass_at_1": 0.72, | |
| "mbpp_pass_at_1": 0.68 | |
| } | |
| ], | |
| "notes": "Troviku-1.1 shows competitive performance with specialized optimization for coding tasks" | |
| }, | |
| "limitations": { | |
| "known_weaknesses": [ | |
| "Complex algorithmic problems requiring advanced mathematics", | |
| "Very large codebases with deep context dependencies", | |
| "Highly specialized domain-specific languages", | |
| "Real-time system programming with strict performance constraints", | |
| "Legacy code with unusual or deprecated patterns" | |
| ], | |
| "improvement_areas": [ | |
| "Enhanced multi-file context understanding", | |
| "Better handling of rare programming languages", | |
| "Improved optimization for competitive programming", | |
| "Stronger performance on security-critical code" | |
| ] | |
| }, | |
| "methodology": { | |
| "evaluation_framework": "Standard academic benchmarks with custom extensions", | |
| "temperature_settings": "Varied by task type (0.2-0.8)", | |
| "sampling_strategy": "Pass@k evaluated with k=[1, 10, 100]", | |
| "test_execution": "Isolated sandbox environments with timeout limits", | |
| "human_evaluation": "20% of outputs manually reviewed by senior engineers" | |
| }, | |
| "reproducibility": { | |
| "random_seed": 42, | |
| "evaluation_code_repository": "https://github.com/OpenTrouter/troviku-eval", | |
| "benchmark_versions": { | |
| "humaneval": "v1.0", | |
| "mbpp": "v1.0-sanitized", | |
| "code_contests": "v1.0", | |
| "multipl_e": "v0.2" | |
| } | |
| } | |
| } |