OpenTrouter
/

Troviku-1.1

Model card Files Files and versions

xet

Community

Trouter-Library commited on 20 days ago

Commit

8697525

verified ·

1 Parent(s): 9cc73a0

Create benchmark_results.json

Browse files

Files changed (1) hide show

benchmark_results.json +307 -0

benchmark_results.json ADDED Viewed

	@@ -0,0 +1,307 @@

+{
+  "model": "OpenTrouter/Troviku-1.1",
+  "version": "1.1.0",
+  "evaluation_date": "2025-01-15",
+  "evaluator": "OpenTrouter Research Team",
+  "benchmarks": {
+    "humaneval": {
+      "name": "HumanEval",
+      "description": "Hand-written programming problems for evaluating functional correctness",
+      "total_problems": 164,
+      "metrics": {
+        "pass_at_1": 0.72,
+        "pass_at_10": 0.89,
+        "pass_at_100": 0.94
+      },
+      "language": "python",
+      "temperature": 0.8,
+      "evaluation_notes": "Evaluated using exact match on test cases"
+    },
+    "mbpp": {
+      "name": "MBPP",
+      "description": "Mostly Basic Python Problems - crowd-sourced programming problems",
+      "total_problems": 500,
+      "metrics": {
+        "pass_at_1": 0.68,
+        "pass_at_10": 0.84,
+        "pass_at_100": 0.91
+      },
+      "language": "python",
+      "temperature": 0.8,
+      "evaluation_notes": "Sanitized version used for evaluation"
+    },
+    "code_contests": {
+      "name": "CodeContests",
+      "description": "Competitive programming problems from various platforms",
+      "total_problems": 165,
+      "metrics": {
+        "pass_at_1": 0.45,
+        "pass_at_10": 0.68,
+        "pass_at_100": 0.79
+      },
+      "difficulty_breakdown": {
+        "introductory": 0.82,
+        "interview": 0.54,
+        "competition": 0.28
+      },
+      "language": "python",
+      "temperature": 0.8,
+      "evaluation_notes": "Competitive programming requiring advanced algorithms"
+    },
+    "multipl_e": {
+      "name": "MultiPL-E",
+      "description": "Multi-language evaluation benchmark",
+      "total_problems": 164,
+      "languages_evaluated": [
+        "python",
+        "java",
+        "javascript",
+        "cpp",
+        "rust",
+        "go",
+        "typescript",
+        "php",
+        "ruby",
+        "swift"
+      ],
+      "metrics_by_language": {
+        "python": {
+          "pass_at_1": 0.72,
+          "pass_at_10": 0.88
+        },
+        "java": {
+          "pass_at_1": 0.65,
+          "pass_at_10": 0.82
+        },
+        "javascript": {
+          "pass_at_1": 0.68,
+          "pass_at_10": 0.85
+        },
+        "cpp": {
+          "pass_at_1": 0.61,
+          "pass_at_10": 0.78
+        },
+        "rust": {
+          "pass_at_1": 0.58,
+          "pass_at_10": 0.75
+        },
+        "go": {
+          "pass_at_1": 0.64,
+          "pass_at_10": 0.80
+        },
+        "typescript": {
+          "pass_at_1": 0.67,
+          "pass_at_10": 0.84
+        },
+        "php": {
+          "pass_at_1": 0.60,
+          "pass_at_10": 0.76
+        },
+        "ruby": {
+          "pass_at_1": 0.59,
+          "pass_at_10": 0.74
+        },
+        "swift": {
+          "pass_at_1": 0.56,
+          "pass_at_10": 0.72
+        }
+      },
+      "temperature": 0.8,
+      "evaluation_notes": "Cross-language code generation capability"
+    },
+    "ds_1000": {
+      "name": "DS-1000",
+      "description": "Data science code generation with 1000 problems",
+      "total_problems": 1000,
+      "metrics": {
+        "pass_at_1": 0.58,
+        "pass_at_10": 0.76
+      },
+      "libraries_covered": [
+        "numpy",
+        "pandas",
+        "scikit-learn",
+        "matplotlib",
+        "scipy",
+        "pytorch",
+        "tensorflow"
+      ],
+      "metrics_by_library": {
+        "numpy": 0.64,
+        "pandas": 0.61,
+        "scikit_learn": 0.55,
+        "matplotlib": 0.58,
+        "scipy": 0.52,
+        "pytorch": 0.54,
+        "tensorflow": 0.51
+      },
+      "language": "python",
+      "temperature": 0.2,
+      "evaluation_notes": "Focused on data science and ML libraries"
+    }
+  },
+  "custom_evaluations": {
+    "code_translation": {
+      "description": "Translation accuracy between programming languages",
+      "language_pairs_tested": 45,
+      "metrics": {
+        "semantic_preservation": 0.84,
+        "syntax_correctness": 0.91,
+        "functional_equivalence": 0.78
+      },
+      "top_performing_pairs": [
+        {
+          "source": "python",
+          "target": "javascript",
+          "score": 0.89
+        },
+        {
+          "source": "javascript",
+          "target": "typescript",
+          "score": 0.93
+        },
+        {
+          "source": "java",
+          "target": "kotlin",
+          "score": 0.87
+        }
+      ]
+    },
+    "code_explanation": {
+      "description": "Quality of code explanations and documentation",
+      "total_samples": 200,
+      "metrics": {
+        "accuracy": 0.88,
+        "completeness": 0.82,
+        "clarity": 0.86,
+        "relevance": 0.90
+      },
+      "complexity_levels": {
+        "beginner": 0.93,
+        "intermediate": 0.86,
+        "advanced": 0.79
+      }
+    },
+    "bug_detection": {
+      "description": "Ability to identify and fix bugs in code",
+      "total_samples": 250,
+      "bug_types": {
+        "syntax_errors": 0.95,
+        "logic_errors": 0.73,
+        "type_errors": 0.88,
+        "runtime_errors": 0.81,
+        "edge_cases": 0.65
+      },
+      "fix_success_rate": 0.76
+    },
+    "code_review": {
+      "description": "Code review and improvement suggestions",
+      "total_samples": 150,
+      "categories": {
+        "code_quality": 0.84,
+        "security_issues": 0.78,
+        "performance_optimization": 0.72,
+        "best_practices": 0.86,
+        "readability": 0.89
+      }
+    },
+    "test_generation": {
+      "description": "Quality of generated unit tests",
+      "total_samples": 180,
+      "metrics": {
+        "test_coverage": 0.81,
+        "edge_case_coverage": 0.68,
+        "test_correctness": 0.87,
+        "assertion_quality": 0.83
+      }
+    }
+  },
+  "performance_metrics": {
+    "latency": {
+      "p50_ms": 1250,
+      "p95_ms": 3100,
+      "p99_ms": 5200
+    },
+    "throughput": {
+      "tokens_per_second": 45,
+      "requests_per_minute": 60
+    },
+    "resource_utilization": {
+      "average_gpu_memory_gb": 18,
+      "peak_gpu_memory_gb": 22,
+      "average_cpu_percent": 35
+    }
+  },
+  "comparison_to_baselines": {
+    "models": [
+      {
+        "name": "GPT-4-turbo",
+        "humaneval_pass_at_1": 0.84,
+        "mbpp_pass_at_1": 0.80
+      },
+      {
+        "name": "Claude-3.5-Sonnet",
+        "humaneval_pass_at_1": 0.82,
+        "mbpp_pass_at_1": 0.78
+      },
+      {
+        "name": "CodeLlama-34B",
+        "humaneval_pass_at_1": 0.68,
+        "mbpp_pass_at_1": 0.62
+      },
+      {
+        "name": "Troviku-1.1",
+        "humaneval_pass_at_1": 0.72,
+        "mbpp_pass_at_1": 0.68
+      }
+    ],
+    "notes": "Troviku-1.1 shows competitive performance with specialized optimization for coding tasks"
+  },
+  "limitations": {
+    "known_weaknesses": [
+      "Complex algorithmic problems requiring advanced mathematics",
+      "Very large codebases with deep context dependencies",
+      "Highly specialized domain-specific languages",
+      "Real-time system programming with strict performance constraints",
+      "Legacy code with unusual or deprecated patterns"
+    ],
+    "improvement_areas": [
+      "Enhanced multi-file context understanding",
+      "Better handling of rare programming languages",
+      "Improved optimization for competitive programming",
+      "Stronger performance on security-critical code"
+    ]
+  },
+  "methodology": {
+    "evaluation_framework": "Standard academic benchmarks with custom extensions",
+    "temperature_settings": "Varied by task type (0.2-0.8)",
+    "sampling_strategy": "Pass@k evaluated with k=[1, 10, 100]",
+    "test_execution": "Isolated sandbox environments with timeout limits",
+    "human_evaluation": "20% of outputs manually reviewed by senior engineers"
+  },
+  "reproducibility": {
+    "random_seed": 42,
+    "evaluation_code_repository": "https://github.com/OpenTrouter/troviku-eval",
+    "benchmark_versions": {
+      "humaneval": "v1.0",
+      "mbpp": "v1.0-sanitized",
+      "code_contests": "v1.0",
+      "multipl_e": "v0.2"
+    }
+  }
+}