{ "model": "Kirim-1-Math", "version": "1.0.0", "parameters": "30B", "evaluation_date": "2024-12-13", "temperature": 0.1, "sampling": "greedy", "mathematical_reasoning": { "GSM8K": { "accuracy": 0.942, "total_questions": 1319, "correct": 1242, "comparison": { "gpt4": 0.920, "claude_3_opus": 0.915, "best_open_source": 0.917 } }, "MATH": { "accuracy": 0.785, "total_questions": 5000, "correct": 3925, "breakdown_by_difficulty": { "level_1": 0.96, "level_2": 0.92, "level_3": 0.84, "level_4": 0.71, "level_5": 0.58 }, "breakdown_by_subject": { "algebra": 0.89, "counting_and_probability": 0.82, "geometry": 0.76, "intermediate_algebra": 0.81, "number_theory": 0.78, "prealgebra": 0.94, "precalculus": 0.73 }, "comparison": { "gpt4": 0.764, "claude_3_opus": 0.752, "best_open_source": 0.742 } }, "MMLU_Math": { "accuracy": 0.887, "subjects": { "abstract_algebra": 0.82, "college_mathematics": 0.89, "elementary_mathematics": 0.96, "high_school_mathematics": 0.91, "high_school_statistics": 0.85 } }, "Minerva_Math": { "accuracy": 0.452, "total_questions": 272, "correct": 123, "note": "Complex competition-level problems" }, "AMC10": { "accuracy": 0.723, "average_score": "18.1/25", "comparison": { "human_average": 0.48, "gpt4": 0.695 } }, "AMC12": { "accuracy": 0.723, "average_score": "18.1/25", "comparison": { "human_average": 0.42, "gpt4": 0.695 } }, "AIME": { "accuracy": 0.387, "average_score": "5.8/15", "comparison": { "human_qualifier_average": 0.40, "gpt4": 0.352 } } }, "tool_calling_evaluation": { "tool_selection_accuracy": { "score": 0.968, "description": "Correctly identifies which tool to use" }, "parameter_extraction_accuracy": { "score": 0.942, "description": "Correctly extracts parameters for tool calls" }, "execution_success_rate": { "score": 0.925, "description": "Tool calls execute without errors" }, "result_integration_accuracy": { "score": 0.951, "description": "Correctly uses tool results in final answer" }, "tool_usage_by_type": { "calculator": { "called": 5234, "successful": 4872, "success_rate": 0.931 }, "symbolic_solver": { "called": 3421, "successful": 3189, "success_rate": 0.932 }, "derivative": { "called": 1892, "successful": 1756, "success_rate": 0.928 }, "integrate": { "called": 1654, "successful": 1521, "success_rate": 0.920 }, "code_executor": { "called": 2341, "successful": 2103, "success_rate": 0.898 } } }, "code_generation": { "HumanEval_Math": { "pass_at_1": 0.783, "pass_at_10": 0.921, "language": "Python" }, "MBPP_Math": { "pass_at_1": 0.756, "pass_at_10": 0.894 }, "SymPy_Tasks": { "accuracy": 0.825, "tasks": "symbolic_manipulation" }, "NumPy_Tasks": { "accuracy": 0.756, "tasks": "numerical_computation" } }, "multilingual_math": { "chinese_math_problems": { "accuracy": 0.891, "total": 1000, "correct": 891, "sources": ["Gaokao", "Chinese_Olympiad"] }, "english_math_problems": { "accuracy": 0.887, "total": 1000, "correct": 887 }, "cross_lingual_consistency": { "score": 0.965, "description": "Same problem in different languages yields same answer" } }, "reasoning_quality": { "step_by_step_accuracy": { "score": 0.912, "description": "Each reasoning step is logically sound" }, "proof_validity": { "score": 0.834, "description": "Mathematical proofs are formally valid" }, "notation_correctness": { "score": 0.956, "description": "Mathematical notation is used correctly" }, "latex_formatting": { "score": 0.978, "description": "LaTeX output is properly formatted" } }, "performance_metrics": { "inference_speed": { "tokens_per_second": 45, "hardware": "A100 80GB", "batch_size": 1 }, "memory_usage": { "bf16": "60GB", "int8": "30GB", "int4": "20GB" }, "latency": { "mean_ms": 89, "p50_ms": 82, "p95_ms": 145, "p99_ms": 203 } }, "comparison_with_baselines": { "overall_math_score": { "kirim_1_math": 0.847, "gpt4": 0.826, "claude_3_opus": 0.814, "gemini_1_5_pro": 0.798, "llama_3_70b": 0.742, "mistral_large": 0.735 } }, "limitations": { "observed_failures": [ "Complex multi-variable calculus problems", "Abstract topology proofs", "Very large numerical computations without tools", "Problems requiring visual/geometric intuition", "Extremely novel mathematical concepts" ], "error_rate_by_difficulty": { "elementary": 0.04, "high_school": 0.08, "undergraduate": 0.15, "graduate": 0.28, "research": 0.45 } }, "notes": { "evaluation_methodology": "All benchmarks run with temperature=0.1 for deterministic results", "tool_calling": "Tool calling enabled for all evaluations", "verification": "Results verified by automated test suites and manual review", "reproducibility": "Seeds fixed for reproducible results" } }