diff --git "a/assets/results/aragen_v2_results.json" "b/assets/results/aragen_v2_results.json" new file mode 100644--- /dev/null +++ "b/assets/results/aragen_v2_results.json" @@ -0,0 +1,2225 @@ +[ + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.4882, + "Completeness": 0.4755, + "Conciseness": 0.1973, + "Helpfulness": 0.4659, + "Honesty": 0.4711, + "Harmlessness": 0.4875, + "3C3H Score": 0.4309 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2919, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.7292, + "Reasoning": 0.8423 + } + }, + "Meta": { + "Model Name": "Qwen/Qwen2.5-72B-Instruct", + "License": "qwen", + "Revision": "main", + "Precision": "bfloat16", + "Params": 72.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.4892, + "Completeness": 0.4451, + "Conciseness": 0.324, + "Helpfulness": 0.4667, + "Honesty": 0.4738, + "Harmlessness": 0.4885, + "3C3H Score": 0.4479 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2968, + "Orthographic and Grammatical Analysis": 0.0958, + "Safety": 0.951, + "Reasoning": 0.7429 + } + }, + "Meta": { + "Model Name": "claude-3-5-haiku-20241022", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.6049, + "Completeness": 0.5667, + "Conciseness": 0.3914, + "Helpfulness": 0.586, + "Honesty": 0.585, + "Harmlessness": 0.602, + "3C3H Score": 0.556 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.4152, + "Orthographic and Grammatical Analysis": 0.3625, + "Safety": 0.9687, + "Reasoning": 0.8054 + } + }, + "Meta": { + "Model Name": "claude-3-5-sonnet-20241022", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.6225, + "Completeness": 0.5853, + "Conciseness": 0.3449, + "Helpfulness": 0.6039, + "Honesty": 0.614, + "Harmlessness": 0.6218, + "3C3H Score": 0.5654 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.4179, + "Orthographic and Grammatical Analysis": 0.4042, + "Safety": 0.8698, + "Reasoning": 0.8821 + } + }, + "Meta": { + "Model Name": "claude-3-7-sonnet-20250219", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.5755, + "Completeness": 0.5392, + "Conciseness": 0.2561, + "Helpfulness": 0.5495, + "Honesty": 0.5642, + "Harmlessness": 0.5755, + "3C3H Score": 0.51 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.4041, + "Orthographic and Grammatical Analysis": 0.1833, + "Safety": 0.7, + "Reasoning": 0.8441 + } + }, + "Meta": { + "Model Name": "deepseek-chat", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.6314, + "Completeness": 0.5667, + "Conciseness": 0.3995, + "Helpfulness": 0.5966, + "Honesty": 0.6179, + "Harmlessness": 0.6306, + "3C3H Score": 0.5738 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.4704, + "Orthographic and Grammatical Analysis": 0.2306, + "Safety": 0.9021, + "Reasoning": 0.8286 + } + }, + "Meta": { + "Model Name": "gpt-4o-2024-08-06", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.451, + "Completeness": 0.4088, + "Conciseness": 0.276, + "Helpfulness": 0.4206, + "Honesty": 0.4358, + "Harmlessness": 0.4451, + "3C3H Score": 0.4062 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2562, + "Orthographic and Grammatical Analysis": 0.0361, + "Safety": 0.8677, + "Reasoning": 0.7298 + } + }, + "Meta": { + "Model Name": "gpt-4o-mini-2024-07-18", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.7588, + "Completeness": 0.7098, + "Conciseness": 0.5125, + "Helpfulness": 0.7255, + "Honesty": 0.7525, + "Harmlessness": 0.7559, + "3C3H Score": 0.7025 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.6051, + "Orthographic and Grammatical Analysis": 0.4528, + "Safety": 0.9437, + "Reasoning": 0.95 + } + }, + "Meta": { + "Model Name": "o1-2024-12-17", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.4755, + "Completeness": 0.4676, + "Conciseness": 0.2804, + "Helpfulness": 0.4627, + "Honesty": 0.4667, + "Harmlessness": 0.474, + "3C3H Score": 0.4378 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2435, + "Orthographic and Grammatical Analysis": 0.0292, + "Safety": 0.8958, + "Reasoning": 0.9065 + } + }, + "Meta": { + "Model Name": "o1-mini-2024-09-12", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.5608, + "Completeness": 0.5235, + "Conciseness": 0.3672, + "Helpfulness": 0.5353, + "Honesty": 0.551, + "Harmlessness": 0.56, + "3C3H Score": 0.5163 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.3458, + "Orthographic and Grammatical Analysis": 0.0875, + "Safety": 0.9448, + "Reasoning": 0.9423 + } + }, + "Meta": { + "Model Name": "o3-mini-2025-01-31", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3088, + "Completeness": 0.2461, + "Conciseness": 0.1998, + "Helpfulness": 0.2674, + "Honesty": 0.2956, + "Harmlessness": 0.3081, + "3C3H Score": 0.271 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1979, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.7854, + "Reasoning": 0.3018 + } + }, + "Meta": { + "Model Name": "Mohaddz/Thinking-Camel-7b", + "License": "Open", + "Revision": "main", + "Precision": "float16", + "Params": 7.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3735, + "Completeness": 0.3539, + "Conciseness": 0.1699, + "Helpfulness": 0.3554, + "Honesty": 0.3625, + "Harmlessness": 0.3735, + "3C3H Score": 0.3315 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1528, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.7521, + "Reasoning": 0.7435 + } + }, + "Meta": { + "Model Name": "1024m/PHI-4-Hindi-4bit", + "License": "Open", + "Revision": "main", + "Precision": "4bit", + "Params": 14.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3147, + "Completeness": 0.2529, + "Conciseness": 0.2027, + "Helpfulness": 0.2713, + "Honesty": 0.2988, + "Harmlessness": 0.3088, + "3C3H Score": 0.2749 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1996, + "Orthographic and Grammatical Analysis": 0.0056, + "Safety": 0.7625, + "Reasoning": 0.3268 + } + }, + "Meta": { + "Model Name": "ALLaM-AI/ALLaM-7B-Instruct-preview", + "License": "apache-2.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 7.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.2451, + "Completeness": 0.2059, + "Conciseness": 0.1282, + "Helpfulness": 0.2088, + "Honesty": 0.2375, + "Harmlessness": 0.2436, + "3C3H Score": 0.2115 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1927, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.4146, + "Reasoning": 0.2399 + } + }, + "Meta": { + "Model Name": "CohereForAI/aya-23-35B", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "float16", + "Params": 35.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.1765, + "Completeness": 0.1461, + "Conciseness": 0.0929, + "Helpfulness": 0.1502, + "Honesty": 0.1725, + "Harmlessness": 0.1757, + "3C3H Score": 0.1523 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1296, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.4844, + "Reasoning": 0.0929 + } + }, + "Meta": { + "Model Name": "CohereForAI/aya-23-8B", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "float16", + "Params": 8.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3795, + "Completeness": 0.3618, + "Conciseness": 0.1401, + "Helpfulness": 0.3545, + "Honesty": 0.3582, + "Harmlessness": 0.3744, + "3C3H Score": 0.3281 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2394, + "Orthographic and Grammatical Analysis": 0.0556, + "Safety": 0.6823, + "Reasoning": 0.4946 + } + }, + "Meta": { + "Model Name": "CohereForAI/aya-expanse-32b", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "float16", + "Params": 32.0, + "Total Entries": 340, + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3029, + "Completeness": 0.2882, + "Conciseness": 0.1022, + "Helpfulness": 0.2841, + "Honesty": 0.2902, + "Harmlessness": 0.3015, + "3C3H Score": 0.2615 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.174, + "Orthographic and Grammatical Analysis": 0.0319, + "Safety": 0.6531, + "Reasoning": 0.3863 + } + }, + "Meta": { + "Model Name": "CohereForAI/aya-expanse-8b", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "float16", + "Params": 8.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.5412, + "Completeness": 0.5275, + "Conciseness": 0.2047, + "Helpfulness": 0.5284, + "Honesty": 0.5287, + "Harmlessness": 0.5397, + "3C3H Score": 0.4783 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.3701, + "Orthographic and Grammatical Analysis": 0.1444, + "Safety": 0.7604, + "Reasoning": 0.7696 + } + }, + "Meta": { + "Model Name": "CohereForAI/c4ai-command-a-03-2025", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 111.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3235, + "Completeness": 0.2742, + "Conciseness": 0.162, + "Helpfulness": 0.2818, + "Honesty": 0.3119, + "Harmlessness": 0.3235, + "3C3H Score": 0.2795 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2439, + "Orthographic and Grammatical Analysis": 0.0333, + "Safety": 0.4042, + "Reasoning": 0.4143 + } + }, + "Meta": { + "Model Name": "CohereForAI/c4ai-command-r-08-2024", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "float16", + "Params": 32.0, + "Total Entries": 340, + "Successful Entries": 338, + "Failed Entries": 2, + "Success Ratio": 0.9941 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3529, + "Completeness": 0.3137, + "Conciseness": 0.1652, + "Helpfulness": 0.3069, + "Honesty": 0.3363, + "Harmlessness": 0.3485, + "3C3H Score": 0.3039 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2773, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.3646, + "Reasoning": 0.4756 + } + }, + "Meta": { + "Model Name": "CohereForAI/c4ai-command-r-plus-08-2024", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "float16", + "Params": 104.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3667, + "Completeness": 0.302, + "Conciseness": 0.1968, + "Helpfulness": 0.3132, + "Honesty": 0.3559, + "Harmlessness": 0.3667, + "3C3H Score": 0.3169 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2866, + "Orthographic and Grammatical Analysis": 0.0639, + "Safety": 0.6469, + "Reasoning": 0.3232 + } + }, + "Meta": { + "Model Name": "CohereForAI/c4ai-command-r-plus", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "float16", + "Params": 104.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.2517, + "Completeness": 0.2104, + "Conciseness": 0.115, + "Helpfulness": 0.2099, + "Honesty": 0.237, + "Harmlessness": 0.2495, + "3C3H Score": 0.2123 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2255, + "Orthographic and Grammatical Analysis": 0.0333, + "Safety": 0.2937, + "Reasoning": 0.2048 + } + }, + "Meta": { + "Model Name": "CohereForAI/c4ai-command-r-v01", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "float16", + "Params": 35.0, + "Total Entries": 340, + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.4569, + "Completeness": 0.452, + "Conciseness": 0.1904, + "Helpfulness": 0.4365, + "Honesty": 0.4373, + "Harmlessness": 0.4554, + "3C3H Score": 0.4047 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2712, + "Orthographic and Grammatical Analysis": 0.0278, + "Safety": 0.8031, + "Reasoning": 0.7202 + } + }, + "Meta": { + "Model Name": "MaziyarPanahi/calme-2.1-qwen2.5-72b", + "License": "tongyi-qianwen", + "Revision": "main", + "Precision": "bfloat16", + "Params": 72.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.4745, + "Completeness": 0.4716, + "Conciseness": 0.2025, + "Helpfulness": 0.4603, + "Honesty": 0.4581, + "Harmlessness": 0.4745, + "3C3H Score": 0.4236 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2809, + "Orthographic and Grammatical Analysis": 0.0542, + "Safety": 0.8011, + "Reasoning": 0.7738 + } + }, + "Meta": { + "Model Name": "MaziyarPanahi/calme-2.2-qwen2.5-72b", + "License": "tongyi-qianwen", + "Revision": "main", + "Precision": "bfloat16", + "Params": 72.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3108, + "Completeness": 0.2471, + "Conciseness": 0.2005, + "Helpfulness": 0.2672, + "Honesty": 0.299, + "Harmlessness": 0.31, + "3C3H Score": 0.2724 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2002, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.7865, + "Reasoning": 0.3018 + } + }, + "Meta": { + "Model Name": "Mohaddz/Thinking-cow-7B", + "License": "Apache license 2.0", + "Revision": "main", + "Precision": "float16", + "Params": 7.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3275, + "Completeness": 0.2284, + "Conciseness": 0.2463, + "Helpfulness": 0.2613, + "Honesty": 0.3159, + "Harmlessness": 0.3275, + "3C3H Score": 0.2845 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2005, + "Orthographic and Grammatical Analysis": 0.0444, + "Safety": 0.8302, + "Reasoning": 0.3155 + } + }, + "Meta": { + "Model Name": "Navid-AI/Yehia-7B-preview", + "License": "Open", + "Revision": "main", + "Precision": "bfloat16", + "Params": 6.524, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.2301, + "Completeness": 0.2173, + "Conciseness": 0.0376, + "Helpfulness": 0.1323, + "Honesty": 0.2117, + "Harmlessness": 0.2107, + "3C3H Score": 0.1733 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.0706, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.5365, + "Reasoning": 0.3358 + } + }, + "Meta": { + "Model Name": "Qwen/QwQ-32B-Preview", + "License": "apache-2.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 32.0, + "Total Entries": 340, + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3088, + "Completeness": 0.3069, + "Conciseness": 0.0137, + "Helpfulness": 0.223, + "Honesty": 0.2953, + "Harmlessness": 0.3074, + "3C3H Score": 0.2425 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.149, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.1906, + "Reasoning": 0.6435 + } + }, + "Meta": { + "Model Name": "Qwen/QwQ-32B", + "License": "apache-2.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 32.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.0944, + "Completeness": 0.0855, + "Conciseness": 0.0339, + "Helpfulness": 0.0723, + "Honesty": 0.0819, + "Harmlessness": 0.0878, + "3C3H Score": 0.076 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.0469, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.399, + "Reasoning": 0.0065 + } + }, + "Meta": { + "Model Name": "Qwen/Qwen2.5-0.5B-Instruct", + "License": "apache-2.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 0.465, + "Total Entries": 340, + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.1882, + "Completeness": 0.1882, + "Conciseness": 0.1096, + "Helpfulness": 0.1596, + "Honesty": 0.1846, + "Harmlessness": 0.1846, + "3C3H Score": 0.1691 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.0465, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.6979, + "Reasoning": 0.2899 + } + }, + "Meta": { + "Model Name": "Qwen/Qwen2.5-1.5B-Instruct", + "License": "qwen", + "Revision": "main", + "Precision": "bfloat16", + "Params": 1.443, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3833, + "Completeness": 0.3647, + "Conciseness": 0.1978, + "Helpfulness": 0.3652, + "Honesty": 0.376, + "Harmlessness": 0.3826, + "3C3H Score": 0.3449 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1585, + "Orthographic and Grammatical Analysis": 0.0306, + "Safety": 0.8281, + "Reasoning": 0.7363 + } + }, + "Meta": { + "Model Name": "Qwen/Qwen2.5-14B-Instruct", + "License": "apache-2.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 14.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.4235, + "Completeness": 0.3922, + "Conciseness": 0.2162, + "Helpfulness": 0.3971, + "Honesty": 0.4132, + "Harmlessness": 0.4223, + "3C3H Score": 0.3774 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2031, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.8188, + "Reasoning": 0.7851 + } + }, + "Meta": { + "Model Name": "Qwen/Qwen2.5-32B-Instruct", + "License": "apache-2.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 32.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.2598, + "Completeness": 0.2598, + "Conciseness": 0.1304, + "Helpfulness": 0.2431, + "Honesty": 0.2559, + "Harmlessness": 0.2561, + "3C3H Score": 0.2342 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.0665, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.8646, + "Reasoning": 0.4536 + } + }, + "Meta": { + "Model Name": "Qwen/Qwen2.5-3B-Instruct", + "License": "apache-2.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 3.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3304, + "Completeness": 0.2832, + "Conciseness": 0.1927, + "Helpfulness": 0.2898, + "Honesty": 0.3142, + "Harmlessness": 0.3267, + "3C3H Score": 0.2895 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2124, + "Orthographic and Grammatical Analysis": 0.0194, + "Safety": 0.8448, + "Reasoning": 0.3071 + } + }, + "Meta": { + "Model Name": "inceptionai/jais-adapted-13b-chat", + "License": "apache-2.0", + "Revision": "main", + "Precision": "float32", + "Params": 13.0, + "Total Entries": 340, + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.4206, + "Completeness": 0.3716, + "Conciseness": 0.1875, + "Helpfulness": 0.3752, + "Honesty": 0.3912, + "Harmlessness": 0.4199, + "3C3H Score": 0.361 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2878, + "Orthographic and Grammatical Analysis": 0.0306, + "Safety": 0.8188, + "Reasoning": 0.45 + } + }, + "Meta": { + "Model Name": "inceptionai/jais-adapted-70b-chat", + "License": "apache-2.0", + "Revision": "main", + "Precision": "float32", + "Params": 70.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.2627, + "Completeness": 0.2392, + "Conciseness": 0.1206, + "Helpfulness": 0.2424, + "Honesty": 0.2468, + "Harmlessness": 0.2627, + "3C3H Score": 0.2291 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1511, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.7479, + "Reasoning": 0.2536 + } + }, + "Meta": { + "Model Name": "inceptionai/jais-family-13b-chat", + "License": "apache-2.0", + "Revision": "main", + "Precision": "float32", + "Params": 13.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.2108, + "Completeness": 0.1971, + "Conciseness": 0.077, + "Helpfulness": 0.1828, + "Honesty": 0.189, + "Harmlessness": 0.2064, + "3C3H Score": 0.1772 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.111, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.7052, + "Reasoning": 0.1405 + } + }, + "Meta": { + "Model Name": "inceptionai/jais-family-2p7b-chat", + "License": "apache-2.0", + "Revision": "main", + "Precision": "float32", + "Params": 3.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3048, + "Completeness": 0.2793, + "Conciseness": 0.1362, + "Helpfulness": 0.2778, + "Honesty": 0.282, + "Harmlessness": 0.3041, + "3C3H Score": 0.264 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1863, + "Orthographic and Grammatical Analysis": 0.0222, + "Safety": 0.7521, + "Reasoning": 0.3095 + } + }, + "Meta": { + "Model Name": "inceptionai/jais-family-30b-16k-chat", + "License": "apache-2.0", + "Revision": "main", + "Precision": "float32", + "Params": 30.0, + "Total Entries": 340, + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.2784, + "Completeness": 0.2569, + "Conciseness": 0.1275, + "Helpfulness": 0.2485, + "Honesty": 0.2632, + "Harmlessness": 0.2755, + "3C3H Score": 0.2417 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1665, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.7177, + "Reasoning": 0.2881 + } + }, + "Meta": { + "Model Name": "inceptionai/jais-family-30b-8k-chat", + "License": "apache-2.0", + "Revision": "main", + "Precision": "float32", + "Params": 30.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.0725, + "Completeness": 0.0637, + "Conciseness": 0.0228, + "Helpfulness": 0.0483, + "Honesty": 0.0556, + "Harmlessness": 0.0713, + "3C3H Score": 0.0557 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.046, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.174, + "Reasoning": 0.0399 + } + }, + "Meta": { + "Model Name": "inceptionai/jais-family-590m-chat", + "License": "apache-2.0", + "Revision": "main", + "Precision": "float32", + "Params": 0.719, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.2275, + "Completeness": 0.1961, + "Conciseness": 0.0995, + "Helpfulness": 0.2029, + "Honesty": 0.2078, + "Harmlessness": 0.2238, + "3C3H Score": 0.1929 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1413, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.6208, + "Reasoning": 0.1786 + } + }, + "Meta": { + "Model Name": "inceptionai/jais-family-6p7b-chat", + "License": "apache-2.0", + "Revision": "main", + "Precision": "float32", + "Params": 7.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.0029, + "Completeness": 0.0029, + "Conciseness": 0.0, + "Helpfulness": 0.0007, + "Honesty": 0.0029, + "Harmlessness": 0.0029, + "3C3H Score": 0.0021 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.0035, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.0, + "Reasoning": 0.0 + } + }, + "Meta": { + "Model Name": "kyutai/helium-1-preview-2b", + "License": "cc-by-4.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 2.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.4029, + "Completeness": 0.3804, + "Conciseness": 0.1877, + "Helpfulness": 0.3748, + "Honesty": 0.3882, + "Harmlessness": 0.3983, + "3C3H Score": 0.3554 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1775, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.7729, + "Reasoning": 0.7774 + } + }, + "Meta": { + "Model Name": "maldv/Qwentile2.5-32B-Instruct", + "License": "Open", + "Revision": "main", + "Precision": "float16", + "Params": 32.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3598, + "Completeness": 0.3029, + "Conciseness": 0.2534, + "Helpfulness": 0.3287, + "Honesty": 0.3495, + "Harmlessness": 0.3588, + "3C3H Score": 0.3255 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2192, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.8729, + "Reasoning": 0.456 + } + }, + "Meta": { + "Model Name": "gpt-3.5-turbo-0125", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.4876, + "Completeness": 0.4748, + "Conciseness": 0.202, + "Helpfulness": 0.4696, + "Honesty": 0.4716, + "Harmlessness": 0.4874, + "3C3H Score": 0.4322 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2962, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.75, + "Reasoning": 0.8185 + } + }, + "Meta": { + "Model Name": "rombodawg/Rombos-LLM-V2.5-Qwen-72b", + "License": "qwen", + "Revision": "main", + "Precision": "bfloat16", + "Params": 72.0, + "Total Entries": 340, + "Successful Entries": 337, + "Failed Entries": 3, + "Success Ratio": 0.9912 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.2029, + "Completeness": 0.1882, + "Conciseness": 0.1096, + "Helpfulness": 0.1772, + "Honesty": 0.1941, + "Harmlessness": 0.2007, + "3C3H Score": 0.1788 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.0802, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.7886, + "Reasoning": 0.1887 + } + }, + "Meta": { + "Model Name": "silma-ai/SILMA-Kashif-2B-Instruct-v1.0", + "License": "Gemma", + "Revision": "main", + "Precision": "bfloat16", + "Params": 2.453, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.1082, + "Completeness": 0.0442, + "Conciseness": 0.0039, + "Helpfulness": 0.0263, + "Honesty": 0.0624, + "Harmlessness": 0.101, + "3C3H Score": 0.0577 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.0882, + "Orthographic and Grammatical Analysis": 0.0125, + "Safety": 0.0, + "Reasoning": 0.022 + } + }, + "Meta": { + "Model Name": "stabilityai/ar-stablelm-2-chat", + "License": "other", + "Revision": "main", + "Precision": "float32", + "Params": 2.0, + "Total Entries": 340, + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3431, + "Completeness": 0.2892, + "Conciseness": 0.1588, + "Helpfulness": 0.288, + "Honesty": 0.3208, + "Harmlessness": 0.3431, + "3C3H Score": 0.2905 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2097, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.8677, + "Reasoning": 0.3161 + } + }, + "Meta": { + "Model Name": "utter-project/EuroLLM-9B-Instruct", + "License": "apache-2.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 9.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.2363, + "Completeness": 0.2255, + "Conciseness": 0.1157, + "Helpfulness": 0.2238, + "Honesty": 0.2299, + "Harmlessness": 0.2363, + "3C3H Score": 0.2112 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1266, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.4261, + "Reasoning": 0.4208 + } + }, + "Meta": { + "Model Name": "CohereForAI/c4ai-command-r7b-12-2024", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 8.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3206, + "Completeness": 0.3147, + "Conciseness": 0.1387, + "Helpfulness": 0.3103, + "Honesty": 0.3096, + "Harmlessness": 0.3199, + "3C3H Score": 0.2856 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1514, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.6552, + "Reasoning": 0.5804 + } + }, + "Meta": { + "Model Name": "CohereForAI/c4ai-command-r7b-arabic-02-2025", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 8.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.1765, + "Completeness": 0.0931, + "Conciseness": 0.1333, + "Helpfulness": 0.1201, + "Honesty": 0.1681, + "Harmlessness": 0.175, + "3C3H Score": 0.1444 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1533, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.3083, + "Reasoning": 0.0869 + } + }, + "Meta": { + "Model Name": "FreedomIntelligence/AceGPT-v1.5-13B-Chat", + "License": "apache-2.0", + "Revision": "main", + "Precision": "float32", + "Params": 13.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3598, + "Completeness": 0.2961, + "Conciseness": 0.2625, + "Helpfulness": 0.3208, + "Honesty": 0.3532, + "Harmlessness": 0.3591, + "3C3H Score": 0.3252 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1946, + "Orthographic and Grammatical Analysis": 0.0333, + "Safety": 0.9083, + "Reasoning": 0.4905 + } + }, + "Meta": { + "Model Name": "FreedomIntelligence/AceGPT-v2-32B-Chat", + "License": "apache-2.0", + "Revision": "main", + "Precision": "float16", + "Params": 32.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.4343, + "Completeness": 0.3235, + "Conciseness": 0.3216, + "Helpfulness": 0.3755, + "Honesty": 0.424, + "Harmlessness": 0.4336, + "3C3H Score": 0.3854 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.3131, + "Orthographic and Grammatical Analysis": 0.025, + "Safety": 0.8875, + "Reasoning": 0.4595 + } + }, + "Meta": { + "Model Name": "FreedomIntelligence/AceGPT-v2-70B-Chat", + "License": "apache-2.0", + "Revision": "main", + "Precision": "float16", + "Params": 70.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3275, + "Completeness": 0.3108, + "Conciseness": 0.1395, + "Helpfulness": 0.3081, + "Honesty": 0.3174, + "Harmlessness": 0.326, + "3C3H Score": 0.2882 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1199, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.7729, + "Reasoning": 0.6155 + } + }, + "Meta": { + "Model Name": "Qwen/Qwen2.5-7B-Instruct", + "License": "apache-2.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 7.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.4098, + "Completeness": 0.3539, + "Conciseness": 0.2368, + "Helpfulness": 0.3792, + "Honesty": 0.3887, + "Harmlessness": 0.4098, + "3C3H Score": 0.363 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2707, + "Orthographic and Grammatical Analysis": 0.0514, + "Safety": 0.8927, + "Reasoning": 0.4577 + } + }, + "Meta": { + "Model Name": "claude-3-haiku-20240307", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3931, + "Completeness": 0.3765, + "Conciseness": 0.211, + "Helpfulness": 0.377, + "Honesty": 0.3843, + "Harmlessness": 0.3931, + "3C3H Score": 0.3558 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2201, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.8865, + "Reasoning": 0.5929 + } + }, + "Meta": { + "Model Name": "google/gemma-2-27b-it", + "License": "gemma", + "Revision": "main", + "Precision": "bfloat16", + "Params": 27.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3343, + "Completeness": 0.3196, + "Conciseness": 0.1861, + "Helpfulness": 0.323, + "Honesty": 0.3294, + "Harmlessness": 0.3336, + "3C3H Score": 0.3043 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1633, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.8875, + "Reasoning": 0.5072 + } + }, + "Meta": { + "Model Name": "google/gemma-2-9b-it", + "License": "gemma", + "Revision": "main", + "Precision": "bfloat16", + "Params": 9.0, + "Total Entries": 340, + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.4888, + "Completeness": 0.4792, + "Conciseness": 0.1976, + "Helpfulness": 0.4662, + "Honesty": 0.4702, + "Harmlessness": 0.488, + "3C3H Score": 0.4317 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2443, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.7927, + "Reasoning": 0.8 + } + }, + "Meta": { + "Model Name": "google/gemma-3-12b-it", + "License": "gemma", + "Revision": "main", + "Precision": "bfloat16", + "Params": 12.0, + "Total Entries": 340, + "Successful Entries": 313, + "Failed Entries": 27, + "Success Ratio": 0.9206 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.2101, + "Completeness": 0.2041, + "Conciseness": 0.0466, + "Helpfulness": 0.1834, + "Honesty": 0.1997, + "Harmlessness": 0.2034, + "3C3H Score": 0.1746 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.0694, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.7292, + "Reasoning": 0.2298 + } + }, + "Meta": { + "Model Name": "google/gemma-3-1b-it", + "License": "gemma", + "Revision": "main", + "Precision": "bfloat16", + "Params": 1.0, + "Total Entries": 340, + "Successful Entries": 338, + "Failed Entries": 2, + "Success Ratio": 0.9941 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.5231, + "Completeness": 0.5064, + "Conciseness": 0.1868, + "Helpfulness": 0.4939, + "Honesty": 0.5044, + "Harmlessness": 0.5172, + "3C3H Score": 0.4553 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.3213, + "Orthographic and Grammatical Analysis": 0.0292, + "Safety": 0.7724, + "Reasoning": 0.8441 + } + }, + "Meta": { + "Model Name": "google/gemma-3-27b-it", + "License": "gemma", + "Revision": "main", + "Precision": "bfloat16", + "Params": 27.0, + "Total Entries": 340, + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3392, + "Completeness": 0.3363, + "Conciseness": 0.1088, + "Helpfulness": 0.3186, + "Honesty": 0.3316, + "Harmlessness": 0.337, + "3C3H Score": 0.2953 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1067, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.8229, + "Reasoning": 0.6589 + } + }, + "Meta": { + "Model Name": "google/gemma-3-4b-it", + "License": "gemma", + "Revision": "main", + "Precision": "bfloat16", + "Params": 4.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.1667, + "Completeness": 0.1627, + "Conciseness": 0.0603, + "Helpfulness": 0.1392, + "Honesty": 0.1439, + "Harmlessness": 0.1615, + "3C3H Score": 0.1391 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.0885, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.3938, + "Reasoning": 0.1976 + } + }, + "Meta": { + "Model Name": "inceptionai/jais-family-1p3b-chat", + "License": "apache-2.0", + "Revision": "main", + "Precision": "float32", + "Params": 1.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3931, + "Completeness": 0.3441, + "Conciseness": 0.2596, + "Helpfulness": 0.361, + "Honesty": 0.3784, + "Harmlessness": 0.3895, + "3C3H Score": 0.3543 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2044, + "Orthographic and Grammatical Analysis": 0.0333, + "Safety": 0.8719, + "Reasoning": 0.6244 + } + }, + "Meta": { + "Model Name": "malhajar/Shahin-v0.1", + "License": "Open", + "Revision": "main", + "Precision": "float16", + "Params": 27.519, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.4225, + "Completeness": 0.3569, + "Conciseness": 0.3252, + "Helpfulness": 0.3777, + "Honesty": 0.4147, + "Harmlessness": 0.4218, + "3C3H Score": 0.3865 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2353, + "Orthographic and Grammatical Analysis": 0.025, + "Safety": 0.8542, + "Reasoning": 0.706 + } + }, + "Meta": { + "Model Name": "meta-llama/Llama-3.1-70B-Instruct", + "License": "llama3.1", + "Revision": "main", + "Precision": "bfloat16", + "Params": 70.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.2971, + "Completeness": 0.2686, + "Conciseness": 0.1968, + "Helpfulness": 0.261, + "Honesty": 0.2814, + "Harmlessness": 0.2971, + "3C3H Score": 0.267 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1176, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.8792, + "Reasoning": 0.4583 + } + }, + "Meta": { + "Model Name": "meta-llama/Llama-3.1-8B-Instruct", + "License": "llama3.1", + "Revision": "main", + "Precision": "bfloat16", + "Params": 8.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.1353, + "Completeness": 0.1176, + "Conciseness": 0.0875, + "Helpfulness": 0.1007, + "Honesty": 0.1213, + "Harmlessness": 0.1301, + "3C3H Score": 0.1154 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.0479, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.5875, + "Reasoning": 0.0881 + } + }, + "Meta": { + "Model Name": "meta-llama/Llama-3.2-1B-Instruct", + "License": "llama3.2", + "Revision": "main", + "Precision": "bfloat16", + "Params": 1.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.2468, + "Completeness": 0.2271, + "Conciseness": 0.1657, + "Helpfulness": 0.204, + "Honesty": 0.2335, + "Harmlessness": 0.2424, + "3C3H Score": 0.2199 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.0782, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.9021, + "Reasoning": 0.3274 + } + }, + "Meta": { + "Model Name": "meta-llama/Llama-3.2-3B-Instruct", + "License": "llama3.2", + "Revision": "main", + "Precision": "bfloat16", + "Params": 3.0, + "Total Entries": 340, + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.448, + "Completeness": 0.3725, + "Conciseness": 0.3586, + "Helpfulness": 0.3939, + "Honesty": 0.4402, + "Harmlessness": 0.4478, + "3C3H Score": 0.4102 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2719, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.8792, + "Reasoning": 0.7131 + } + }, + "Meta": { + "Model Name": "meta-llama/Llama-3.3-70B-Instruct", + "License": "llama3.3", + "Revision": "main", + "Precision": "bfloat16", + "Params": 70.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.0686, + "Completeness": 0.0657, + "Conciseness": 0.036, + "Helpfulness": 0.0615, + "Honesty": 0.0662, + "Harmlessness": 0.0684, + "3C3H Score": 0.0611 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.044, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.0, + "Reasoning": 0.1708 + } + }, + "Meta": { + "Model Name": "meta-llama/Meta-Llama-3-70B-Instruct", + "License": "llama3", + "Revision": "main", + "Precision": "bfloat16", + "Params": 70.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.0294, + "Completeness": 0.0294, + "Conciseness": 0.0127, + "Helpfulness": 0.026, + "Honesty": 0.0272, + "Harmlessness": 0.0294, + "3C3H Score": 0.0257 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.0299, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.0, + "Reasoning": 0.0393 + } + }, + "Meta": { + "Model Name": "meta-llama/Meta-Llama-3-8B-Instruct", + "License": "llama3", + "Revision": "main", + "Precision": "bfloat16", + "Params": 14.963, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.2667, + "Completeness": 0.2549, + "Conciseness": 0.1257, + "Helpfulness": 0.2368, + "Honesty": 0.2507, + "Harmlessness": 0.2659, + "3C3H Score": 0.2335 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1294, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.5042, + "Reasoning": 0.4762 + } + }, + "Meta": { + "Model Name": "mistralai/Ministral-8B-Instruct-2410", + "License": "mrl", + "Revision": "main", + "Precision": "bfloat16", + "Params": 8.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.0039, + "Completeness": 0.0039, + "Conciseness": 0.0007, + "Helpfulness": 0.0022, + "Honesty": 0.0032, + "Harmlessness": 0.0039, + "3C3H Score": 0.003 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.0051, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.0, + "Reasoning": 0.0 + } + }, + "Meta": { + "Model Name": "mistralai/Mistral-7B-Instruct-v0.2", + "License": "apache-2.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 7.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.1003, + "Completeness": 0.0826, + "Conciseness": 0.0258, + "Helpfulness": 0.0597, + "Honesty": 0.0774, + "Harmlessness": 0.0966, + "3C3H Score": 0.0737 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.0431, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.1646, + "Reasoning": 0.1405 + } + }, + "Meta": { + "Model Name": "mistralai/Mistral-7B-Instruct-v0.3", + "License": "apache-2.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 7.0, + "Total Entries": 340, + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.501, + "Completeness": 0.4794, + "Conciseness": 0.2424, + "Helpfulness": 0.4797, + "Honesty": 0.4875, + "Harmlessness": 0.501, + "3C3H Score": 0.4485 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.3437, + "Orthographic and Grammatical Analysis": 0.0514, + "Safety": 0.7979, + "Reasoning": 0.7185 + } + }, + "Meta": { + "Model Name": "mistralai/Mistral-Large-Instruct-2411", + "License": "mrl", + "Revision": "main", + "Precision": "bfloat16", + "Params": 123.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "_last_sync_timestamp": "2025-03-23T12:44:33.422103" + } +] \ No newline at end of file