DeepSeek-LLM-67B-Chat-gptq-8bit / llm-eval-DeepSeek-LLM-67B-Chat-gptq-8bit.json
rwmasood's picture
Upload llm-eval-DeepSeek-LLM-67B-Chat-gptq-8bit.json
11c88fe verified
{
"arc_challenge": {
"alias": "arc_challenge",
"acc,none": 0.5810580204778157,
"acc_stderr,none": 0.014418106953639013,
"acc_norm,none": 0.6092150170648464,
"acc_norm_stderr,none": 0.014258563880513782
},
"gpqa_diamond_cot_n_shot": {
"alias": "gpqa_diamond_cot_n_shot",
"exact_match,strict-match": 0.0,
"exact_match_stderr,strict-match": 0.0,
"exact_match,flexible-extract": 0.1414141414141414,
"exact_match_stderr,flexible-extract": 0.024825909793343322
},
"gpqa_diamond_cot_zeroshot": {
"alias": "gpqa_diamond_cot_zeroshot",
"exact_match,strict-match": 0.0,
"exact_match_stderr,strict-match": 0.0,
"exact_match,flexible-extract": 0.18686868686868688,
"exact_match_stderr,flexible-extract": 0.027772533334218984
},
"gpqa_diamond_generative_n_shot": {
"alias": "gpqa_diamond_generative_n_shot",
"exact_match,strict-match": 0.0,
"exact_match_stderr,strict-match": 0.0,
"exact_match,flexible-extract": 0.21717171717171718,
"exact_match_stderr,flexible-extract": 0.029376616484945633
},
"gpqa_diamond_n_shot": {
"alias": "gpqa_diamond_n_shot",
"acc,none": 0.3484848484848485,
"acc_stderr,none": 0.03394853965156402,
"acc_norm,none": 0.3484848484848485,
"acc_norm_stderr,none": 0.03394853965156402
},
"gpqa_diamond_zeroshot": {
"alias": "gpqa_diamond_zeroshot",
"acc,none": 0.32323232323232326,
"acc_stderr,none": 0.03332299921070644,
"acc_norm,none": 0.32323232323232326,
"acc_norm_stderr,none": 0.03332299921070644
},
"gpqa_extended_cot_n_shot": {
"alias": "gpqa_extended_cot_n_shot",
"exact_match,strict-match": 0.0,
"exact_match_stderr,strict-match": 0.0,
"exact_match,flexible-extract": 0.21428571428571427,
"exact_match_stderr,flexible-extract": 0.01757643057660649
},
"gpqa_extended_cot_zeroshot": {
"alias": "gpqa_extended_cot_zeroshot",
"exact_match,strict-match": 0.0018315018315018315,
"exact_match_stderr,strict-match": 0.001831501831501839,
"exact_match,flexible-extract": 0.16666666666666666,
"exact_match_stderr,flexible-extract": 0.01596377142035251
},
"gpqa_extended_generative_n_shot": {
"alias": "gpqa_extended_generative_n_shot",
"exact_match,strict-match": 0.0,
"exact_match_stderr,strict-match": 0.0,
"exact_match,flexible-extract": 0.27472527472527475,
"exact_match_stderr,flexible-extract": 0.019120635768881594
},
"gpqa_extended_n_shot": {
"alias": "gpqa_extended_n_shot",
"acc,none": 0.30952380952380953,
"acc_stderr,none": 0.01980264188017035,
"acc_norm,none": 0.30952380952380953,
"acc_norm_stderr,none": 0.01980264188017035
},
"gpqa_extended_zeroshot": {
"alias": "gpqa_extended_zeroshot",
"acc,none": 0.32051282051282054,
"acc_stderr,none": 0.01999010546069712,
"acc_norm,none": 0.32051282051282054,
"acc_norm_stderr,none": 0.01999010546069712
},
"gpqa_main_cot_n_shot": {
"alias": "gpqa_main_cot_n_shot",
"exact_match,strict-match": 0.0,
"exact_match_stderr,strict-match": 0.0,
"exact_match,flexible-extract": 0.16071428571428573,
"exact_match_stderr,flexible-extract": 0.017371142987257344
},
"gpqa_main_cot_zeroshot": {
"alias": "gpqa_main_cot_zeroshot",
"exact_match,strict-match": 0.0,
"exact_match_stderr,strict-match": 0.0,
"exact_match,flexible-extract": 0.171875,
"exact_match_stderr,flexible-extract": 0.01784434214814947
},
"gpqa_main_generative_n_shot": {
"alias": "gpqa_main_generative_n_shot",
"exact_match,strict-match": 0.0,
"exact_match_stderr,strict-match": 0.0,
"exact_match,flexible-extract": 0.30580357142857145,
"exact_match_stderr,flexible-extract": 0.021792582688756976
},
"gpqa_main_n_shot": {
"alias": "gpqa_main_n_shot",
"acc,none": 0.33705357142857145,
"acc_stderr,none": 0.022358101465776416,
"acc_norm,none": 0.33705357142857145,
"acc_norm_stderr,none": 0.022358101465776416
},
"gpqa_main_zeroshot": {
"alias": "gpqa_main_zeroshot",
"acc,none": 0.34151785714285715,
"acc_stderr,none": 0.022429776589214558,
"acc_norm,none": 0.34151785714285715,
"acc_norm_stderr,none": 0.022429776589214558
},
"ifeval": {
"alias": "ifeval",
"prompt_level_strict_acc,none": 0.36229205175600737,
"prompt_level_strict_acc_stderr,none": 0.020684424314965397,
"inst_level_strict_acc,none": 0.4784172661870504,
"inst_level_strict_acc_stderr,none": "N/A",
"prompt_level_loose_acc,none": 0.3844731977818854,
"prompt_level_loose_acc_stderr,none": 0.020934357634584663,
"inst_level_loose_acc,none": 0.5011990407673861,
"inst_level_loose_acc_stderr,none": "N/A"
}
}