| { | |
| "arc_challenge": { | |
| "alias": "arc_challenge", | |
| "acc,none": 0.5810580204778157, | |
| "acc_stderr,none": 0.014418106953639013, | |
| "acc_norm,none": 0.6092150170648464, | |
| "acc_norm_stderr,none": 0.014258563880513782 | |
| }, | |
| "gpqa_diamond_cot_n_shot": { | |
| "alias": "gpqa_diamond_cot_n_shot", | |
| "exact_match,strict-match": 0.0, | |
| "exact_match_stderr,strict-match": 0.0, | |
| "exact_match,flexible-extract": 0.1414141414141414, | |
| "exact_match_stderr,flexible-extract": 0.024825909793343322 | |
| }, | |
| "gpqa_diamond_cot_zeroshot": { | |
| "alias": "gpqa_diamond_cot_zeroshot", | |
| "exact_match,strict-match": 0.0, | |
| "exact_match_stderr,strict-match": 0.0, | |
| "exact_match,flexible-extract": 0.18686868686868688, | |
| "exact_match_stderr,flexible-extract": 0.027772533334218984 | |
| }, | |
| "gpqa_diamond_generative_n_shot": { | |
| "alias": "gpqa_diamond_generative_n_shot", | |
| "exact_match,strict-match": 0.0, | |
| "exact_match_stderr,strict-match": 0.0, | |
| "exact_match,flexible-extract": 0.21717171717171718, | |
| "exact_match_stderr,flexible-extract": 0.029376616484945633 | |
| }, | |
| "gpqa_diamond_n_shot": { | |
| "alias": "gpqa_diamond_n_shot", | |
| "acc,none": 0.3484848484848485, | |
| "acc_stderr,none": 0.03394853965156402, | |
| "acc_norm,none": 0.3484848484848485, | |
| "acc_norm_stderr,none": 0.03394853965156402 | |
| }, | |
| "gpqa_diamond_zeroshot": { | |
| "alias": "gpqa_diamond_zeroshot", | |
| "acc,none": 0.32323232323232326, | |
| "acc_stderr,none": 0.03332299921070644, | |
| "acc_norm,none": 0.32323232323232326, | |
| "acc_norm_stderr,none": 0.03332299921070644 | |
| }, | |
| "gpqa_extended_cot_n_shot": { | |
| "alias": "gpqa_extended_cot_n_shot", | |
| "exact_match,strict-match": 0.0, | |
| "exact_match_stderr,strict-match": 0.0, | |
| "exact_match,flexible-extract": 0.21428571428571427, | |
| "exact_match_stderr,flexible-extract": 0.01757643057660649 | |
| }, | |
| "gpqa_extended_cot_zeroshot": { | |
| "alias": "gpqa_extended_cot_zeroshot", | |
| "exact_match,strict-match": 0.0018315018315018315, | |
| "exact_match_stderr,strict-match": 0.001831501831501839, | |
| "exact_match,flexible-extract": 0.16666666666666666, | |
| "exact_match_stderr,flexible-extract": 0.01596377142035251 | |
| }, | |
| "gpqa_extended_generative_n_shot": { | |
| "alias": "gpqa_extended_generative_n_shot", | |
| "exact_match,strict-match": 0.0, | |
| "exact_match_stderr,strict-match": 0.0, | |
| "exact_match,flexible-extract": 0.27472527472527475, | |
| "exact_match_stderr,flexible-extract": 0.019120635768881594 | |
| }, | |
| "gpqa_extended_n_shot": { | |
| "alias": "gpqa_extended_n_shot", | |
| "acc,none": 0.30952380952380953, | |
| "acc_stderr,none": 0.01980264188017035, | |
| "acc_norm,none": 0.30952380952380953, | |
| "acc_norm_stderr,none": 0.01980264188017035 | |
| }, | |
| "gpqa_extended_zeroshot": { | |
| "alias": "gpqa_extended_zeroshot", | |
| "acc,none": 0.32051282051282054, | |
| "acc_stderr,none": 0.01999010546069712, | |
| "acc_norm,none": 0.32051282051282054, | |
| "acc_norm_stderr,none": 0.01999010546069712 | |
| }, | |
| "gpqa_main_cot_n_shot": { | |
| "alias": "gpqa_main_cot_n_shot", | |
| "exact_match,strict-match": 0.0, | |
| "exact_match_stderr,strict-match": 0.0, | |
| "exact_match,flexible-extract": 0.16071428571428573, | |
| "exact_match_stderr,flexible-extract": 0.017371142987257344 | |
| }, | |
| "gpqa_main_cot_zeroshot": { | |
| "alias": "gpqa_main_cot_zeroshot", | |
| "exact_match,strict-match": 0.0, | |
| "exact_match_stderr,strict-match": 0.0, | |
| "exact_match,flexible-extract": 0.171875, | |
| "exact_match_stderr,flexible-extract": 0.01784434214814947 | |
| }, | |
| "gpqa_main_generative_n_shot": { | |
| "alias": "gpqa_main_generative_n_shot", | |
| "exact_match,strict-match": 0.0, | |
| "exact_match_stderr,strict-match": 0.0, | |
| "exact_match,flexible-extract": 0.30580357142857145, | |
| "exact_match_stderr,flexible-extract": 0.021792582688756976 | |
| }, | |
| "gpqa_main_n_shot": { | |
| "alias": "gpqa_main_n_shot", | |
| "acc,none": 0.33705357142857145, | |
| "acc_stderr,none": 0.022358101465776416, | |
| "acc_norm,none": 0.33705357142857145, | |
| "acc_norm_stderr,none": 0.022358101465776416 | |
| }, | |
| "gpqa_main_zeroshot": { | |
| "alias": "gpqa_main_zeroshot", | |
| "acc,none": 0.34151785714285715, | |
| "acc_stderr,none": 0.022429776589214558, | |
| "acc_norm,none": 0.34151785714285715, | |
| "acc_norm_stderr,none": 0.022429776589214558 | |
| }, | |
| "ifeval": { | |
| "alias": "ifeval", | |
| "prompt_level_strict_acc,none": 0.36229205175600737, | |
| "prompt_level_strict_acc_stderr,none": 0.020684424314965397, | |
| "inst_level_strict_acc,none": 0.4784172661870504, | |
| "inst_level_strict_acc_stderr,none": "N/A", | |
| "prompt_level_loose_acc,none": 0.3844731977818854, | |
| "prompt_level_loose_acc_stderr,none": 0.020934357634584663, | |
| "inst_level_loose_acc,none": 0.5011990407673861, | |
| "inst_level_loose_acc_stderr,none": "N/A" | |
| } | |
| } |