{ "arc_challenge": { "alias": "arc_challenge", "acc,none": 0.5810580204778157, "acc_stderr,none": 0.014418106953639013, "acc_norm,none": 0.6092150170648464, "acc_norm_stderr,none": 0.014258563880513782 }, "gpqa_diamond_cot_n_shot": { "alias": "gpqa_diamond_cot_n_shot", "exact_match,strict-match": 0.0, "exact_match_stderr,strict-match": 0.0, "exact_match,flexible-extract": 0.1414141414141414, "exact_match_stderr,flexible-extract": 0.024825909793343322 }, "gpqa_diamond_cot_zeroshot": { "alias": "gpqa_diamond_cot_zeroshot", "exact_match,strict-match": 0.0, "exact_match_stderr,strict-match": 0.0, "exact_match,flexible-extract": 0.18686868686868688, "exact_match_stderr,flexible-extract": 0.027772533334218984 }, "gpqa_diamond_generative_n_shot": { "alias": "gpqa_diamond_generative_n_shot", "exact_match,strict-match": 0.0, "exact_match_stderr,strict-match": 0.0, "exact_match,flexible-extract": 0.21717171717171718, "exact_match_stderr,flexible-extract": 0.029376616484945633 }, "gpqa_diamond_n_shot": { "alias": "gpqa_diamond_n_shot", "acc,none": 0.3484848484848485, "acc_stderr,none": 0.03394853965156402, "acc_norm,none": 0.3484848484848485, "acc_norm_stderr,none": 0.03394853965156402 }, "gpqa_diamond_zeroshot": { "alias": "gpqa_diamond_zeroshot", "acc,none": 0.32323232323232326, "acc_stderr,none": 0.03332299921070644, "acc_norm,none": 0.32323232323232326, "acc_norm_stderr,none": 0.03332299921070644 }, "gpqa_extended_cot_n_shot": { "alias": "gpqa_extended_cot_n_shot", "exact_match,strict-match": 0.0, "exact_match_stderr,strict-match": 0.0, "exact_match,flexible-extract": 0.21428571428571427, "exact_match_stderr,flexible-extract": 0.01757643057660649 }, "gpqa_extended_cot_zeroshot": { "alias": "gpqa_extended_cot_zeroshot", "exact_match,strict-match": 0.0018315018315018315, "exact_match_stderr,strict-match": 0.001831501831501839, "exact_match,flexible-extract": 0.16666666666666666, "exact_match_stderr,flexible-extract": 0.01596377142035251 }, "gpqa_extended_generative_n_shot": { "alias": "gpqa_extended_generative_n_shot", "exact_match,strict-match": 0.0, "exact_match_stderr,strict-match": 0.0, "exact_match,flexible-extract": 0.27472527472527475, "exact_match_stderr,flexible-extract": 0.019120635768881594 }, "gpqa_extended_n_shot": { "alias": "gpqa_extended_n_shot", "acc,none": 0.30952380952380953, "acc_stderr,none": 0.01980264188017035, "acc_norm,none": 0.30952380952380953, "acc_norm_stderr,none": 0.01980264188017035 }, "gpqa_extended_zeroshot": { "alias": "gpqa_extended_zeroshot", "acc,none": 0.32051282051282054, "acc_stderr,none": 0.01999010546069712, "acc_norm,none": 0.32051282051282054, "acc_norm_stderr,none": 0.01999010546069712 }, "gpqa_main_cot_n_shot": { "alias": "gpqa_main_cot_n_shot", "exact_match,strict-match": 0.0, "exact_match_stderr,strict-match": 0.0, "exact_match,flexible-extract": 0.16071428571428573, "exact_match_stderr,flexible-extract": 0.017371142987257344 }, "gpqa_main_cot_zeroshot": { "alias": "gpqa_main_cot_zeroshot", "exact_match,strict-match": 0.0, "exact_match_stderr,strict-match": 0.0, "exact_match,flexible-extract": 0.171875, "exact_match_stderr,flexible-extract": 0.01784434214814947 }, "gpqa_main_generative_n_shot": { "alias": "gpqa_main_generative_n_shot", "exact_match,strict-match": 0.0, "exact_match_stderr,strict-match": 0.0, "exact_match,flexible-extract": 0.30580357142857145, "exact_match_stderr,flexible-extract": 0.021792582688756976 }, "gpqa_main_n_shot": { "alias": "gpqa_main_n_shot", "acc,none": 0.33705357142857145, "acc_stderr,none": 0.022358101465776416, "acc_norm,none": 0.33705357142857145, "acc_norm_stderr,none": 0.022358101465776416 }, "gpqa_main_zeroshot": { "alias": "gpqa_main_zeroshot", "acc,none": 0.34151785714285715, "acc_stderr,none": 0.022429776589214558, "acc_norm,none": 0.34151785714285715, "acc_norm_stderr,none": 0.022429776589214558 }, "ifeval": { "alias": "ifeval", "prompt_level_strict_acc,none": 0.36229205175600737, "prompt_level_strict_acc_stderr,none": 0.020684424314965397, "inst_level_strict_acc,none": 0.4784172661870504, "inst_level_strict_acc_stderr,none": "N/A", "prompt_level_loose_acc,none": 0.3844731977818854, "prompt_level_loose_acc_stderr,none": 0.020934357634584663, "inst_level_loose_acc,none": 0.5011990407673861, "inst_level_loose_acc_stderr,none": "N/A" } }