rwmasood commited on
Commit
11c88fe
·
verified ·
1 Parent(s): 777ca78

Upload llm-eval-DeepSeek-LLM-67B-Chat-gptq-8bit.json

Browse files
llm-eval-DeepSeek-LLM-67B-Chat-gptq-8bit.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "arc_challenge": {
3
+ "alias": "arc_challenge",
4
+ "acc,none": 0.5810580204778157,
5
+ "acc_stderr,none": 0.014418106953639013,
6
+ "acc_norm,none": 0.6092150170648464,
7
+ "acc_norm_stderr,none": 0.014258563880513782
8
+ },
9
+ "gpqa_diamond_cot_n_shot": {
10
+ "alias": "gpqa_diamond_cot_n_shot",
11
+ "exact_match,strict-match": 0.0,
12
+ "exact_match_stderr,strict-match": 0.0,
13
+ "exact_match,flexible-extract": 0.1414141414141414,
14
+ "exact_match_stderr,flexible-extract": 0.024825909793343322
15
+ },
16
+ "gpqa_diamond_cot_zeroshot": {
17
+ "alias": "gpqa_diamond_cot_zeroshot",
18
+ "exact_match,strict-match": 0.0,
19
+ "exact_match_stderr,strict-match": 0.0,
20
+ "exact_match,flexible-extract": 0.18686868686868688,
21
+ "exact_match_stderr,flexible-extract": 0.027772533334218984
22
+ },
23
+ "gpqa_diamond_generative_n_shot": {
24
+ "alias": "gpqa_diamond_generative_n_shot",
25
+ "exact_match,strict-match": 0.0,
26
+ "exact_match_stderr,strict-match": 0.0,
27
+ "exact_match,flexible-extract": 0.21717171717171718,
28
+ "exact_match_stderr,flexible-extract": 0.029376616484945633
29
+ },
30
+ "gpqa_diamond_n_shot": {
31
+ "alias": "gpqa_diamond_n_shot",
32
+ "acc,none": 0.3484848484848485,
33
+ "acc_stderr,none": 0.03394853965156402,
34
+ "acc_norm,none": 0.3484848484848485,
35
+ "acc_norm_stderr,none": 0.03394853965156402
36
+ },
37
+ "gpqa_diamond_zeroshot": {
38
+ "alias": "gpqa_diamond_zeroshot",
39
+ "acc,none": 0.32323232323232326,
40
+ "acc_stderr,none": 0.03332299921070644,
41
+ "acc_norm,none": 0.32323232323232326,
42
+ "acc_norm_stderr,none": 0.03332299921070644
43
+ },
44
+ "gpqa_extended_cot_n_shot": {
45
+ "alias": "gpqa_extended_cot_n_shot",
46
+ "exact_match,strict-match": 0.0,
47
+ "exact_match_stderr,strict-match": 0.0,
48
+ "exact_match,flexible-extract": 0.21428571428571427,
49
+ "exact_match_stderr,flexible-extract": 0.01757643057660649
50
+ },
51
+ "gpqa_extended_cot_zeroshot": {
52
+ "alias": "gpqa_extended_cot_zeroshot",
53
+ "exact_match,strict-match": 0.0018315018315018315,
54
+ "exact_match_stderr,strict-match": 0.001831501831501839,
55
+ "exact_match,flexible-extract": 0.16666666666666666,
56
+ "exact_match_stderr,flexible-extract": 0.01596377142035251
57
+ },
58
+ "gpqa_extended_generative_n_shot": {
59
+ "alias": "gpqa_extended_generative_n_shot",
60
+ "exact_match,strict-match": 0.0,
61
+ "exact_match_stderr,strict-match": 0.0,
62
+ "exact_match,flexible-extract": 0.27472527472527475,
63
+ "exact_match_stderr,flexible-extract": 0.019120635768881594
64
+ },
65
+ "gpqa_extended_n_shot": {
66
+ "alias": "gpqa_extended_n_shot",
67
+ "acc,none": 0.30952380952380953,
68
+ "acc_stderr,none": 0.01980264188017035,
69
+ "acc_norm,none": 0.30952380952380953,
70
+ "acc_norm_stderr,none": 0.01980264188017035
71
+ },
72
+ "gpqa_extended_zeroshot": {
73
+ "alias": "gpqa_extended_zeroshot",
74
+ "acc,none": 0.32051282051282054,
75
+ "acc_stderr,none": 0.01999010546069712,
76
+ "acc_norm,none": 0.32051282051282054,
77
+ "acc_norm_stderr,none": 0.01999010546069712
78
+ },
79
+ "gpqa_main_cot_n_shot": {
80
+ "alias": "gpqa_main_cot_n_shot",
81
+ "exact_match,strict-match": 0.0,
82
+ "exact_match_stderr,strict-match": 0.0,
83
+ "exact_match,flexible-extract": 0.16071428571428573,
84
+ "exact_match_stderr,flexible-extract": 0.017371142987257344
85
+ },
86
+ "gpqa_main_cot_zeroshot": {
87
+ "alias": "gpqa_main_cot_zeroshot",
88
+ "exact_match,strict-match": 0.0,
89
+ "exact_match_stderr,strict-match": 0.0,
90
+ "exact_match,flexible-extract": 0.171875,
91
+ "exact_match_stderr,flexible-extract": 0.01784434214814947
92
+ },
93
+ "gpqa_main_generative_n_shot": {
94
+ "alias": "gpqa_main_generative_n_shot",
95
+ "exact_match,strict-match": 0.0,
96
+ "exact_match_stderr,strict-match": 0.0,
97
+ "exact_match,flexible-extract": 0.30580357142857145,
98
+ "exact_match_stderr,flexible-extract": 0.021792582688756976
99
+ },
100
+ "gpqa_main_n_shot": {
101
+ "alias": "gpqa_main_n_shot",
102
+ "acc,none": 0.33705357142857145,
103
+ "acc_stderr,none": 0.022358101465776416,
104
+ "acc_norm,none": 0.33705357142857145,
105
+ "acc_norm_stderr,none": 0.022358101465776416
106
+ },
107
+ "gpqa_main_zeroshot": {
108
+ "alias": "gpqa_main_zeroshot",
109
+ "acc,none": 0.34151785714285715,
110
+ "acc_stderr,none": 0.022429776589214558,
111
+ "acc_norm,none": 0.34151785714285715,
112
+ "acc_norm_stderr,none": 0.022429776589214558
113
+ },
114
+ "ifeval": {
115
+ "alias": "ifeval",
116
+ "prompt_level_strict_acc,none": 0.36229205175600737,
117
+ "prompt_level_strict_acc_stderr,none": 0.020684424314965397,
118
+ "inst_level_strict_acc,none": 0.4784172661870504,
119
+ "inst_level_strict_acc_stderr,none": "N/A",
120
+ "prompt_level_loose_acc,none": 0.3844731977818854,
121
+ "prompt_level_loose_acc_stderr,none": 0.020934357634584663,
122
+ "inst_level_loose_acc,none": 0.5011990407673861,
123
+ "inst_level_loose_acc_stderr,none": "N/A"
124
+ }
125
+ }