PeterKruger commited on
Commit
cf5cf07
·
verified ·
1 Parent(s): 838b31f

Upload 8 files

Browse files
runs/run-2025-12-16/avg_latency.csv CHANGED
@@ -8,6 +8,7 @@ Deepseek-v3.2-speciale,454.4097,373.5313,230.8446,213.1212,265.5541,201.2352,651
8
  Gemini-2.5-flash,122.2311,28.9762,68.5994,58.9803,57.4608,53.4325,77.5255,70.8783,47.9233,65.6251,65.61706446
9
  Gemini-2.5-flash-lite,27.6891,7.4738,22.1568,13.5681,17.9023,19.6096,31.563,27.4738,17.0569,16.9643,20.41722765
10
  Gemini-2.5-pro,109.9939,37.5364,99.5693,87.053,65.1453,74.7684,125.0731,98.0517,75.3436,88.0257,86.79510542
 
11
  Gemini-3-pro-preview,111.2193,41.256,68.7167,67.4612,63.8528,59.1828,138.2422,94.61,57.7557,63.3303,76.10772546
12
  GLM-4.5-Air,250.8921,33.182,130.3234,118.605,102.7298,125.2574,275.6033,337.8757,103.0877,122.1762,163.1509648
13
  GLM-4.6,187.9063,59.6837,192.016,172.9087,143.9757,161.7514,273.6536,325.9543,153.79,160.8635,187.4263836
@@ -28,9 +29,11 @@ Minimax-m2,258.3904,80.0205,108.2872,89.6492,87.1081,80.6304,210.7194,236.3157,1
28
  Ministral-8b-2512,28.8863,22.0955,41.2578,28.7331,21.6026,29.6067,54.5365,24.612,32.0725,33.7791,31.40083599
29
  Mistral-large-2512,79.2901,28.5084,126.2813,111.757,80.9586,115.3766,70.4993,54.8645,93.7809,116.9721,89.96343603
30
  Mistral-medium-3.1,37.2227,15.6724,81.6154,70.4164,46.7696,67.9571,43.1978,23.2783,55.3481,69.7366,52.24553215
 
31
  Nemotron-nano-9b-v2,124.962,41.4807,47.0715,36.6855,47.2949,41.3541,127.955,107.3714,40.8651,55.1499,66.77738031
32
  Nova-2-lite-v1,86.8579,56.826,67.0827,61.455,57.903,54.8436,51.9657,53.2645,60.4673,59.8649,61.45748847
33
  Nova-premier-v1,59.2369,19.7099,61.1146,61.1911,38.9898,55.9562,49.0279,50.0873,49.4701,62.2549,51.84074232
 
34
  Qwen3-235b-a22b-2507,133.3347,35.1341,78.9824,71.2207,115.1465,81.8462,163.3372,184.0751,89.4845,78.7041,104.7811018
35
  Qwen3-235B-A22B-Thinking-2507,548.9492,102.3888,246.6013,235.7818,240.6059,257.2741,631.6219,589.9518,236.4708,208.5225,316.8201599
36
  Qwen3-next-80b-a3b-thinking,105.9201,48.549,77.5874,69.3386,85.093,65.7956,98.0509,85.3588,71.0135,68.6137,77.75939135
 
8
  Gemini-2.5-flash,122.2311,28.9762,68.5994,58.9803,57.4608,53.4325,77.5255,70.8783,47.9233,65.6251,65.61706446
9
  Gemini-2.5-flash-lite,27.6891,7.4738,22.1568,13.5681,17.9023,19.6096,31.563,27.4738,17.0569,16.9643,20.41722765
10
  Gemini-2.5-pro,109.9939,37.5364,99.5693,87.053,65.1453,74.7684,125.0731,98.0517,75.3436,88.0257,86.79510542
11
+ Gemini-3-flash-preview,53.7734,32.3758,26.7758,22.85,25.7441,21.9813,151.4909,80.1863,24.0318,25.9412,45.56031499
12
  Gemini-3-pro-preview,111.2193,41.256,68.7167,67.4612,63.8528,59.1828,138.2422,94.61,57.7557,63.3303,76.10772546
13
  GLM-4.5-Air,250.8921,33.182,130.3234,118.605,102.7298,125.2574,275.6033,337.8757,103.0877,122.1762,163.1509648
14
  GLM-4.6,187.9063,59.6837,192.016,172.9087,143.9757,161.7514,273.6536,325.9543,153.79,160.8635,187.4263836
 
29
  Ministral-8b-2512,28.8863,22.0955,41.2578,28.7331,21.6026,29.6067,54.5365,24.612,32.0725,33.7791,31.40083599
30
  Mistral-large-2512,79.2901,28.5084,126.2813,111.757,80.9586,115.3766,70.4993,54.8645,93.7809,116.9721,89.96343603
31
  Mistral-medium-3.1,37.2227,15.6724,81.6154,70.4164,46.7696,67.9571,43.1978,23.2783,55.3481,69.7366,52.24553215
32
+ Nemotron-3-nano-30b-a3b,32.6547,32.7213,24.9018,19.692,22.4484,21.2114,61.7155,40.3934,22.7944,26.9905,30.08154062
33
  Nemotron-nano-9b-v2,124.962,41.4807,47.0715,36.6855,47.2949,41.3541,127.955,107.3714,40.8651,55.1499,66.77738031
34
  Nova-2-lite-v1,86.8579,56.826,67.0827,61.455,57.903,54.8436,51.9657,53.2645,60.4673,59.8649,61.45748847
35
  Nova-premier-v1,59.2369,19.7099,61.1146,61.1911,38.9898,55.9562,49.0279,50.0873,49.4701,62.2549,51.84074232
36
+ Olmo-3.1-32b-think,177.482,130.5138,74.441,78.2192,108.5078,71.7146,215.9375,202.7508,97.1318,80.6868,122.4217364
37
  Qwen3-235b-a22b-2507,133.3347,35.1341,78.9824,71.2207,115.1465,81.8462,163.3372,184.0751,89.4845,78.7041,104.7811018
38
  Qwen3-235B-A22B-Thinking-2507,548.9492,102.3888,246.6013,235.7818,240.6059,257.2741,631.6219,589.9518,236.4708,208.5225,316.8201599
39
  Qwen3-next-80b-a3b-thinking,105.9201,48.549,77.5874,69.3386,85.093,65.7956,98.0509,85.3588,71.0135,68.6137,77.75939135
runs/run-2025-12-16/correlations.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "correlations": {
3
  "LMArena": 69.19,
4
- "Artificial Analysis Intelligence Index": 89.38,
5
- "MMLU": 82.21
6
 
7
  },
8
  "description": "Correlation percentages between AutoBench scores and other benchmark scores"
 
1
  {
2
  "correlations": {
3
  "LMArena": 69.19,
4
+ "Artificial Analysis Intelligence Index": 89.52,
5
+ "MMLU": 82.64
6
 
7
  },
8
  "description": "Correlation percentages between AutoBench scores and other benchmark scores"
runs/run-2025-12-16/cost_data.csv CHANGED
@@ -2,17 +2,18 @@
2
  Claude-haiku-4.5,0.04796761,0.03238883,0.03107844,0.02638083,0.03072529,0.02659291,0.06254624,0.05931721,0.02862727,0.03348466,0.037873971
3
  Claude-opus-4.5,0.29227484,0.14104891,0.12830818,0.13549917,0.11872339,0.13533474,0.25805,0.22583962,0.13629584,0.15803141,0.172599703
4
  Claude-sonnet-4.5,0.14973639,0.07717565,0.10377029,0.0932825,0.07577582,0.08187862,0.1930656,0.17324842,0.08588089,0.10120519,0.113931502
5
- DeepSeek-R1-0528,0.01875911,0.00303728,0.0047644,0.00431912,0.00500901,0.00403824,0.02015593,0.02310714,0.00674013,0.00709617,0.009866481
6
  Deepseek-v3.2,0.00109186,0.00052365,0.00081581,0.00070466,0.00075338,0.00075529,0.00153214,0.00117388,0.00079126,0.00072429,0.00088526
7
  Deepseek-v3.2-speciale,0.00672248,0.00675838,0.00320302,0.00289234,0.00444567,0.00283425,0.01029455,0.00628261,0.00329474,0.00279927,0.004672989
8
- Gemini-2.5-flash,0.03803445,0.0117833,0.01362167,0.01224066,0.01833685,0.0121711,0.03863442,0.03681212,0.01540965,0.0149641,0.021217848
9
  Gemini-2.5-flash-lite,0.00277171,0.00084387,0.00132463,0.00109807,0.00167517,0.00119299,0.00468076,0.00449945,0.00172071,0.00134208,0.002139796
10
- Gemini-2.5-pro,0.08474181,0.03169886,0.04509337,0.04217825,0.04990143,0.04269468,0.13244895,0.11265404,0.05585932,0.04711418,0.064793075
 
11
  Gemini-3-pro-preview,0.09641867,0.04151,0.0406917,0.0404174,0.05291838,0.03837331,0.16871096,0.12023964,0.05092595,0.04228494,0.068499733
12
  GLM-4.5-Air,0.00820787,0.0013094,0.00317276,0.00338355,0.00377618,0.00329219,0.01023654,0.0127487,0.00321646,0.00340004,0.005355855
13
  GLM-4.6,0.01399979,0.00462601,0.00890891,0.01057332,0.01132542,0.01130957,0.02189323,0.02268625,0.01008674,0.00841888,0.01253707
14
  Gpt-5.1,0.13881786,0.07678016,0.09721818,0.10773247,0.09562121,0.09244821,0.16249342,0.11152067,0.08273436,0.12518665,0.108002358
15
- Gpt-5.2,0.13035496,0.04910371,0.05832808,0.03989294,0.06298887,0.0454307,0.14882455,0.1013052,0.0541931,0.05409835,0.073564969
16
  Gpt-5.2-pro,1.0698774,0.59090945,0.693588,0.54748576,0.80446333,0.5967108,1.50086726,1.25455216,0.57316151,0.63746686,0.818815856
17
  Gpt-5-mini,0.01101629,0.0066571,0.00806052,0.00685722,0.0085612,0.00951664,0.01375456,0.00993965,0.00845582,0.00856096,0.009137171
18
  Gpt-5-nano,0.00380816,0.00252091,0.00245592,0.00244178,0.00397283,0.00255242,0.00575112,0.00493667,0.00303911,0.00256656,0.003385019
@@ -24,13 +25,15 @@ Grok-4.1-fast-thinking,0.00413646,0.00118447,0.00132395,0.00122325,0.0020697,0.0
24
  Kimi-k2-0905,0.0040027,0.00200646,0.00369414,0.00327289,0.00220109,0.00368381,0.00514003,0.0031238,0.00287381,0.00342115,0.003344045
25
  Kimi-k2-thinking,0.02400572,0.01381101,0.01216847,0.01033478,0.01514665,0.01051941,0.0377184,0.03795372,0.01288832,0.01324748,0.018558145
26
  Llama-3.3-nemotron-super-49b-v1.5,0.00300765,0.00122499,0.00098242,0.00099594,0.00108555,0.00106881,0.00381754,0.00365907,0.00140375,0.00112887,0.001833066
27
- Minimax-m2,0.01213038,0.00553417,0.00320592,0.00289512,0.00458057,0.00287236,0.01794542,0.01389677,0.005267,0.00354698,0.00711788
28
- Ministral-8b-2512,0.00046259,0.00026429,0.00054361,0.00049856,0.00041828,0.00048709,0.00079872,0.00051314,0.00042434,0.00049399,0.000489568
29
- Mistral-large-2512,0.00471713,0.00195759,0.00594909,0.00554823,0.00492521,0.00553417,0.0054327,0.00507622,0.0053989,0.00568219,0.005120599
30
  Mistral-medium-3.1,0.00336013,0.00167505,0.00473395,0.0041676,0.00330309,0.00418606,0.0048653,0.00326612,0.00349375,0.00414133,0.003751786
 
31
  Nemotron-nano-9b-v2,0.00153706,0.00057663,0.00044255,0.00039797,0.0005314,0.00045132,0.00166813,0.00156872,0.00053541,0.00051641,0.000820591
32
  Nova-2-lite-v1,0.07547446,0.04099998,0.03170261,0.0309826,0.0359062,0.02763011,0.04359764,0.04758953,0.03771458,0.03259245,0.039437834
33
  Nova-premier-v1,0.01677879,0.00763352,0.01237621,0.01206467,0.01235429,0.01140471,0.0148664,0.01551314,0.01240074,0.01254813,0.012950513
 
34
  Qwen3-235b-a22b-2507,0.00184113,0.00043658,0.00158505,0.00120764,0.00209282,0.00146099,0.00359504,0.00329687,0.00165955,0.00172153,0.001917167
35
  Qwen3-235B-A22B-Thinking-2507,0.00549796,0.00113073,0.00201848,0.00202241,0.00223348,0.00210888,0.00760879,0.00690929,0.00217258,0.00206767,0.003170949
36
  Qwen3-next-80b-a3b-thinking,0.0099701,0.00608854,0.00549343,0.00515869,0.00768631,0.00488623,0.01232633,0.01059962,0.00707064,0.00592575,0.007494371
 
2
  Claude-haiku-4.5,0.04796761,0.03238883,0.03107844,0.02638083,0.03072529,0.02659291,0.06254624,0.05931721,0.02862727,0.03348466,0.037873971
3
  Claude-opus-4.5,0.29227484,0.14104891,0.12830818,0.13549917,0.11872339,0.13533474,0.25805,0.22583962,0.13629584,0.15803141,0.172599703
4
  Claude-sonnet-4.5,0.14973639,0.07717565,0.10377029,0.0932825,0.07577582,0.08187862,0.1930656,0.17324842,0.08588089,0.10120519,0.113931502
5
+ DeepSeek-R1-0528,0.01875911,0.00303728,0.0047644,0.00431912,0.00500901,0.00403824,0.02015693,0.02310714,0.00674013,0.00709617,0.009866481
6
  Deepseek-v3.2,0.00109186,0.00052365,0.00081581,0.00070466,0.00075338,0.00075529,0.00153214,0.00117388,0.00079126,0.00072429,0.00088526
7
  Deepseek-v3.2-speciale,0.00672248,0.00675838,0.00320302,0.00289234,0.00444567,0.00283425,0.01029455,0.00628261,0.00329474,0.00279927,0.004672989
8
+ Gemini-2.5-flash,0.03803445,0.0117833,0.01362167,0.01224066,0.01833685,0.0121711,0.03863428,0.03681212,0.01540965,0.0149641,0.021217848
9
  Gemini-2.5-flash-lite,0.00277171,0.00084387,0.00132463,0.00109807,0.00167517,0.00119299,0.00468076,0.00449945,0.00172071,0.00134208,0.002139796
10
+ Gemini-2.5-pro,0.08474266,0.03169886,0.04509337,0.04217825,0.04990143,0.04269468,0.13244895,0.11265404,0.05585932,0.04711418,0.064793075
11
+ Gemini-3-flash-preview,0.02435909,0.0136177,0.00739711,0.00733464,0.00987005,0.0071199,0.06845508,0.04256463,0.00924835,0.00750067,0.01945601
12
  Gemini-3-pro-preview,0.09641867,0.04151,0.0406917,0.0404174,0.05291838,0.03837331,0.16871096,0.12023964,0.05092595,0.04228494,0.068499733
13
  GLM-4.5-Air,0.00820787,0.0013094,0.00317276,0.00338355,0.00377618,0.00329219,0.01023654,0.0127487,0.00321646,0.00340004,0.005355855
14
  GLM-4.6,0.01399979,0.00462601,0.00890891,0.01057332,0.01132542,0.01130957,0.02189323,0.02268625,0.01008674,0.00841888,0.01253707
15
  Gpt-5.1,0.13881786,0.07678016,0.09721818,0.10773247,0.09562121,0.09244821,0.16249342,0.11152067,0.08273436,0.12518665,0.108002358
16
+ Gpt-5.2,0.13035496,0.04910371,0.05832808,0.03989294,0.06298887,0.0454307,0.14882455,0.1013052,0.0541931,0.05409845,0.073564969
17
  Gpt-5.2-pro,1.0698774,0.59090945,0.693588,0.54748576,0.80446333,0.5967108,1.50086726,1.25455216,0.57316151,0.63746686,0.818815856
18
  Gpt-5-mini,0.01101629,0.0066571,0.00806052,0.00685722,0.0085612,0.00951664,0.01375456,0.00993965,0.00845582,0.00856096,0.009137171
19
  Gpt-5-nano,0.00380816,0.00252091,0.00245592,0.00244178,0.00397283,0.00255242,0.00575112,0.00493667,0.00303911,0.00256656,0.003385019
 
25
  Kimi-k2-0905,0.0040027,0.00200646,0.00369414,0.00327289,0.00220109,0.00368381,0.00514003,0.0031238,0.00287381,0.00342115,0.003344045
26
  Kimi-k2-thinking,0.02400572,0.01381101,0.01216847,0.01033478,0.01514665,0.01051941,0.0377184,0.03795372,0.01288832,0.01324748,0.018558145
27
  Llama-3.3-nemotron-super-49b-v1.5,0.00300765,0.00122499,0.00098242,0.00099594,0.00108555,0.00106881,0.00381754,0.00365907,0.00140375,0.00112887,0.001833066
28
+ Minimax-m2,0.01213038,0.00553417,0.00320592,0.00289512,0.00458057,0.00287236,0.01794542,0.01389664,0.005267,0.00354698,0.00711788
29
+ Ministral-8b-2512,0.00046259,0.00026428,0.00054361,0.00049856,0.00041828,0.00048709,0.00079872,0.00051314,0.00042434,0.00049399,0.000489568
30
+ Mistral-large-2512,0.00471713,0.00195759,0.00594909,0.00554823,0.00492521,0.00553417,0.0054327,0.00507622,0.00539897,0.00568219,0.005120599
31
  Mistral-medium-3.1,0.00336013,0.00167505,0.00473395,0.0041676,0.00330309,0.00418606,0.0048653,0.00326612,0.00349375,0.00414133,0.003751786
32
+ Nemotron-3-nano-30b-a3b,0,0,0,0,0,0,0,0,0,0,0
33
  Nemotron-nano-9b-v2,0.00153706,0.00057663,0.00044255,0.00039797,0.0005314,0.00045132,0.00166813,0.00156872,0.00053541,0.00051641,0.000820591
34
  Nova-2-lite-v1,0.07547446,0.04099998,0.03170261,0.0309826,0.0359062,0.02763011,0.04359764,0.04758953,0.03771458,0.03259245,0.039437834
35
  Nova-premier-v1,0.01677879,0.00763352,0.01237621,0.01206467,0.01235429,0.01140471,0.0148664,0.01551314,0.01240074,0.01254813,0.012950513
36
+ Olmo-3.1-32b-think,0,0,0,0,0,0,0,0,0,0,0
37
  Qwen3-235b-a22b-2507,0.00184113,0.00043658,0.00158505,0.00120764,0.00209282,0.00146099,0.00359504,0.00329687,0.00165955,0.00172153,0.001917167
38
  Qwen3-235B-A22B-Thinking-2507,0.00549796,0.00113073,0.00201848,0.00202241,0.00223348,0.00210888,0.00760879,0.00690929,0.00217258,0.00206767,0.003170949
39
  Qwen3-next-80b-a3b-thinking,0.0099701,0.00608854,0.00549343,0.00515869,0.00768631,0.00488623,0.01232633,0.01059962,0.00707064,0.00592575,0.007494371
runs/run-2025-12-16/domain_ranks.csv CHANGED
@@ -8,6 +8,7 @@ Deepseek-v3.2-speciale,4.0964,3.7169,4.1727,4.2891,4.2365,4.3793,3.558,3.9619,4.
8
  Gemini-2.5-flash,4.01,4.046,4.3665,4.215,4.3058,4.2863,3.8277,4.024,4.1208,4.4398,4.171935
9
  Gemini-2.5-flash-lite,3.9154,4.0888,4.2033,4.15,3.7796,4.3708,3.2679,3.3294,4.0502,4.2879,3.94904
10
  Gemini-2.5-pro,4.0197,4.1507,4.3678,4.4328,4.3717,4.4027,4.0175,4.2377,4.4451,4.3803,4.294065
 
11
  Gemini-3-pro-preview,4.2254,4.352,4.5077,4.6974,4.2012,4.5579,4.123,4.2957,4.5101,4.4766,4.405224
12
  GLM-4.5-Air,3.4011,3.6283,4.0442,4.4026,3.6135,4.1639,3.6893,3.466,3.8959,4.2471,3.864646
13
  GLM-4.6,3.9502,4.154,4.1144,4.4669,4.1101,4.2879,3.4484,4.0146,4.2906,4.3261,4.132794
@@ -28,9 +29,11 @@ Minimax-m2,3.4588,3.6467,4.3288,4.2996,4.0075,4.1983,3.3369,3.9375,4.1415,4.2852
28
  Ministral-8b-2512,2.9477,3.5676,4.0966,4.1553,3.2352,3.9678,2.9408,2.7773,3.7756,4.1015,3.570151
29
  Mistral-large-2512,3.5392,3.8875,4.135,4.4486,4.1053,4.3948,3.4126,3.1081,4.0114,4.2951,3.935105
30
  Mistral-medium-3.1,3.025,3.9361,4.2094,4.0717,4.0296,4.4697,3.2035,2.8908,3.9827,4.2469,3.811798
 
31
  Nemotron-nano-9b-v2,3.0723,3.0018,3.9185,4.0366,2.9714,3.9167,2.8622,3.0697,3.7831,4.0139,3.500291
32
  Nova-2-lite-v1,3.7409,3.8832,4.2516,4.2551,4.0813,4.3824,3.0429,3.641,4.2797,4.3358,4.059981
33
  Nova-premier-v1,2.8353,3.5547,3.7279,3.9576,3.3375,3.9765,2.5511,2.8474,3.8154,3.9793,3.473742
 
34
  Qwen3-235b-a22b-2507,3.6516,3.842,4.0547,4.3213,3.9642,4.2631,3.2727,3.7025,4.289,4.2662,3.98095
35
  Qwen3-235B-A22B-Thinking-2507,3.7017,4.2155,4.1925,4.5712,4.2408,4.4447,3.8836,3.7789,4.2573,4.4556,4.196769
36
  Qwen3-next-80b-a3b-thinking,3.7023,3.7423,4.0543,4.4018,4.1316,4.2153,3.7258,3.7883,4.24,4.2008,4.031744
 
8
  Gemini-2.5-flash,4.01,4.046,4.3665,4.215,4.3058,4.2863,3.8277,4.024,4.1208,4.4398,4.171935
9
  Gemini-2.5-flash-lite,3.9154,4.0888,4.2033,4.15,3.7796,4.3708,3.2679,3.3294,4.0502,4.2879,3.94904
10
  Gemini-2.5-pro,4.0197,4.1507,4.3678,4.4328,4.3717,4.4027,4.0175,4.2377,4.4451,4.3803,4.294065
11
+ Gemini-3-flash-preview,3.9148,4.2529,4.4172,4.5766,4.3414,4.5388,4.0164,4.0574,4.4399,4.4483,4.303068
12
  Gemini-3-pro-preview,4.2254,4.352,4.5077,4.6974,4.2012,4.5579,4.123,4.2957,4.5101,4.4766,4.405224
13
  GLM-4.5-Air,3.4011,3.6283,4.0442,4.4026,3.6135,4.1639,3.6893,3.466,3.8959,4.2471,3.864646
14
  GLM-4.6,3.9502,4.154,4.1144,4.4669,4.1101,4.2879,3.4484,4.0146,4.2906,4.3261,4.132794
 
29
  Ministral-8b-2512,2.9477,3.5676,4.0966,4.1553,3.2352,3.9678,2.9408,2.7773,3.7756,4.1015,3.570151
30
  Mistral-large-2512,3.5392,3.8875,4.135,4.4486,4.1053,4.3948,3.4126,3.1081,4.0114,4.2951,3.935105
31
  Mistral-medium-3.1,3.025,3.9361,4.2094,4.0717,4.0296,4.4697,3.2035,2.8908,3.9827,4.2469,3.811798
32
+ Nemotron-3-nano-30b-a3b,3.8756,3.6321,4.1365,4.2908,3.8024,4.1842,3.6777,3.8097,4.1924,4.4894,4.028261
33
  Nemotron-nano-9b-v2,3.0723,3.0018,3.9185,4.0366,2.9714,3.9167,2.8622,3.0697,3.7831,4.0139,3.500291
34
  Nova-2-lite-v1,3.7409,3.8832,4.2516,4.2551,4.0813,4.3824,3.0429,3.641,4.2797,4.3358,4.059981
35
  Nova-premier-v1,2.8353,3.5547,3.7279,3.9576,3.3375,3.9765,2.5511,2.8474,3.8154,3.9793,3.473742
36
+ Olmo-3.1-32b-think,3.3149,3.9322,4.1472,4.2858,3.5331,4.1608,3.2607,3.4826,4.0137,4.2631,3.85021
37
  Qwen3-235b-a22b-2507,3.6516,3.842,4.0547,4.3213,3.9642,4.2631,3.2727,3.7025,4.289,4.2662,3.98095
38
  Qwen3-235B-A22B-Thinking-2507,3.7017,4.2155,4.1925,4.5712,4.2408,4.4447,3.8836,3.7789,4.2573,4.4556,4.196769
39
  Qwen3-next-80b-a3b-thinking,3.7023,3.7423,4.0543,4.4018,4.1316,4.2153,3.7258,3.7883,4.24,4.2008,4.031744
runs/run-2025-12-16/metadata.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "run_id": "run_2025-12-16",
3
  "title": "AutoBench Run 5 - December 2025",
4
- "date": "2025-12-16",
5
- "description": "Latest AutoBench run with models Gpt 5.2, Claude Opus 4.5 and more",
6
  "blog_url": "https://huggingface.co/blog/PeterKruger/autobench-5th-run",
7
- "model_count": 35,
8
  "is_latest": true
9
  }
 
1
  {
2
  "run_id": "run_2025-12-16",
3
  "title": "AutoBench Run 5 - December 2025",
4
+ "date": "2025-12-19",
5
+ "description": "Latest AutoBench run with models Gpt 5.2, Claude Opus 4.5, Gemini 3 Flash and more",
6
  "blog_url": "https://huggingface.co/blog/PeterKruger/autobench-5th-run",
7
+ "model_count": 38,
8
  "is_latest": true
9
  }
runs/run-2025-12-16/models.csv ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ company_id,model_name,model_display_name,model_logo_url,api_type,context,parameters,model_version,release_date,thinking_mode,description
2
+ 2,Claude-haiku-4.5,Claude haiku 4.5,/logos/models/claude.svg,OpenRouter,200000,,,16/10/2025,1,"Claude Haiku 4.5 is Anthropic's fastest efficient model, delivering near-frontier intelligence. It matches Sonnet 4's performance in reasoning and coding, optimized for real-time applications."
3
+ 2,Claude-opus-4.5,Claude opus 4.5,/logos/models/claude.svg,OpenRouter,200000,,,24/11/2025,1,"Claude Opus 4.5 is Anthropic's frontier reasoning model, optimized for complex software engineering and long-horizon tasks. It supports extended thinking and multimodal capabilities."
4
+ 2,Claude-sonnet-4.5,Claude sonnet 4.5,/logos/models/claude.svg,OpenRouter,1000000,,,29/09/2025,1,"Claude Sonnet 4.5 is Anthropic's most advanced model for real-world agents and coding. It features a 1M token context, state-of-the-art coding performance, and enhanced agentic capabilities."
5
+ 30,DeepSeek-R1-0528,DeepSeek R1 0528,/logos/models/deepseek.svg,OpenRouter,163840,6.71E+11,,28/05/2025,1,"DeepSeek R1 0528 is an open-source model with 671B parameters (37B active). It offers performance on par with proprietary reasoning models, featuring fully open reasoning tokens."
6
+ 30,Deepseek-v3.2,Deepseek v3.2,/logos/models/deepseek.svg,OpenRouter,163840,6.85E+11,,01/12/2025,,DeepSeek V3.2 is the latest direct DeepSeek model featuring DeepSeek Sparse Attention (DSA) for high efficiency. It delivers long-context handling up to 163k tokens with reduced inference costs.
7
+ 30,Deepseek-v3.2-speciale,DeepSeek 3.2 Speciale,/logos/models/deepseek.svg,OpenRouter,163840,6.85E+11,,01/12/2025,1,DeepSeek-V3.2-Speciale is a high-compute variant of DeepSeek-V3.2 optimized for maximum reasoning and agentic performance
8
+ 3,Gemini-2.5-flash,Gemini 2.5 flash,/logos/models/gemini.svg,OpenRouter,1048576,,,25/09/2025,1,"Gemini 2.5 Flash is Google's workhorse model for high-frequency tasks. It features a 1M context window, optimized for speed and efficiency in reasoning and multimodal processing."
9
+ 3,Gemini-2.5-flash-lite,Gemini 2.5 flash lite,/logos/models/gemini.svg,OpenRouter,1000000,,,25/09/2025,1,"Gemini 2.5 Flash-Lite is a lightweight reasoning model optimized for ultra-low latency. It offers a 1M context window and is designed for cost-effective, high-throughput applications."
10
+ 3,Gemini-2.5-pro,Gemini 2.5 pro,/logos/models/gemini.svg,OpenRouter,1000000,,,01/09/2025,1,"Gemini 2.5 Pro is Google's best reasoning model, featuring a 1M token context window. It uses a sparse MoE architecture to excel in complex reasoning, coding, and multimodal tasks."
11
+ 3,Gemini-3-pro-preview,Gemini 3 pro preview,/logos/models/gemini.svg,OpenRouter,1048576,,,18/11/2025,1,"Gemini 3 Pro Preview is Google's flagship frontier model. It offers high-precision multimodal reasoning across text, audio, video, and code, with a 1M token context."
12
+ 22,GLM-4.6,GLM 4.6,/logos/models/glm.svg,OpenRouter,202752,3.55E+11,,30/09/2025,1,"GLM-4.6 is an open-weight model with 355B parameters (32B active). It uses MoE architecture to deliver state-of-the-art performance in reasoning, coding, and multimodal tasks."
13
+ 22,GLM-4.5-Air,GLM 4.5 Air,/logos/models/glm.svg,OpenRouter,128000,1.06E+11,,29/07/2025,1,"GLM-4.5 Air is an efficient MoE model with 106B parameters (12B active). It is optimized for agentic applications, tool use, and speed."
14
+ 1,Gpt-5.2-pro,Gpt 5.2 Pro,/logos/models/openai.svg,OpenRouter,400000,,,10/12/2025,1,"GPT-5.2 Pro is OpenAI’s most advanced model, offering major improvements in agentic coding and long context performance over GPT-5 Pro. It is optimized for complex tasks that require step-by-step reasoning, instruction following, and accuracy in high-stakes use cases."
15
+ 1,Gpt-5.2,Gpt 5.2,/logos/models/openai.svg,OpenRouter,400000,,,10/12/2025,1,"GPT-5.2 is the latest frontier-grade model in the GPT-5 series, offering stronger agentic and long context perfomance compared to GPT-5.1. It uses adaptive reasoning to allocate computation dynamically."
16
+ 1,Gpt-5.1,Gpt 5.1,/logos/models/openai.svg,OpenRouter,400000,,,13/11/2025,1,GPT-5.1 offers stronger general-purpose reasoning and instruction adherence than GPT-5. It features adaptive computation and a natural conversational style.
17
+ 1,Gpt-5-mini,Gpt 5 mini,/logos/models/openai.svg,OpenRouter,400000,,,07/08/2025,1,"GPT-5-Nano is the smallest and fastest variant in the GPT-5 system, optimized for developer tools, rapid interactions, and ultra-low latency environments."
18
+ 1,Gpt-5-nano,Gpt 5 nano,/logos/models/openai.svg,OpenRouter,400000,,,07/08/2025,1,"GPT-5 Mini is a compact version of GPT-5 for lightweight reasoning. It offers low latency and cost, suitable for high-frequency tasks."
19
+ 1,Gpt-oss-120b,Gpt oss 120b,/logos/models/openai.svg,OpenRouter,131072,1.17E+11,,04/08/2025,1,"GPT-OSS-120B is an open-weight MoE model from OpenAI (117B params, 5.1B active). It is optimized for single-GPU deployment and excels in reasoning and agentic tasks."
20
+ 1,Gpt-oss-20b,Gpt oss 20b,/logos/models/openai.svg,OpenRouter,131072,19500000000,,04/08/2025,1,"gpt-oss-20b is an open-weight 21B parameter model released by OpenAI under the Apache 2.0 license. It uses a Mixture-of-Experts (MoE) architecture with 3.6B active parameters per forward pass, optimized for lower-latency inference and deployability on consumer or single-GPU hardware."
21
+ 23,Grok-4,Grok 4,/logos/models/grok.svg,OpenRouter,256000,3.14E+11,,09/07/2025,1,Grok 4 is xAI's general-purpose reasoning model with 314B parameters (MoE). It features real-time data integration and strong performance in general tasks.
22
+ 23,Grok-4.1-fast,Grok 4.1 fast,/logos/models/grok.svg,xAI,2000000,,,19/11/2025,1,"Grok 4.1 Fast is an agentic tool-calling model with a 2M context window. It is optimized for customer support, deep research, and real-world workflows."
23
+ 23,Grok-4.1-fast-thinking,Grok 4.1 fast thinking,/logos/models/grok.svg,xAI,2000000,,,19/11/2025,1,Grok 4.1 Fast Thinking is the reasoning-enabled variant of Grok 4.1 Fast. It provides extended thought processes for complex problem-solving within a 2M context.
24
+ 29,Kimi-K2-Instruct,Kimi K2 Instruct,/logos/models/kimi.svg,OpenRouter,256000,1.00E+12,,01/07/2025,,"Kimi K2 Instruct is a large open-weight model (1T params, 32B active) by Moonshot AI. It offers strong performance in instruction following and general tasks."
25
+ 29,Kimi-k2-thinking,Kimi k2 thinking,/logos/models/kimi.svg,OpenRouter,256000,1.00E+12,,01/07/2025,1,Kimi K2 Thinking is a reasoning variant capable of autonomous long-horizon tasks. It can execute hundreds of sequential tool calls.
26
+ 27,Llama-3.3-nemotron-super-49b-v1.5,Llama 3.3 nemotron super 49b v1.5,/logos/companies/nvidia.svg,OpenRouter,131072,49000000000,,17/03/2025,1,"Llama 3.3 Nemotron Super 49B is a reasoning model derived from Llama 3.3 70B. It is post-trained for agentic workflows, RAG, and tool calling."
27
+ 13,Ministral-8b-2512,Ministral 8b 2512,/logos/models/mistral.svg,OpenRouter,262144,,,13/08/2025,,"A balanced model in the Ministral 3 family, Ministral 3 8B is a powerful, efficient tiny language model with vision capabilities."
28
+ 13,Mistral-medium-3.1,Mistral medium 3.1,/logos/models/mistral.svg,OpenRouter,131072,,,02/12/2024,,"Mistral Medium 3.1 is an updated version of Mistral Medium 3, which is a high-performance enterprise-grade language model designed to deliver frontier-level capabilities at significantly reduced operational cost."
29
+ 31,Minimax-m2,Minimax m2,/logos/models/minimax.svg,OpenRouter,204800,2.30E+11,,26/10/2025,1,"MiniMax M2 is a 230B (10B active) MoE model. It is highly efficient, designed for coding and agentic workflows with low latency."
30
+ 13,Mistral-large-2512,Mistral large 2512,/logos/models/mistral.svg,OpenRouter,262144,6.75E+11,,01/12/2025,1,"Mistral Large 3 2512 is Mistral's flagship MoE model (675B total, 41B active). It offers top-tier performance in reasoning and coding."
31
+ 27,Nemotron-nano-9b-v2,Nemotron nano 9b v2,/logos/companies/nvidia.svg,OpenRouter,131072,9000000000,,05/09/2025,,"Nemotron Nano 9B v2 is a compact 9B model by NVIDIA. It is a unified model for reasoning and non-reasoning tasks, trained from scratch."
32
+ 7,Nova-2-lite-v1,Nova 2 lite v1,/logos/models/nova.svg,OpenRouter,1000000,,,02/12/2024,1,"Nova 2 Lite is a fast, cost-effective reasoning model for everyday workloads that can process text, images, and videos to generate text."
33
+ 7,Nova-premier-v1,Nova Premier v1,/logos/models/nova.svg,OpenRouter,1000000,,,09/10/2024,,Amazon Nova Premier is the most capable of Amazon’s multimodal models for complex reasoning tasks and for use as the best teacher for distilling custom models.
34
+ 18,Qwen3-235B-A22B-Thinking-2507,Qwen3 235B A22B Thinking 2507,/logos/models/qwen.svg,OpenRouter,262144,2.35E+11,,25/07/2025,1,"Qwen3 235B Thinking is a MoE model (235B total, 22B active) optimized for complex reasoning. It generates thinking traces for deep problem solving."
35
+ 18,Qwen3-235b-a22b-2507,Qwen3 235b a22b 2507,/logos/models/qwen.svg,OpenRouter,262144,2.35E+11,,21/07/2025,,"Qwen3-235B-A22B-Instruct-2507 is a multilingual, instruction-tuned mixture-of-experts language model based on the Qwen3-235B architecture, with 22B active parameters per forward pass"
36
+ 18,Qwen3-next-80b-a3b-thinking,Qwen3 next 80b a3b thinking,/logos/models/qwen.svg,OpenRouter,262144,80000000000,,11/09/2025,1,Qwen3 Next 80B Thinking is a reasoning-first MoE model (80B total). It specializes in hard multi-step problems and agentic planning.
37
+ 3,Gemini-3-pro-preview,Gemini-3-flash-preview,/logos/models/gemini.svg,OpenRouter,1048576,,,17/12/2025,1,Gemini 3 Flash Preview is a high speedthinking model that delivers near Pro level reasoning and tool use performance with substantially lower latency than larger Gemini variants.
38
+ 27,Nemotron-3-nano-30b-a3b,Nemotron nano 9b v2,/logos/companies/nvidia.svg,OpenRouter,256000,3.00E+10,,14/12/2025,1,NVIDIA Nemotron 3 Nano 30B A3B is a small language MoE model with highest compute efficiency and accuracy for developers to build specialized agentic AI systems.
39
+ 33,Olmo-3.1-32b-think,Olmo 3.1 32b Think,/logos/companies/olmo.svg,,65536,3.20E+10,,16/12/2025,1,"A large-scale, 32-billion-parameter model designed for deep reasoning, complex multi-step logic, and advanced instruction following."
runs/run-2025-12-16/p99_latency.csv CHANGED
@@ -8,6 +8,7 @@ Deepseek-v3.2-speciale,1311.9582,650.2337,528.4946,503.1869,607.8602,403.1232,17
8
  Gemini-2.5-flash,475.328,58.3303,184.8024,139.9909,178.2687,125.5033,154.3759,178.7707,112.877,131.1387,173.9386
9
  Gemini-2.5-flash-lite,78.696,11.6849,128.7149,38.8007,44.9652,110.2367,86.6394,74.8385,84.655,31.6472,69.0879
10
  Gemini-2.5-pro,306.2334,63.3816,314.2017,209.8994,160.4949,139.1723,328.0977,268.0161,228.2436,207.0017,222.4742
 
11
  Gemini-3-pro-preview,381.7002,98.8641,168.2995,140.8396,157.0393,98.373,342.9425,247.1862,129.185,97.1085,186.1538
12
  GLM-4.5-Air,945.3131,88.5651,379.6257,236.088,270.8741,253.7883,632.8253,900.4997,274.4039,270.8565,425.284
13
  GLM-4.6,726.6297,152.2205,537.891,591.0155,502.2694,618.9771,898.3246,1317.976,546.1143,413.4574,630.4876
@@ -28,9 +29,11 @@ Minimax-m2,991.8198,589.4947,238.879,288.4827,400.5846,201.5793,562.5588,746.353
28
  Ministral-8b-2512,108.0489,233.4817,256.4196,82.3999,54.9958,104.629,423.6525,68.7241,92.101,116.9296,154.1382
29
  Mistral-large-2512,318.5243,67.4724,277.4699,230.3884,182.5314,212.874,132.5931,141.9202,204.2604,213.2644,198.1299
30
  Mistral-medium-3.1,101.3717,25.4349,207.9478,212.7246,182.3555,175.8245,108.1197,63.577,166.3996,225.5168,146.9272
 
31
  Nemotron-nano-9b-v2,338.6557,195.0248,111.1827,64.8506,178.3646,92.1459,473.7478,272.3648,169.0054,221.995,211.7337
32
  Nova-2-lite-v1,173.4527,103.964,157.1997,159.4808,133.8659,108.7504,116.0531,119.6456,135.6125,107.9884,131.6013
33
  Nova-premier-v1,177.9887,35.0321,225.562,188.5122,93.0407,103.8183,83.1841,94.8349,145.6836,197.7514,134.5408
 
34
  Qwen3-235b-a22b-2507,388.6708,256.5265,274.0394,224.9808,386.1726,290.7797,524.4854,507.0452,284.5746,234.8702,337.2145
35
  Qwen3-235B-A22B-Thinking-2507,1366.0363,248.1267,506.3298,540.1404,515.3773,580.0747,1677.411,1660.089,582.4473,432.8225,810.8855
36
  Qwen3-next-80b-a3b-thinking,497.6184,107.4429,187.0656,154.3664,247.5986,120.2682,189.55,343.7824,279.0175,140.0428,226.6753
 
8
  Gemini-2.5-flash,475.328,58.3303,184.8024,139.9909,178.2687,125.5033,154.3759,178.7707,112.877,131.1387,173.9386
9
  Gemini-2.5-flash-lite,78.696,11.6849,128.7149,38.8007,44.9652,110.2367,86.6394,74.8385,84.655,31.6472,69.0879
10
  Gemini-2.5-pro,306.2334,63.3816,314.2017,209.8994,160.4949,139.1723,328.0977,268.0161,228.2436,207.0017,222.4742
11
+ Gemini-3-flash-preview,151.2741,87.0896,51.96,38.3654,70.0249,37.0887,549.6887,257.3478,84.5966,38.2785,136.5714
12
  Gemini-3-pro-preview,381.7002,98.8641,168.2995,140.8396,157.0393,98.373,342.9425,247.1862,129.185,97.1085,186.1538
13
  GLM-4.5-Air,945.3131,88.5651,379.6257,236.088,270.8741,253.7883,632.8253,900.4997,274.4039,270.8565,425.284
14
  GLM-4.6,726.6297,152.2205,537.891,591.0155,502.2694,618.9771,898.3246,1317.976,546.1143,413.4574,630.4876
 
29
  Ministral-8b-2512,108.0489,233.4817,256.4196,82.3999,54.9958,104.629,423.6525,68.7241,92.101,116.9296,154.1382
30
  Mistral-large-2512,318.5243,67.4724,277.4699,230.3884,182.5314,212.874,132.5931,141.9202,204.2604,213.2644,198.1299
31
  Mistral-medium-3.1,101.3717,25.4349,207.9478,212.7246,182.3555,175.8245,108.1197,63.577,166.3996,225.5168,146.9272
32
+ Nemotron-3-nano-30b-a3b,71.8239,159.5903,48.939,33.3004,107.5626,42.9702,177.9534,172.5468,110.8246,53.1962,97.8707
33
  Nemotron-nano-9b-v2,338.6557,195.0248,111.1827,64.8506,178.3646,92.1459,473.7478,272.3648,169.0054,221.995,211.7337
34
  Nova-2-lite-v1,173.4527,103.964,157.1997,159.4808,133.8659,108.7504,116.0531,119.6456,135.6125,107.9884,131.6013
35
  Nova-premier-v1,177.9887,35.0321,225.562,188.5122,93.0407,103.8183,83.1841,94.8349,145.6836,197.7514,134.5408
36
+ Olmo-3.1-32b-think,345.2511,273.287,135.2763,136.9126,222.3814,107.5568,467.3806,507.0392,302.2275,207.0715,270.4384
37
  Qwen3-235b-a22b-2507,388.6708,256.5265,274.0394,224.9808,386.1726,290.7797,524.4854,507.0452,284.5746,234.8702,337.2145
38
  Qwen3-235B-A22B-Thinking-2507,1366.0363,248.1267,506.3298,540.1404,515.3773,580.0747,1677.411,1660.089,582.4473,432.8225,810.8855
39
  Qwen3-next-80b-a3b-thinking,497.6184,107.4429,187.0656,154.3664,247.5986,120.2682,189.55,343.7824,279.0175,140.0428,226.6753
runs/run-2025-12-16/summary_data.csv CHANGED
@@ -1,36 +1,39 @@
1
  Model,Iterations,AutoBench,LMArena,AAI Index,MMLU-Pro,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec),Fail Rate %
2
- Gpt-5.2-pro,303,4.48,,72,87%,0.8188,261,784,3.81%
3
- Gpt-5.2,312,4.43,,,,0.0736,130,434,0.95%
4
- Gemini-3-pro-preview,312,4.41,1492,73,90%,0.0685,76,186,0.95%
5
- Claude-opus-4.5,313,4.39,1470,70,90%,0.1726,144,373,0.63%
6
- Gpt-5.1,310,4.38,1457,70,87%,0.1080,227,627,1.59%
7
- Kimi-k2-thinking,287,4.32,1429,67,85%,0.0186,248,729,8.89%
8
- Claude-sonnet-4.5,307,4.30,1450,63,88%,0.1139,170,477,2.54%
9
- Gemini-2.5-pro,313,4.29,1451,60,86%,0.0648,87,222,0.63%
10
- Gpt-5-mini,312,4.29,1392,64,84%,0.0091,93,258,0.95%
11
- Grok-4.1-fast-thinking,306,4.21,,64,85%,0.0027,69,207,2.86%
12
- Grok-4,293,4.20,1478,65,87%,0.0812,180,562,6.98%
13
- Qwen3-235B-A22B-Thinking-2507,283,4.20,1397,57,84%,0.0032,317,811,10.16%
14
- Gpt-oss-120b,292,4.18,1352,61,81%,0.0011,75,292,7.30%
15
- Gemini-2.5-flash,312,4.17,1408,51,84%,0.0212,66,174,0.95%
16
- Claude-haiku-4.5,312,4.17,1402,55,76%,0.0379,111,317,0.95%
17
- Deepseek-v3.2-speciale,288,4.14,1418,59,86%,0.0047,310,833,8.57%
18
- GLM-4.6,306,4.13,1425,56,83%,0.0125,187,630,2.86%
19
- DeepSeek-R1-0528,308,4.12,1395,52,85%,0.0099,171,477,2.22%
20
- Deepseek-v3.2,311,4.11,1414,52,84%,0.0009,125,410,1.27%
21
- Kimi-k2-0905,312,4.11,1416,50,82%,0.0033,83,329,0.95%
22
- Gpt-5-nano,309,4.06,1339,51,77%,0.0034,100,269,1.90%
23
- Nova-2-lite-v1,277,4.06,1334,47,81%,0.0394,61,132,12.06%
24
- Qwen3-next-80b-a3b-thinking,312,4.03,1367,54,82%,0.0075,78,227,0.95%
25
- Minimax-m2,308,3.99,1345,61,82%,0.0071,137,473,2.22%
26
- Qwen3-235b-a22b-2507,302,3.98,1374,45,83%,0.0019,105,337,1.31%
27
- Gemini-2.5-flash-lite,313,3.95,1378,40,81%,0.0021,20,69,0.63%
28
- Mistral-large-2512,307,3.94,1415,38,81%,0.0051,90,198,2.54%
29
- Grok-4.1-fast,312,3.88,,38,74%,0.0008,24,57,0.95%
30
- GLM-4.5-Air,306,3.86,1370,49,82%,0.0054,163,425,2.86%
31
- Mistral-medium-3.1,306,3.81,1411,35,68%,0.0038,52,147,2.86%
32
- Llama-3.3-nemotron-super-49b-v1.5,311,3.78,1340,45,81%,0.0018,76,240,1.27%
33
- Gpt-oss-20b,310,3.78,1318,52,75%,0.0007,39,183,1.59%
34
- Ministral-8b-2512,306,3.57,,28,64%,0.0005,31,154,2.86%
35
- Nemotron-nano-9b-v2,311,3.50,,37,74%,0.0008,67,212,1.27%
36
- Nova-premier-v1,312,3.47,,32,73%,0.0130,52,135,0.95%
 
 
 
 
1
  Model,Iterations,AutoBench,LMArena,AAI Index,MMLU-Pro,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec),Fail Rate %
2
+ Claude-haiku-4.5,312,4.170821,1402,55,76%,0.037873971,110.9457146,316.9223,0.95%
3
+ Claude-opus-4.5,313,4.39496,1470,70,90%,0.172599703,144.0064821,373.3072,0.63%
4
+ Claude-sonnet-4.5,307,4.30218,1450,63,88%,0.113931502,169.7268622,476.7493,2.54%
5
+ DeepSeek-R1-0528,308,4.118577,1395,52,85%,0.009866481,171.4999365,476.6504,2.22%
6
+ Deepseek-v3.2,311,4.109586,1414,52,84%,0.00088526,124.5749929,410.4636,1.27%
7
+ Deepseek-v3.2-speciale,288,4.141433,1418,59,86%,0.004672989,310.3903417,832.6711,8.57%
8
+ Gemini-2.5-flash,312,4.171935,1408,51,84%,0.021217848,65.61706446,173.9386,0.95%
9
+ Gemini-2.5-flash-lite,313,3.94904,1378,40,81%,0.002139796,20.41722765,69.0879,0.63%
10
+ Gemini-2.5-pro,313,4.294065,1451,60,86%,0.064793075,86.79510542,222.4742,0.63%
11
+ Gemini-3-flash-preview,313,4.303068,,71,89%,0.01945601,45.56031499,136.5714,0.63%
12
+ Gemini-3-pro-preview,312,4.405224,1492,73,90%,0.068499733,76.10772546,186.1538,0.95%
13
+ GLM-4.5-Air,306,3.864646,1370,49,82%,0.005355855,163.1509648,425.284,2.86%
14
+ GLM-4.6,306,4.132794,1425,56,83%,0.01253707,187.4263836,630.4876,2.86%
15
+ Gpt-5.1,310,4.38364,1457,70,0.87,0.108002358,227.425965,627.3321,1.59%
16
+ Gpt-5.2,312,4.430061,,,,0.073564969,130.0950453,434.1457,0.95%
17
+ Gpt-5.2-pro,303,4.476206,,73,87%,0.818815856,261.3839264,783.8191,3.81%
18
+ Gpt-5-mini,312,4.287269,1392,64,84%,0.009137171,93.48742858,257.8269,0.95%
19
+ Gpt-5-nano,309,4.060397,1339,51,77%,0.003385019,99.62428955,268.8315,1.90%
20
+ Gpt-oss-120b,292,4.181097,1352,61,81%,0.001149135,75.47582399,291.8432,7.30%
21
+ Gpt-oss-20b,310,3.779105,1318,52,75%,0.000682808,38.76845801,183.0218,1.59%
22
+ Grok-4,293,4.197064,1478,65,87%,0.081239078,180.1110452,562.4568,6.98%
23
+ Grok-4.1-fast,312,3.877365,,38,74%,0.000841039,23.59507062,57.4378,0.95%
24
+ Grok-4.1-fast-thinking,306,4.206201,,64,85%,0.002722312,69.23819567,207.0219,2.86%
25
+ Kimi-k2-0905,312,4.107852,1416,50,82%,0.003344045,82.79755057,329.0056,0.95%
26
+ Kimi-k2-thinking,287,4.315342,1429,67,85%,0.018558145,247.9683578,729.1158,8.89%
27
+ Llama-3.3-nemotron-super-49b-v1.5,311,3.783612,1340,45,81%,0.001833066,76.47567281,240.3163,1.27%
28
+ Minimax-m2,308,3.990557,1345,61,82%,0.00711788,136.9629944,472.8137,2.22%
29
+ Ministral-8b-2512,306,3.570151,,28,64%,0.000489568,31.40083599,154.1382,2.86%
30
+ Mistral-large-2512,307,3.935105,1415,38,81%,0.005120599,89.96343603,198.1299,2.54%
31
+ Mistral-medium-3.1,306,3.811798,1411,35,68%,0.003751786,52.24553215,146.9272,2.86%
32
+ Nemotron-3-nano-30b-a3b,314,4.028261,,52,79%,0,30.08154062,97.8707,0.32%
33
+ Nemotron-nano-9b-v2,311,3.500291,,37,74%,0.000820591,66.77738031,211.7337,1.27%
34
+ Nova-2-lite-v1,277,4.059981,1334,47,81%,0.039437834,61.45748847,131.6013,12.06%
35
+ Nova-premier-v1,312,3.473742,,32,73%,0.012950513,51.84074232,134.5408,0.95%
36
+ Olmo-3.1-32b-think,307,3.85021,,,,0,122.4217364,270.4384,2.54%
37
+ Qwen3-235b-a22b-2507,302,3.98095,1374,45,0.83,0.001917167,104.7811018,337.2145,0.041269841
38
+ Qwen3-235B-A22B-Thinking-2507,283,4.196769,1397,57,0.84,0.003170949,316.8201599,810.8855,0.101587302
39
+ Qwen3-next-80b-a3b-thinking,312,4.031744,1367,54,0.82,0.007494371,77.75939135,226.6753,0.00952381