Update README.md
Browse filesremoved bbh and drop for now. Will add after fixing them.
README.md
CHANGED
|
@@ -190,8 +190,8 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
| 190 |
<th style="text-align:center; background-color: #001d6c; color: white;">MMLU</th>
|
| 191 |
<th style="text-align:center; background-color: #001d6c; color: white;">PopQA</th>
|
| 192 |
<th style="text-align:center; background-color: #001d6c; color: white;">TruthfulQA</th>
|
| 193 |
-
<th style="text-align:center; background-color: #001d6c; color: white;">BigBenchHard</th>
|
| 194 |
-
<th style="text-align:center; background-color: #001d6c; color: white;">DROP</th>
|
| 195 |
<th style="text-align:center; background-color: #001d6c; color: white;">GSM8K</th>
|
| 196 |
<th style="text-align:center; background-color: #001d6c; color: white;">HumanEval</th>
|
| 197 |
<th style="text-align:center; background-color: #001d6c; color: white;">HumanEval+</th>
|
|
@@ -206,8 +206,8 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
| 206 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.11</td>
|
| 207 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.55</td>
|
| 208 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.79</td>
|
| 209 |
-
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">54.46</td>
|
| 210 |
-
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">18.68</td>
|
| 211 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.55</td>
|
| 212 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.45</td>
|
| 213 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">75.26</td>
|
|
@@ -221,8 +221,8 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
| 221 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.18</td>
|
| 222 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.56</td>
|
| 223 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.8</td>
|
| 224 |
-
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.27</td>
|
| 225 |
-
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">21.12</td>
|
| 226 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.02</td>
|
| 227 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">80.13</td>
|
| 228 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">73.39</td>
|
|
@@ -236,8 +236,8 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
| 236 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 55.88 </td>
|
| 237 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 18.4 </td>
|
| 238 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 58.97 </td>
|
| 239 |
-
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 52.51 </td>
|
| 240 |
-
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 35.98 </td>
|
| 241 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 72.48 </td>
|
| 242 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.51 </td>
|
| 243 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 75.68 </td>
|
|
@@ -252,8 +252,8 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
| 252 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">69.15</td>
|
| 253 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.79</td>
|
| 254 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.79</td>
|
| 255 |
-
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">72.66</td>
|
| 256 |
-
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">61.48</td>
|
| 257 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">83.24</td>
|
| 258 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.32</td>
|
| 259 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">80.15</td>
|
|
@@ -268,8 +268,8 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
| 268 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">45.80</td>
|
| 269 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">13.25</td>
|
| 270 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.43</td>
|
| 271 |
-
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.71</td>
|
| 272 |
-
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">44.46</td>
|
| 273 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">72.18</td>
|
| 274 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.54</td>
|
| 275 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">62.91</td>
|
|
@@ -284,8 +284,8 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
| 284 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">74.30</td>
|
| 285 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">18.12</td>
|
| 286 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">63.06</td>
|
| 287 |
-
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">70.40</td>
|
| 288 |
-
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">54.71</td>
|
| 289 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">84.46</td>
|
| 290 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">93.35</td>
|
| 291 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.91</td>
|
|
@@ -300,8 +300,8 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
| 300 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">50.72</td>
|
| 301 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">9.94</td>
|
| 302 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.14</td>
|
| 303 |
-
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.04</td>
|
| 304 |
-
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">42.76</td>
|
| 305 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">78.47</td>
|
| 306 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.89</td>
|
| 307 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">78.43</td>
|
|
@@ -315,8 +315,8 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
| 315 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.77</td>
|
| 316 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.7</td>
|
| 317 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.84</td>
|
| 318 |
-
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">68.55</td>
|
| 319 |
-
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">50.78</td>
|
| 320 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.15</td>
|
| 321 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.63</td>
|
| 322 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.79</td>
|
|
@@ -331,8 +331,8 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
| 331 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.79</td>
|
| 332 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.04</td>
|
| 333 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.92</td>
|
| 334 |
-
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">64.77</td>
|
| 335 |
-
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">50.95</td>
|
| 336 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">81.65</td>
|
| 337 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.35</td>
|
| 338 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.72</td>
|
|
@@ -346,8 +346,8 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
| 346 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 65.54 </td>
|
| 347 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 26.17 </td>
|
| 348 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 66.86 </td>
|
| 349 |
-
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 59.01 </td>
|
| 350 |
-
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 41.53 </td>
|
| 351 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.89 </td>
|
| 352 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 89.73 </td>
|
| 353 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 86.09 </td>
|
|
@@ -355,6 +355,7 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
| 355 |
<td style="text-align:center; background-color: #DAE8FF; color: black;">88.5</td>
|
| 356 |
</tr>
|
| 357 |
</tbody></table>
|
|
|
|
| 358 |
<table>
|
| 359 |
<caption style="text-align:center"><b>Math Benchmarks</b></caption>
|
| 360 |
<thead>
|
|
|
|
| 190 |
<th style="text-align:center; background-color: #001d6c; color: white;">MMLU</th>
|
| 191 |
<th style="text-align:center; background-color: #001d6c; color: white;">PopQA</th>
|
| 192 |
<th style="text-align:center; background-color: #001d6c; color: white;">TruthfulQA</th>
|
| 193 |
+
<!-- <th style="text-align:center; background-color: #001d6c; color: white;">BigBenchHard</th> -->
|
| 194 |
+
<!-- <th style="text-align:center; background-color: #001d6c; color: white;">DROP</th> -->
|
| 195 |
<th style="text-align:center; background-color: #001d6c; color: white;">GSM8K</th>
|
| 196 |
<th style="text-align:center; background-color: #001d6c; color: white;">HumanEval</th>
|
| 197 |
<th style="text-align:center; background-color: #001d6c; color: white;">HumanEval+</th>
|
|
|
|
| 206 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.11</td>
|
| 207 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.55</td>
|
| 208 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.79</td>
|
| 209 |
+
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">54.46</td> -->
|
| 210 |
+
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">18.68</td> -->
|
| 211 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.55</td>
|
| 212 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.45</td>
|
| 213 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">75.26</td>
|
|
|
|
| 221 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.18</td>
|
| 222 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.56</td>
|
| 223 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.8</td>
|
| 224 |
+
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.27</td> -->
|
| 225 |
+
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">21.12</td> -->
|
| 226 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.02</td>
|
| 227 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">80.13</td>
|
| 228 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">73.39</td>
|
|
|
|
| 236 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 55.88 </td>
|
| 237 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 18.4 </td>
|
| 238 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 58.97 </td>
|
| 239 |
+
<!-- <td style="text-align:center; background-color: #DAE8FF; color: black;"> 52.51 </td> -->
|
| 240 |
+
<!-- <td style="text-align:center; background-color: #DAE8FF; color: black;"> 35.98 </td> -->
|
| 241 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 72.48 </td>
|
| 242 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.51 </td>
|
| 243 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 75.68 </td>
|
|
|
|
| 252 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">69.15</td>
|
| 253 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.79</td>
|
| 254 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.79</td>
|
| 255 |
+
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">72.66</td> -->
|
| 256 |
+
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">61.48</td> -->
|
| 257 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">83.24</td>
|
| 258 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.32</td>
|
| 259 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">80.15</td>
|
|
|
|
| 268 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">45.80</td>
|
| 269 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">13.25</td>
|
| 270 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.43</td>
|
| 271 |
+
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.71</td> -->
|
| 272 |
+
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">44.46</td> -->
|
| 273 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">72.18</td>
|
| 274 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.54</td>
|
| 275 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">62.91</td>
|
|
|
|
| 284 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">74.30</td>
|
| 285 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">18.12</td>
|
| 286 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">63.06</td>
|
| 287 |
+
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">70.40</td> -->
|
| 288 |
+
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">54.71</td> -->
|
| 289 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">84.46</td>
|
| 290 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">93.35</td>
|
| 291 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.91</td>
|
|
|
|
| 300 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">50.72</td>
|
| 301 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">9.94</td>
|
| 302 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.14</td>
|
| 303 |
+
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.04</td> -->
|
| 304 |
+
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">42.76</td> -->
|
| 305 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">78.47</td>
|
| 306 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.89</td>
|
| 307 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">78.43</td>
|
|
|
|
| 315 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.77</td>
|
| 316 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.7</td>
|
| 317 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.84</td>
|
| 318 |
+
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">68.55</td> -->
|
| 319 |
+
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">50.78</td> -->
|
| 320 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.15</td>
|
| 321 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.63</td>
|
| 322 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.79</td>
|
|
|
|
| 331 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.79</td>
|
| 332 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.04</td>
|
| 333 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.92</td>
|
| 334 |
+
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">64.77</td> -->
|
| 335 |
+
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">50.95</td> -->
|
| 336 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">81.65</td>
|
| 337 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.35</td>
|
| 338 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.72</td>
|
|
|
|
| 346 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 65.54 </td>
|
| 347 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 26.17 </td>
|
| 348 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 66.86 </td>
|
| 349 |
+
<!-- <td style="text-align:center; background-color: #DAE8FF; color: black;"> 59.01 </td> -->
|
| 350 |
+
<!-- <td style="text-align:center; background-color: #DAE8FF; color: black;"> 41.53 </td> -->
|
| 351 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.89 </td>
|
| 352 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 89.73 </td>
|
| 353 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 86.09 </td>
|
|
|
|
| 355 |
<td style="text-align:center; background-color: #DAE8FF; color: black;">88.5</td>
|
| 356 |
</tr>
|
| 357 |
</tbody></table>
|
| 358 |
+
|
| 359 |
<table>
|
| 360 |
<caption style="text-align:center"><b>Math Benchmarks</b></caption>
|
| 361 |
<thead>
|