Update README.md
Browse files
README.md
CHANGED
|
@@ -57,73 +57,73 @@ The website [https://swallow-llm.github.io/](https://swallow-llm.github.io/index
|
|
| 57 |
|
| 58 |
## MT-Bench JA
|
| 59 |
|
| 60 |
-
| Model
|
| 61 |
-
|
|
| 62 |
-
| google/gemma-3-1b-it
|
| 63 |
-
| Qwen/Qwen2.5-1.5B-Instruct
|
| 64 |
-
| google/gemma-2-2b-it
|
| 65 |
-
| rinna/gemma-2-baku-2b-it
|
| 66 |
-
| google/gemma-2-2b-jpn-it
|
| 67 |
-
| tokyotech-llm/Gemma-2-Llama-Swallow-2b-it-v0.1
|
| 68 |
-
| Qwen/Qwen2.5-3B-Instruct
|
| 69 |
-
| google/gemma-3-4b-it
|
| 70 |
-
| Qwen/Qwen2.5-7B-Instruct
|
| 71 |
-
| tokyotech-llm/Llama-3.1-Swallow-8B-Instruct-v0.3
|
| 72 |
-
| google/gemma-2-9b-it
|
| 73 |
-
| tokyotech-llm/Gemma-2-Llama-Swallow-9b-it-v0.1
|
| 74 |
-
| google/gemma-3-12b-it
|
| 75 |
-
| google/gemma-2-27b-it
|
| 76 |
-
| tokyotech-llm/Gemma-2-Llama-Swallow-27b-it-v0.1
|
| 77 |
-
| google/gemma-3-27b-it
|
| 78 |
-
| Qwen/Qwen2.5-32B-Instruct
|
| 79 |
|
| 80 |
### Japanese tasks
|
| 81 |
|
| 82 |
-
| Model
|
| 83 |
-
|
|
| 84 |
-
|
|
| 85 |
-
|
|
| 86 |
-
| google/gemma-3-1b-it
|
| 87 |
-
| Qwen/Qwen2.5-1.5B-Instruct
|
| 88 |
-
| google/gemma-2-2b-it
|
| 89 |
-
| rinna/gemma-2-baku-2b-it
|
| 90 |
-
| google/gemma-2-2b-jpn-it
|
| 91 |
-
| tokyotech-llm/Gemma-2-Llama-Swallow-2b-it-v0.1
|
| 92 |
-
| Qwen/Qwen2.5-3B-Instruct
|
| 93 |
-
| google/gemma-3-4b-it
|
| 94 |
-
| Qwen/Qwen2.5-7B-Instruct
|
| 95 |
-
| tokyotech-llm/Llama-3.1-Swallow-8B-Instruct-v0.3
|
| 96 |
-
| google/gemma-2-9b-it
|
| 97 |
-
| tokyotech-llm/Gemma-2-Llama-Swallow-9b-it-v0.1
|
| 98 |
-
| google/gemma-3-12b-it
|
| 99 |
-
| google/gemma-2-27b-it
|
| 100 |
-
| tokyotech-llm/Gemma-2-Llama-Swallow-27b-it-v0.1
|
| 101 |
-
| google/gemma-3-27b-it
|
| 102 |
-
| Qwen/Qwen2.5-32B-Instruct
|
| 103 |
|
| 104 |
### English tasks
|
| 105 |
|
| 106 |
-
| Model
|
| 107 |
-
|
|
| 108 |
-
|
|
| 109 |
-
|
|
| 110 |
-
| google/gemma-3-1b-it
|
| 111 |
-
| Qwen/Qwen2.5-1.5B-Instruct
|
| 112 |
-
| google/gemma-2-2b-it
|
| 113 |
-
| rinna/gemma-2-baku-2b-it
|
| 114 |
-
| google/gemma-2-2b-jpn-it
|
| 115 |
-
| tokyotech-llm/Gemma-2-Llama-Swallow-2b-it-v0.1
|
| 116 |
-
| Qwen/Qwen2.5-3B-Instruct
|
| 117 |
-
| google/gemma-3-4b-it
|
| 118 |
-
| Qwen/Qwen2.5-7B-Instruct
|
| 119 |
-
| tokyotech-llm/Llama-3.1-Swallow-8B-Instruct-v0.3
|
| 120 |
-
| google/gemma-2-9b-it
|
| 121 |
-
| tokyotech-llm/Gemma-2-Llama-Swallow-9b-it-v0.1
|
| 122 |
-
| google/gemma-3-12b-it
|
| 123 |
-
| google/gemma-2-27b-it
|
| 124 |
-
| tokyotech-llm/Gemma-2-Llama-Swallow-27b-it-v0.1
|
| 125 |
-
| google/gemma-3-27b-it
|
| 126 |
-
| Qwen/Qwen2.5-32B-Instruct
|
| 127 |
|
| 128 |
## Evaluation Benchmarks
|
| 129 |
|
|
|
|
| 57 |
|
| 58 |
## MT-Bench JA
|
| 59 |
|
| 60 |
+
| Model | coding | extraction | humanities | math | reasoning | roleplay | stem | writing | JMT Avg |
|
| 61 |
+
| --------------------------------------------------- | ------ | ---------- | ---------- | ----- | --------- | -------- | ----- | ------- | ------- |
|
| 62 |
+
| google/gemma-3-1b-it | 0.379 | 0.497 | 0.680 | 0.385 | 0.322 | 0.628 | 0.540 | 0.651 | 0.510 |
|
| 63 |
+
| Qwen/Qwen2.5-1.5B-Instruct | 0.408 | 0.513 | 0.456 | 0.527 | 0.352 | 0.473 | 0.406 | 0.469 | 0.450 |
|
| 64 |
+
| google/gemma-2-2b-it | 0.454 | 0.587 | 0.693 | 0.524 | 0.445 | 0.654 | 0.567 | 0.630 | 0.569 |
|
| 65 |
+
| rinna/gemma-2-baku-2b-it | 0.470 | 0.625 | 0.810 | 0.414 | 0.382 | 0.713 | 0.609 | 0.697 | 0.590 |
|
| 66 |
+
| google/gemma-2-2b-jpn-it | 0.467 | 0.488 | 0.741 | 0.379 | 0.406 | 0.660 | 0.589 | 0.672 | 0.550 |
|
| 67 |
+
| **tokyotech-llm/Gemma-2-Llama-Swallow-2b-it-v0.1** | 0.438 | 0.533 | 0.781 | 0.557 | 0.404 | 0.706 | 0.674 | 0.682 | 0.597 |
|
| 68 |
+
| Qwen/Qwen2.5-3B-Instruct | 0.567 | 0.647 | 0.597 | 0.665 | 0.457 | 0.649 | 0.526 | 0.637 | 0.593 |
|
| 69 |
+
| google/gemma-3-4b-it | 0.603 | 0.724 | 0.798 | 0.767 | 0.498 | 0.803 | 0.775 | 0.822 | 0.724 |
|
| 70 |
+
| Qwen/Qwen2.5-7B-Instruct | 0.599 | 0.741 | 0.719 | 0.637 | 0.541 | 0.744 | 0.624 | 0.713 | 0.665 |
|
| 71 |
+
| tokyotech-llm/Llama-3.1-Swallow-8B-Instruct-v0.3 | 0.562 | 0.756 | 0.869 | 0.610 | 0.512 | 0.783 | 0.748 | 0.803 | 0.705 |
|
| 72 |
+
| google/gemma-2-9b-it | 0.652 | 0.765 | 0.857 | 0.614 | 0.673 | 0.811 | 0.713 | 0.800 | 0.736 |
|
| 73 |
+
| **tokyotech-llm/Gemma-2-Llama-Swallow-9b-it-v0.1** | 0.592 | 0.796 | 0.872 | 0.742 | 0.638 | 0.802 | 0.745 | 0.803 | 0.749 |
|
| 74 |
+
| google/gemma-3-12b-it | 0.807 | 0.814 | 0.871 | 0.886 | 0.623 | 0.847 | 0.858 | 0.863 | 0.821 |
|
| 75 |
+
| google/gemma-2-27b-it | 0.727 | 0.809 | 0.874 | 0.719 | 0.639 | 0.810 | 0.740 | 0.826 | 0.768 |
|
| 76 |
+
| **tokyotech-llm/Gemma-2-Llama-Swallow-27b-it-v0.1** | 0.618 | 0.839 | 0.873 | 0.741 | 0.608 | 0.814 | 0.739 | 0.836 | 0.759 |
|
| 77 |
+
| google/gemma-3-27b-it | 0.804 | 0.927 | 0.879 | 0.876 | 0.774 | 0.846 | 0.848 | 0.882 | 0.855 |
|
| 78 |
+
| Qwen/Qwen2.5-32B-Instruct | 0.724 | 0.885 | 0.816 | 0.918 | 0.726 | 0.834 | 0.763 | 0.808 | 0.809 |
|
| 79 |
|
| 80 |
### Japanese tasks
|
| 81 |
|
| 82 |
+
| Model | JCom. | JEMHopQA | NIILC | JSQuAD | XL-Sum | MGSM | WMT20-en-ja | WMT20-ja-en | JMMLU | JHumanEval | Ja Avg |
|
| 83 |
+
| --------------------------------------------------- | ------ | -------- | ------- | ------- | ------- | ------ | ----------- | ----------- | ------ | ---------- | ------ |
|
| 84 |
+
| | 4-shot | 4-shot | 4-shot | 4-shot | 1-shot | 4-shot | 4-shot | 4-shot | 5-shot | 0-shot | |
|
| 85 |
+
| | EM acc | Char-F1 | Char-F1 | Char-F1 | ROUGE-2 | EM acc | BLEU | BLEU | EM acc | pass@1 | |
|
| 86 |
+
| google/gemma-3-1b-it | 0.526 | 0.330 | 0.237 | 0.700 | 0.113 | 0.088 | 0.166 | 0.115 | 0.332 | 0.245 | 0.285 |
|
| 87 |
+
| Qwen/Qwen2.5-1.5B-Instruct | 0.812 | 0.276 | 0.241 | 0.847 | 0.128 | 0.292 | 0.147 | 0.119 | 0.447 | 0.242 | 0.355 |
|
| 88 |
+
| google/gemma-2-2b-it | 0.862 | 0.348 | 0.315 | 0.879 | 0.117 | 0.252 | 0.207 | 0.183 | 0.437 | 0.321 | 0.392 |
|
| 89 |
+
| rinna/gemma-2-baku-2b-it | 0.855 | 0.228 | 0.390 | 0.877 | 0.115 | 0.172 | 0.255 | 0.190 | 0.415 | 0.165 | 0.366 |
|
| 90 |
+
| google/gemma-2-2b-jpn-it | 0.845 | 0.321 | 0.291 | 0.877 | 0.131 | 0.192 | 0.204 | 0.180 | 0.418 | 0.311 | 0.377 |
|
| 91 |
+
| **tokyotech-llm/Gemma-2-Llama-Swallow-2b-it-v0.1** | 0.862 | 0.367 | 0.483 | 0.881 | 0.145 | 0.288 | 0.258 | 0.200 | 0.485 | 0.267 | 0.424 |
|
| 92 |
+
| Qwen/Qwen2.5-3B-Instruct | 0.876 | 0.304 | 0.293 | 0.866 | 0.144 | 0.228 | 0.198 | 0.168 | 0.536 | 0.474 | 0.409 |
|
| 93 |
+
| google/gemma-3-4b-it | 0.818 | 0.444 | 0.404 | 0.801 | 0.134 | 0.332 | 0.217 | 0.169 | 0.477 | 0.365 | 0.416 |
|
| 94 |
+
| Qwen/Qwen2.5-7B-Instruct | 0.915 | 0.429 | 0.391 | 0.891 | 0.168 | 0.632 | 0.211 | 0.192 | 0.623 | 0.532 | 0.498 |
|
| 95 |
+
| tokyotech-llm/Llama-3.1-Swallow-8B-Instruct-v0.3 | 0.924 | 0.528 | 0.583 | 0.896 | 0.191 | 0.532 | 0.281 | 0.229 | 0.544 | 0.394 | 0.510 |
|
| 96 |
+
| google/gemma-2-9b-it | 0.931 | 0.532 | 0.527 | 0.876 | 0.149 | 0.636 | 0.273 | 0.239 | 0.623 | 0.559 | 0.535 |
|
| 97 |
+
| **tokyotech-llm/Gemma-2-Llama-Swallow-9b-it-v0.1** | 0.946 | 0.606 | 0.643 | 0.852 | 0.170 | 0.624 | 0.296 | 0.238 | 0.639 | 0.446 | 0.546 |
|
| 98 |
+
| google/gemma-3-12b-it | 0.935 | 0.566 | 0.542 | 0.808 | 0.148 | 0.724 | 0.289 | 0.239 | 0.645 | 0.637 | 0.553 |
|
| 99 |
+
| google/gemma-2-27b-it | 0.956 | 0.541 | 0.576 | 0.883 | 0.166 | 0.704 | 0.290 | 0.249 | 0.670 | 0.638 | 0.567 |
|
| 100 |
+
| **tokyotech-llm/Gemma-2-Llama-Swallow-27b-it-v0.1** | 0.969 | 0.654 | 0.658 | 0.891 | 0.194 | 0.764 | 0.316 | 0.258 | 0.686 | 0.635 | 0.602 |
|
| 101 |
+
| google/gemma-3-27b-it | 0.946 | 0.592 | 0.584 | 0.867 | 0.142 | 0.764 | 0.307 | 0.253 | 0.716 | 0.736 | 0.591 |
|
| 102 |
+
| Qwen/Qwen2.5-32B-Instruct | 0.959 | 0.567 | 0.497 | 0.903 | 0.169 | 0.780 | 0.228 | 0.195 | 0.757 | 0.651 | 0.571 |
|
| 103 |
|
| 104 |
### English tasks
|
| 105 |
|
| 106 |
+
| Model | OpenBookQA | TriviaQA | HellaSWAG | SQuAD2.0 | XWINO | MMLU | GSM8K | MATH | BBH | HumanEval | En Avg |
|
| 107 |
+
| --------------------------------------------------- | ---------- | -------- | --------- | -------- | ------ | ------ | ------ | ---------- | ---------- | --------- | ------ |
|
| 108 |
+
| | 4-shot | 4-shot | 4-shot | 4-shot | 4-shot | 5-shot | 4-shot | 4-shot | 3-shot | 0-shot | |
|
| 109 |
+
| | Acc | EM acc | Acc | EM acc | Acc | Acc | EM acc | CoT EM Acc | CoT EM Acc | pass@1 | |
|
| 110 |
+
| google/gemma-3-1b-it | 0.272 | 0.229 | 0.421 | 0.501 | 0.786 | 0.398 | 0.256 | 0.340 | 0.379 | 0.335 | 0.392 |
|
| 111 |
+
| Qwen/Qwen2.5-1.5B-Instruct | 0.334 | 0.378 | 0.503 | 0.501 | 0.844 | 0.604 | 0.257 | 0.272 | 0.272 | 0.277 | 0.424 |
|
| 112 |
+
| google/gemma-2-2b-it | 0.354 | 0.502 | 0.520 | 0.548 | 0.878 | 0.569 | 0.440 | 0.230 | 0.464 | 0.382 | 0.489 |
|
| 113 |
+
| rinna/gemma-2-baku-2b-it | 0.342 | 0.416 | 0.511 | 0.522 | 0.871 | 0.526 | 0.027 | 0.174 | 0.063 | 0.158 | 0.361 |
|
| 114 |
+
| google/gemma-2-2b-jpn-it | 0.370 | 0.503 | 0.532 | 0.539 | 0.879 | 0.557 | 0.351 | 0.132 | 0.451 | 0.392 | 0.471 |
|
| 115 |
+
| **tokyotech-llm/Gemma-2-Llama-Swallow-2b-it-v0.1** | 0.332 | 0.417 | 0.529 | 0.506 | 0.856 | 0.530 | 0.284 | 0.150 | 0.405 | 0.301 | 0.431 |
|
| 116 |
+
| Qwen/Qwen2.5-3B-Instruct | 0.364 | 0.446 | 0.562 | 0.504 | 0.869 | 0.664 | 0.096 | 0.612 | 0.128 | 0.471 | 0.472 |
|
| 117 |
+
| google/gemma-3-4b-it | 0.412 | 0.500 | 0.560 | 0.552 | 0.872 | 0.583 | 0.769 | 0.306 | 0.598 | 0.513 | 0.566 |
|
| 118 |
+
| Qwen/Qwen2.5-7B-Instruct | 0.428 | 0.519 | 0.624 | 0.569 | 0.877 | 0.742 | 0.739 | 0.688 | 0.217 | 0.636 | 0.604 |
|
| 119 |
+
| tokyotech-llm/Llama-3.1-Swallow-8B-Instruct-v0.3 | 0.396 | 0.629 | 0.593 | 0.570 | 0.884 | 0.629 | 0.622 | 0.266 | 0.626 | 0.445 | 0.566 |
|
| 120 |
+
| google/gemma-2-9b-it | 0.432 | 0.658 | 0.605 | 0.659 | 0.904 | 0.723 | 0.779 | 0.394 | 0.719 | 0.613 | 0.649 |
|
| 121 |
+
| **tokyotech-llm/Gemma-2-Llama-Swallow-9b-it-v0.1** | 0.404 | 0.640 | 0.609 | 0.623 | 0.900 | 0.680 | 0.710 | 0.392 | 0.663 | 0.491 | 0.611 |
|
| 122 |
+
| google/gemma-3-12b-it | 0.422 | 0.665 | 0.639 | 0.649 | 0.901 | 0.721 | 0.867 | 0.796 | 0.802 | 0.712 | 0.717 |
|
| 123 |
+
| google/gemma-2-27b-it | 0.458 | 0.766 | 0.655 | 0.669 | 0.909 | 0.762 | 0.851 | 0.466 | 0.790 | 0.707 | 0.703 |
|
| 124 |
+
| **tokyotech-llm/Gemma-2-Llama-Swallow-27b-it-v0.1** | 0.424 | 0.747 | 0.663 | 0.664 | 0.911 | 0.749 | 0.821 | 0.442 | 0.772 | 0.682 | 0.687 |
|
| 125 |
+
| google/gemma-3-27b-it | 0.418 | 0.744 | 0.661 | 0.687 | 0.906 | 0.774 | 0.916 | 0.852 | 0.793 | 0.829 | 0.758 |
|
| 126 |
+
| Qwen/Qwen2.5-32B-Instruct | 0.424 | 0.534 | 0.671 | 0.536 | 0.893 | 0.834 | 0.581 | 0.802 | 0.017 | 0.589 | 0.588 |
|
| 127 |
|
| 128 |
## Evaluation Benchmarks
|
| 129 |
|