Upload tokenizer.json
Browse files- tokenizer.json +280 -57
tokenizer.json
CHANGED
|
@@ -21,15 +21,6 @@
|
|
| 21 |
"rstrip": false,
|
| 22 |
"normalized": false
|
| 23 |
},
|
| 24 |
-
{
|
| 25 |
-
"id": 2,
|
| 26 |
-
"special": true,
|
| 27 |
-
"content": "[SPACE]",
|
| 28 |
-
"single_word": false,
|
| 29 |
-
"lstrip": false,
|
| 30 |
-
"rstrip": false,
|
| 31 |
-
"normalized": true
|
| 32 |
-
},
|
| 33 |
{
|
| 34 |
"id": 255,
|
| 35 |
"special": true,
|
|
@@ -1064,69 +1055,199 @@
|
|
| 1064 |
"rstrip": false,
|
| 1065 |
"normalized": false,
|
| 1066 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1067 |
}
|
| 1068 |
],
|
| 1069 |
"normalizer": {
|
| 1070 |
-
"type": "
|
| 1071 |
-
"
|
| 1072 |
-
|
| 1073 |
-
|
| 1074 |
-
|
| 1075 |
-
"content": "[SPACE]"
|
| 1076 |
-
}
|
| 1077 |
-
]
|
| 1078 |
-
},
|
| 1079 |
-
"pre_tokenizer": {
|
| 1080 |
-
"type": "Whitespace"
|
| 1081 |
},
|
|
|
|
| 1082 |
"post_processor": {
|
| 1083 |
"type": "TemplateProcessing",
|
| 1084 |
"single": [
|
| 1085 |
-
{
|
| 1086 |
-
|
| 1087 |
-
|
| 1088 |
-
|
| 1089 |
-
|
| 1090 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1091 |
],
|
| 1092 |
"pair": [
|
| 1093 |
-
{
|
| 1094 |
-
|
| 1095 |
-
|
| 1096 |
-
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
{
|
| 1100 |
-
|
| 1101 |
-
|
| 1102 |
-
|
| 1103 |
-
|
| 1104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1105 |
],
|
| 1106 |
"special_tokens": {
|
| 1107 |
-
"
|
| 1108 |
-
"id": "
|
| 1109 |
-
"ids": [
|
| 1110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1111 |
},
|
| 1112 |
-
"
|
| 1113 |
-
"id": "
|
| 1114 |
-
"ids": [
|
| 1115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1116 |
},
|
| 1117 |
-
"EXAGGERATION": {
|
| 1118 |
-
"id": "EXAGGERATION",
|
| 1119 |
-
"ids": [
|
| 1120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1121 |
},
|
| 1122 |
-
"START_SPEECH": {
|
| 1123 |
-
"id": "START_SPEECH",
|
| 1124 |
-
"ids": [
|
| 1125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1126 |
}
|
| 1127 |
}
|
| 1128 |
},
|
| 1129 |
-
"decoder":
|
|
|
|
|
|
|
| 1130 |
"model": {
|
| 1131 |
"type": "BPE",
|
| 1132 |
"dropout": null,
|
|
@@ -3486,8 +3607,110 @@
|
|
| 3486 |
"tch": 2348,
|
| 3487 |
"sch": 2349,
|
| 3488 |
"🙊": 2350,
|
| 3489 |
-
"🤭": 2351
|
| 3490 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3491 |
"merges": [
|
| 3492 |
"t h",
|
| 3493 |
"i n",
|
|
|
|
| 21 |
"rstrip": false,
|
| 22 |
"normalized": false
|
| 23 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
{
|
| 25 |
"id": 255,
|
| 26 |
"special": true,
|
|
|
|
| 1055 |
"rstrip": false,
|
| 1056 |
"normalized": false,
|
| 1057 |
"special": true
|
| 1058 |
+
},
|
| 1059 |
+
{
|
| 1060 |
+
"id": 6561,
|
| 1061 |
+
"content": "[START_SPEECH]",
|
| 1062 |
+
"single_word": false,
|
| 1063 |
+
"lstrip": false,
|
| 1064 |
+
"rstrip": false,
|
| 1065 |
+
"normalized": false,
|
| 1066 |
+
"special": true
|
| 1067 |
+
},
|
| 1068 |
+
{
|
| 1069 |
+
"id": 6562,
|
| 1070 |
+
"content": "[STOP_SPEECH]",
|
| 1071 |
+
"single_word": false,
|
| 1072 |
+
"lstrip": false,
|
| 1073 |
+
"rstrip": false,
|
| 1074 |
+
"normalized": false,
|
| 1075 |
+
"special": true
|
| 1076 |
+
},
|
| 1077 |
+
{
|
| 1078 |
+
"id": 6563,
|
| 1079 |
+
"content": "[EXAGGERATION]",
|
| 1080 |
+
"single_word": false,
|
| 1081 |
+
"lstrip": false,
|
| 1082 |
+
"rstrip": false,
|
| 1083 |
+
"normalized": false,
|
| 1084 |
+
"special": true
|
| 1085 |
}
|
| 1086 |
],
|
| 1087 |
"normalizer": {
|
| 1088 |
+
"type": "Replace",
|
| 1089 |
+
"pattern": {
|
| 1090 |
+
"Regex": "\\s+"
|
| 1091 |
+
},
|
| 1092 |
+
"content": " "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1093 |
},
|
| 1094 |
+
"pre_tokenizer": null,
|
| 1095 |
"post_processor": {
|
| 1096 |
"type": "TemplateProcessing",
|
| 1097 |
"single": [
|
| 1098 |
+
{
|
| 1099 |
+
"SpecialToken": {
|
| 1100 |
+
"id": "[EXAGGERATION]",
|
| 1101 |
+
"type_id": 0
|
| 1102 |
+
}
|
| 1103 |
+
},
|
| 1104 |
+
{
|
| 1105 |
+
"SpecialToken": {
|
| 1106 |
+
"id": "[START]",
|
| 1107 |
+
"type_id": 0
|
| 1108 |
+
}
|
| 1109 |
+
},
|
| 1110 |
+
{
|
| 1111 |
+
"Sequence": {
|
| 1112 |
+
"id": "A",
|
| 1113 |
+
"type_id": 0
|
| 1114 |
+
}
|
| 1115 |
+
},
|
| 1116 |
+
{
|
| 1117 |
+
"SpecialToken": {
|
| 1118 |
+
"id": "[STOP]",
|
| 1119 |
+
"type_id": 0
|
| 1120 |
+
}
|
| 1121 |
+
},
|
| 1122 |
+
{
|
| 1123 |
+
"SpecialToken": {
|
| 1124 |
+
"id": "[START_SPEECH]",
|
| 1125 |
+
"type_id": 0
|
| 1126 |
+
}
|
| 1127 |
+
},
|
| 1128 |
+
{
|
| 1129 |
+
"SpecialToken": {
|
| 1130 |
+
"id": "[START_SPEECH]",
|
| 1131 |
+
"type_id": 0
|
| 1132 |
+
}
|
| 1133 |
+
}
|
| 1134 |
],
|
| 1135 |
"pair": [
|
| 1136 |
+
{
|
| 1137 |
+
"SpecialToken": {
|
| 1138 |
+
"id": "[EXAGGERATION]",
|
| 1139 |
+
"type_id": 0
|
| 1140 |
+
}
|
| 1141 |
+
},
|
| 1142 |
+
{
|
| 1143 |
+
"SpecialToken": {
|
| 1144 |
+
"id": "[START]",
|
| 1145 |
+
"type_id": 0
|
| 1146 |
+
}
|
| 1147 |
+
},
|
| 1148 |
+
{
|
| 1149 |
+
"Sequence": {
|
| 1150 |
+
"id": "A",
|
| 1151 |
+
"type_id": 0
|
| 1152 |
+
}
|
| 1153 |
+
},
|
| 1154 |
+
{
|
| 1155 |
+
"SpecialToken": {
|
| 1156 |
+
"id": "[STOP]",
|
| 1157 |
+
"type_id": 0
|
| 1158 |
+
}
|
| 1159 |
+
},
|
| 1160 |
+
{
|
| 1161 |
+
"SpecialToken": {
|
| 1162 |
+
"id": "[START_SPEECH]",
|
| 1163 |
+
"type_id": 0
|
| 1164 |
+
}
|
| 1165 |
+
},
|
| 1166 |
+
{
|
| 1167 |
+
"SpecialToken": {
|
| 1168 |
+
"id": "[START_SPEECH]",
|
| 1169 |
+
"type_id": 0
|
| 1170 |
+
}
|
| 1171 |
+
},
|
| 1172 |
+
{
|
| 1173 |
+
"SpecialToken": {
|
| 1174 |
+
"id": "[EXAGGERATION]",
|
| 1175 |
+
"type_id": 1
|
| 1176 |
+
}
|
| 1177 |
+
},
|
| 1178 |
+
{
|
| 1179 |
+
"SpecialToken": {
|
| 1180 |
+
"id": "[START]",
|
| 1181 |
+
"type_id": 1
|
| 1182 |
+
}
|
| 1183 |
+
},
|
| 1184 |
+
{
|
| 1185 |
+
"Sequence": {
|
| 1186 |
+
"id": "B",
|
| 1187 |
+
"type_id": 1
|
| 1188 |
+
}
|
| 1189 |
+
},
|
| 1190 |
+
{
|
| 1191 |
+
"SpecialToken": {
|
| 1192 |
+
"id": "[STOP]",
|
| 1193 |
+
"type_id": 1
|
| 1194 |
+
}
|
| 1195 |
+
},
|
| 1196 |
+
{
|
| 1197 |
+
"SpecialToken": {
|
| 1198 |
+
"id": "[START_SPEECH]",
|
| 1199 |
+
"type_id": 1
|
| 1200 |
+
}
|
| 1201 |
+
},
|
| 1202 |
+
{
|
| 1203 |
+
"SpecialToken": {
|
| 1204 |
+
"id": "[START_SPEECH]",
|
| 1205 |
+
"type_id": 1
|
| 1206 |
+
}
|
| 1207 |
+
}
|
| 1208 |
],
|
| 1209 |
"special_tokens": {
|
| 1210 |
+
"[START]": {
|
| 1211 |
+
"id": "[START]",
|
| 1212 |
+
"ids": [
|
| 1213 |
+
255
|
| 1214 |
+
],
|
| 1215 |
+
"tokens": [
|
| 1216 |
+
"[START]"
|
| 1217 |
+
]
|
| 1218 |
},
|
| 1219 |
+
"[STOP]": {
|
| 1220 |
+
"id": "[STOP]",
|
| 1221 |
+
"ids": [
|
| 1222 |
+
0
|
| 1223 |
+
],
|
| 1224 |
+
"tokens": [
|
| 1225 |
+
"[STOP]"
|
| 1226 |
+
]
|
| 1227 |
},
|
| 1228 |
+
"[EXAGGERATION]": {
|
| 1229 |
+
"id": "[EXAGGERATION]",
|
| 1230 |
+
"ids": [
|
| 1231 |
+
6563
|
| 1232 |
+
],
|
| 1233 |
+
"tokens": [
|
| 1234 |
+
"[EXAGGERATION]"
|
| 1235 |
+
]
|
| 1236 |
},
|
| 1237 |
+
"[START_SPEECH]": {
|
| 1238 |
+
"id": "[START_SPEECH]",
|
| 1239 |
+
"ids": [
|
| 1240 |
+
6561
|
| 1241 |
+
],
|
| 1242 |
+
"tokens": [
|
| 1243 |
+
"[START_SPEECH]"
|
| 1244 |
+
]
|
| 1245 |
}
|
| 1246 |
}
|
| 1247 |
},
|
| 1248 |
+
"decoder": {
|
| 1249 |
+
"type": "Fuse"
|
| 1250 |
+
},
|
| 1251 |
"model": {
|
| 1252 |
"type": "BPE",
|
| 1253 |
"dropout": null,
|
|
|
|
| 3607 |
"tch": 2348,
|
| 3608 |
"sch": 2349,
|
| 3609 |
"🙊": 2350,
|
| 3610 |
+
"🤭": 2351,
|
| 3611 |
+
"€": 2352,
|
| 3612 |
+
"أ": 2353,
|
| 3613 |
+
"إ": 2354,
|
| 3614 |
+
"ئ": 2355,
|
| 3615 |
+
"آ": 2356,
|
| 3616 |
+
"ؤ": 2357,
|
| 3617 |
+
"ﻻ": 2358,
|
| 3618 |
+
"ﺃ": 2359,
|
| 3619 |
+
"ę": 2360,
|
| 3620 |
+
"ą": 2361,
|
| 3621 |
+
"ż": 2362,
|
| 3622 |
+
"ś": 2363,
|
| 3623 |
+
"ć": 2364,
|
| 3624 |
+
"ń": 2365,
|
| 3625 |
+
"ź": 2366,
|
| 3626 |
+
"Ś": 2367,
|
| 3627 |
+
"Ź": 2368,
|
| 3628 |
+
"Ż": 2369,
|
| 3629 |
+
"Ć": 2370,
|
| 3630 |
+
"Š": 2371,
|
| 3631 |
+
"Ő": 2372,
|
| 3632 |
+
"й": 2373,
|
| 3633 |
+
"ё": 2374,
|
| 3634 |
+
"Й": 2375,
|
| 3635 |
+
"Ё": 2376,
|
| 3636 |
+
"が": 2377,
|
| 3637 |
+
"で": 2378,
|
| 3638 |
+
"じ": 2379,
|
| 3639 |
+
"だ": 2380,
|
| 3640 |
+
"ど": 2381,
|
| 3641 |
+
"ば": 2382,
|
| 3642 |
+
"げ": 2383,
|
| 3643 |
+
"ご": 2384,
|
| 3644 |
+
"ぶ": 2385,
|
| 3645 |
+
"ぎ": 2386,
|
| 3646 |
+
",": 2387,
|
| 3647 |
+
"(": 2388,
|
| 3648 |
+
":": 2389,
|
| 3649 |
+
";": 2390,
|
| 3650 |
+
"?": 2391,
|
| 3651 |
+
"!": 2392,
|
| 3652 |
+
"#": 2393,
|
| 3653 |
+
" )": 2394,
|
| 3654 |
+
"ά": 2395,
|
| 3655 |
+
"ό": 2396,
|
| 3656 |
+
"ί": 2397,
|
| 3657 |
+
"έ": 2398,
|
| 3658 |
+
"ή": 2399,
|
| 3659 |
+
"ύ": 2400,
|
| 3660 |
+
"ώ": 2401,
|
| 3661 |
+
"Έ": 2402,
|
| 3662 |
+
"Ό": 2403,
|
| 3663 |
+
"Ή": 2404,
|
| 3664 |
+
"ž": 2405,
|
| 3665 |
+
"š": 2406,
|
| 3666 |
+
"ū": 2407,
|
| 3667 |
+
"ş": 2408,
|
| 3668 |
+
"Ō": 2409,
|
| 3669 |
+
"ī": 2410,
|
| 3670 |
+
"č": 2411,
|
| 3671 |
+
"ř": 2412,
|
| 3672 |
+
"ă": 2413,
|
| 3673 |
+
"이": 2414,
|
| 3674 |
+
"기": 2415,
|
| 3675 |
+
"요": 2416,
|
| 3676 |
+
"에": 2417,
|
| 3677 |
+
"다": 2418,
|
| 3678 |
+
"을": 2419,
|
| 3679 |
+
"은": 2420,
|
| 3680 |
+
"서": 2421,
|
| 3681 |
+
"니": 2422,
|
| 3682 |
+
"어": 2423,
|
| 3683 |
+
"ě": 2424,
|
| 3684 |
+
"ů": 2425,
|
| 3685 |
+
"Č": 2426,
|
| 3686 |
+
"ň": 2427,
|
| 3687 |
+
"ď": 2428,
|
| 3688 |
+
"ť": 2429,
|
| 3689 |
+
"♭": 2430,
|
| 3690 |
+
"ľ": 2431,
|
| 3691 |
+
"ĺ": 2432,
|
| 3692 |
+
"ğ": 2433,
|
| 3693 |
+
"İ": 2434,
|
| 3694 |
+
"Ş": 2435,
|
| 3695 |
+
"ड़": 2436,
|
| 3696 |
+
"ढ़": 2437,
|
| 3697 |
+
"ज़": 2438,
|
| 3698 |
+
"फ़": 2439,
|
| 3699 |
+
"ख़": 2440,
|
| 3700 |
+
"क़": 2441,
|
| 3701 |
+
"ग़": 2442,
|
| 3702 |
+
"Ά": 2443,
|
| 3703 |
+
"ϊ": 2444,
|
| 3704 |
+
"Ί": 2445,
|
| 3705 |
+
"Ύ": 2446,
|
| 3706 |
+
"Ώ": 2447,
|
| 3707 |
+
"ΐ": 2448,
|
| 3708 |
+
"ϋ": 2449,
|
| 3709 |
+
"ũ": 2450,
|
| 3710 |
+
"ụ": 2451,
|
| 3711 |
+
"ọ": 2452,
|
| 3712 |
+
"ạ": 2453
|
| 3713 |
+
},
|
| 3714 |
"merges": [
|
| 3715 |
"t h",
|
| 3716 |
"i n",
|