vladislavbro commited on
Commit
a619a40
·
verified ·
1 Parent(s): 9e27834

Upload tokenizer.json

Browse files
Files changed (1) hide show
  1. tokenizer.json +280 -57
tokenizer.json CHANGED
@@ -21,15 +21,6 @@
21
  "rstrip": false,
22
  "normalized": false
23
  },
24
- {
25
- "id": 2,
26
- "special": true,
27
- "content": "[SPACE]",
28
- "single_word": false,
29
- "lstrip": false,
30
- "rstrip": false,
31
- "normalized": true
32
- },
33
  {
34
  "id": 255,
35
  "special": true,
@@ -1064,69 +1055,199 @@
1064
  "rstrip": false,
1065
  "normalized": false,
1066
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1067
  }
1068
  ],
1069
  "normalizer": {
1070
- "type": "Sequence",
1071
- "normalizers": [
1072
- {
1073
- "type": "Replace",
1074
- "pattern": { "String": " " },
1075
- "content": "[SPACE]"
1076
- }
1077
- ]
1078
- },
1079
- "pre_tokenizer": {
1080
- "type": "Whitespace"
1081
  },
 
1082
  "post_processor": {
1083
  "type": "TemplateProcessing",
1084
  "single": [
1085
- { "SpecialToken": { "id": "EXAGGERATION", "type_id": 0 } },
1086
- { "SpecialToken": { "id": "BOS", "type_id": 0 } },
1087
- { "Sequence": { "id": "A", "type_id": 0 } },
1088
- { "SpecialToken": { "id": "EOS", "type_id": 0 } },
1089
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } },
1090
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1091
  ],
1092
  "pair": [
1093
- { "SpecialToken": { "id": "EXAGGERATION", "type_id": 0 } },
1094
- { "SpecialToken": { "id": "BOS", "type_id": 0 } },
1095
- { "Sequence": { "id": "A", "type_id": 0 } },
1096
- { "SpecialToken": { "id": "EOS", "type_id": 0 } },
1097
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } },
1098
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } },
1099
- { "SpecialToken": { "id": "EXAGGERATION", "type_id": 1 } },
1100
- { "SpecialToken": { "id": "BOS", "type_id": 1 } },
1101
- { "Sequence": { "id": "B", "type_id": 1 } },
1102
- { "SpecialToken": { "id": "EOS", "type_id": 1 } },
1103
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 1 } },
1104
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 1 } }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1105
  ],
1106
  "special_tokens": {
1107
- "BOS": {
1108
- "id": "BOS",
1109
- "ids": [255],
1110
- "tokens": ["<s>"]
 
 
 
 
1111
  },
1112
- "EOS": {
1113
- "id": "EOS",
1114
- "ids": [0],
1115
- "tokens": ["</s>"]
 
 
 
 
1116
  },
1117
- "EXAGGERATION": {
1118
- "id": "EXAGGERATION",
1119
- "ids": [6563],
1120
- "tokens": ["<EXAGGERATION>"]
 
 
 
 
1121
  },
1122
- "START_SPEECH": {
1123
- "id": "START_SPEECH",
1124
- "ids": [6561],
1125
- "tokens": ["<START_SPEECH>"]
 
 
 
 
1126
  }
1127
  }
1128
  },
1129
- "decoder": null,
 
 
1130
  "model": {
1131
  "type": "BPE",
1132
  "dropout": null,
@@ -3486,8 +3607,110 @@
3486
  "tch": 2348,
3487
  "sch": 2349,
3488
  "🙊": 2350,
3489
- "🤭": 2351
3490
- },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3491
  "merges": [
3492
  "t h",
3493
  "i n",
 
21
  "rstrip": false,
22
  "normalized": false
23
  },
 
 
 
 
 
 
 
 
 
24
  {
25
  "id": 255,
26
  "special": true,
 
1055
  "rstrip": false,
1056
  "normalized": false,
1057
  "special": true
1058
+ },
1059
+ {
1060
+ "id": 6561,
1061
+ "content": "[START_SPEECH]",
1062
+ "single_word": false,
1063
+ "lstrip": false,
1064
+ "rstrip": false,
1065
+ "normalized": false,
1066
+ "special": true
1067
+ },
1068
+ {
1069
+ "id": 6562,
1070
+ "content": "[STOP_SPEECH]",
1071
+ "single_word": false,
1072
+ "lstrip": false,
1073
+ "rstrip": false,
1074
+ "normalized": false,
1075
+ "special": true
1076
+ },
1077
+ {
1078
+ "id": 6563,
1079
+ "content": "[EXAGGERATION]",
1080
+ "single_word": false,
1081
+ "lstrip": false,
1082
+ "rstrip": false,
1083
+ "normalized": false,
1084
+ "special": true
1085
  }
1086
  ],
1087
  "normalizer": {
1088
+ "type": "Replace",
1089
+ "pattern": {
1090
+ "Regex": "\\s+"
1091
+ },
1092
+ "content": " "
 
 
 
 
 
 
1093
  },
1094
+ "pre_tokenizer": null,
1095
  "post_processor": {
1096
  "type": "TemplateProcessing",
1097
  "single": [
1098
+ {
1099
+ "SpecialToken": {
1100
+ "id": "[EXAGGERATION]",
1101
+ "type_id": 0
1102
+ }
1103
+ },
1104
+ {
1105
+ "SpecialToken": {
1106
+ "id": "[START]",
1107
+ "type_id": 0
1108
+ }
1109
+ },
1110
+ {
1111
+ "Sequence": {
1112
+ "id": "A",
1113
+ "type_id": 0
1114
+ }
1115
+ },
1116
+ {
1117
+ "SpecialToken": {
1118
+ "id": "[STOP]",
1119
+ "type_id": 0
1120
+ }
1121
+ },
1122
+ {
1123
+ "SpecialToken": {
1124
+ "id": "[START_SPEECH]",
1125
+ "type_id": 0
1126
+ }
1127
+ },
1128
+ {
1129
+ "SpecialToken": {
1130
+ "id": "[START_SPEECH]",
1131
+ "type_id": 0
1132
+ }
1133
+ }
1134
  ],
1135
  "pair": [
1136
+ {
1137
+ "SpecialToken": {
1138
+ "id": "[EXAGGERATION]",
1139
+ "type_id": 0
1140
+ }
1141
+ },
1142
+ {
1143
+ "SpecialToken": {
1144
+ "id": "[START]",
1145
+ "type_id": 0
1146
+ }
1147
+ },
1148
+ {
1149
+ "Sequence": {
1150
+ "id": "A",
1151
+ "type_id": 0
1152
+ }
1153
+ },
1154
+ {
1155
+ "SpecialToken": {
1156
+ "id": "[STOP]",
1157
+ "type_id": 0
1158
+ }
1159
+ },
1160
+ {
1161
+ "SpecialToken": {
1162
+ "id": "[START_SPEECH]",
1163
+ "type_id": 0
1164
+ }
1165
+ },
1166
+ {
1167
+ "SpecialToken": {
1168
+ "id": "[START_SPEECH]",
1169
+ "type_id": 0
1170
+ }
1171
+ },
1172
+ {
1173
+ "SpecialToken": {
1174
+ "id": "[EXAGGERATION]",
1175
+ "type_id": 1
1176
+ }
1177
+ },
1178
+ {
1179
+ "SpecialToken": {
1180
+ "id": "[START]",
1181
+ "type_id": 1
1182
+ }
1183
+ },
1184
+ {
1185
+ "Sequence": {
1186
+ "id": "B",
1187
+ "type_id": 1
1188
+ }
1189
+ },
1190
+ {
1191
+ "SpecialToken": {
1192
+ "id": "[STOP]",
1193
+ "type_id": 1
1194
+ }
1195
+ },
1196
+ {
1197
+ "SpecialToken": {
1198
+ "id": "[START_SPEECH]",
1199
+ "type_id": 1
1200
+ }
1201
+ },
1202
+ {
1203
+ "SpecialToken": {
1204
+ "id": "[START_SPEECH]",
1205
+ "type_id": 1
1206
+ }
1207
+ }
1208
  ],
1209
  "special_tokens": {
1210
+ "[START]": {
1211
+ "id": "[START]",
1212
+ "ids": [
1213
+ 255
1214
+ ],
1215
+ "tokens": [
1216
+ "[START]"
1217
+ ]
1218
  },
1219
+ "[STOP]": {
1220
+ "id": "[STOP]",
1221
+ "ids": [
1222
+ 0
1223
+ ],
1224
+ "tokens": [
1225
+ "[STOP]"
1226
+ ]
1227
  },
1228
+ "[EXAGGERATION]": {
1229
+ "id": "[EXAGGERATION]",
1230
+ "ids": [
1231
+ 6563
1232
+ ],
1233
+ "tokens": [
1234
+ "[EXAGGERATION]"
1235
+ ]
1236
  },
1237
+ "[START_SPEECH]": {
1238
+ "id": "[START_SPEECH]",
1239
+ "ids": [
1240
+ 6561
1241
+ ],
1242
+ "tokens": [
1243
+ "[START_SPEECH]"
1244
+ ]
1245
  }
1246
  }
1247
  },
1248
+ "decoder": {
1249
+ "type": "Fuse"
1250
+ },
1251
  "model": {
1252
  "type": "BPE",
1253
  "dropout": null,
 
3607
  "tch": 2348,
3608
  "sch": 2349,
3609
  "🙊": 2350,
3610
+ "🤭": 2351,
3611
+ "€": 2352,
3612
+ "أ": 2353,
3613
+ "إ": 2354,
3614
+ "ئ": 2355,
3615
+ "آ": 2356,
3616
+ "ؤ": 2357,
3617
+ "ﻻ": 2358,
3618
+ "ﺃ": 2359,
3619
+ "ę": 2360,
3620
+ "ą": 2361,
3621
+ "ż": 2362,
3622
+ "ś": 2363,
3623
+ "ć": 2364,
3624
+ "ń": 2365,
3625
+ "ź": 2366,
3626
+ "Ś": 2367,
3627
+ "Ź": 2368,
3628
+ "Ż": 2369,
3629
+ "Ć": 2370,
3630
+ "Š": 2371,
3631
+ "Ő": 2372,
3632
+ "й": 2373,
3633
+ "ё": 2374,
3634
+ "Й": 2375,
3635
+ "Ё": 2376,
3636
+ "が": 2377,
3637
+ "で": 2378,
3638
+ "じ": 2379,
3639
+ "だ": 2380,
3640
+ "ど": 2381,
3641
+ "ば": 2382,
3642
+ "げ": 2383,
3643
+ "ご": 2384,
3644
+ "ぶ": 2385,
3645
+ "ぎ": 2386,
3646
+ ",": 2387,
3647
+ "(": 2388,
3648
+ ":": 2389,
3649
+ ";": 2390,
3650
+ "?": 2391,
3651
+ "!": 2392,
3652
+ "#": 2393,
3653
+ " )": 2394,
3654
+ "ά": 2395,
3655
+ "ό": 2396,
3656
+ "ί": 2397,
3657
+ "έ": 2398,
3658
+ "ή": 2399,
3659
+ "ύ": 2400,
3660
+ "ώ": 2401,
3661
+ "Έ": 2402,
3662
+ "Ό": 2403,
3663
+ "Ή": 2404,
3664
+ "ž": 2405,
3665
+ "š": 2406,
3666
+ "ū": 2407,
3667
+ "ş": 2408,
3668
+ "Ō": 2409,
3669
+ "ī": 2410,
3670
+ "č": 2411,
3671
+ "ř": 2412,
3672
+ "ă": 2413,
3673
+ "이": 2414,
3674
+ "기": 2415,
3675
+ "요": 2416,
3676
+ "에": 2417,
3677
+ "다": 2418,
3678
+ "을": 2419,
3679
+ "은": 2420,
3680
+ "서": 2421,
3681
+ "니": 2422,
3682
+ "어": 2423,
3683
+ "ě": 2424,
3684
+ "ů": 2425,
3685
+ "Č": 2426,
3686
+ "ň": 2427,
3687
+ "ď": 2428,
3688
+ "ť": 2429,
3689
+ "♭": 2430,
3690
+ "ľ": 2431,
3691
+ "ĺ": 2432,
3692
+ "ğ": 2433,
3693
+ "İ": 2434,
3694
+ "Ş": 2435,
3695
+ "ड़": 2436,
3696
+ "ढ़": 2437,
3697
+ "ज़": 2438,
3698
+ "फ़": 2439,
3699
+ "ख़": 2440,
3700
+ "क़": 2441,
3701
+ "ग़": 2442,
3702
+ "Ά": 2443,
3703
+ "ϊ": 2444,
3704
+ "Ί": 2445,
3705
+ "Ύ": 2446,
3706
+ "Ώ": 2447,
3707
+ "ΐ": 2448,
3708
+ "ϋ": 2449,
3709
+ "ũ": 2450,
3710
+ "ụ": 2451,
3711
+ "ọ": 2452,
3712
+ "ạ": 2453
3713
+ },
3714
  "merges": [
3715
  "t h",
3716
  "i n",