Upload tokenizer

#6
by ArthurZ HF Staff - opened
added_tokens.json ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</s>": 2,
3
+ "<mask>": 256203,
4
+ "<pad>": 1,
5
+ "<s>": 0,
6
+ "<unk>": 3,
7
+ "ace_Arab": 256001,
8
+ "ace_Latn": 256002,
9
+ "acm_Arab": 256003,
10
+ "acq_Arab": 256004,
11
+ "aeb_Arab": 256005,
12
+ "afr_Latn": 256006,
13
+ "ajp_Arab": 256007,
14
+ "aka_Latn": 256008,
15
+ "als_Latn": 256162,
16
+ "amh_Ethi": 256009,
17
+ "apc_Arab": 256010,
18
+ "arb_Arab": 256011,
19
+ "ars_Arab": 256012,
20
+ "ary_Arab": 256013,
21
+ "arz_Arab": 256014,
22
+ "asm_Beng": 256015,
23
+ "ast_Latn": 256016,
24
+ "awa_Deva": 256017,
25
+ "ayr_Latn": 256018,
26
+ "azb_Arab": 256019,
27
+ "azj_Latn": 256020,
28
+ "bak_Cyrl": 256021,
29
+ "bam_Latn": 256022,
30
+ "ban_Latn": 256023,
31
+ "bel_Cyrl": 256024,
32
+ "bem_Latn": 256025,
33
+ "ben_Beng": 256026,
34
+ "bho_Deva": 256027,
35
+ "bjn_Arab": 256028,
36
+ "bjn_Latn": 256029,
37
+ "bod_Tibt": 256030,
38
+ "bos_Latn": 256031,
39
+ "bug_Latn": 256032,
40
+ "bul_Cyrl": 256033,
41
+ "cat_Latn": 256034,
42
+ "ceb_Latn": 256035,
43
+ "ces_Latn": 256036,
44
+ "cjk_Latn": 256037,
45
+ "ckb_Arab": 256038,
46
+ "crh_Latn": 256039,
47
+ "cym_Latn": 256040,
48
+ "dan_Latn": 256041,
49
+ "deu_Latn": 256042,
50
+ "dik_Latn": 256043,
51
+ "dyu_Latn": 256044,
52
+ "dzo_Tibt": 256045,
53
+ "ell_Grek": 256046,
54
+ "eng_Latn": 256047,
55
+ "epo_Latn": 256048,
56
+ "est_Latn": 256049,
57
+ "eus_Latn": 256050,
58
+ "ewe_Latn": 256051,
59
+ "fao_Latn": 256052,
60
+ "fij_Latn": 256054,
61
+ "fin_Latn": 256055,
62
+ "fon_Latn": 256056,
63
+ "fra_Latn": 256057,
64
+ "fur_Latn": 256058,
65
+ "fuv_Latn": 256059,
66
+ "gaz_Latn": 256135,
67
+ "gla_Latn": 256060,
68
+ "gle_Latn": 256061,
69
+ "glg_Latn": 256062,
70
+ "grn_Latn": 256063,
71
+ "guj_Gujr": 256064,
72
+ "hat_Latn": 256065,
73
+ "hau_Latn": 256066,
74
+ "heb_Hebr": 256067,
75
+ "hin_Deva": 256068,
76
+ "hne_Deva": 256069,
77
+ "hrv_Latn": 256070,
78
+ "hun_Latn": 256071,
79
+ "hye_Armn": 256072,
80
+ "ibo_Latn": 256073,
81
+ "ilo_Latn": 256074,
82
+ "ind_Latn": 256075,
83
+ "isl_Latn": 256076,
84
+ "ita_Latn": 256077,
85
+ "jav_Latn": 256078,
86
+ "jpn_Jpan": 256079,
87
+ "kab_Latn": 256080,
88
+ "kac_Latn": 256081,
89
+ "kam_Latn": 256082,
90
+ "kan_Knda": 256083,
91
+ "kas_Arab": 256084,
92
+ "kas_Deva": 256085,
93
+ "kat_Geor": 256086,
94
+ "kaz_Cyrl": 256089,
95
+ "kbp_Latn": 256090,
96
+ "kea_Latn": 256091,
97
+ "khk_Cyrl": 256122,
98
+ "khm_Khmr": 256092,
99
+ "kik_Latn": 256093,
100
+ "kin_Latn": 256094,
101
+ "kir_Cyrl": 256095,
102
+ "kmb_Latn": 256096,
103
+ "kmr_Latn": 256099,
104
+ "knc_Arab": 256087,
105
+ "knc_Latn": 256088,
106
+ "kon_Latn": 256097,
107
+ "kor_Hang": 256098,
108
+ "lao_Laoo": 256100,
109
+ "lij_Latn": 256102,
110
+ "lim_Latn": 256103,
111
+ "lin_Latn": 256104,
112
+ "lit_Latn": 256105,
113
+ "lmo_Latn": 256106,
114
+ "ltg_Latn": 256107,
115
+ "ltz_Latn": 256108,
116
+ "lua_Latn": 256109,
117
+ "lug_Latn": 256110,
118
+ "luo_Latn": 256111,
119
+ "lus_Latn": 256112,
120
+ "lvs_Latn": 256101,
121
+ "mag_Deva": 256113,
122
+ "mai_Deva": 256114,
123
+ "mal_Mlym": 256115,
124
+ "mar_Deva": 256116,
125
+ "min_Latn": 256117,
126
+ "mkd_Cyrl": 256118,
127
+ "mlt_Latn": 256120,
128
+ "mni_Beng": 256121,
129
+ "mos_Latn": 256123,
130
+ "mri_Latn": 256124,
131
+ "mya_Mymr": 256126,
132
+ "nld_Latn": 256127,
133
+ "nno_Latn": 256128,
134
+ "nob_Latn": 256129,
135
+ "npi_Deva": 256130,
136
+ "nso_Latn": 256131,
137
+ "nus_Latn": 256132,
138
+ "nya_Latn": 256133,
139
+ "oci_Latn": 256134,
140
+ "ory_Orya": 256136,
141
+ "pag_Latn": 256137,
142
+ "pan_Guru": 256138,
143
+ "pap_Latn": 256139,
144
+ "pbt_Arab": 256143,
145
+ "pes_Arab": 256053,
146
+ "plt_Latn": 256119,
147
+ "pol_Latn": 256140,
148
+ "por_Latn": 256141,
149
+ "prs_Arab": 256142,
150
+ "quy_Latn": 256144,
151
+ "ron_Latn": 256145,
152
+ "run_Latn": 256146,
153
+ "rus_Cyrl": 256147,
154
+ "sag_Latn": 256148,
155
+ "san_Deva": 256149,
156
+ "sat_Beng": 256150,
157
+ "scn_Latn": 256151,
158
+ "shn_Mymr": 256152,
159
+ "sin_Sinh": 256153,
160
+ "slk_Latn": 256154,
161
+ "slv_Latn": 256155,
162
+ "smo_Latn": 256156,
163
+ "sna_Latn": 256157,
164
+ "snd_Arab": 256158,
165
+ "som_Latn": 256159,
166
+ "sot_Latn": 256160,
167
+ "spa_Latn": 256161,
168
+ "srd_Latn": 256163,
169
+ "srp_Cyrl": 256164,
170
+ "ssw_Latn": 256165,
171
+ "sun_Latn": 256166,
172
+ "swe_Latn": 256167,
173
+ "swh_Latn": 256168,
174
+ "szl_Latn": 256169,
175
+ "tam_Taml": 256170,
176
+ "taq_Latn": 256177,
177
+ "taq_Tfng": 256178,
178
+ "tat_Cyrl": 256171,
179
+ "tel_Telu": 256172,
180
+ "tgk_Cyrl": 256173,
181
+ "tgl_Latn": 256174,
182
+ "tha_Thai": 256175,
183
+ "tir_Ethi": 256176,
184
+ "tpi_Latn": 256179,
185
+ "tsn_Latn": 256180,
186
+ "tso_Latn": 256181,
187
+ "tuk_Latn": 256182,
188
+ "tum_Latn": 256183,
189
+ "tur_Latn": 256184,
190
+ "twi_Latn": 256185,
191
+ "tzm_Tfng": 256186,
192
+ "uig_Arab": 256187,
193
+ "ukr_Cyrl": 256188,
194
+ "umb_Latn": 256189,
195
+ "urd_Arab": 256190,
196
+ "uzn_Latn": 256191,
197
+ "vec_Latn": 256192,
198
+ "vie_Latn": 256193,
199
+ "war_Latn": 256194,
200
+ "wol_Latn": 256195,
201
+ "xho_Latn": 256196,
202
+ "ydd_Hebr": 256197,
203
+ "yor_Latn": 256198,
204
+ "yue_Hant": 256199,
205
+ "zho_Hans": 256200,
206
+ "zho_Hant": 256201,
207
+ "zsm_Latn": 256125,
208
+ "zul_Latn": 256202
209
+ }
special_tokens_map.json CHANGED
@@ -203,53 +203,11 @@
203
  "zho_Hant",
204
  "zul_Latn"
205
  ],
206
- "bos_token": {
207
- "content": "<s>",
208
- "lstrip": false,
209
- "normalized": false,
210
- "rstrip": false,
211
- "single_word": false
212
- },
213
- "cls_token": {
214
- "content": "<s>",
215
- "lstrip": false,
216
- "normalized": false,
217
- "rstrip": false,
218
- "single_word": false
219
- },
220
- "eos_token": {
221
- "content": "</s>",
222
- "lstrip": false,
223
- "normalized": false,
224
- "rstrip": false,
225
- "single_word": false
226
- },
227
- "mask_token": {
228
- "content": "<mask>",
229
- "lstrip": true,
230
- "normalized": true,
231
- "rstrip": false,
232
- "single_word": false
233
- },
234
- "pad_token": {
235
- "content": "<pad>",
236
- "lstrip": false,
237
- "normalized": false,
238
- "rstrip": false,
239
- "single_word": false
240
- },
241
- "sep_token": {
242
- "content": "</s>",
243
- "lstrip": false,
244
- "normalized": false,
245
- "rstrip": false,
246
- "single_word": false
247
- },
248
- "unk_token": {
249
- "content": "<unk>",
250
- "lstrip": false,
251
- "normalized": false,
252
- "rstrip": false,
253
- "single_word": false
254
- }
255
  }
 
203
  "zho_Hant",
204
  "zul_Latn"
205
  ],
206
+ "bos_token": "<s>",
207
+ "cls_token": "<s>",
208
+ "eos_token": "</s>",
209
+ "mask_token": "<mask>",
210
+ "pad_token": "<pad>",
211
+ "sep_token": "</s>",
212
+ "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  }
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:610501fe8857739dbb451ab69a0a795cb87dadcf8873d7e2227764d165e72e72
3
- size 17331379
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9633f7d33a10432bf06e8865c3c7d4e4798ed432be7bbc7a4a29051a7f5e594
3
+ size 17331380
tokenizer_config.json CHANGED
@@ -1651,7 +1651,7 @@
1651
  "256203": {
1652
  "content": "<mask>",
1653
  "lstrip": true,
1654
- "normalized": true,
1655
  "rstrip": false,
1656
  "single_word": false,
1657
  "special": true
 
1651
  "256203": {
1652
  "content": "<mask>",
1653
  "lstrip": true,
1654
+ "normalized": false,
1655
  "rstrip": false,
1656
  "single_word": false,
1657
  "special": true