hans00 commited on
Commit
069b702
·
verified ·
1 Parent(s): e8c0b2b

Create tokenizer.json

Browse files
Files changed (1) hide show
  1. tokenizer.json +170 -0
tokenizer.json ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "_",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 111,
17
+ "content": "UNK",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ }
24
+ ],
25
+ "normalizer": {
26
+ "type": "Sequence",
27
+ "normalizers": [
28
+ {
29
+ "type": "Replace",
30
+ "pattern": {
31
+ "Regex": "(?:\\[SEP\\]|^|$)"
32
+ },
33
+ "content": "_"
34
+ },
35
+ {
36
+ "type": "Replace",
37
+ "pattern": {
38
+ "Regex": "(?:_)+"
39
+ },
40
+ "content": "_"
41
+ }
42
+ ]
43
+ },
44
+ "pre_tokenizer": {
45
+ "type": "Split",
46
+ "pattern": {
47
+ "Regex": "_"
48
+ },
49
+ "behavior": "Isolated",
50
+ "invert": false
51
+ },
52
+ "post_processor": null,
53
+ "decoder": null,
54
+ "model": {
55
+ "vocab": {
56
+ "!": 103,
57
+ "'": 108,
58
+ ",": 106,
59
+ "-": 109,
60
+ ".": 107,
61
+ "?": 104,
62
+ "AA": 1,
63
+ "E": 2,
64
+ "EE": 3,
65
+ "En": 4,
66
+ "N": 5,
67
+ "OO": 6,
68
+ "SP": 110,
69
+ "UNK": 111,
70
+ "V": 7,
71
+ "_": 0,
72
+ "a": 8,
73
+ "a:": 9,
74
+ "aa": 10,
75
+ "ae": 11,
76
+ "ah": 12,
77
+ "ai": 13,
78
+ "an": 14,
79
+ "ang": 15,
80
+ "ao": 16,
81
+ "aw": 17,
82
+ "ay": 18,
83
+ "b": 19,
84
+ "by": 20,
85
+ "c": 21,
86
+ "ch": 22,
87
+ "d": 23,
88
+ "dh": 24,
89
+ "dy": 25,
90
+ "e": 26,
91
+ "e:": 27,
92
+ "eh": 28,
93
+ "ei": 29,
94
+ "en": 30,
95
+ "eng": 31,
96
+ "er": 32,
97
+ "ey": 33,
98
+ "f": 34,
99
+ "g": 35,
100
+ "gy": 36,
101
+ "h": 37,
102
+ "hh": 38,
103
+ "hy": 39,
104
+ "i": 40,
105
+ "i0": 41,
106
+ "i:": 42,
107
+ "ia": 43,
108
+ "ian": 44,
109
+ "iang": 45,
110
+ "iao": 46,
111
+ "ie": 47,
112
+ "ih": 48,
113
+ "in": 49,
114
+ "ing": 50,
115
+ "iong": 51,
116
+ "ir": 52,
117
+ "iu": 53,
118
+ "iy": 54,
119
+ "j": 55,
120
+ "jh": 56,
121
+ "k": 57,
122
+ "ky": 58,
123
+ "l": 59,
124
+ "m": 60,
125
+ "my": 61,
126
+ "n": 62,
127
+ "ng": 63,
128
+ "ny": 64,
129
+ "o": 65,
130
+ "o:": 66,
131
+ "ong": 67,
132
+ "ou": 68,
133
+ "ow": 69,
134
+ "oy": 70,
135
+ "p": 71,
136
+ "py": 72,
137
+ "q": 73,
138
+ "r": 74,
139
+ "ry": 75,
140
+ "s": 76,
141
+ "sh": 77,
142
+ "t": 78,
143
+ "th": 79,
144
+ "ts": 80,
145
+ "ty": 81,
146
+ "u": 82,
147
+ "u:": 83,
148
+ "ua": 84,
149
+ "uai": 85,
150
+ "uan": 86,
151
+ "uang": 87,
152
+ "uh": 88,
153
+ "ui": 89,
154
+ "un": 90,
155
+ "uo": 91,
156
+ "uw": 92,
157
+ "v": 93,
158
+ "van": 94,
159
+ "ve": 95,
160
+ "vn": 96,
161
+ "w": 97,
162
+ "x": 98,
163
+ "y": 99,
164
+ "z": 100,
165
+ "zh": 101,
166
+ "zy": 102,
167
+ "…": 105
168
+ }
169
+ }
170
+ }