duzx16
commited on
Commit
·
53f0197
1
Parent(s):
eb55ff0
Fix bug
Browse files- tokenization_chatglm.py +8 -0
tokenization_chatglm.py
CHANGED
|
@@ -176,6 +176,8 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
| 176 |
mask_token='[MASK]',
|
| 177 |
gmask_token='[gMASK]',
|
| 178 |
padding_side="left",
|
|
|
|
|
|
|
| 179 |
num_image_tokens=20000,
|
| 180 |
**kwargs
|
| 181 |
) -> None:
|
|
@@ -188,6 +190,8 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
| 188 |
end_token=end_token,
|
| 189 |
mask_token=mask_token,
|
| 190 |
gmask_token=gmask_token,
|
|
|
|
|
|
|
| 191 |
num_image_tokens=num_image_tokens,
|
| 192 |
**kwargs
|
| 193 |
)
|
|
@@ -402,6 +406,10 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
| 402 |
encoded_inputs["attention_mask"] = attention_mask
|
| 403 |
|
| 404 |
if "position_ids" not in encoded_inputs:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
position_ids = np.arange(seq_length, dtype=np.int64)
|
| 406 |
mask_token = mask_token_id if mask_token_id in required_input else gmask_token_id
|
| 407 |
if mask_token in required_input:
|
|
|
|
| 176 |
mask_token='[MASK]',
|
| 177 |
gmask_token='[gMASK]',
|
| 178 |
padding_side="left",
|
| 179 |
+
pad_token="<pad>",
|
| 180 |
+
unk_token="<unk>",
|
| 181 |
num_image_tokens=20000,
|
| 182 |
**kwargs
|
| 183 |
) -> None:
|
|
|
|
| 190 |
end_token=end_token,
|
| 191 |
mask_token=mask_token,
|
| 192 |
gmask_token=gmask_token,
|
| 193 |
+
pad_token=pad_token,
|
| 194 |
+
unk_token=unk_token,
|
| 195 |
num_image_tokens=num_image_tokens,
|
| 196 |
**kwargs
|
| 197 |
)
|
|
|
|
| 406 |
encoded_inputs["attention_mask"] = attention_mask
|
| 407 |
|
| 408 |
if "position_ids" not in encoded_inputs:
|
| 409 |
+
if bos_token_id in required_input:
|
| 410 |
+
context_length = required_input.index(bos_token_id)
|
| 411 |
+
else:
|
| 412 |
+
context_length = seq_length
|
| 413 |
position_ids = np.arange(seq_length, dtype=np.int64)
|
| 414 |
mask_token = mask_token_id if mask_token_id in required_input else gmask_token_id
|
| 415 |
if mask_token in required_input:
|