Support streaming
Browse files- tokenization_qwen.py +9 -1
tokenization_qwen.py
CHANGED
|
@@ -153,6 +153,10 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
| 153 |
self.box_end_id = self.special_tokens[self.box_end_tag]
|
| 154 |
self.quad_start_id = self.special_tokens[self.quad_start_tag]
|
| 155 |
self.quad_end_id = self.special_tokens[self.quad_end_tag]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
enc = tiktoken.Encoding(
|
| 158 |
"Qwen",
|
|
@@ -354,7 +358,11 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
| 354 |
token_ids = _replace_closed_tag(token_ids, self.img_start_id, self.img_end_id, _decode_imgurl)
|
| 355 |
|
| 356 |
if skip_special_tokens:
|
| 357 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
return self.tokenizer.decode(token_ids, errors=errors or self.errors)
|
| 359 |
|
| 360 |
def to_list_format(self, text: str):
|
|
|
|
| 153 |
self.box_end_id = self.special_tokens[self.box_end_tag]
|
| 154 |
self.quad_start_id = self.special_tokens[self.quad_start_tag]
|
| 155 |
self.quad_end_id = self.special_tokens[self.quad_end_tag]
|
| 156 |
+
self.image_special_tokens = set([
|
| 157 |
+
self.ref_start_id, self.ref_end_id, self.box_start_id, self.box_end_id,
|
| 158 |
+
self.quad_start_id, self.quad_end_id,
|
| 159 |
+
])
|
| 160 |
|
| 161 |
enc = tiktoken.Encoding(
|
| 162 |
"Qwen",
|
|
|
|
| 358 |
token_ids = _replace_closed_tag(token_ids, self.img_start_id, self.img_end_id, _decode_imgurl)
|
| 359 |
|
| 360 |
if skip_special_tokens:
|
| 361 |
+
if kwargs.get('keep_image_special', False):
|
| 362 |
+
token_ids = [i for i in token_ids if i < self.eod_id
|
| 363 |
+
or i in self.image_special_tokens]
|
| 364 |
+
else:
|
| 365 |
+
token_ids = [i for i in token_ids if i < self.eod_id]
|
| 366 |
return self.tokenizer.decode(token_ids, errors=errors or self.errors)
|
| 367 |
|
| 368 |
def to_list_format(self, text: str):
|