openGPT-X
/

Teuken-7B-instruct-research-v0.4

Text Generation

text-generation-inference

Model card Files Files and versions

mfromm commited on Nov 27, 2024

Commit

b6c73ac

·

verified ·

1 Parent(s): 85ef05e

Update gptx_tokenizer.py

Files changed (1) hide show

gptx_tokenizer.py +4 -4

gptx_tokenizer.py CHANGED Viewed

@@ -245,10 +245,9 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
         """
         output = self.tok.decode(input=token_ids, num_threads=num_threads)
         if skip_special_tokens:
-            warnings.warn(
-                "skip_special_tokens currently not implemented"
-            )
         if clean_up_tokenization_spaces:
             warnings.warn(
                 "when cleaning up tokenization spaces, this will not behave "
@@ -259,6 +258,7 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
         return output
     def _convert_id_to_token(self, index: int) -> str:
         """
         Convert a token ID to its corresponding token string.

         """
         output = self.tok.decode(input=token_ids, num_threads=num_threads)
         if skip_special_tokens:
+            for substring in self.additional_special_tokens:
+                output = output.replace(substring, "")
         if clean_up_tokenization_spaces:
             warnings.warn(
                 "when cleaning up tokenization spaces, this will not behave "
         return output
     def _convert_id_to_token(self, index: int) -> str:
         """
         Convert a token ID to its corresponding token string.