Commit 
							
							·
						
						faaf572
	
1
								Parent(s):
							
							2855749
								
Changes for fast tokenizer (#11)
Browse files- Update tokens (b33367645f5904439db3926099068645c19d93dd)
- Add tokenizer.json (bbb0af831bf099d47d1744ab20fbb68bf4e37a30)
Co-authored-by: Jonatan Kłosko <[email protected]>
- added_tokens.json +0 -1
- special_tokens_map.json +1 -1
- tokenizer.json +0 -0
- tokenizer_config.json +1 -1
- vocab.json +1 -0
    	
        added_tokens.json
    CHANGED
    
    | @@ -17,7 +17,6 @@ | |
| 17 | 
             
              "<|da|>": 50285,
         | 
| 18 | 
             
              "<|de|>": 50261,
         | 
| 19 | 
             
              "<|el|>": 50281,
         | 
| 20 | 
            -
              "<|endoftext|>": 50257,
         | 
| 21 | 
             
              "<|en|>": 50259,
         | 
| 22 | 
             
              "<|es|>": 50262,
         | 
| 23 | 
             
              "<|et|>": 50307,
         | 
|  | |
| 17 | 
             
              "<|da|>": 50285,
         | 
| 18 | 
             
              "<|de|>": 50261,
         | 
| 19 | 
             
              "<|el|>": 50281,
         | 
|  | |
| 20 | 
             
              "<|en|>": 50259,
         | 
| 21 | 
             
              "<|es|>": 50262,
         | 
| 22 | 
             
              "<|et|>": 50307,
         | 
    	
        special_tokens_map.json
    CHANGED
    
    | @@ -124,7 +124,7 @@ | |
| 124 | 
             
              },
         | 
| 125 | 
             
              "pad_token": "<|endoftext|>",
         | 
| 126 | 
             
              "unk_token": {
         | 
| 127 | 
            -
                "content": "",
         | 
| 128 | 
             
                "lstrip": false,
         | 
| 129 | 
             
                "normalized": true,
         | 
| 130 | 
             
                "rstrip": false,
         | 
|  | |
| 124 | 
             
              },
         | 
| 125 | 
             
              "pad_token": "<|endoftext|>",
         | 
| 126 | 
             
              "unk_token": {
         | 
| 127 | 
            +
                "content": "<|endoftext|>",
         | 
| 128 | 
             
                "lstrip": false,
         | 
| 129 | 
             
                "normalized": true,
         | 
| 130 | 
             
                "rstrip": false,
         | 
    	
        tokenizer.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        tokenizer_config.json
    CHANGED
    
    | @@ -27,7 +27,7 @@ | |
| 27 | 
             
              "tokenizer_class": "WhisperTokenizer",
         | 
| 28 | 
             
              "unk_token": {
         | 
| 29 | 
             
                "__type": "AddedToken",
         | 
| 30 | 
            -
                "content": "",
         | 
| 31 | 
             
                "lstrip": false,
         | 
| 32 | 
             
                "normalized": true,
         | 
| 33 | 
             
                "rstrip": false,
         | 
|  | |
| 27 | 
             
              "tokenizer_class": "WhisperTokenizer",
         | 
| 28 | 
             
              "unk_token": {
         | 
| 29 | 
             
                "__type": "AddedToken",
         | 
| 30 | 
            +
                "content": "<|endoftext|>",
         | 
| 31 | 
             
                "lstrip": false,
         | 
| 32 | 
             
                "normalized": true,
         | 
| 33 | 
             
                "rstrip": false,
         | 
    	
        vocab.json
    CHANGED
    
    | @@ -1,5 +1,6 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "": 50256,
         | 
|  | |
| 3 | 
             
              "!": 0,
         | 
| 4 | 
             
              "!!": 1432,
         | 
| 5 | 
             
              "!!!": 4589,
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "": 50256,
         | 
| 3 | 
            +
              "<|endoftext|>": 50257,
         | 
| 4 | 
             
              "!": 0,
         | 
| 5 | 
             
              "!!": 1432,
         | 
| 6 | 
             
              "!!!": 4589,
         | 

 
		