Upload utils.py with huggingface_hub
Browse files
utils.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer
|
| 2 |
+
import logging
|
| 3 |
+
from transformers import GemmaTokenizer # Import GemmaTokenizer
|
| 4 |
+
|
| 5 |
+
# Configure logging
|
| 6 |
+
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 7 |
+
|
| 8 |
+
def get_tokenizer(id):
|
| 9 |
+
logging.debug(f"Loading tokenizer: {id}")
|
| 10 |
+
try:
|
| 11 |
+
if "gemma" in id.lower():
|
| 12 |
+
tokenizer = GemmaTokenizer.from_pretrained(id)
|
| 13 |
+
else:
|
| 14 |
+
tokenizer = AutoTokenizer.from_pretrained(id, trust_remote_code=True)
|
| 15 |
+
logging.debug(f"Tokenizer loaded: {tokenizer}")
|
| 16 |
+
return tokenizer
|
| 17 |
+
except Exception as e:
|
| 18 |
+
logging.error(f"Error loading tokenizer {id}: {e}")
|
| 19 |
+
raise e
|
| 20 |
+
|
| 21 |
+
def get_tokenization(tokenizer, text):
|
| 22 |
+
logging.debug(f"Tokenizing text: {text}")
|
| 23 |
+
ids = tokenizer.encode(text)
|
| 24 |
+
string_tokens = tokenizer.convert_ids_to_tokens(ids)
|
| 25 |
+
logging.debug(f"Tokens: {string_tokens}")
|
| 26 |
+
return string_tokens
|
| 27 |
+
|
| 28 |
+
def get_vocab_size(tokenizer):
|
| 29 |
+
logging.debug(f"Getting vocabulary size for tokenizer: {tokenizer}")
|
| 30 |
+
vocab_size = len(tokenizer.get_vocab())
|
| 31 |
+
logging.debug(f"Vocabulary size: {vocab_size}")
|
| 32 |
+
return vocab_size
|
| 33 |
+
|
| 34 |
+
def check_latin_support(tokenizer):
|
| 35 |
+
logging.debug(f"Checking Latin support for tokenizer: {tokenizer}")
|
| 36 |
+
try:
|
| 37 |
+
test_text = "This is a test with latin characters 1234567890."
|
| 38 |
+
tokens = tokenizer.tokenize(test_text)
|
| 39 |
+
# If the tokenizer can tokenize the latin text without returning unknown tokens, we consider it as supporting latin
|
| 40 |
+
if all(token != tokenizer.unk_token for token in tokens):
|
| 41 |
+
logging.debug(f"Latin support: β
")
|
| 42 |
+
return "β
"
|
| 43 |
+
else:
|
| 44 |
+
logging.debug(f"Latin support: β")
|
| 45 |
+
return "β"
|
| 46 |
+
except Exception as e:
|
| 47 |
+
logging.error(f"Error checking latin support: {e}")
|
| 48 |
+
return "β"
|