Add more percentiles and BAAI/bge preset tokenizer
Browse files
app.py
CHANGED
|
@@ -12,6 +12,8 @@ from transformers import AutoTokenizer
|
|
| 12 |
|
| 13 |
tokenizers = {
|
| 14 |
"bert": "google-bert/bert-base-uncased",
|
|
|
|
|
|
|
| 15 |
"blenderbot": "facebook/blenderbot-3B",
|
| 16 |
"bloom": "bigscience/bloom-560m",
|
| 17 |
"bloomz": "bigscience/bloomz-7b1",
|
|
@@ -65,12 +67,12 @@ def count(model_id, dataset_id, config, split, column, add_special_tokens=True):
|
|
| 65 |
# not 100% accurate but good enough
|
| 66 |
words = re.findall(pattern, item[column])
|
| 67 |
wordcounter.append(len(words))
|
| 68 |
-
|
| 69 |
-
df = pd.DataFrame(tokencounter).describe().T
|
| 70 |
df.insert(0, "type", "tokens")
|
| 71 |
-
dfc = pd.DataFrame(charcounter).describe().T
|
| 72 |
dfc.insert(0, "type", "chars")
|
| 73 |
-
dfw = pd.DataFrame(wordcounter).describe().T
|
| 74 |
dfw.insert(0, "type", "words")
|
| 75 |
df.loc[-1] = dfw.values[0]
|
| 76 |
df.index = df.index + 1 # shifting index
|
|
@@ -105,6 +107,7 @@ demo = gr.Interface(
|
|
| 105 |
["tiiuae/falcon-7b", "imdb", "", "test", "text"],
|
| 106 |
["tiiuae/falcon-7b", "wikitext", "wikitext-2-v1", "validation", "text"],
|
| 107 |
["tiiuae/falcon-7b", "zeroshot/twitter-financial-news-sentiment", "", "validation", "text"],
|
|
|
|
| 108 |
],
|
| 109 |
cache_examples=True,
|
| 110 |
)
|
|
|
|
| 12 |
|
| 13 |
tokenizers = {
|
| 14 |
"bert": "google-bert/bert-base-uncased",
|
| 15 |
+
"bge-en": "BAAI/bge-base-en-v1.5",
|
| 16 |
+
"bge-zh": "BAAI/bge-base-zh-v1.5",
|
| 17 |
"blenderbot": "facebook/blenderbot-3B",
|
| 18 |
"bloom": "bigscience/bloom-560m",
|
| 19 |
"bloomz": "bigscience/bloomz-7b1",
|
|
|
|
| 67 |
# not 100% accurate but good enough
|
| 68 |
words = re.findall(pattern, item[column])
|
| 69 |
wordcounter.append(len(words))
|
| 70 |
+
percentiles = [0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
|
| 71 |
+
df = pd.DataFrame(tokencounter).describe(percentiles=percentiles).T
|
| 72 |
df.insert(0, "type", "tokens")
|
| 73 |
+
dfc = pd.DataFrame(charcounter).describe(percentiles=percentiles).T
|
| 74 |
dfc.insert(0, "type", "chars")
|
| 75 |
+
dfw = pd.DataFrame(wordcounter).describe(percentiles=percentiles).T
|
| 76 |
dfw.insert(0, "type", "words")
|
| 77 |
df.loc[-1] = dfw.values[0]
|
| 78 |
df.index = df.index + 1 # shifting index
|
|
|
|
| 107 |
["tiiuae/falcon-7b", "imdb", "", "test", "text"],
|
| 108 |
["tiiuae/falcon-7b", "wikitext", "wikitext-2-v1", "validation", "text"],
|
| 109 |
["tiiuae/falcon-7b", "zeroshot/twitter-financial-news-sentiment", "", "validation", "text"],
|
| 110 |
+
["BAAI/bge-base-en-v1.5", "PolyAI/banking77", "", "test", "text"],
|
| 111 |
],
|
| 112 |
cache_examples=True,
|
| 113 |
)
|