Update README.md
Browse files
README.md
CHANGED
|
@@ -4662,6 +4662,8 @@ refer to [enable-unpadding-and-xformers](https://huggingface.co/Alibaba-NLP/new-
|
|
| 4662 |
```
|
| 4663 |
# Requires transformers>=4.36.0
|
| 4664 |
|
|
|
|
|
|
|
| 4665 |
import torch.nn.functional as F
|
| 4666 |
from transformers import AutoModel, AutoTokenizer
|
| 4667 |
|
|
@@ -4672,9 +4674,9 @@ input_texts = [
|
|
| 4672 |
"快排算法介绍"
|
| 4673 |
]
|
| 4674 |
|
| 4675 |
-
|
| 4676 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 4677 |
-
model = AutoModel.from_pretrained(
|
| 4678 |
|
| 4679 |
# Tokenize the input texts
|
| 4680 |
batch_dict = tokenizer(input_texts, max_length=8192, padding=True, truncation=True, return_tensors='pt')
|
|
@@ -4687,12 +4689,17 @@ embeddings = outputs.last_hidden_state[:, 0][:dimension]
|
|
| 4687 |
embeddings = F.normalize(embeddings, p=2, dim=1)
|
| 4688 |
scores = (embeddings[:1] @ embeddings[1:].T) * 100
|
| 4689 |
print(scores.tolist())
|
|
|
|
|
|
|
| 4690 |
```
|
| 4691 |
|
| 4692 |
### Use with sentence-transformers
|
| 4693 |
```
|
|
|
|
|
|
|
| 4694 |
from sentence_transformers import SentenceTransformer
|
| 4695 |
from sentence_transformers.util import cos_sim
|
|
|
|
| 4696 |
|
| 4697 |
input_texts = [
|
| 4698 |
"what is the capital of China?",
|
|
@@ -4701,17 +4708,31 @@ input_texts = [
|
|
| 4701 |
"快排算法介绍"
|
| 4702 |
]
|
| 4703 |
|
| 4704 |
-
|
| 4705 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4706 |
```
|
| 4707 |
|
| 4708 |
### Use with custom code to get dense embeddigns and sparse token weights
|
| 4709 |
```
|
| 4710 |
-
# You can find the
|
| 4711 |
-
from gte_embeddings import GTEEmbeddidng
|
| 4712 |
|
| 4713 |
-
|
| 4714 |
-
|
|
|
|
|
|
|
| 4715 |
query = "中国的首都在哪儿"
|
| 4716 |
|
| 4717 |
docs = [
|
|
@@ -4727,12 +4748,26 @@ print('token_weights', embs['token_weights'])
|
|
| 4727 |
pairs = [(query, doc) for doc in docs]
|
| 4728 |
dense_scores = model.compute_scores(pairs, dense_weight=1.0, sparse_weight=0.0)
|
| 4729 |
sparse_scores = model.compute_scores(pairs, dense_weight=0.0, sparse_weight=1.0)
|
| 4730 |
-
|
|
|
|
| 4731 |
print('dense_scores', dense_scores)
|
| 4732 |
print('sparse_scores', sparse_scores)
|
| 4733 |
-
print('
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4734 |
```
|
| 4735 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4736 |
## Citation
|
| 4737 |
```
|
| 4738 |
@misc{zhang2024mgte,
|
|
|
|
| 4662 |
```
|
| 4663 |
# Requires transformers>=4.36.0
|
| 4664 |
|
| 4665 |
+
# Requires transformers>=4.36.0
|
| 4666 |
+
|
| 4667 |
import torch.nn.functional as F
|
| 4668 |
from transformers import AutoModel, AutoTokenizer
|
| 4669 |
|
|
|
|
| 4674 |
"快排算法介绍"
|
| 4675 |
]
|
| 4676 |
|
| 4677 |
+
model_name_or_path = 'Alibaba-NLP/gte-multilingual-base'
|
| 4678 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
| 4679 |
+
model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True)
|
| 4680 |
|
| 4681 |
# Tokenize the input texts
|
| 4682 |
batch_dict = tokenizer(input_texts, max_length=8192, padding=True, truncation=True, return_tensors='pt')
|
|
|
|
| 4689 |
embeddings = F.normalize(embeddings, p=2, dim=1)
|
| 4690 |
scores = (embeddings[:1] @ embeddings[1:].T) * 100
|
| 4691 |
print(scores.tolist())
|
| 4692 |
+
|
| 4693 |
+
# [[0.3016996383666992, 0.7503870129585266, 0.3203084468841553]]
|
| 4694 |
```
|
| 4695 |
|
| 4696 |
### Use with sentence-transformers
|
| 4697 |
```
|
| 4698 |
+
# Requires sentences-transformers>=3.0.0
|
| 4699 |
+
|
| 4700 |
from sentence_transformers import SentenceTransformer
|
| 4701 |
from sentence_transformers.util import cos_sim
|
| 4702 |
+
import numpy as np
|
| 4703 |
|
| 4704 |
input_texts = [
|
| 4705 |
"what is the capital of China?",
|
|
|
|
| 4708 |
"快排算法介绍"
|
| 4709 |
]
|
| 4710 |
|
| 4711 |
+
model_name_or_path="Alibaba-NLP/gte-multilingual-base"
|
| 4712 |
+
model = SentenceTransformer(', trust_remote_code=True)
|
| 4713 |
+
embeddings = model.encode(input_texts) # embeddings.shape (4, 768)
|
| 4714 |
+
|
| 4715 |
+
# normalized embeddings
|
| 4716 |
+
norms = np.linalg.norm(embeddings, ord=2, axis=1, keepdims=True)
|
| 4717 |
+
norms[norms == 0] = 1
|
| 4718 |
+
embeddings = embeddings / norms
|
| 4719 |
+
|
| 4720 |
+
# sim scores
|
| 4721 |
+
scores = (embeddings[:1] @ embeddings[1:].T)
|
| 4722 |
+
|
| 4723 |
+
print(scores.tolist())
|
| 4724 |
+
# [[0.301699697971344, 0.7503870129585266, 0.32030850648880005]]
|
| 4725 |
+
|
| 4726 |
```
|
| 4727 |
|
| 4728 |
### Use with custom code to get dense embeddigns and sparse token weights
|
| 4729 |
```
|
| 4730 |
+
# You can find the script gte_embedding.py in https://huggingface.co/Alibaba-NLP/gte-multilingual-base/blob/main/scripts/gte_embedding.py
|
|
|
|
| 4731 |
|
| 4732 |
+
from gte_embedding import GTEEmbeddidng
|
| 4733 |
+
|
| 4734 |
+
model_name_or_path = 'Alibaba-NLP/gte-multilingual-base'
|
| 4735 |
+
model = GTEEmbeddidng(model_name_or_path)
|
| 4736 |
query = "中国的首都在哪儿"
|
| 4737 |
|
| 4738 |
docs = [
|
|
|
|
| 4748 |
pairs = [(query, doc) for doc in docs]
|
| 4749 |
dense_scores = model.compute_scores(pairs, dense_weight=1.0, sparse_weight=0.0)
|
| 4750 |
sparse_scores = model.compute_scores(pairs, dense_weight=0.0, sparse_weight=1.0)
|
| 4751 |
+
hybrid_scores = model.compute_scores(pairs, dense_weight=1.0, sparse_weight=0.3)
|
| 4752 |
+
|
| 4753 |
print('dense_scores', dense_scores)
|
| 4754 |
print('sparse_scores', sparse_scores)
|
| 4755 |
+
print('hybrid_scores', hybrid_scores)
|
| 4756 |
+
|
| 4757 |
+
# dense_scores [0.85302734375, 0.257568359375, 0.76953125, 0.325439453125]
|
| 4758 |
+
# sparse_scores [0.0, 0.0, 4.600879669189453, 1.570279598236084]
|
| 4759 |
+
# hybrid_scores [0.85302734375, 0.257568359375, 2.1497951507568356, 0.7965233325958252]
|
| 4760 |
+
|
| 4761 |
```
|
| 4762 |
|
| 4763 |
+
## Evaluation
|
| 4764 |
+
|
| 4765 |
+
We validated the performance of the **gte-multilingual-base** model on multiple downstream tasks, including multilingual retrieval, cross-lingual retrieval, long text retrieval, and general text representation evaluation on the [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard), among others.
|
| 4766 |
+
|
| 4767 |
+
### Retrieval Task
|
| 4768 |
+
|
| 4769 |
+

|
| 4770 |
+
|
| 4771 |
## Citation
|
| 4772 |
```
|
| 4773 |
@misc{zhang2024mgte,
|