Spaces:
Runtime error
Runtime error
im
commited on
Commit
·
e9755d9
1
Parent(s):
4bb4754
add embeddings explanation and dimensionality reduction explanation
Browse files- app.py +282 -23
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import streamlit as st
|
|
|
|
| 2 |
|
| 3 |
# TODO: move to 'utils'
|
| 4 |
mystyle = '''
|
|
@@ -15,6 +16,11 @@ def divider():
|
|
| 15 |
_, c, _ = st.columns(3)
|
| 16 |
c.divider()
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
st.title("Transformers: Tokenisers and Embeddings")
|
| 19 |
|
| 20 |
preface_image, preface_text, = st.columns(2)
|
|
@@ -288,7 +294,7 @@ elif tokeniser_name == 'Unigram':
|
|
| 288 |
according to their probabilities.
|
| 289 |
""")
|
| 290 |
|
| 291 |
-
st.subheader("Try Yourself:")
|
| 292 |
st.write(f"""\
|
| 293 |
*Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
|
| 294 |
efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
|
|
@@ -358,7 +364,7 @@ elif tokeniser_name == 'WordPiece':
|
|
| 358 |
it.
|
| 359 |
""")
|
| 360 |
|
| 361 |
-
st.subheader("Try Yourself:")
|
| 362 |
st.write(f"""\
|
| 363 |
*Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
|
| 364 |
efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
|
|
@@ -472,11 +478,17 @@ st.write("""\
|
|
| 472 |
characteristics using numbers, not words.
|
| 473 |
""")
|
| 474 |
|
| 475 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
|
| 477 |
-
col1, col2 = st.columns(2)
|
| 478 |
-
token_king = col1.text_input("Choose a word to compare embeddings:", value="king")
|
| 479 |
-
token_queen = col2.text_input("Choose a word to compare embeddings:", value="queen")
|
| 480 |
|
| 481 |
from torch import nn
|
| 482 |
from transformers import AutoConfig
|
|
@@ -502,28 +514,61 @@ openai.api_key = st.secrets["OPENAI_API_KEY"]
|
|
| 502 |
EMBEDDING_MODEL = 'text-embedding-ada-002'
|
| 503 |
EMBEDDING_CTX_LENGTH = 8191
|
| 504 |
EMBEDDING_ENCODING = 'cl100k_base'
|
| 505 |
-
king =
|
| 506 |
-
queen =
|
| 507 |
|
| 508 |
-
|
| 509 |
-
|
|
|
|
| 510 |
fig.update_layout(legend=dict(orientation="h"))
|
| 511 |
st.plotly_chart(fig, use_container_width=True)
|
| 512 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
|
| 514 |
-
df = pd.DataFrame({f'"{token_king}" embeddings': king
|
| 515 |
-
fig = px.line(df, title="OpenAI's 'text-embedding-ada-002' model embeddings")
|
| 516 |
fig.update_layout(legend=dict(orientation="h"))
|
| 517 |
st.plotly_chart(fig, use_container_width=True)
|
| 518 |
|
| 519 |
|
| 520 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 521 |
|
| 522 |
-
|
| 523 |
-
|
|
|
|
| 524 |
|
| 525 |
-
|
| 526 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
|
| 528 |
input = {word: get_embeddings(word) for word in sentence}
|
| 529 |
|
|
@@ -534,24 +579,238 @@ for i, word_i in enumerate(sentence):
|
|
| 534 |
|
| 535 |
fig = px.imshow(scores_matrix, x=sentence, y=sentence, color_continuous_scale="hot_r")
|
| 536 |
fig.update_layout(coloraxis_showscale=False)
|
| 537 |
-
fig.update_layout(width=6000
|
| 538 |
st.plotly_chart(fig, use_container_width=True)
|
| 539 |
|
|
|
|
|
|
|
| 540 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
| 541 |
from langchain.vectorstores import FAISS
|
| 542 |
from langchain.schema.document import Document
|
| 543 |
-
db = FAISS.from_documents([Document(page_content="king"), Document(page_content="queen")], OpenAIEmbeddings(model=EMBEDDING_MODEL))
|
| 544 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 545 |
embeddings_query = st.text_input(label="search term")
|
| 546 |
if embeddings_query is not None and embeddings_query != '':
|
| 547 |
-
|
| 548 |
-
docs
|
| 549 |
-
st.write(docs[0].page_content)
|
| 550 |
|
| 551 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 552 |
|
| 553 |
with st.expander("References:"):
|
| 554 |
st.write("""\
|
| 555 |
- https://huggingface.co/blog/getting-started-with-embeddings
|
| 556 |
- https://huggingface.co/blog/1b-sentence-embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 557 |
""")
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import numpy as np
|
| 3 |
|
| 4 |
# TODO: move to 'utils'
|
| 5 |
mystyle = '''
|
|
|
|
| 16 |
_, c, _ = st.columns(3)
|
| 17 |
c.divider()
|
| 18 |
|
| 19 |
+
@st.cache_data
|
| 20 |
+
def get_embeddings(text):
|
| 21 |
+
return np.array(openai.Embedding.create(input=text, model=EMBEDDING_MODEL)["data"][0]["embedding"])
|
| 22 |
+
|
| 23 |
+
|
| 24 |
st.title("Transformers: Tokenisers and Embeddings")
|
| 25 |
|
| 26 |
preface_image, preface_text, = st.columns(2)
|
|
|
|
| 294 |
according to their probabilities.
|
| 295 |
""")
|
| 296 |
|
| 297 |
+
st.subheader(":green[Try Yourself:]")
|
| 298 |
st.write(f"""\
|
| 299 |
*Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
|
| 300 |
efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
|
|
|
|
| 364 |
it.
|
| 365 |
""")
|
| 366 |
|
| 367 |
+
st.subheader(":green[Try Yourself:]")
|
| 368 |
st.write(f"""\
|
| 369 |
*Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
|
| 370 |
efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
|
|
|
|
| 478 |
characteristics using numbers, not words.
|
| 479 |
""")
|
| 480 |
|
| 481 |
+
st.write("""\
|
| 482 |
+
Let's explore embeddings in more detail. We can take an experimental approach by encoding two specific
|
| 483 |
+
words and examining the corresponding embedding vectors they generate. To make our exploration more accessible,
|
| 484 |
+
we'll visualise a portion of these vectors, thereby unveiling the underlying structure of embeddings. Pay attention
|
| 485 |
+
to common patterns and peaks, try to find two words that yield differing embeddings.
|
| 486 |
+
""")
|
| 487 |
+
col1, col2, col3 = st.columns(3)
|
| 488 |
+
token_king = col1.text_input("Choose a word:", value="king")
|
| 489 |
+
token_queen = col2.text_input("Choose a word:", value="queen")
|
| 490 |
+
token_dots = col3.number_input("Number of dots:", value=50, min_value=0, max_value=1536)
|
| 491 |
|
|
|
|
|
|
|
|
|
|
| 492 |
|
| 493 |
from torch import nn
|
| 494 |
from transformers import AutoConfig
|
|
|
|
| 514 |
EMBEDDING_MODEL = 'text-embedding-ada-002'
|
| 515 |
EMBEDDING_CTX_LENGTH = 8191
|
| 516 |
EMBEDDING_ENCODING = 'cl100k_base'
|
| 517 |
+
king = get_embeddings(token_king)
|
| 518 |
+
queen = get_embeddings(token_queen)
|
| 519 |
|
| 520 |
+
|
| 521 |
+
df = pd.DataFrame({f'"{token_king}" embeddings': king_emb_np, f'"{token_queen}" embeddings': queen_emb_np})
|
| 522 |
+
fig = px.line(df[:token_dots], title=f"Google's 'bert-base-uncased' model embeddings, embedding vector size: {len(queen_emb_np)}")
|
| 523 |
fig.update_layout(legend=dict(orientation="h"))
|
| 524 |
st.plotly_chart(fig, use_container_width=True)
|
| 525 |
|
| 526 |
+
with st.expander("Python Code:"):
|
| 527 |
+
st.code(f"""\
|
| 528 |
+
from torch import nn
|
| 529 |
+
from transformers import AutoConfig
|
| 530 |
+
|
| 531 |
+
model_ckpt = 'bert-base-uncased'
|
| 532 |
+
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
|
| 533 |
+
king_id = tokenizer("{token_king}", return_tensors="pt", add_special_tokens=False)
|
| 534 |
+
queen_id = tokenizer("{token_queen}", return_tensors="pt", add_special_tokens=False)
|
| 535 |
+
|
| 536 |
+
config = AutoConfig.from_pretrained(model_ckpt)
|
| 537 |
+
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
|
| 538 |
+
king_embeddings = token_emb(king_id.input_ids)
|
| 539 |
+
queen_embeddings = token_emb(queen_id.input_ids)
|
| 540 |
+
""")
|
| 541 |
|
| 542 |
+
df = pd.DataFrame({f'"{token_king}" embeddings': king, f'"{token_queen}" embeddings': queen})
|
| 543 |
+
fig = px.line(df[:token_dots], title=f"OpenAI's 'text-embedding-ada-002' model embeddings, embedding vector size: {len(queen)}")
|
| 544 |
fig.update_layout(legend=dict(orientation="h"))
|
| 545 |
st.plotly_chart(fig, use_container_width=True)
|
| 546 |
|
| 547 |
|
| 548 |
+
with st.expander("Python Code:"):
|
| 549 |
+
st.code(f"""\
|
| 550 |
+
import openai
|
| 551 |
+
|
| 552 |
+
EMBEDDING_MODEL = 'text-embedding-ada-002'
|
| 553 |
|
| 554 |
+
king_embeddings = np.array(openai.Embedding.create(input="{token_king}", model=EMBEDDING_MODEL)["data"][0]["embedding"])
|
| 555 |
+
queen_embeddings = np.array(openai.Embedding.create(input="{token_queen}", model=EMBEDDING_MODEL)["data"][0]["embedding"])
|
| 556 |
+
""")
|
| 557 |
|
| 558 |
+
st.write("""\
|
| 559 |
+
The similarity can be represented as a similarity score. Identical words naturally have the highest
|
| 560 |
+
score (black colours), while unrelated terms have lower scores (white colours). To compute this score,
|
| 561 |
+
we construct a matrix infused with our embedding vectors. Each row in this matrix corresponds to a unique word in the
|
| 562 |
+
sentence, while each column aligns with another word. The value at the intersection of row i and column j represents
|
| 563 |
+
the score between word i and word j. For a clearer understanding, let's visualise this matrix using a heatmap. Each
|
| 564 |
+
cell in the grid corresponds to a pair of words, and the colour of the cell indicates the similarity (correlation)
|
| 565 |
+
score between those two words. The intensity of the colour directly corresponds to the magnitude of the score - the
|
| 566 |
+
darker the hue, the higher the score.
|
| 567 |
+
""")
|
| 568 |
+
|
| 569 |
+
st.write("""Here is a heatmap of the score matrix for the sentence:""")
|
| 570 |
+
sentence = st.text_input(label="*words to explore embeddings*", value="a the king queen space sit eat from on")
|
| 571 |
+
sentence = sentence.split()
|
| 572 |
|
| 573 |
input = {word: get_embeddings(word) for word in sentence}
|
| 574 |
|
|
|
|
| 579 |
|
| 580 |
fig = px.imshow(scores_matrix, x=sentence, y=sentence, color_continuous_scale="hot_r")
|
| 581 |
fig.update_layout(coloraxis_showscale=False)
|
| 582 |
+
fig.update_layout(width=6000)
|
| 583 |
st.plotly_chart(fig, use_container_width=True)
|
| 584 |
|
| 585 |
+
st.subheader(":green[Try Yourself:]")
|
| 586 |
+
|
| 587 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
| 588 |
from langchain.vectorstores import FAISS
|
| 589 |
from langchain.schema.document import Document
|
|
|
|
| 590 |
|
| 591 |
+
@st.cache_resource
|
| 592 |
+
def create_vector_database():
|
| 593 |
+
return FAISS.from_documents([Document(page_content="king"), Document(page_content="queen")], OpenAIEmbeddings(model=EMBEDDING_MODEL))
|
| 594 |
+
db = create_vector_database()
|
| 595 |
+
|
| 596 |
+
@st.cache_data
|
| 597 |
+
def search_vector_database(term):
|
| 598 |
+
embedding_vector = OpenAIEmbeddings(model=EMBEDDING_MODEL).embed_query(term)
|
| 599 |
+
docs = db.similarity_search_by_vector(embedding_vector)
|
| 600 |
+
return docs
|
| 601 |
+
|
| 602 |
+
st.write("""\
|
| 603 |
+
*There is a vector database containing two words: 'king' and 'queen'. Your task is to pinpoint search
|
| 604 |
+
terms that would yield either of these words. To facilitate this, use the previously presented similarity matrix to
|
| 605 |
+
seek out words that give a higher correlation with the word in question. For instance, you might want to explore
|
| 606 |
+
terms such as 'king', 'queen', 'dog', 'prince', 'man', 'minister', 'boy'.*
|
| 607 |
+
""")
|
| 608 |
embeddings_query = st.text_input(label="search term")
|
| 609 |
if embeddings_query is not None and embeddings_query != '':
|
| 610 |
+
docs = search_vector_database(embeddings_query)
|
| 611 |
+
st.warning(docs[0].page_content)
|
|
|
|
| 612 |
|
| 613 |
+
with st.expander("Python Code:"):
|
| 614 |
+
st.code(f"""\
|
| 615 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
| 616 |
+
from langchain.vectorstores import FAISS
|
| 617 |
+
from langchain.schema.document import Document
|
| 618 |
+
|
| 619 |
+
|
| 620 |
+
db = FAISS.from_documents([Document(page_content="king"), Document(page_content="queen")], OpenAIEmbeddings(model=EMBEDDING_MODEL))
|
| 621 |
+
embedding_vector = OpenAIEmbeddings(model=EMBEDDING_MODEL).embed_query("{embeddings_query}")
|
| 622 |
+
docs = db.similarity_search_by_vector(embedding_vector)
|
| 623 |
+
""")
|
| 624 |
+
|
| 625 |
+
divider()
|
| 626 |
+
st.caption("Conclusion")
|
| 627 |
+
st.write("""\
|
| 628 |
+
As embedding algorithms are trained on a vast corpus of data, they inherently encapsulate a rich
|
| 629 |
+
tapestry of information about our language and even the world at large. Therefore, they can be used for:
|
| 630 |
+
|
| 631 |
+
- Search (where results are ranked by relevance to a query string)
|
| 632 |
+
- Clustering (where text strings are grouped by similarity)
|
| 633 |
+
- Recommendations (where items with related text strings are recommended)
|
| 634 |
+
- Anomaly detection (where outliers with little relatedness are identified)
|
| 635 |
+
- Diversity measurement (where similarity distributions are analyzed)
|
| 636 |
+
- Classification (where text strings are classified by their most similar label)
|
| 637 |
+
""")
|
| 638 |
|
| 639 |
with st.expander("References:"):
|
| 640 |
st.write("""\
|
| 641 |
- https://huggingface.co/blog/getting-started-with-embeddings
|
| 642 |
- https://huggingface.co/blog/1b-sentence-embeddings
|
| 643 |
+
- https://platform.openai.com/docs/guides/embeddings/use-cases
|
| 644 |
+
""")
|
| 645 |
+
|
| 646 |
+
divider()
|
| 647 |
+
st.header("Dimensionality Reduction (optional)")
|
| 648 |
+
|
| 649 |
+
|
| 650 |
+
st.write("""\
|
| 651 |
+
As was mentioned above, embedding vectors are learned in such a way that words with similar meanings
|
| 652 |
+
are located close to each other in the space. However, this is an abstract concept that might be difficult to
|
| 653 |
+
explore, understand and visualise in a 2D space because word embeddings typically have hundreds of dimensions. To
|
| 654 |
+
solve this, we can use techniques like Principal Component Analysis (PCA) or t-SNE to reduce the dimensionality of
|
| 655 |
+
the vectors and plot them.
|
| 656 |
+
""")
|
| 657 |
+
st.write("""But first, let's talk about the meaning of dimensionality reduction using simplified use-case:""")
|
| 658 |
+
|
| 659 |
+
dimensionality_name = st.selectbox(label="Choose your example", options=["Simplified", "PCA", 't-SNE'])
|
| 660 |
+
if dimensionality_name == 'Simplified':
|
| 661 |
+
_, col2, _ = st.columns(3)
|
| 662 |
+
col2.image("assets/img.png")
|
| 663 |
+
st.write("""\
|
| 664 |
+
**Step 1: The context**\n
|
| 665 |
+
We have a 3D object (your hand) and a light source that's casting a 2D shadow of your hand onto a
|
| 666 |
+
wall. The shadow is a simpler, lower-dimensional representation of your hand.
|
| 667 |
+
|
| 668 |
+
**Step 2: Identifying the dimensions**\n
|
| 669 |
+
In this case, the dimensions are the different aspects of your hand that can be
|
| 670 |
+
observed: the length of your fingers, the width of your palm, the height (or depth) of your hand, the scars,
|
| 671 |
+
the colour of the skin, etc. However, we have a problem: we can't easily visualise or understand all these dimensions
|
| 672 |
+
at once. Just as it's hard to imagine a 6-dimensional space.
|
| 673 |
+
|
| 674 |
+
**Step 3: Deciding on important dimensions**\n
|
| 675 |
+
Let's say you want to compare the number of fingers of different hands. In
|
| 676 |
+
this case, you don't need to know about the depth of the hand, the width of the palm, or other details like freckles,
|
| 677 |
+
scars, or skin colour. You just need a shadow that clearly shows the fingers. So, you decide to focus on the length
|
| 678 |
+
of the fingers, which can be easily shown in the shadow.
|
| 679 |
+
|
| 680 |
+
**Step 4: Reducing dimensions**\n
|
| 681 |
+
This is where you actually perform dimensionality reduction. You orient your hand in such
|
| 682 |
+
a way (giving the wall a high-five) that the shadow clearly shows the fingers. You've effectively reduced the
|
| 683 |
+
dimensions from 3D to 2D. Your hand is still a 3D object, but its shadow — the simplified representation you're using
|
| 684 |
+
for your comparison — is 2D.
|
| 685 |
+
|
| 686 |
+
**Step 5: Interpretation**\n
|
| 687 |
+
This hand and shadow example shows how dimensionality reduction simplifies a complex object (
|
| 688 |
+
the 3D hand) into a lower-dimensional representation (the 2D shadow) that retains the most important information (the
|
| 689 |
+
number of fingers) while discarding the less important details (like the depth of the hand, skin colour, etc.). It's
|
| 690 |
+
a process of prioritisation and simplification that makes it easier for us to understand and analyse the data (or the
|
| 691 |
+
hands, in this case).
|
| 692 |
+
""")
|
| 693 |
+
elif dimensionality_name == 'PCA':
|
| 694 |
+
st.write("""\
|
| 695 |
+
**Step 1: Understanding PCA**\n
|
| 696 |
+
PCA is a popular method for dimensionality reduction. It identifies the
|
| 697 |
+
axes in the feature space along which the original data varies the most. These axes are known as the principal
|
| 698 |
+
components, and they are orthogonal (perpendicular) to each other.
|
| 699 |
+
|
| 700 |
+
**Step 2: Projecting the Data**\n
|
| 701 |
+
Imagine that instead of just casting a shadow on the wall, you can cast your hand's
|
| 702 |
+
shadow onto a number of walls arranged at different angles around your hand. Each shadow is a different projection of
|
| 703 |
+
your hand. In PCA, these different walls represent different principal components, and the shadow on each wall is a
|
| 704 |
+
projection of your hand onto that principal component.
|
| 705 |
+
|
| 706 |
+
**Step 3: Choosing the Best Projection**\n
|
| 707 |
+
Now, consider the shadow that most accurately portrays the number of fingers on
|
| 708 |
+
your hand. This shadow corresponds to the principal component that captures the most variance in the data. In PCA,
|
| 709 |
+
this would be the first principal component.
|
| 710 |
+
|
| 711 |
+
**Step 4: Secondary Features**\n
|
| 712 |
+
Next, consider the shadow that, while not as accurate as the first, still gives a
|
| 713 |
+
reasonable representation of your hand, such as showing the width of your palm. This shadow represents the second
|
| 714 |
+
principal component, which captures the second highest amount of variance in the data.
|
| 715 |
+
|
| 716 |
+
**Step 5: Reduction of Dimensions**\n
|
| 717 |
+
In the process of reducing dimensions, we select the top few principal components (
|
| 718 |
+
shadows) that capture the most variance. The other dimensions (shadows) are discarded. So, instead of having to
|
| 719 |
+
consider the complex 3D structure of your hand, you can simply look at one or two shadows that give you the most
|
| 720 |
+
information about the hand.
|
| 721 |
+
|
| 722 |
+
**Step 6: Transformation**\n
|
| 723 |
+
Finally, we transform the original data into the reduced dimensional space defined by the
|
| 724 |
+
selected principal components. This is analogous to replacing each hand with the selected shadows for further analysis.
|
| 725 |
+
By using PCA, we can reduce the complexity of the data (from a 3D hand to a 2D or even 1D shadow), while still
|
| 726 |
+
retaining the most important information (like the number of fingers or the width of the palm). This makes the data
|
| 727 |
+
easier to visualize, understand, and work with.
|
| 728 |
+
""")
|
| 729 |
+
embedding_dim = 1536
|
| 730 |
+
embeddings = st.text_input("words to explore:",
|
| 731 |
+
value="king queen man woman prince prince princess counselor minister teacher")
|
| 732 |
+
embeddings = embeddings.split()
|
| 733 |
+
embeddings = {word: get_embeddings(word) for word in embeddings}
|
| 734 |
+
|
| 735 |
+
from sklearn.decomposition import PCA
|
| 736 |
+
|
| 737 |
+
pca = PCA(n_components=2)
|
| 738 |
+
embedding_matrix = np.array(list(embeddings.values()))
|
| 739 |
+
reduced_embeddings = pca.fit_transform(embedding_matrix)
|
| 740 |
+
|
| 741 |
+
df = pd.DataFrame(reduced_embeddings, columns=["X", "Y"])
|
| 742 |
+
df["Word"] = list(embeddings.keys())
|
| 743 |
+
fig = px.scatter(df, x="X", y="Y", text="Word", title="Word Embeddings", width=800, height=800)
|
| 744 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 745 |
+
|
| 746 |
+
st.code(f"""\
|
| 747 |
+
from sklearn.decomposition import PCA
|
| 748 |
+
|
| 749 |
+
pca = PCA(n_components=2)
|
| 750 |
+
embedding_matrix = np.array(list(embeddings.values()))
|
| 751 |
+
reduced_embeddings = pca.fit_transform(embedding_matrix)
|
| 752 |
+
""", language='python')
|
| 753 |
+
|
| 754 |
+
elif dimensionality_name == 't-SNE':
|
| 755 |
+
st.write("""\
|
| 756 |
+
**Step 1: Understanding t-SNE**\n
|
| 757 |
+
t-SNE is a technique for dimensionality reduction that is particularly
|
| 758 |
+
well-suited for the visualization of high-dimensional datasets. Unlike PCA, which is a linear technique,
|
| 759 |
+
t-SNE is a non-linear technique, making it better at capturing complex polynomial relationships between variables.
|
| 760 |
+
|
| 761 |
+
**Step 2: Measuring Similarities**\n
|
| 762 |
+
Imagine that instead of just one hand, you have many hands casting shadows. Each hand
|
| 763 |
+
is different - some hands might have longer fingers, some might have a wider palm, and so on. Each hand has its own
|
| 764 |
+
"neighborhood" of similar hands. In t-SNE, these neighborhoods are represented mathematically by a probability
|
| 765 |
+
distribution. Hands that are very similar to each other have a high probability of being "neighbors", while hands
|
| 766 |
+
that are very different have a low probability.
|
| 767 |
+
|
| 768 |
+
**Step 3: Creating a Map**\n
|
| 769 |
+
t-SNE creates a map (or a projection) where hands that were close in the high-dimensional
|
| 770 |
+
space (similar hands) are still close in the low-dimensional space (in their shadows), and hands that were far apart
|
| 771 |
+
in the high-dimensional space (different hands) are still far apart in the low-dimensional space. This map is created
|
| 772 |
+
in such a way that it minimizes the difference between the distances in the high-dimensional space and the distances
|
| 773 |
+
in the low-dimensional space.
|
| 774 |
+
|
| 775 |
+
**Step 4: Reducing Dimensions**\n
|
| 776 |
+
The process of reducing dimensions in t-SNE involves optimizing the locations of each
|
| 777 |
+
hand's shadow in the low-dimensional space such that the overall configuration of shadows best represents the
|
| 778 |
+
similarities between the hands in the high-dimensional space.
|
| 779 |
+
|
| 780 |
+
**Step 5: Interpretation**\n
|
| 781 |
+
The result of t-SNE is a map where similar hands are located close together and dissimilar
|
| 782 |
+
hands are located far apart. This makes it easier to visualize clusters or groups of similar hands.
|
| 783 |
+
t-SNE, therefore, helps us to project high-dimensional data into a lower-dimensional space in a way that preserves
|
| 784 |
+
the structure of the data as much as possible, making it easier to visualize and understand the relationships in the
|
| 785 |
+
data.
|
| 786 |
+
""")
|
| 787 |
+
embedding_dim = 1536
|
| 788 |
+
embeddings = st.text_input("words to explore:",
|
| 789 |
+
value="king queen man woman prince prince princess counselor minister teacher")
|
| 790 |
+
embeddings = embeddings.split()
|
| 791 |
+
embeddings = {word: get_embeddings(word) for word in embeddings}
|
| 792 |
+
|
| 793 |
+
from sklearn.manifold import TSNE
|
| 794 |
+
|
| 795 |
+
tsne = TSNE(n_components=2, perplexity=2, random_state=0)
|
| 796 |
+
embedding_matrix = np.array(list(embeddings.values()))
|
| 797 |
+
reduced_embeddings = tsne.fit_transform(embedding_matrix)
|
| 798 |
+
|
| 799 |
+
df = pd.DataFrame(reduced_embeddings, columns=["X", "Y"])
|
| 800 |
+
df["Word"] = list(embeddings.keys())
|
| 801 |
+
fig = px.scatter(df, x="X", y="Y", text="Word", title="Word Embeddings", width=800, height=800)
|
| 802 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 803 |
+
|
| 804 |
+
st.code(f"""\
|
| 805 |
+
from sklearn.manifold import TSNE
|
| 806 |
+
|
| 807 |
+
tsne = TSNE(n_components=2, perplexity=2, random_state=0)
|
| 808 |
+
embedding_matrix = np.array(list(embeddings.values()))
|
| 809 |
+
reduced_embeddings = tsne.fit_transform(embedding_matrix)
|
| 810 |
+
""", language='python')
|
| 811 |
+
|
| 812 |
+
with st.expander("References:"):
|
| 813 |
+
st.write("""\
|
| 814 |
+
- https://hex.tech/blog/dimensionality-reduction/
|
| 815 |
+
- https://github.com/openai/openai-cookbook/blob/main/examples/Visualizing_embeddings_in_2D.ipynb
|
| 816 |
""")
|
requirements.txt
CHANGED
|
@@ -6,4 +6,5 @@ openai~=0.27.8
|
|
| 6 |
plotly~=5.15.0
|
| 7 |
langchain~=0.0.242
|
| 8 |
faiss-cpu~=1.7.4
|
| 9 |
-
tiktoken~=0.4.0
|
|
|
|
|
|
| 6 |
plotly~=5.15.0
|
| 7 |
langchain~=0.0.242
|
| 8 |
faiss-cpu~=1.7.4
|
| 9 |
+
tiktoken~=0.4.0
|
| 10 |
+
scikit-learn~=1.3.0
|