Spaces:

adenshulga
/

arxiv-paper-classifier

Sleeping

+from dataclasses import dataclass, field
+from hydra.core.config_store import ConfigStore
+@dataclass
+class InferenceConfig:
+    """Configuration for inference"""
+    model_name: str = "allenai/scibert_scivocab_uncased"
+    checkpoint_path: str = "data/checkpoints/checkpoint-10"
+    top_percent: float = 0.95

config/pipeline_config.py CHANGED Viewed

@@ -16,13 +16,13 @@ class DatasetConfig:
 class CustomTrainingArguments:
     output_dir: str = "data/checkpoints"
     overwrite_output_dir: bool = True
-    num_train_epochs: float = 10
     learning_rate: float = 5e-5
     lr_scheduler_type: str = "cosine"
     # lr_scheduler_kwargs={},
     warmup_ratio: float = 0.03125
-    warmup_steps: int = 10
-    # per_device_train_batch_size: int = 32
     gradient_accumulation_steps: int = 1
     log_level: str = "error"
     # logging_dir="output_dir/runs/CURRENT_DATETIME_HOSTNAME"  # логи для tensorboard (default)
@@ -46,13 +46,21 @@ class CustomTrainingArguments:
     # resume_from_checkpoint: str = "last-checkpoint"
     auto_find_batch_size: bool = True
     report_to: str = "comet_ml"
 @dataclass
 class ModelConfig:
     """Configuration for model architecture and parameters"""
-    model_name: str = "bert-base-uncased"
 @dataclass

 class CustomTrainingArguments:
     output_dir: str = "data/checkpoints"
     overwrite_output_dir: bool = True
+    num_train_epochs: float = 3
     learning_rate: float = 5e-5
     lr_scheduler_type: str = "cosine"
     # lr_scheduler_kwargs={},
     warmup_ratio: float = 0.03125
+    warmup_steps: int = 1
+    # per_device_train_batch_size: int = 64
     gradient_accumulation_steps: int = 1
     log_level: str = "error"
     # logging_dir="output_dir/runs/CURRENT_DATETIME_HOSTNAME"  # логи для tensorboard (default)
     # resume_from_checkpoint: str = "last-checkpoint"
     auto_find_batch_size: bool = True
     report_to: str = "comet_ml"
+    metric_for_best_model: str = "f1"
+    greater_is_better: bool = True
 @dataclass
 class ModelConfig:
     """Configuration for model architecture and parameters"""
+    model_name: str = "allenai/scibert_scivocab_uncased"
+    # model_name: tp.Literal[
+    #     "FacebookAI/roberta-base",
+    #     "distilbert-base-uncased",
+    #     "allenai/scibert_scivocab_uncased",
+    # ] = "allenai/scibert_scivocab_uncased"
 @dataclass

container_setup/Dockerfile CHANGED Viewed

@@ -42,12 +42,10 @@ WORKDIR /app
 COPY --chown=appuser:appgroup . /app
-# 1) Create a dedicated venv in .venv
 RUN uv venv .venv
-# 2) Install / sync packages into that .venv
 RUN uv sync
-EXPOSE 8000
-CMD ["uv", "run", "python", "entrypoints/app.py"]

 COPY --chown=appuser:appgroup . /app
 RUN uv venv .venv
 RUN uv sync
+EXPOSE 9000
+# CMD ["uv", "run", "python", "entrypoints/app.py"]

container_setup/build.sh CHANGED Viewed

@@ -2,7 +2,7 @@
 source container_setup/credentials
-docker build --no-cache -f container_setup/Dockerfile -t ${DOCKER_NAME} . \
         --build-arg DOCKER_NAME=${DOCKER_NAME} \
         --build-arg USER_ID=${DOCKER_USER_ID} \
         --build-arg GROUP_ID=${DOCKER_GROUP_ID}

 source container_setup/credentials
+docker build -f container_setup/Dockerfile -t ${DOCKER_NAME} . \
         --build-arg DOCKER_NAME=${DOCKER_NAME} \
         --build-arg USER_ID=${DOCKER_USER_ID} \
         --build-arg GROUP_ID=${DOCKER_GROUP_ID}

container_setup/credentials CHANGED Viewed

@@ -5,5 +5,5 @@ CONTAINER_NAME=$USER"-arxiv-papers-classification"
 SRC="." # folder to propulse in docker container
 DOCKER_USER_ID=$(id -u) # to get these values type "id" in shell termilal
 DOCKER_GROUP_ID=$(id -g)
-CONTAINER_PORT=8001 # used in launch_container file
-INNER_PORT=8001

 SRC="." # folder to propulse in docker container
 DOCKER_USER_ID=$(id -u) # to get these values type "id" in shell termilal
 DOCKER_GROUP_ID=$(id -g)
+CONTAINER_PORT=9001 # used in launch_container file
+INNER_PORT=9001

container_setup/launch_container.sh CHANGED Viewed

@@ -11,6 +11,7 @@ docker run \
     --rm \
     -it \
     --init \
     -v ${SRC}:/app \
     -p ${INNER_PORT}:${CONTAINER_PORT} \
     ${DOCKER_NAME} \

     --rm \
     -it \
     --init \
+	--gpus '"device=0,1,2"' \
     -v ${SRC}:/app \
     -p ${INNER_PORT}:${CONTAINER_PORT} \
     ${DOCKER_NAME} \

entrypoints/app.py CHANGED Viewed

@@ -1,60 +1,27 @@
-# import streamlit as st
-# st.title("This is a title")
-# st.header("This is a header")
-# st.subheader("This is a subheader")
-# st.text("This is a text")
-# st.markdown("# This is a markdown header 1")
-# st.markdown("## This is a markdown header 2")
-# st.markdown("### This is a markdown header 3")
-# st.markdown("This is a markdown: *bold* **italic** `inline code` ~strikethrough~")
-# st.markdown("""This is a code block with syntax highlighting
-# ```python
-# print("Hello world!")
-# ```
-# """)
-# st.html(
-#     "image from url example with html: "
-#     "<img src='https://www.wallpaperflare.com/static/450/825/286/kitten-cute-animals-grass-5k-wallpaper.jpg' width=400px>",
-# )
-# st.write("Text with write")
-# st.write(range(10))
-# st.success("Success")
-# st.info("Information")
-# st.warning("Warning")
-# st.error("Error")
-# exp = ZeroDivisionError("Trying to divide by Zero")
-# st.exception(exp)
-# # инициализируем переменные
-# st.session_state.key1 = "value1"  # Attribute API
-# st.session_state["key2"] = "value2"  # Dictionary like API
-# # посмотреть что в st.session_state
-# st.write(st.session_state)
-# # magic
-# st.session_state
-# # ошибка если неправильный ключ
-# # st.write(st.session_state["missing_key"])
-# import streamlit as st
-# from transformers import pipeline
-# @st.cache_resource  # кэширование
-# def load_model():
-#     return pipeline("sentiment-analysis")  # скачивание модели
-# model = load_model()
-# query = st.text_input("Your query", value="I love Streamlit! 🎈")
-# if query:
-#     result = model(query)[0]  # классифицируем
-#     st.write(query)
-#     st.write(result)

+import streamlit as st
+from src.app.setup_model import setup_pipeline, get_top_label_names, LabelScore
+from src.app.tags_mapping import tags2full_name
+from src.app.visualization import visualize_predicted_categories
+from config.inference_config import InferenceConfig
+from src.app.data_validation import validate_data
+st.title("arXiv Paper Classifier")
+st.markdown("Enter paper details to predict arXiv categories")
+st.text_input("Enter paper name", key="paper_name")
+st.text_area("Enter paper abstract", key="paper_abstract", height=250)
+if st.button("Predict Categories", type="primary"):
+    validate_data(st.session_state["paper_name"], st.session_state["paper_abstract"])
+    with st.spinner("Analyzing paper..."):
+        pipeline = setup_pipeline(InferenceConfig())
+        scores: list[LabelScore] = pipeline(
+            st.session_state["paper_name"] + " " + st.session_state["paper_abstract"],
+            output_scores=True,
+        )  # type: ignore
+        top_labels = get_top_label_names(scores, tags2full_name, 0.95)
+    visualize_predicted_categories(top_labels, scores, tags2full_name)
+else:
+    st.info("Enter paper details and click 'Predict Categories' to get predictions.")

pyproject.toml CHANGED Viewed

@@ -17,6 +17,7 @@ dependencies = [
     "python-dotenv>=1.1.0",
     "scikit-learn>=1.6.1",
     "streamlit>=1.44.1",
     "torch>=2.6.0",
     "transformers>=4.50.3",
 ]

     "python-dotenv>=1.1.0",
     "scikit-learn>=1.6.1",
     "streamlit>=1.44.1",
+    "tiktoken>=0.9.0",
     "torch>=2.6.0",
     "transformers>=4.50.3",
 ]

scripts/pipeline.sh CHANGED Viewed

@@ -4,4 +4,4 @@ export PYTHONPATH='.'
 source .venv/bin/activate
-python entrypoints/pipeline.py


4
5	source .venv/bin/activate
6
7	+ CUDA_VISIBLE_DEVICES=0 uv run entrypoints/pipeline.py

src/app/data_validation.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import streamlit as st
+def validate_data(paper_name: str, paper_abstract: str) -> None:
+    if paper_name == "" or paper_abstract == "":
+        st.error("Paper name or abstract are required")
+        return
+    if paper_abstract == "":
+        st.warning(
+            "Without abstract, the performance of the model will be significantly worse"
+        )
+        return

src/app/setup_model.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from transformers import pipeline, Pipeline
+import typing as tp
+from config.inference_config import InferenceConfig
+import streamlit as st
+from src.app.tags_mapping import tags2full_name
+class LabelScore(tp.TypedDict):
+    label: str
+    score: float
+@st.cache_resource
+def setup_pipeline(cfg: InferenceConfig) -> Pipeline:
+    model = pipeline(
+        "text-classification", model=cfg.checkpoint_path, tokenizer=cfg.model_name
+    )
+    return model
+def get_top_labels(scores: list[LabelScore], top_percent: float) -> list[LabelScore]:
+    top_scores = sorted(scores, key=lambda x: x["score"], reverse=True)
+    cumulative_score = 0
+    selected_labels: list[LabelScore] = []
+    for score in top_scores:
+        cumulative_score += score["score"]
+        selected_labels.append(score)
+        if cumulative_score >= top_percent:
+            break
+    return selected_labels
+def get_full_names(
+    labels: list[LabelScore], label2name: dict[str, str]
+) -> list[LabelScore]:
+    return [
+        LabelScore(label=label2name[label["label"]], score=label["score"])
+        if label["label"] in label2name
+        else LabelScore(label=label["label"], score=label["score"])
+        for label in labels
+    ]
+def get_top_label_names(
+    scores: list[LabelScore], label2name: dict[str, str], top_percent: float
+) -> list[LabelScore]:
+    top_labels = get_top_labels(scores, top_percent)
+    return get_full_names(top_labels, label2name)

src/app/tags_mapping.py ADDED Viewed

	@@ -0,0 +1,157 @@

+tags2full_name = {
+    "cs.AI": "Artificial Intelligence",
+    "cs.AR": "Hardware Architecture",
+    "cs.CC": "Computational Complexity",
+    "cs.CE": "Computational Engineering, Finance, and Science",
+    "cs.CG": "Computational Geometry",
+    "cs.CL": "Computation and Language",
+    "cs.CR": "Cryptography and Security",
+    "cs.CV": "Computer Vision and Pattern Recognition",
+    "cs.CY": "Computers and Society",
+    "cs.DB": "Databases",
+    "cs.DC": "Distributed, Parallel, and Cluster Computing",
+    "cs.DL": "Digital Libraries",
+    "cs.DM": "Discrete Mathematics",
+    "cs.DS": "Data Structures and Algorithms",
+    "cs.ET": "Emerging Technologies",
+    "cs.FL": "Formal Languages and Automata Theory",
+    "cs.GL": "General Literature",
+    "cs.GR": "Graphics",
+    "cs.GT": "Computer Science and Game Theory",
+    "cs.HC": "Human-Computer Interaction",
+    "cs.IR": "Information Retrieval",
+    "cs.IT": "Information Theory",
+    "cs.LG": "Machine Learning",
+    "cs.LO": "Logic in Computer Science",
+    "cs.MA": "Multiagent Systems",
+    "cs.MM": "Multimedia",
+    "cs.MS": "Mathematical Software",
+    "cs.NA": "Numerical Analysis",
+    "cs.NE": "Neural and Evolutionary Computing",
+    "cs.NI": "Networking and Internet Architecture",
+    "cs.OH": "Other Computer Science",
+    "cs.OS": "Operating Systems",
+    "cs.PF": "Performance",
+    "cs.PL": "Programming Languages",
+    "cs.RO": "Robotics",
+    "cs.SC": "Symbolic Computation",
+    "cs.SD": "Sound",
+    "cs.SE": "Software Engineering",
+    "cs.SI": "Social and Information Networks",
+    "cs.SY": "Systems and Control",
+    "econ.EM": "Econometrics",
+    "econ.GN": "General Economics",
+    "econ.TH": "Theoretical Economics",
+    "eess.AS": "Audio and Speech Processing",
+    "eess.IV": "Image and Video Processing",
+    "eess.SP": "Signal Processing",
+    "eess.SY": "Systems and Control",
+    "math.AC": "Commutative Algebra",
+    "math.AG": "Algebraic Geometry",
+    "math.AP": "Analysis of PDEs",
+    "math.AT": "Algebraic Topology",
+    "math.CA": "Classical Analysis and ODEs",
+    "math.CO": "Combinatorics",
+    "math.CT": "Category Theory",
+    "math.CV": "Complex Variables",
+    "math.DG": "Differential Geometry",
+    "math.DS": "Dynamical Systems",
+    "math.FA": "Functional Analysis",
+    "math.GM": "General Mathematics",
+    "math.GN": "General Topology",
+    "math.GR": "Group Theory",
+    "math.GT": "Geometric Topology",
+    "math.HO": "History and Overview",
+    "math.IT": "Information Theory",
+    "math.KT": "K-Theory and Homology",
+    "math.LO": "Logic",
+    "math.MG": "Metric Geometry",
+    "math.MP": "Mathematical Physics",
+    "math.NA": "Numerical Analysis",
+    "math.NT": "Number Theory",
+    "math.OA": "Operator Algebras",
+    "math.OC": "Optimization and Control",
+    "math.PR": "Probability",
+    "math.QA": "Quantum Algebra",
+    "math.RA": "Rings and Algebras",
+    "math.RT": "Representation Theory",
+    "math.SG": "Symplectic Geometry",
+    "math.SP": "Spectral Theory",
+    "math.ST": "Statistics Theory",
+    "astro-ph.CO": "Cosmology and Nongalactic Astrophysics",
+    "astro-ph.EP": "Earth and Planetary Astrophysics",
+    "astro-ph.GA": "Astrophysics of Galaxies",
+    "astro-ph.HE": "High Energy Astrophysical Phenomena",
+    "astro-ph.IM": "Instrumentation and Methods for Astrophysics",
+    "astro-ph.SR": "Solar and Stellar Astrophysics",
+    "cond-mat.dis-nn": "Disordered Systems and Neural Networks",
+    "cond-mat.mes-hall": "Mesoscale and Nanoscale Physics",
+    "cond-mat.mtrl-sci": "Materials Science",
+    "cond-mat.other": "Other Condensed Matter",
+    "cond-mat.quant-gas": "Quantum Gases",
+    "cond-mat.soft": "Soft Condensed Matter",
+    "cond-mat.stat-mech": "Statistical Mechanics",
+    "cond-mat.str-el": "Strongly Correlated Electrons",
+    "cond-mat.supr-con": "Superconductivity",
+    "gr-qc": "General Relativity and Quantum Cosmology",
+    "hep-ex": "High Energy Physics - Experiment",
+    "hep-lat": "High Energy Physics - Lattice",
+    "hep-ph": "High Energy Physics - Phenomenology",
+    "hep-th": "High Energy Physics - Theory",
+    "math-ph": "Mathematical Physics",
+    "nlin.AO": "Adaptation and Self-Organizing Systems",
+    "nlin.CD": "Chaotic Dynamics",
+    "nlin.CG": "Cellular Automata and Lattice Gases",
+    "nlin.PS": "Pattern Formation and Solitons",
+    "nlin.SI": "Exactly Solvable and Integrable Systems",
+    "nucl-ex": "Nuclear Experiment",
+    "nucl-th": "Nuclear Theory",
+    "physics.acc-ph": "Accelerator Physics",
+    "physics.ao-ph": "Atmospheric and Oceanic Physics",
+    "physics.app-ph": "Applied Physics",
+    "physics.atm-clus": "Atomic and Molecular Clusters",
+    "physics.atom-ph": "Atomic Physics",
+    "physics.bio-ph": "Biological Physics",
+    "physics.chem-ph": "Chemical Physics",
+    "physics.class-ph": "Classical Physics",
+    "physics.comp-ph": "Computational Physics",
+    "physics.data-an": "Data Analysis, Statistics and Probability",
+    "physics.ed-ph": "Physics Education",
+    "physics.flu-dyn": "Fluid Dynamics",
+    "physics.gen-ph": "General Physics",
+    "physics.geo-ph": "Geophysics",
+    "physics.hist-ph": "History and Philosophy of Physics",
+    "physics.ins-det": "Instrumentation and Detectors",
+    "physics.med-ph": "Medical Physics",
+    "physics.optics": "Optics",
+    "physics.plasm-ph": "Plasma Physics",
+    "physics.pop-ph": "Popular Physics",
+    "physics.soc-ph": "Physics and Society",
+    "physics.space-ph": "Space Physics",
+    "quant-ph": "Quantum Physics",
+    "q-bio.BM": "Biomolecules",
+    "q-bio.CB": "Cell Behavior",
+    "q-bio.GN": "Genomics",
+    "q-bio.MN": "Molecular Networks",
+    "q-bio.NC": "Neurons and Cognition",
+    "q-bio.OT": "Other Quantitative Biology",
+    "q-bio.PE": "Populations and Evolution",
+    "q-bio.QM": "Quantitative Methods",
+    "q-bio.SC": "Subcellular Processes",
+    "q-bio.TO": "Tissues and Organs",
+    "q-fin.CP": "Computational Finance",
+    "q-fin.EC": "Economics",
+    "q-fin.GN": "General Finance",
+    "q-fin.MF": "Mathematical Finance",
+    "q-fin.PM": "Portfolio Management",
+    "q-fin.PR": "Pricing of Securities",
+    "q-fin.RM": "Risk Management",
+    "q-fin.ST": "Statistical Finance",
+    "q-fin.TR": "Trading and Market Microstructure",
+    "stat.AP": "Applications",
+    "stat.CO": "Computation",
+    "stat.ME": "Methodology",
+    "stat.ML": "Machine Learning",
+    "stat.OT": "Other Statistics",
+    "stat.TH": "Statistics Theory",
+}

src/app/visualization.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import streamlit as st
+from src.app.setup_model import LabelScore
+from typing import Dict
+def visualize_predicted_categories(
+    top_labels: list[LabelScore],
+    scores: list[LabelScore],
+    label_to_name_mapping: Dict[str, str],
+):
+    """
+    Visualize the predicted categories in a streamlit app
+    Args:
+        top_labels: List of top labels to display
+        scores: All scores from the model
+        label_to_name_mapping: Mapping from label codes to full names
+    """
+    st.subheader("Predicted Categories")
+    for i, label in enumerate(top_labels):
+        score = next((s["score"] for s in scores if s["label"] == label["label"]), 0)
+        # Color gradient based on confidence
+        color_intensity = min(int(score * 255), 255)
+        with st.container(border=True):
+            cols = st.columns([3, 1])
+            with cols[0]:
+                # Access full_name from the mapping if available
+                full_name = label_to_name_mapping.get(label["label"], label["label"])
+                st.markdown(f"**{full_name}**")
+                st.caption(f"Tag: {label['label']}")
+            with cols[1]:
+                st.markdown(
+                    f"<h3 style='text-align: right; color: rgb(0, {color_intensity}, {255 - color_intensity});'>{score:.2f}</h3>",
+                    unsafe_allow_html=True,
+                )
+    display_all_scores(scores, label_to_name_mapping)
+def display_all_scores(scores: list[LabelScore], label_to_name_mapping: Dict[str, str]):
+    """
+    Display all scores in an expandable section
+    Args:
+        scores: All scores from the model
+        label_to_name_mapping: Mapping from label codes to full names
+    """
+    with st.expander("View all category scores"):
+        sorted_scores = sorted(scores, key=lambda x: x["score"], reverse=True)
+        for score_item in sorted_scores[:20]:  # Show top 20
+            label_name = label_to_name_mapping.get(
+                score_item["label"], score_item["label"]
+            )
+            st.text(f"{label_name} ({score_item['label']}): {score_item['score']:.4f}")

src/pipeline/arxiv_dataset.py CHANGED Viewed

@@ -22,7 +22,7 @@ class ArxivPaper(tp.TypedDict):
 def load_arxiv_dataset() -> Dataset:
-    df = pd.read_json("data/arxivData.json").head(100)
     dataset = Dataset.from_pandas(df[["summary", "tag", "title"]])
     return dataset
@@ -51,6 +51,7 @@ def generate_preprocessing_function(
             text,
             truncation=True,
             padding="max_length",
         )
         tags_list: list[ArxivTag] = ast.literal_eval(row["tag"])

 def load_arxiv_dataset() -> Dataset:
+    df = pd.read_json("data/arxivData.json")
     dataset = Dataset.from_pandas(df[["summary", "tag", "title"]])
     return dataset
             text,
             truncation=True,
             padding="max_length",
+            max_length=512,
         )
         tags_list: list[ArxivTag] = ast.literal_eval(row["tag"])

src/pipeline/env_setup.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from dotenv import load_dotenv
 import os
 REQUIRED_ENV_VARS = ["COMET_API_KEY", "COMET_MODE"]

 from dotenv import load_dotenv
 import os
+import torch
 REQUIRED_ENV_VARS = ["COMET_API_KEY", "COMET_MODE"]

src/pipeline/metrics.py CHANGED Viewed

@@ -1,7 +1,11 @@
-import evaluate
 import numpy as np
-clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
 def sigmoid(x):
@@ -10,8 +14,27 @@ def sigmoid(x):
 def compute_metrics(eval_pred):
     predictions, labels = eval_pred
-    predictions = sigmoid(predictions)
-    predictions = (predictions > 0.5).astype(int).reshape(-1)
-    return clf_metrics.compute(
-        predictions=predictions, references=labels.astype(int).reshape(-1)
     )

 import numpy as np
+from sklearn.metrics import (
+    accuracy_score,
+    f1_score,
+    precision_score,
+    recall_score,
+    roc_auc_score,
+)
 def sigmoid(x):
 def compute_metrics(eval_pred):
     predictions, labels = eval_pred
+    # Handle T5 model output which can be a tuple
+    if isinstance(predictions, tuple):
+        predictions = predictions[0]
+    prediction_scores = sigmoid(predictions)
+    predictions = (prediction_scores > 0.5).astype(int)
+    # Multi-label metrics
+    accuracy = accuracy_score(labels, predictions)
+    roc_auc = roc_auc_score(labels, prediction_scores)
+    f1 = f1_score(labels, predictions, average="weighted", zero_division=0)
+    precision = precision_score(
+        labels, predictions, average="weighted", zero_division=0
     )
+    recall = recall_score(labels, predictions, average="weighted", zero_division=0)
+    return {
+        "accuracy": accuracy,
+        "f1": f1,
+        "precision": precision,
+        "recall": recall,
+        "roc_auc": roc_auc,
+    }

uv.lock CHANGED Viewed

@@ -122,6 +122,7 @@ dependencies = [
     { name = "python-dotenv" },
     { name = "scikit-learn" },
     { name = "streamlit" },
     { name = "torch" },
     { name = "transformers" },
 ]
@@ -140,6 +141,7 @@ requires-dist = [
     { name = "python-dotenv", specifier = ">=1.1.0" },
     { name = "scikit-learn", specifier = ">=1.6.1" },
     { name = "streamlit", specifier = ">=1.44.1" },
     { name = "torch", specifier = ">=2.6.0" },
     { name = "transformers", specifier = ">=4.50.3" },
 ]
@@ -1717,6 +1719,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638 },
 ]
 [[package]]
 name = "tokenizers"
 version = "0.21.1"

     { name = "python-dotenv" },
     { name = "scikit-learn" },
     { name = "streamlit" },
+    { name = "tiktoken" },
     { name = "torch" },
     { name = "transformers" },
 ]
     { name = "python-dotenv", specifier = ">=1.1.0" },
     { name = "scikit-learn", specifier = ">=1.6.1" },
     { name = "streamlit", specifier = ">=1.44.1" },
+    { name = "tiktoken", specifier = ">=0.9.0" },
     { name = "torch", specifier = ">=2.6.0" },
     { name = "transformers", specifier = ">=4.50.3" },
 ]
     { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638 },
 ]
+[[package]]
+name = "tiktoken"
+version = "0.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "regex" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ea/cf/756fedf6981e82897f2d570dd25fa597eb3f4459068ae0572d7e888cfd6f/tiktoken-0.9.0.tar.gz", hash = "sha256:d02a5ca6a938e0490e1ff957bc48c8b078c88cb83977be1625b1fd8aac792c5d", size = 35991 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7a/11/09d936d37f49f4f494ffe660af44acd2d99eb2429d60a57c71318af214e0/tiktoken-0.9.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2b0e8e05a26eda1249e824156d537015480af7ae222ccb798e5234ae0285dbdb", size = 1064919 },
+    { url = "https://files.pythonhosted.org/packages/80/0e/f38ba35713edb8d4197ae602e80837d574244ced7fb1b6070b31c29816e0/tiktoken-0.9.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:27d457f096f87685195eea0165a1807fae87b97b2161fe8c9b1df5bd74ca6f63", size = 1007877 },
+    { url = "https://files.pythonhosted.org/packages/fe/82/9197f77421e2a01373e27a79dd36efdd99e6b4115746ecc553318ecafbf0/tiktoken-0.9.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cf8ded49cddf825390e36dd1ad35cd49589e8161fdcb52aa25f0583e90a3e01", size = 1140095 },
+    { url = "https://files.pythonhosted.org/packages/f2/bb/4513da71cac187383541facd0291c4572b03ec23c561de5811781bbd988f/tiktoken-0.9.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc156cb314119a8bb9748257a2eaebd5cc0753b6cb491d26694ed42fc7cb3139", size = 1195649 },
+    { url = "https://files.pythonhosted.org/packages/fa/5c/74e4c137530dd8504e97e3a41729b1103a4ac29036cbfd3250b11fd29451/tiktoken-0.9.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:cd69372e8c9dd761f0ab873112aba55a0e3e506332dd9f7522ca466e817b1b7a", size = 1258465 },
+    { url = "https://files.pythonhosted.org/packages/de/a8/8f499c179ec900783ffe133e9aab10044481679bb9aad78436d239eee716/tiktoken-0.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:5ea0edb6f83dc56d794723286215918c1cde03712cbbafa0348b33448faf5b95", size = 894669 },
+]
 [[package]]
 name = "tokenizers"
 version = "0.21.1"