Spaces:

adenshulga
/

arxiv-paper-classifier

Sleeping

App Files Files Community

adenshulga commited on Apr 7

Commit

2073e38

1 Parent(s): 92d5847

index on master: 92d5847 working training code

Browse files

Files changed (21) hide show

.dockerignore +4 -0
.gitattributes +1 -0
.streamlit/config.toml +2 -0
Dockerfile +43 -0
README.md +65 -7
config/inference_config.py +6 -4
container_setup/Dockerfile +8 -6
container_setup/credentials +2 -2
data/checkpoints/checkpoint-12300/config.json +351 -0
data/checkpoints/checkpoint-12300/model.safetensors +3 -0
data/checkpoints/checkpoint-12300/special_tokens_map.json +7 -0
data/checkpoints/checkpoint-12300/tokenizer.json +0 -0
data/checkpoints/checkpoint-12300/tokenizer_config.json +58 -0
data/checkpoints/checkpoint-12300/trainer_state.json +0 -0
data/checkpoints/checkpoint-12300/training_args.bin +0 -0
data/checkpoints/checkpoint-12300/vocab.txt +0 -0
entrypoints/app.py +13 -9
scripts/launch_app.sh +7 -0
src/app/data_validation.py +5 -4
src/app/setup_model.py +3 -3
src/app/visualization.py +4 -1

.dockerignore ADDED Viewed

	@@ -0,0 +1,4 @@

+.venv
+data/arxivData.json
+outputs
+__pycache__/*

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ data/checkpoints/checkpoint-12300/model.safetensors filter=lfs diff=lfs merge=lfs -text

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [server]
2	+ fileWatcherType = "none"

Dockerfile ADDED Viewed

	@@ -0,0 +1,43 @@

+FROM python:3.13-slim
+# THIS IS DEVELOPMENT DOCKERFILE
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential \
+    python3-dev && \
+    rm -rf /var/lib/apt/lists/*
+ARG USER_ID=1000
+ARG GROUP_ID=1000
+# Create a group and user with the specified UID and GID
+RUN addgroup --gid $GROUP_ID appgroup && \
+    adduser --uid $USER_ID --gid $GROUP_ID --shell /bin/bash --disabled-password --gecos "" appuser
+# Install sudo and grant privileges
+RUN apt-get update && apt-get install -y sudo && \
+    echo "appuser ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
+# Create /app directory with proper ownership
+RUN mkdir -p /app && chown -R appuser:appgroup /app
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+# Switch to the new user
+USER appuser
+WORKDIR /app
+COPY --chown=appuser:appgroup . /app
+RUN uv venv .venv
+RUN uv sync
+EXPOSE 7860
+CMD scripts/launch_app.sh

README.md CHANGED Viewed

@@ -1,22 +1,80 @@
-uv venv
-chmod +x container_setup/build container_setup/launch_container
-in folder data unzip dataset
-add .env with COMET_API_KEY, COMET_MODE=GET
-chmod +x on scripts
-add data to folder data
-specify cuda device in pipeline script

+# arXiv Paper Classification
+A machine learning application that predicts arXiv categories for academic papers based on their title and abstract. This tool uses a fine-tuned SciBERT model to classify papers into arXiv subject categories. This task is completed as homework for YSDA ML 2 course
+I personally hate jupyter-notebooks, so as a proof that i conducted experiments i made Comet ML logger project public.
+Latest training logs, configs and other details can be found here https://www.comet.com/adenshulga/arxiv-papers-classification/ef1256f1d4eb4b588da881366eb27578?compareXAxis=step&experiment-tab=panels&showOutliers=true&smoothing=0&xAxis=step
+## Installation
+There are two relatively close dockerfile configurations. container_setup folder contains scripts and dockerfile to setup interactive developmpent environment. Dockerfile in the root is for deploying a StreamlitApp.
+### Streamlit App Setup
+1. Clone the repository:
+   ```bash
+   git clone https://github.com/adenshulga/arxiv-paper-classification.git
+   cd arxiv-paper-classification
+   ```
+2. Give permissions for executable scripts:
+    ```
+    chmod +x scripts/pipeline.sh scripts/launch_app.sh
+    ```
+3. Build and launch docker container:
+    ```
+    docker build -t arxiv-paper-clf .
+    docker run -p 9001:9001 arxiv-paper-clf
+    ```
+### Configuration
+You can modify the inference settings in `config/inference_config.py`:
+- `model_name`: Base model name from Hugging Face
+- `checkpoint_path`: Path to fine-tuned model checkpoint
+- `top_percent`: Cumulative score threshold for showing predictions
+- `minimal_score`: Minimum confidence score to display
+## Development and model Training
+To enter development environment
+1. Fill container_setup/credentials file
+2. Give executable permissions to build and launch scripts:
+    ```
+    chmod +x container_setup/build.sh container_setup/launch_script.sh
+    ```
+3. Specify resources constrains in ./container_setup/launch_container.sh
+4. Build and launch docker container
+    ```
+    ./container_setup/build.sh
+    ./container_setup/launch_container.sh
+    ```
+5. Attach to running container
+    ```
+    docker attach <container-id>
+    ```
+6. Install the dependencies
+    ```
+    uv venv
+    uv sync
+    ```
+To train the model:
+1. Load and unzip the arxiv dataset in the `data` folder(https://www.kaggle.com/datasets/neelshah18/arxivdataset)
+2. Configure the process in config/pipeline_config.py
+Run the training script:
+   ```
+   scripts/pipeline.sh
+   ```

config/inference_config.py CHANGED Viewed

@@ -1,6 +1,4 @@
-from dataclasses import dataclass, field
-from hydra.core.config_store import ConfigStore
 @dataclass
@@ -8,5 +6,9 @@ class InferenceConfig:
     """Configuration for inference"""
     model_name: str = "allenai/scibert_scivocab_uncased"
-    checkpoint_path: str = "data/checkpoints/checkpoint-10"
     top_percent: float = 0.95

+from dataclasses import dataclass
 @dataclass
     """Configuration for inference"""
     model_name: str = "allenai/scibert_scivocab_uncased"
+    checkpoint_path: str = "data/checkpoints/checkpoint-12300"
     top_percent: float = 0.95
+    minimal_score: float = 0.01
+cfg = InferenceConfig()

container_setup/Dockerfile CHANGED Viewed

@@ -1,5 +1,7 @@
 FROM python:3.13-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
@@ -36,16 +38,16 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 # Switch to the new user
 USER appuser
-SHELL ["/usr/bin/fish", "-c"]
 WORKDIR /app
-COPY --chown=appuser:appgroup . /app
-RUN uv venv .venv
-RUN uv sync
-EXPOSE 9000
-# CMD ["uv", "run", "python", "entrypoints/app.py"]

 FROM python:3.13-slim
+# THIS IS DEVELOPMENT DOCKERFILE
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 # Switch to the new user
 USER appuser
+# SHELL ["/usr/bin/fish", "-c"]
 WORKDIR /app
+# COPY --chown=appuser:appgroup . /app
+# RUN uv venv .venv
+# RUN uv sync
+# EXPOSE 7860
+# CMD scripts/launch_app.sh

container_setup/credentials CHANGED Viewed

@@ -5,5 +5,5 @@ CONTAINER_NAME=$USER"-arxiv-papers-classification"
 SRC="." # folder to propulse in docker container
 DOCKER_USER_ID=$(id -u) # to get these values type "id" in shell termilal
 DOCKER_GROUP_ID=$(id -g)
-CONTAINER_PORT=9001 # used in launch_container file
-INNER_PORT=9001

 SRC="." # folder to propulse in docker container
 DOCKER_USER_ID=$(id -u) # to get these values type "id" in shell termilal
 DOCKER_GROUP_ID=$(id -g)
+CONTAINER_PORT=7860 # used in launch_container file
+INNER_PORT=7860

data/checkpoints/checkpoint-12300/config.json ADDED Viewed

	@@ -0,0 +1,351 @@

+{
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "A.m",
+    "1": "Artificial intelligence and nonmonotonic reasoning and belief\n  revision",
+    "2": "Comptuational science",
+    "3": "Computer Science",
+    "4": "H.m",
+    "5": "IEEE",
+    "6": "MIMO, relay, queue-aware, distributive resource control",
+    "7": "Mathematical logic and foundations",
+    "8": "aaai.org",
+    "9": "adap-org",
+    "10": "artificial intelligence, approximate reasoning",
+    "11": "astro-ph",
+    "12": "astro-ph.CO",
+    "13": "astro-ph.EP",
+    "14": "astro-ph.GA",
+    "15": "astro-ph.HE",
+    "16": "astro-ph.IM",
+    "17": "astro-ph.SR",
+    "18": "cmp-lg",
+    "19": "cond-mat",
+    "20": "cond-mat.dis-nn",
+    "21": "cond-mat.mes-hall",
+    "22": "cond-mat.mtrl-sci",
+    "23": "cond-mat.other",
+    "24": "cond-mat.quant-gas",
+    "25": "cond-mat.soft",
+    "26": "cond-mat.stat-mech",
+    "27": "cond-mat.str-el",
+    "28": "cond-mat.supr-con",
+    "29": "cs.AI",
+    "30": "cs.AR",
+    "31": "cs.CC",
+    "32": "cs.CE",
+    "33": "cs.CG",
+    "34": "cs.CL",
+    "35": "cs.CL, cs.AI, math.CT",
+    "36": "cs.CR",
+    "37": "cs.CV",
+    "38": "cs.CY",
+    "39": "cs.DB",
+    "40": "cs.DC",
+    "41": "cs.DL",
+    "42": "cs.DM",
+    "43": "cs.DS",
+    "44": "cs.ET",
+    "45": "cs.FL",
+    "46": "cs.GL",
+    "47": "cs.GR",
+    "48": "cs.GT",
+    "49": "cs.HC",
+    "50": "cs.IR",
+    "51": "cs.IT",
+    "52": "cs.LG",
+    "53": "cs.LO",
+    "54": "cs.MA",
+    "55": "cs.MM",
+    "56": "cs.MS",
+    "57": "cs.NA",
+    "58": "cs.NE",
+    "59": "cs.NI",
+    "60": "cs.OH",
+    "61": "cs.OS",
+    "62": "cs.PF",
+    "63": "cs.PL",
+    "64": "cs.RO",
+    "65": "cs.SC",
+    "66": "cs.SD",
+    "67": "cs.SE",
+    "68": "cs.SI",
+    "69": "cs.SY",
+    "70": "econ.EM",
+    "71": "eess.AS",
+    "72": "eess.IV",
+    "73": "eess.SP",
+    "74": "gr-qc",
+    "75": "hep-ex",
+    "76": "hep-lat",
+    "77": "hep-ph",
+    "78": "hep-th",
+    "79": "math-ph",
+    "80": "math.AC",
+    "81": "math.AG",
+    "82": "math.AP",
+    "83": "math.AT",
+    "84": "math.CA",
+    "85": "math.CO",
+    "86": "math.CT",
+    "87": "math.CV",
+    "88": "math.DG",
+    "89": "math.DS",
+    "90": "math.FA",
+    "91": "math.GM",
+    "92": "math.GN",
+    "93": "math.GR",
+    "94": "math.GT",
+    "95": "math.HO",
+    "96": "math.IT",
+    "97": "math.LO",
+    "98": "math.MG",
+    "99": "math.MP",
+    "100": "math.NA",
+    "101": "math.NT",
+    "102": "math.OA",
+    "103": "math.OC",
+    "104": "math.PR",
+    "105": "math.QA",
+    "106": "math.RA",
+    "107": "math.RT",
+    "108": "math.SP",
+    "109": "math.ST",
+    "110": "nlin.AO",
+    "111": "nlin.AO, nlin.CD, q-bio.NC, physics.bio-ph, cond-mat.dis-nn",
+    "112": "nlin.CD",
+    "113": "nlin.CG",
+    "114": "nlin.PS",
+    "115": "nucl-ex",
+    "116": "nucl-th",
+    "117": "physics.ao-ph",
+    "118": "physics.app-ph",
+    "119": "physics.bio-ph",
+    "120": "physics.chem-ph",
+    "121": "physics.class-ph",
+    "122": "physics.comp-ph",
+    "123": "physics.data-an",
+    "124": "physics.flu-dyn",
+    "125": "physics.gen-ph",
+    "126": "physics.geo-ph",
+    "127": "physics.hist-ph",
+    "128": "physics.ins-det",
+    "129": "physics.med-ph",
+    "130": "physics.optics",
+    "131": "physics.pop-ph",
+    "132": "physics.soc-ph",
+    "133": "physics.space-ph",
+    "134": "q-bio",
+    "135": "q-bio.BM",
+    "136": "q-bio.BM, q-bio.MN, q-bio.NC, nlin.AO, nlin.CD",
+    "137": "q-bio.CB",
+    "138": "q-bio.GN",
+    "139": "q-bio.MN",
+    "140": "q-bio.NC",
+    "141": "q-bio.OT",
+    "142": "q-bio.PE",
+    "143": "q-bio.QM",
+    "144": "q-bio.SC",
+    "145": "q-bio.TO",
+    "146": "q-fin.CP",
+    "147": "q-fin.EC",
+    "148": "q-fin.GN",
+    "149": "q-fin.PM",
+    "150": "q-fin.PR",
+    "151": "q-fin.RM",
+    "152": "q-fin.ST",
+    "153": "q-fin.TR",
+    "154": "quant-ph",
+    "155": "stat.AP",
+    "156": "stat.CO",
+    "157": "stat.ME",
+    "158": "stat.ML",
+    "159": "stat.OT",
+    "160": "stat.TH"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "A.m": 0,
+    "Artificial intelligence and nonmonotonic reasoning and belief\n  revision": 1,
+    "Comptuational science": 2,
+    "Computer Science": 3,
+    "H.m": 4,
+    "IEEE": 5,
+    "MIMO, relay, queue-aware, distributive resource control": 6,
+    "Mathematical logic and foundations": 7,
+    "aaai.org": 8,
+    "adap-org": 9,
+    "artificial intelligence, approximate reasoning": 10,
+    "astro-ph": 11,
+    "astro-ph.CO": 12,
+    "astro-ph.EP": 13,
+    "astro-ph.GA": 14,
+    "astro-ph.HE": 15,
+    "astro-ph.IM": 16,
+    "astro-ph.SR": 17,
+    "cmp-lg": 18,
+    "cond-mat": 19,
+    "cond-mat.dis-nn": 20,
+    "cond-mat.mes-hall": 21,
+    "cond-mat.mtrl-sci": 22,
+    "cond-mat.other": 23,
+    "cond-mat.quant-gas": 24,
+    "cond-mat.soft": 25,
+    "cond-mat.stat-mech": 26,
+    "cond-mat.str-el": 27,
+    "cond-mat.supr-con": 28,
+    "cs.AI": 29,
+    "cs.AR": 30,
+    "cs.CC": 31,
+    "cs.CE": 32,
+    "cs.CG": 33,
+    "cs.CL": 34,
+    "cs.CL, cs.AI, math.CT": 35,
+    "cs.CR": 36,
+    "cs.CV": 37,
+    "cs.CY": 38,
+    "cs.DB": 39,
+    "cs.DC": 40,
+    "cs.DL": 41,
+    "cs.DM": 42,
+    "cs.DS": 43,
+    "cs.ET": 44,
+    "cs.FL": 45,
+    "cs.GL": 46,
+    "cs.GR": 47,
+    "cs.GT": 48,
+    "cs.HC": 49,
+    "cs.IR": 50,
+    "cs.IT": 51,
+    "cs.LG": 52,
+    "cs.LO": 53,
+    "cs.MA": 54,
+    "cs.MM": 55,
+    "cs.MS": 56,
+    "cs.NA": 57,
+    "cs.NE": 58,
+    "cs.NI": 59,
+    "cs.OH": 60,
+    "cs.OS": 61,
+    "cs.PF": 62,
+    "cs.PL": 63,
+    "cs.RO": 64,
+    "cs.SC": 65,
+    "cs.SD": 66,
+    "cs.SE": 67,
+    "cs.SI": 68,
+    "cs.SY": 69,
+    "econ.EM": 70,
+    "eess.AS": 71,
+    "eess.IV": 72,
+    "eess.SP": 73,
+    "gr-qc": 74,
+    "hep-ex": 75,
+    "hep-lat": 76,
+    "hep-ph": 77,
+    "hep-th": 78,
+    "math-ph": 79,
+    "math.AC": 80,
+    "math.AG": 81,
+    "math.AP": 82,
+    "math.AT": 83,
+    "math.CA": 84,
+    "math.CO": 85,
+    "math.CT": 86,
+    "math.CV": 87,
+    "math.DG": 88,
+    "math.DS": 89,
+    "math.FA": 90,
+    "math.GM": 91,
+    "math.GN": 92,
+    "math.GR": 93,
+    "math.GT": 94,
+    "math.HO": 95,
+    "math.IT": 96,
+    "math.LO": 97,
+    "math.MG": 98,
+    "math.MP": 99,
+    "math.NA": 100,
+    "math.NT": 101,
+    "math.OA": 102,
+    "math.OC": 103,
+    "math.PR": 104,
+    "math.QA": 105,
+    "math.RA": 106,
+    "math.RT": 107,
+    "math.SP": 108,
+    "math.ST": 109,
+    "nlin.AO": 110,
+    "nlin.AO, nlin.CD, q-bio.NC, physics.bio-ph, cond-mat.dis-nn": 111,
+    "nlin.CD": 112,
+    "nlin.CG": 113,
+    "nlin.PS": 114,
+    "nucl-ex": 115,
+    "nucl-th": 116,
+    "physics.ao-ph": 117,
+    "physics.app-ph": 118,
+    "physics.bio-ph": 119,
+    "physics.chem-ph": 120,
+    "physics.class-ph": 121,
+    "physics.comp-ph": 122,
+    "physics.data-an": 123,
+    "physics.flu-dyn": 124,
+    "physics.gen-ph": 125,
+    "physics.geo-ph": 126,
+    "physics.hist-ph": 127,
+    "physics.ins-det": 128,
+    "physics.med-ph": 129,
+    "physics.optics": 130,
+    "physics.pop-ph": 131,
+    "physics.soc-ph": 132,
+    "physics.space-ph": 133,
+    "q-bio": 134,
+    "q-bio.BM": 135,
+    "q-bio.BM, q-bio.MN, q-bio.NC, nlin.AO, nlin.CD": 136,
+    "q-bio.CB": 137,
+    "q-bio.GN": 138,
+    "q-bio.MN": 139,
+    "q-bio.NC": 140,
+    "q-bio.OT": 141,
+    "q-bio.PE": 142,
+    "q-bio.QM": 143,
+    "q-bio.SC": 144,
+    "q-bio.TO": 145,
+    "q-fin.CP": 146,
+    "q-fin.EC": 147,
+    "q-fin.GN": 148,
+    "q-fin.PM": 149,
+    "q-fin.PR": 150,
+    "q-fin.RM": 151,
+    "q-fin.ST": 152,
+    "q-fin.TR": 153,
+    "quant-ph": 154,
+    "stat.AP": 155,
+    "stat.CO": 156,
+    "stat.ME": 157,
+    "stat.ML": 158,
+    "stat.OT": 159,
+    "stat.TH": 160
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "problem_type": "multi_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.50.3",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 31090
+}

data/checkpoints/checkpoint-12300/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe26497cd2f66e6db8739c13ac359fe50c8023ff6bb897c80df6ddae58a77cd3
+size 440192628

data/checkpoints/checkpoint-12300/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

data/checkpoints/checkpoint-12300/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/checkpoints/checkpoint-12300/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "104": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

data/checkpoints/checkpoint-12300/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/checkpoints/checkpoint-12300/training_args.bin ADDED Viewed

Binary file (5.37 kB). View file

data/checkpoints/checkpoint-12300/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

entrypoints/app.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import streamlit as st
-from src.app.setup_model import setup_pipeline, get_top_label_names, LabelScore
 from src.app.tags_mapping import tags2full_name
 from src.app.visualization import visualize_predicted_categories
-from config.inference_config import InferenceConfig
-from src.app.data_validation import validate_data
 st.title("arXiv Paper Classifier")
 st.markdown("Enter paper details to predict arXiv categories")
@@ -11,17 +12,20 @@ st.markdown("Enter paper details to predict arXiv categories")
 st.text_input("Enter paper name", key="paper_name")
 st.text_area("Enter paper abstract", key="paper_abstract", height=250)
-if st.button("Predict Categories", type="primary"):
-    validate_data(st.session_state["paper_name"], st.session_state["paper_abstract"])
     with st.spinner("Analyzing paper..."):
-        pipeline = setup_pipeline(InferenceConfig())
         scores: list[LabelScore] = pipeline(
             st.session_state["paper_name"] + " " + st.session_state["paper_abstract"],
-            output_scores=True,
         )  # type: ignore
-        top_labels = get_top_label_names(scores, tags2full_name, 0.95)
-    visualize_predicted_categories(top_labels, scores, tags2full_name)
 else:
     st.info("Enter paper details and click 'Predict Categories' to get predictions.")

 import streamlit as st
+from config.inference_config import cfg
+from src.app.data_validation import validate_data
+from src.app.setup_model import LabelScore, get_top_label_names, setup_pipeline
 from src.app.tags_mapping import tags2full_name
 from src.app.visualization import visualize_predicted_categories
 st.title("arXiv Paper Classifier")
 st.markdown("Enter paper details to predict arXiv categories")
 st.text_input("Enter paper name", key="paper_name")
 st.text_area("Enter paper abstract", key="paper_abstract", height=250)
+if st.button("Predict Categories", type="primary") and validate_data(
+    st.session_state["paper_name"], st.session_state["paper_abstract"]
+):
     with st.spinner("Analyzing paper..."):
+        pipeline = setup_pipeline(cfg)
         scores: list[LabelScore] = pipeline(
             st.session_state["paper_name"] + " " + st.session_state["paper_abstract"],
+            top_k=None,
         )  # type: ignore
+        top_labels = get_top_label_names(scores, tags2full_name, cfg.top_percent)
+    visualize_predicted_categories(
+        top_labels, scores, tags2full_name, minimal_score=cfg.minimal_score
+    )
 else:
     st.info("Enter paper details and click 'Predict Categories' to get predictions.")

scripts/launch_app.sh CHANGED Viewed

	@@ -0,0 +1,7 @@

+#!/bin/bash
+export PYTHONPATH='.'
+# source .venv/bin/activate
+CUDA_VISIBLE_DEVICES="" uv run -m streamlit run entrypoints/app.py --server.address=0.0.0.0 --server.port=9001

src/app/data_validation.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import streamlit as st
-def validate_data(paper_name: str, paper_abstract: str) -> None:
-    if paper_name == "" or paper_abstract == "":
         st.error("Paper name or abstract are required")
-        return
     if paper_abstract == "":
         st.warning(
             "Without abstract, the performance of the model will be significantly worse"
         )
-        return

 import streamlit as st
+def validate_data(paper_name: str, paper_abstract: str) -> bool:
+    if paper_name == "" and paper_abstract == "":
         st.error("Paper name or abstract are required")
+        return False
     if paper_abstract == "":
         st.warning(
             "Without abstract, the performance of the model will be significantly worse"
         )
+        return True
+    return True

src/app/setup_model.py CHANGED Viewed

@@ -3,8 +3,6 @@ import typing as tp
 from config.inference_config import InferenceConfig
 import streamlit as st
-from src.app.tags_mapping import tags2full_name
 class LabelScore(tp.TypedDict):
     label: str
@@ -14,7 +12,9 @@ class LabelScore(tp.TypedDict):
 @st.cache_resource
 def setup_pipeline(cfg: InferenceConfig) -> Pipeline:
     model = pipeline(
-        "text-classification", model=cfg.checkpoint_path, tokenizer=cfg.model_name
     )
     return model

 from config.inference_config import InferenceConfig
 import streamlit as st
 class LabelScore(tp.TypedDict):
     label: str
 @st.cache_resource
 def setup_pipeline(cfg: InferenceConfig) -> Pipeline:
     model = pipeline(
+        "text-classification",
+        model=cfg.checkpoint_path,
+        tokenizer=cfg.model_name,
     )
     return model

src/app/visualization.py CHANGED Viewed

@@ -7,6 +7,7 @@ def visualize_predicted_categories(
     top_labels: list[LabelScore],
     scores: list[LabelScore],
     label_to_name_mapping: Dict[str, str],
 ):
     """
     Visualize the predicted categories in a streamlit app
@@ -19,7 +20,9 @@ def visualize_predicted_categories(
     st.subheader("Predicted Categories")
     for i, label in enumerate(top_labels):
-        score = next((s["score"] for s in scores if s["label"] == label["label"]), 0)
         # Color gradient based on confidence
         color_intensity = min(int(score * 255), 255)

     top_labels: list[LabelScore],
     scores: list[LabelScore],
     label_to_name_mapping: Dict[str, str],
+    minimal_score: float = 0.01,
 ):
     """
     Visualize the predicted categories in a streamlit app
     st.subheader("Predicted Categories")
     for i, label in enumerate(top_labels):
+        score = label["score"]
+        if score < minimal_score:
+            continue
         # Color gradient based on confidence
         color_intensity = min(int(score * 255), 255)