arxiv-rag-demo / ingest.py
aakash-malhan's picture
Update ingest.py
d364c27 verified
# ingest.py
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import weaviate
from tqdm import tqdm
import time
import os
# SECRETS
WEAVIATE_URL = os.environ["WEAVIATE_URL"]
WEAVIATE_KEY = os.environ["WEAVIATE_KEY"]
print("Connecting to Weaviate:", WEAVIATE_URL)
client = weaviate.Client(
url=WEAVIATE_URL,
auth_client_secret=weaviate.AuthApiKey(WEAVIATE_KEY) # ← v3 syntax
)
# LOAD DATASET
print("Loading 10K ArXiv papers from CShorten/ML-ArXiv-Papers...")
dataset = load_dataset("CShorten/ML-ArXiv-Papers", split="train", streaming=True)
dataset = list(dataset.take(10000))
# EMBEDDER
embedder = SentenceTransformer("all-MiniLM-L6-v2")
# Reset
try:
client.schema.delete_class("Paper")
print("Deleted old 'Paper' class")
except:
pass
client.schema.create_class({
"class": "Paper",
"vectorizer": "none",
"properties": [
{"name": "title", "dataType": ["text"]},
{"name": "abstract", "dataType": ["text"]},
{"name": "category", "dataType": ["text"]}
]
})
# INGEST
batch_size = 50
objects = []
vectors = []
print("Embedding & uploading 10K papers...")
for item in tqdm(dataset, desc="Indexing"):
title = item.get("title", "") or ""
abstract = item.get("abstract", "") or ""
categories = item.get("categories", "").split()
category = categories[0] if categories else "unknown"
text = (title + " " + abstract)[:1000]
vector = embedder.encode(text).tolist()
objects.append({
"title": title[:500],
"abstract": abstract[:1000],
"category": category
})
vectors.append(vector)
if len(objects) >= batch_size:
client.batch.create_objects("Paper", objects, vector=vectors)
objects, vectors = [], []
time.sleep(0.1)
if objects:
client.batch.create_objects("Paper", objects, vector=vectors)
print("SUCCESS: 10K papers indexed!")