# ingest.py from datasets import load_dataset from sentence_transformers import SentenceTransformer import weaviate from tqdm import tqdm import time import os # SECRETS WEAVIATE_URL = os.environ["WEAVIATE_URL"] WEAVIATE_KEY = os.environ["WEAVIATE_KEY"] print("Connecting to Weaviate:", WEAVIATE_URL) client = weaviate.Client( url=WEAVIATE_URL, auth_client_secret=weaviate.AuthApiKey(WEAVIATE_KEY) # ← v3 syntax ) # LOAD DATASET print("Loading 10K ArXiv papers from CShorten/ML-ArXiv-Papers...") dataset = load_dataset("CShorten/ML-ArXiv-Papers", split="train", streaming=True) dataset = list(dataset.take(10000)) # EMBEDDER embedder = SentenceTransformer("all-MiniLM-L6-v2") # Reset try: client.schema.delete_class("Paper") print("Deleted old 'Paper' class") except: pass client.schema.create_class({ "class": "Paper", "vectorizer": "none", "properties": [ {"name": "title", "dataType": ["text"]}, {"name": "abstract", "dataType": ["text"]}, {"name": "category", "dataType": ["text"]} ] }) # INGEST batch_size = 50 objects = [] vectors = [] print("Embedding & uploading 10K papers...") for item in tqdm(dataset, desc="Indexing"): title = item.get("title", "") or "" abstract = item.get("abstract", "") or "" categories = item.get("categories", "").split() category = categories[0] if categories else "unknown" text = (title + " " + abstract)[:1000] vector = embedder.encode(text).tolist() objects.append({ "title": title[:500], "abstract": abstract[:1000], "category": category }) vectors.append(vector) if len(objects) >= batch_size: client.batch.create_objects("Paper", objects, vector=vectors) objects, vectors = [], [] time.sleep(0.1) if objects: client.batch.create_objects("Paper", objects, vector=vectors) print("SUCCESS: 10K papers indexed!")