from datasets import load_dataset from sentence_transformers import SentenceTransformer import weaviate from tqdm import tqdm import time WEAVIATE_URL = "https://trkmgy3rradnlik1l8jlw.c0.us-west3.gcp.weaviate.cloud" WEAVIATE_KEY = "RldXWHRPOTc1STNuVUhvYl8yVkpSY3lpRnNQdHpYWVY5VnBrWlJKMmdTbWlFNHpQRm9YeG82RmlFQTBJPV92MjAw" # Load data print("Loading 10K ArXiv papers...") dataset = load_dataset("arxiv_dataset", split="train[:10000]") # Embedder embedder = SentenceTransformer("all-MiniLM-L6-v2") # Connect client = weaviate.Client( url=WEAVIATE_URL, auth_client_secret=weaviate.AuthApiKey(WEAVIATE_KEY) ) # Reset + create schema client.schema.delete_class("Paper") client.schema.create_class({ "class": "Paper", "vectorizer": "none", "properties": [ {"name": "title", "dataType": ["text"]}, {"name": "abstract", "dataType": ["text"]}, {"name": "category", "dataType": ["text"]} ] }) # Ingest batch_size = 50 objects = [] vectors = [] print("Embedding & uploading...") for item in tqdm(dataset): text = (item['title'] + " " + (item['abstract'] or ""))[:1000] vector = embedder.encode(text).tolist() objects.append({ "title": item['title'][:500], "abstract": item['abstract'][:1000] if item['abstract'] else "", "category": item['categories'].split()[0] if item['categories'] else "unknown" }) vectors.append(vector) if len(objects) >= batch_size: client.batch.create_objects("Paper", objects, vector=vectors) objects, vectors = [], [] time.sleep(0.1) if objects: client.batch.create_objects("Paper", objects, vector=vectors) print("SUCCESS: 10K papers indexed!")