arxiv-rag-demo / ingest.py
aakash-malhan's picture
Create ingest.py
89245fd verified
raw
history blame
1.7 kB
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import weaviate
from tqdm import tqdm
import time
WEAVIATE_URL = "https://trkmgy3rradnlik1l8jlw.c0.us-west3.gcp.weaviate.cloud"
WEAVIATE_KEY = "RldXWHRPOTc1STNuVUhvYl8yVkpSY3lpRnNQdHpYWVY5VnBrWlJKMmdTbWlFNHpQRm9YeG82RmlFQTBJPV92MjAw"
# Load data
print("Loading 10K ArXiv papers...")
dataset = load_dataset("arxiv_dataset", split="train[:10000]")
# Embedder
embedder = SentenceTransformer("all-MiniLM-L6-v2")
# Connect
client = weaviate.Client(
url=WEAVIATE_URL,
auth_client_secret=weaviate.AuthApiKey(WEAVIATE_KEY)
)
# Reset + create schema
client.schema.delete_class("Paper")
client.schema.create_class({
"class": "Paper",
"vectorizer": "none",
"properties": [
{"name": "title", "dataType": ["text"]},
{"name": "abstract", "dataType": ["text"]},
{"name": "category", "dataType": ["text"]}
]
})
# Ingest
batch_size = 50
objects = []
vectors = []
print("Embedding & uploading...")
for item in tqdm(dataset):
text = (item['title'] + " " + (item['abstract'] or ""))[:1000]
vector = embedder.encode(text).tolist()
objects.append({
"title": item['title'][:500],
"abstract": item['abstract'][:1000] if item['abstract'] else "",
"category": item['categories'].split()[0] if item['categories'] else "unknown"
})
vectors.append(vector)
if len(objects) >= batch_size:
client.batch.create_objects("Paper", objects, vector=vectors)
objects, vectors = [], []
time.sleep(0.1)
if objects:
client.batch.create_objects("Paper", objects, vector=vectors)
print("SUCCESS: 10K papers indexed!")