File size: 1,920 Bytes
9d86a53
89245fd
 
 
 
 
9d86a53
89245fd
9d86a53
 
 
89245fd
9d86a53
89245fd
d364c27
 
 
 
 
 
836f07d
 
 
9d86a53
 
89245fd
 
9d86a53
 
 
 
 
 
 
89245fd
 
 
 
 
 
 
 
 
 
9d86a53
89245fd
 
 
 
9d86a53
 
 
 
 
 
 
 
89245fd
 
 
9d86a53
 
 
89245fd
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# ingest.py
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import weaviate
from tqdm import tqdm
import time
import os

# SECRETS
WEAVIATE_URL = os.environ["WEAVIATE_URL"]
WEAVIATE_KEY = os.environ["WEAVIATE_KEY"]

print("Connecting to Weaviate:", WEAVIATE_URL)

client = weaviate.Client(
    url=WEAVIATE_URL,
    auth_client_secret=weaviate.AuthApiKey(WEAVIATE_KEY)  # ← v3 syntax
)

# LOAD DATASET
print("Loading 10K ArXiv papers from CShorten/ML-ArXiv-Papers...")
dataset = load_dataset("CShorten/ML-ArXiv-Papers", split="train", streaming=True)
dataset = list(dataset.take(10000))

# EMBEDDER
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Reset
try:
    client.schema.delete_class("Paper")
    print("Deleted old 'Paper' class")
except:
    pass

client.schema.create_class({
    "class": "Paper",
    "vectorizer": "none",
    "properties": [
        {"name": "title", "dataType": ["text"]},
        {"name": "abstract", "dataType": ["text"]},
        {"name": "category", "dataType": ["text"]}
    ]
})

# INGEST
batch_size = 50
objects = []
vectors = []

print("Embedding & uploading 10K papers...")
for item in tqdm(dataset, desc="Indexing"):
    title = item.get("title", "") or ""
    abstract = item.get("abstract", "") or ""
    categories = item.get("categories", "").split()
    category = categories[0] if categories else "unknown"

    text = (title + " " + abstract)[:1000]
    vector = embedder.encode(text).tolist()

    objects.append({
        "title": title[:500],
        "abstract": abstract[:1000],
        "category": category
    })
    vectors.append(vector)

    if len(objects) >= batch_size:
        client.batch.create_objects("Paper", objects, vector=vectors)
        objects, vectors = [], []
        time.sleep(0.1)

if objects:
    client.batch.create_objects("Paper", objects, vector=vectors)

print("SUCCESS: 10K papers indexed!")