aakash-malhan commited on
Commit
89245fd
·
verified ·
1 Parent(s): b720eb4

Create ingest.py

Browse files
Files changed (1) hide show
  1. ingest.py +60 -0
ingest.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from sentence_transformers import SentenceTransformer
3
+ import weaviate
4
+ from tqdm import tqdm
5
+ import time
6
+
7
+ WEAVIATE_URL = "https://trkmgy3rradnlik1l8jlw.c0.us-west3.gcp.weaviate.cloud"
8
+ WEAVIATE_KEY = "RldXWHRPOTc1STNuVUhvYl8yVkpSY3lpRnNQdHpYWVY5VnBrWlJKMmdTbWlFNHpQRm9YeG82RmlFQTBJPV92MjAw"
9
+
10
+ # Load data
11
+ print("Loading 10K ArXiv papers...")
12
+ dataset = load_dataset("arxiv_dataset", split="train[:10000]")
13
+
14
+ # Embedder
15
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
16
+
17
+ # Connect
18
+ client = weaviate.Client(
19
+ url=WEAVIATE_URL,
20
+ auth_client_secret=weaviate.AuthApiKey(WEAVIATE_KEY)
21
+ )
22
+
23
+ # Reset + create schema
24
+ client.schema.delete_class("Paper")
25
+ client.schema.create_class({
26
+ "class": "Paper",
27
+ "vectorizer": "none",
28
+ "properties": [
29
+ {"name": "title", "dataType": ["text"]},
30
+ {"name": "abstract", "dataType": ["text"]},
31
+ {"name": "category", "dataType": ["text"]}
32
+ ]
33
+ })
34
+
35
+ # Ingest
36
+ batch_size = 50
37
+ objects = []
38
+ vectors = []
39
+
40
+ print("Embedding & uploading...")
41
+ for item in tqdm(dataset):
42
+ text = (item['title'] + " " + (item['abstract'] or ""))[:1000]
43
+ vector = embedder.encode(text).tolist()
44
+
45
+ objects.append({
46
+ "title": item['title'][:500],
47
+ "abstract": item['abstract'][:1000] if item['abstract'] else "",
48
+ "category": item['categories'].split()[0] if item['categories'] else "unknown"
49
+ })
50
+ vectors.append(vector)
51
+
52
+ if len(objects) >= batch_size:
53
+ client.batch.create_objects("Paper", objects, vector=vectors)
54
+ objects, vectors = [], []
55
+ time.sleep(0.1)
56
+
57
+ if objects:
58
+ client.batch.create_objects("Paper", objects, vector=vectors)
59
+
60
+ print("SUCCESS: 10K papers indexed!")