Spaces:
Sleeping
Sleeping
| # ingest.py | |
| from datasets import load_dataset | |
| from sentence_transformers import SentenceTransformer | |
| import weaviate | |
| from tqdm import tqdm | |
| import time | |
| import os | |
| # SECRETS | |
| WEAVIATE_URL = os.environ["WEAVIATE_URL"] | |
| WEAVIATE_KEY = os.environ["WEAVIATE_KEY"] | |
| print("Connecting to Weaviate:", WEAVIATE_URL) | |
| client = weaviate.Client( | |
| url=WEAVIATE_URL, | |
| auth_client_secret=weaviate.AuthApiKey(WEAVIATE_KEY) # β v3 syntax | |
| ) | |
| # LOAD DATASET | |
| print("Loading 10K ArXiv papers from CShorten/ML-ArXiv-Papers...") | |
| dataset = load_dataset("CShorten/ML-ArXiv-Papers", split="train", streaming=True) | |
| dataset = list(dataset.take(10000)) | |
| # EMBEDDER | |
| embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
| # Reset | |
| try: | |
| client.schema.delete_class("Paper") | |
| print("Deleted old 'Paper' class") | |
| except: | |
| pass | |
| client.schema.create_class({ | |
| "class": "Paper", | |
| "vectorizer": "none", | |
| "properties": [ | |
| {"name": "title", "dataType": ["text"]}, | |
| {"name": "abstract", "dataType": ["text"]}, | |
| {"name": "category", "dataType": ["text"]} | |
| ] | |
| }) | |
| # INGEST | |
| batch_size = 50 | |
| objects = [] | |
| vectors = [] | |
| print("Embedding & uploading 10K papers...") | |
| for item in tqdm(dataset, desc="Indexing"): | |
| title = item.get("title", "") or "" | |
| abstract = item.get("abstract", "") or "" | |
| categories = item.get("categories", "").split() | |
| category = categories[0] if categories else "unknown" | |
| text = (title + " " + abstract)[:1000] | |
| vector = embedder.encode(text).tolist() | |
| objects.append({ | |
| "title": title[:500], | |
| "abstract": abstract[:1000], | |
| "category": category | |
| }) | |
| vectors.append(vector) | |
| if len(objects) >= batch_size: | |
| client.batch.create_objects("Paper", objects, vector=vectors) | |
| objects, vectors = [], [] | |
| time.sleep(0.1) | |
| if objects: | |
| client.batch.create_objects("Paper", objects, vector=vectors) | |
| print("SUCCESS: 10K papers indexed!") |