sakthi07 commited on
Commit
fbd8cfb
Β·
verified Β·
1 Parent(s): 6935535

Upload ingest.py

Browse files
Files changed (1) hide show
  1. ingest.py +35 -0
ingest.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ # Correct imports
5
+ from langchain_community.document_loaders import PyPDFLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain.embeddings import OpenAIEmbeddings
8
+ from langchain_community.vectorstores import FAISS # use community version
9
+
10
+ # Load environment variables
11
+ load_dotenv()
12
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
13
+
14
+ # Paths
15
+ pdf_path = os.path.join("data", "diabetes.pdf")
16
+ out_dir = os.path.join("storage", "faiss_index")
17
+ os.makedirs(out_dir, exist_ok=True)
18
+
19
+ print("πŸ“„ Loading PDF...")
20
+ loader = PyPDFLoader(pdf_path)
21
+ docs = loader.load()
22
+
23
+ print("βœ‚οΈ Splitting into chunks...")
24
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
25
+ chunks = splitter.split_documents(docs)
26
+
27
+ print("🧠 Creating embeddings with OpenAI...")
28
+ embeddings = OpenAIEmbeddings()
29
+
30
+ print("πŸ“¦ Building FAISS index...")
31
+ vectorstore = FAISS.from_documents(chunks, embeddings)
32
+
33
+ vectorstore.save_local(out_dir)
34
+ print(f"βœ… FAISS index created at {out_dir}")
35
+