qandapdfbot / ingest.py
sakthi07's picture
Upload ingest.py
fbd8cfb verified
raw
history blame
1.08 kB
import os
from dotenv import load_dotenv
# Correct imports
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS # use community version
# Load environment variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# Paths
pdf_path = os.path.join("data", "diabetes.pdf")
out_dir = os.path.join("storage", "faiss_index")
os.makedirs(out_dir, exist_ok=True)
print("πŸ“„ Loading PDF...")
loader = PyPDFLoader(pdf_path)
docs = loader.load()
print("βœ‚οΈ Splitting into chunks...")
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)
print("🧠 Creating embeddings with OpenAI...")
embeddings = OpenAIEmbeddings()
print("πŸ“¦ Building FAISS index...")
vectorstore = FAISS.from_documents(chunks, embeddings)
vectorstore.save_local(out_dir)
print(f"βœ… FAISS index created at {out_dir}")