import PyPDF2 import os from typing import List, Optional from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings from langchain.vectorstores import FAISS import pickle class PDFProcessor: """Process PDF files and create searchable vector database""" def __init__(self, pdf_path: str = "Health Tech Hub Copenhagen.pdf"): self.pdf_path = pdf_path self.vector_store = None self.text_chunks = [] def extract_text_from_pdf(self) -> str: """Extract text content from PDF file""" if not os.path.exists(self.pdf_path): raise FileNotFoundError(f"PDF file not found: {self.pdf_path}") text = "" try: with open(self.pdf_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] text += page.extract_text() + "\n" print(f"✅ Successfully extracted text from {self.pdf_path}") return text except Exception as e: print(f"❌ Error extracting text from PDF: {e}") raise def split_text_into_chunks(self, text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[str]: """Split text into smaller chunks for better processing""" text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, ) chunks = text_splitter.split_text(text) self.text_chunks = chunks print(f"✅ Split text into {len(chunks)} chunks") return chunks def create_vector_store(self, chunks: List[str]) -> FAISS: """Create a vector store from text chunks""" try: embeddings = OpenAIEmbeddings() vector_store = FAISS.from_texts(chunks, embeddings) self.vector_store = vector_store print("✅ Vector store created successfully") return vector_store except Exception as e: print(f"❌ Error creating vector store: {e}") raise def search_similar_content(self, query: str, k: int = 3) -> List[str]: """Search for similar content in the PDF""" if not self.vector_store: raise ValueError("Vector store not initialized. Call process_pdf() first.") try: results = self.vector_store.similarity_search(query, k=k) return [doc.page_content for doc in results] except Exception as e: print(f"❌ Error searching content: {e}") return [] def process_pdf(self) -> bool: """Complete PDF processing pipeline""" try: print(f"🔄 Processing PDF: {self.pdf_path}") # Extract text text = self.extract_text_from_pdf() # Split into chunks chunks = self.split_text_into_chunks(text) # Create vector store self.create_vector_store(chunks) print("✅ PDF processing completed successfully") return True except Exception as e: print(f"❌ PDF processing failed: {e}") return False def save_vector_store(self, filepath: str = "vector_store.pkl"): """Save vector store to file""" if self.vector_store: try: with open(filepath, 'wb') as f: pickle.dump(self.vector_store, f) print(f"✅ Vector store saved to {filepath}") except Exception as e: print(f"❌ Error saving vector store: {e}") def load_vector_store(self, filepath: str = "vector_store.pkl") -> bool: """Load vector store from file""" try: if os.path.exists(filepath): with open(filepath, 'rb') as f: self.vector_store = pickle.load(f) print(f"✅ Vector store loaded from {filepath}") return True else: print(f"⚠️ Vector store file not found: {filepath}") return False except Exception as e: print(f"❌ Error loading vector store: {e}") return False