Spaces:

organicoder
/

chatbot-sverige

Sleeping

App Files Files Community

organicoder commited on Jul 13

Commit

3f6db5c

verified ·

1 Parent(s): e539367

Upload pdf_processor.py

Browse files

Files changed (1) hide show

pdf_processor.py +120 -0

pdf_processor.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import PyPDF2
+import os
+from typing import List, Optional
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_openai import OpenAIEmbeddings
+from langchain.vectorstores import FAISS
+import pickle
+class PDFProcessor:
+    """Process PDF files and create searchable vector database"""
+    def __init__(self, pdf_path: str = "Health Tech Hub Copenhagen.pdf"):
+        self.pdf_path = pdf_path
+        self.vector_store = None
+        self.text_chunks = []
+    def extract_text_from_pdf(self) -> str:
+        """Extract text content from PDF file"""
+        if not os.path.exists(self.pdf_path):
+            raise FileNotFoundError(f"PDF file not found: {self.pdf_path}")
+        text = ""
+        try:
+            with open(self.pdf_path, 'rb') as file:
+                pdf_reader = PyPDF2.PdfReader(file)
+                for page_num in range(len(pdf_reader.pages)):
+                    page = pdf_reader.pages[page_num]
+                    text += page.extract_text() + "\n"
+            print(f"✅ Successfully extracted text from {self.pdf_path}")
+            return text
+        except Exception as e:
+            print(f"❌ Error extracting text from PDF: {e}")
+            raise
+    def split_text_into_chunks(self, text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[str]:
+        """Split text into smaller chunks for better processing"""
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            length_function=len,
+        )
+        chunks = text_splitter.split_text(text)
+        self.text_chunks = chunks
+        print(f"✅ Split text into {len(chunks)} chunks")
+        return chunks
+    def create_vector_store(self, chunks: List[str]) -> FAISS:
+        """Create a vector store from text chunks"""
+        try:
+            embeddings = OpenAIEmbeddings()
+            vector_store = FAISS.from_texts(chunks, embeddings)
+            self.vector_store = vector_store
+            print("✅ Vector store created successfully")
+            return vector_store
+        except Exception as e:
+            print(f"❌ Error creating vector store: {e}")
+            raise
+    def search_similar_content(self, query: str, k: int = 3) -> List[str]:
+        """Search for similar content in the PDF"""
+        if not self.vector_store:
+            raise ValueError("Vector store not initialized. Call process_pdf() first.")
+        try:
+            results = self.vector_store.similarity_search(query, k=k)
+            return [doc.page_content for doc in results]
+        except Exception as e:
+            print(f"❌ Error searching content: {e}")
+            return []
+    def process_pdf(self) -> bool:
+        """Complete PDF processing pipeline"""
+        try:
+            print(f"🔄 Processing PDF: {self.pdf_path}")
+            # Extract text
+            text = self.extract_text_from_pdf()
+            # Split into chunks
+            chunks = self.split_text_into_chunks(text)
+            # Create vector store
+            self.create_vector_store(chunks)
+            print("✅ PDF processing completed successfully")
+            return True
+        except Exception as e:
+            print(f"❌ PDF processing failed: {e}")
+            return False
+    def save_vector_store(self, filepath: str = "vector_store.pkl"):
+        """Save vector store to file"""
+        if self.vector_store:
+            try:
+                with open(filepath, 'wb') as f:
+                    pickle.dump(self.vector_store, f)
+                print(f"✅ Vector store saved to {filepath}")
+            except Exception as e:
+                print(f"❌ Error saving vector store: {e}")
+    def load_vector_store(self, filepath: str = "vector_store.pkl") -> bool:
+        """Load vector store from file"""
+        try:
+            if os.path.exists(filepath):
+                with open(filepath, 'rb') as f:
+                    self.vector_store = pickle.load(f)
+                print(f"✅ Vector store loaded from {filepath}")
+                return True
+            else:
+                print(f"⚠️ Vector store file not found: {filepath}")
+                return False
+        except Exception as e:
+            print(f"❌ Error loading vector store: {e}")
+            return False