organicoder commited on
Commit
3f6db5c
Β·
verified Β·
1 Parent(s): e539367

Upload pdf_processor.py

Browse files
Files changed (1) hide show
  1. pdf_processor.py +120 -0
pdf_processor.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import os
3
+ from typing import List, Optional
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_openai import OpenAIEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ import pickle
8
+
9
+ class PDFProcessor:
10
+ """Process PDF files and create searchable vector database"""
11
+
12
+ def __init__(self, pdf_path: str = "Health Tech Hub Copenhagen.pdf"):
13
+ self.pdf_path = pdf_path
14
+ self.vector_store = None
15
+ self.text_chunks = []
16
+
17
+ def extract_text_from_pdf(self) -> str:
18
+ """Extract text content from PDF file"""
19
+ if not os.path.exists(self.pdf_path):
20
+ raise FileNotFoundError(f"PDF file not found: {self.pdf_path}")
21
+
22
+ text = ""
23
+ try:
24
+ with open(self.pdf_path, 'rb') as file:
25
+ pdf_reader = PyPDF2.PdfReader(file)
26
+ for page_num in range(len(pdf_reader.pages)):
27
+ page = pdf_reader.pages[page_num]
28
+ text += page.extract_text() + "\n"
29
+
30
+ print(f"βœ… Successfully extracted text from {self.pdf_path}")
31
+ return text
32
+
33
+ except Exception as e:
34
+ print(f"❌ Error extracting text from PDF: {e}")
35
+ raise
36
+
37
+ def split_text_into_chunks(self, text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[str]:
38
+ """Split text into smaller chunks for better processing"""
39
+ text_splitter = RecursiveCharacterTextSplitter(
40
+ chunk_size=chunk_size,
41
+ chunk_overlap=chunk_overlap,
42
+ length_function=len,
43
+ )
44
+
45
+ chunks = text_splitter.split_text(text)
46
+ self.text_chunks = chunks
47
+ print(f"βœ… Split text into {len(chunks)} chunks")
48
+ return chunks
49
+
50
+ def create_vector_store(self, chunks: List[str]) -> FAISS:
51
+ """Create a vector store from text chunks"""
52
+ try:
53
+ embeddings = OpenAIEmbeddings()
54
+ vector_store = FAISS.from_texts(chunks, embeddings)
55
+ self.vector_store = vector_store
56
+ print("βœ… Vector store created successfully")
57
+ return vector_store
58
+
59
+ except Exception as e:
60
+ print(f"❌ Error creating vector store: {e}")
61
+ raise
62
+
63
+ def search_similar_content(self, query: str, k: int = 3) -> List[str]:
64
+ """Search for similar content in the PDF"""
65
+ if not self.vector_store:
66
+ raise ValueError("Vector store not initialized. Call process_pdf() first.")
67
+
68
+ try:
69
+ results = self.vector_store.similarity_search(query, k=k)
70
+ return [doc.page_content for doc in results]
71
+
72
+ except Exception as e:
73
+ print(f"❌ Error searching content: {e}")
74
+ return []
75
+
76
+ def process_pdf(self) -> bool:
77
+ """Complete PDF processing pipeline"""
78
+ try:
79
+ print(f"πŸ”„ Processing PDF: {self.pdf_path}")
80
+
81
+ # Extract text
82
+ text = self.extract_text_from_pdf()
83
+
84
+ # Split into chunks
85
+ chunks = self.split_text_into_chunks(text)
86
+
87
+ # Create vector store
88
+ self.create_vector_store(chunks)
89
+
90
+ print("βœ… PDF processing completed successfully")
91
+ return True
92
+
93
+ except Exception as e:
94
+ print(f"❌ PDF processing failed: {e}")
95
+ return False
96
+
97
+ def save_vector_store(self, filepath: str = "vector_store.pkl"):
98
+ """Save vector store to file"""
99
+ if self.vector_store:
100
+ try:
101
+ with open(filepath, 'wb') as f:
102
+ pickle.dump(self.vector_store, f)
103
+ print(f"βœ… Vector store saved to {filepath}")
104
+ except Exception as e:
105
+ print(f"❌ Error saving vector store: {e}")
106
+
107
+ def load_vector_store(self, filepath: str = "vector_store.pkl") -> bool:
108
+ """Load vector store from file"""
109
+ try:
110
+ if os.path.exists(filepath):
111
+ with open(filepath, 'rb') as f:
112
+ self.vector_store = pickle.load(f)
113
+ print(f"βœ… Vector store loaded from {filepath}")
114
+ return True
115
+ else:
116
+ print(f"⚠️ Vector store file not found: {filepath}")
117
+ return False
118
+ except Exception as e:
119
+ print(f"❌ Error loading vector store: {e}")
120
+ return False