fmegahed commited on
Commit
982f6ef
·
verified ·
1 Parent(s): c7a9a90

Adding the actual preprocess.py file

Browse files
Files changed (1) hide show
  1. preprocess.py +192 -295
preprocess.py CHANGED
@@ -1,295 +1,192 @@
1
- # SCRIPT 1: PREPROCESS.PY
2
- # Run this script one time to build and save your knowledge graph.
3
-
4
- import os
5
- import requests
6
- import json
7
- from bs4 import BeautifulSoup
8
- from urllib.parse import urljoin
9
- import io
10
-
11
- from llama_index.core import Document, Settings, StorageContext, KnowledgeGraphIndex, load_index_from_storage
12
- from llama_index.core.graph_stores import SimpleGraphStore
13
- from llama_index.llms.openai_multi_modal import OpenAIMultiModal
14
- from llama_index.embeddings.openai import OpenAIEmbedding
15
- from llama_index.readers.file import PyMuPDFReader, ImageReader
16
- from PIL import Image
17
-
18
- # 1. Configuration
19
- # ---
20
- if not os.getenv("OPENAI_API_KEY"):
21
- raise ValueError("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
22
-
23
- # Directories for storing data and the final index
24
- IMAGE_DIR = "image_data"
25
- STORAGE_DIR = "storage"
26
- os.makedirs(IMAGE_DIR, exist_ok=True)
27
- os.makedirs(STORAGE_DIR, exist_ok=True)
28
-
29
- # The JSON data you provided (ensure you have the full list here)
30
- source_data = [
31
- {
32
- "title": "29 CFR § 1910.211 - Definitions",
33
- "url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.211",
34
- "source": "OSHA",
35
- "year": 2024,
36
- "category": "Regulation",
37
- "summary": "Provides key legal definitions for terms used throughout Subpart O, such as 'point of operation,' 'guard,' and 'power press,' which are foundational for understanding and applying machine safeguarding rules.",
38
- "format": "HTML"
39
- },
40
- {
41
- "title": "29 CFR § 1910.212 - General requirements for all machines",
42
- "url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.212",
43
- "source": "OSHA",
44
- "year": 2024,
45
- "category": "Regulation",
46
- "summary": "This is the core machine guarding standard, mandating that one or more safeguarding methods be used to protect operators from point-of-operation hazards, ingoing nip points, and other machinery dangers.",
47
- "format": "HTML"
48
- },
49
- {
50
- "title": "29 CFR § 1910.213 - Woodworking machinery requirements",
51
- "url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.213",
52
- "source": "OSHA",
53
- "year": 2024,
54
- "category": "Regulation",
55
- "summary": "Details specific guarding requirements for various woodworking machines, including circular saws, band saws, and jointers, to prevent lacerations and amputations from blade contact.",
56
- "format": "HTML"
57
- },
58
- {
59
- "title": "29 CFR § 1910.215 - Abrasive wheel machinery",
60
- "url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.215",
61
- "source": "OSHA",
62
- "year": 2024,
63
- "category": "Regulation",
64
- "summary": "Specifies safety requirements for abrasive wheel grinders, including guards, flanges, and work rests, to protect workers from wheel breakage, projectiles, and contact with the wheel.",
65
- "format": "HTML"
66
- },
67
- {
68
- "title": "29 CFR § 1910.217 - Mechanical power presses",
69
- "url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.217",
70
- "source": "OSHA",
71
- "year": 2024,
72
- "category": "Regulation",
73
- "summary": "Outlines extensive requirements for mechanical power presses to prevent injuries to hands and fingers, covering guards, devices, clutch/brake mechanisms, and inspection procedures.",
74
- "format": "HTML"
75
- },
76
- {
77
- "title": "29 CFR § 1910.219 - Mechanical power-transmission apparatus",
78
- "url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.219",
79
- "source": "OSHA",
80
- "year": 2024,
81
- "category": "Regulation",
82
- "summary": "Mandates the guarding of mechanical power-transmission components like belts, pulleys, gears, and shafts to prevent caught-in/between injuries from entanglement.",
83
- "format": "HTML"
84
- },
85
- {
86
- "title": "Safeguarding Equipment and Protecting Workers from Amputations (OSHA 3170)",
87
- "url": "https://www.osha.gov/sites/default/files/publications/osha3170.pdf",
88
- "source": "OSHA",
89
- "year": 2007,
90
- "category": "Technical Guide",
91
- "summary": "This guide helps identify and manage amputation hazards from various machines by explaining hazard analysis, machine safeguarding methods, and the importance of hazardous energy control.",
92
- "format": "PDF"
93
- },
94
- {
95
- "title": "The Control of Hazardous Energy (Lockout/Tagout) (OSHA 3120)",
96
- "url": "https://www.osha.gov/sites/default/files/publications/osha3120.pdf",
97
- "source": "OSHA",
98
- "year": 2002,
99
- "category": "Technical Guide",
100
- "summary": "Provides a detailed explanation of the Lockout/Tagout standard (1910.147), offering guidance on energy control procedures, training, and periodic inspections to prevent unexpected machine startup.",
101
- "format": "PDF"
102
- },
103
- {
104
- "title": "29 CFR § 1910.147 - The control of hazardous energy (lockout/tagout)",
105
- "url": "https://www.ecfr.gov/current/title-29/section-1910.147",
106
- "source": "OSHA",
107
- "year": 2024,
108
- "category": "Regulation",
109
- "summary": "This regulation establishes the employer's responsibility to protect workers from hazardous energy sources during machine servicing and maintenance by requiring energy isolation and lockout/tagout procedures.",
110
- "format": "HTML"
111
- },
112
- {
113
- "title": "29 CFR § 1910.178 - Powered Industrial Trucks",
114
- "url": "https://www.ecfr.gov/current/title-29/section-1910.178",
115
- "source": "OSHA",
116
- "year": 2024,
117
- "category": "Regulation",
118
- "summary": "Covers safety requirements for forklifts and other powered industrial trucks, addressing design, maintenance, and operation to prevent struck-by, caught-in, and crushing incidents.",
119
- "format": "HTML"
120
- },
121
- {
122
- "title": "NIOSH Workplace Solutions: Preventing Worker Injuries from Industrial Machines",
123
- "url": "https://www.cdc.gov/niosh/docs/wp-solutions/2012-116/",
124
- "source": "NIOSH",
125
- "year": 2012,
126
- "category": "Technical Guide",
127
- "summary": "Describes how to prevent machine-related injuries using a combination of engineering controls, administrative controls like LOTO, and personal protective equipment, focusing on the hierarchy of controls.",
128
- "format": "HTML",
129
- "status": "link broken"
130
- },
131
- {
132
- "title": "Engineering Control Guidelines for Safety in Manufacturing",
133
- "url": "https://www.cdc.gov/niosh/docs/2001-123/",
134
- "source": "NIOSH",
135
- "year": 2001,
136
- "category": "Technical Guide",
137
- "summary": "Provides guidance on applying engineering controls—the most effective way to reduce workplace hazards—to manufacturing processes, directly supporting the prevention of machine-related incidents.",
138
- "format": "HTML"
139
- },
140
- {
141
- "title": "OSHA eTool: Machine Guarding",
142
- "url": "https://www.osha.gov/etools/machine-guarding",
143
- "source": "OSHA",
144
- "year": 2023,
145
- "category": "eTool",
146
- "summary": "This interactive web tool illustrates the hazards and safeguarding methods for a wide range of machinery, providing visual examples and explanations relevant to preventing struck-by and caught-in injuries.",
147
- "format": "HTML"
148
- },
149
- {
150
- "title": "OSHA eTool: Powered Industrial Trucks (Forklifts)",
151
- "url": "https://www.osha.gov/etools/powered-industrial-trucks",
152
- "source": "OSHA",
153
- "year": 2023,
154
- "category": "eTool",
155
- "summary": "An interactive resource detailing forklift hazards and controls, covering topics like operating procedures, workplace conditions, and stability, which are critical for preventing struck-by incidents.",
156
- "format": "HTML"
157
- },
158
- {
159
- "title": "Fact Sheet: Lockout/Tagout",
160
- "url": "https://www.osha.gov/sites/default/files/publications/factsheet_lockout-tagout.pdf",
161
- "source": "OSHA",
162
- "year": 2022,
163
- "category": "Fact Sheet",
164
- "summary": "A concise summary of the Lockout/Tagout standard, highlighting the purpose and key components of energy control programs to prevent injuries during machine servicing.",
165
- "format": "PDF"
166
- },
167
- {
168
- "title": "OSHA Technical Manual (OTM) Section IV: Chapter 5 - Industrial Robots and Robot System Safety",
169
- "url": "https://www.osha.gov/otm/section-4/chapter-5",
170
- "source": "OSHA",
171
- "year": 2017,
172
- "category": "Technical Guide",
173
- "summary": "Details hazards associated with industrial robots, such as struck-by and caught-between incidents, and outlines safeguarding requirements including guards, presence-sensing devices, and proper work procedures.",
174
- "format": "HTML"
175
- },
176
- {
177
- "title": "Safety and Health Topics: Robotics",
178
- "url": "https://www.osha.gov/robotics",
179
- "source": "OSHA",
180
- "year": 2024,
181
- "category": "Technical Guide",
182
- "summary": "This page provides a comprehensive overview of robotic system hazards and control methods, referencing key consensus standards (like ANSI/RIA R15.06) for preventing injuries.",
183
- "format": "HTML"
184
- },
185
- {
186
- "title": "Directive: National Emphasis Program on Amputations in Manufacturing Industries",
187
- "url": "https://www.osha.gov/enforcement/directives/cpl-03-00-022",
188
- "source": "OSHA",
189
- "year": 2019,
190
- "category": "Directive",
191
- "summary": "Establishes an enforcement program for inspecting workplaces with machinery that poses amputation hazards, focusing on compliance with standards for machine guarding and Lockout/Tagout.",
192
- "format": "HTML"
193
- },
194
- {
195
- "title": "NIOSH Topic: Human-Robot Collaboration",
196
- "url": "https://www.cdc.gov/niosh/topics/robot/hrc.html",
197
- "source": "NIOSH",
198
- "year": 2024,
199
- "category": "Technical Guide",
200
- "summary": "Addresses the unique safety challenges of collaborative robots (cobots), focusing on research to prevent struck-by injuries through better sensor technology, risk assessments, and safety standards.",
201
- "format": "HTML"
202
- },
203
- {
204
- "title": "ANSI B11.0-2020: Safety of Machinery",
205
- "url": "https://www.assp.org/standards/standards-descriptions/ansi-b11.0-2020-safety-of-machinery",
206
- "source": "ANSI",
207
- "year": 2020,
208
- "category": "Technical Guide",
209
- "summary": "This foundational US standard provides the framework for assessing risk and applying safeguarding measures to machinery to achieve an acceptable level of risk, heavily influencing OSHA's approach.",
210
- "format": "HTML"
211
- },
212
- {
213
- "title": "ANSI B11.19-2019: Performance Requirements for Safeguarding",
214
- "url": "https://webstore.ansi.org/Standards/B11/ANSIB112019",
215
- "source": "ANSI",
216
- "year": 2019,
217
- "category": "Technical Guide",
218
- "summary": "Specifies performance requirements for the design, construction, installation, and operation of machine safeguarding methods, including guards, interlocking devices, and safety circuits.",
219
- "format": "HTML"
220
- }
221
- ]
222
-
223
-
224
- # 2. Helper Functions (Scraping and Processing)
225
- # ---
226
- def scrape_and_process_url(item):
227
- """Scrapes a URL, extracts text, and downloads images."""
228
- url = item.get("url")
229
- if not url or item.get("status") == "link broken":
230
- print(f"⚠️ Skipping broken or missing URL for: {item['title']}")
231
- return [Document(text=item["summary"], metadata=item)]
232
-
233
- print(f"⚙️ Processing URL: {url}")
234
- try:
235
- headers = {'User-Agent': 'Mozilla/5.0'}
236
- response = requests.get(url, headers=headers, timeout=20)
237
- response.raise_for_status()
238
- content_type = response.headers.get("content-type", "").lower()
239
- documents = []
240
- metadata = {"source_url": url, "title": item["title"], "category": item["category"]}
241
-
242
- if "pdf" in content_type:
243
- pdf_docs = PyMuPDFReader().load(file_path=io.BytesIO(response.content), metadata=metadata)
244
- documents.extend(pdf_docs)
245
- else:
246
- soup = BeautifulSoup(response.content, 'html.parser')
247
- text_content = soup.get_text(separator='\n', strip=True)
248
- documents.append(Document(text=text_content, metadata=metadata))
249
- for img_tag in soup.find_all('img'):
250
- img_src = img_tag.get('src')
251
- if not img_src: continue
252
- img_url = urljoin(url, img_src)
253
- try:
254
- img_response = requests.get(img_url, headers=headers, timeout=10)
255
- img_response.raise_for_status()
256
- img_filename = os.path.join(IMAGE_DIR, f"{item['source']}_{len(os.listdir(IMAGE_DIR))}.png")
257
- Image.open(io.BytesIO(img_response.content)).convert("RGB").save(img_filename)
258
- except Exception as e: print(f" ❌ Could not download image {img_url}: {e}")
259
- return documents
260
- except requests.RequestException as e:
261
- print(f"❌ Failed to scrape {url}: {e}")
262
- return [Document(text=item["summary"], metadata=item)]
263
-
264
- # 3. Execution: Scrape, Build, and Save
265
- # ---
266
- print("--- Starting Data Pre-processing ---")
267
- # Configure LlamaIndex Settings
268
- Settings.llm = OpenAIMultiModal(model="gpt-4o", max_new_tokens=1000)
269
- Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
270
-
271
- # Scrape all text
272
- all_documents = []
273
- for item in source_data:
274
- all_documents.extend(scrape_and_process_url(item))
275
-
276
- # Process all images and generate descriptions
277
- image_documents = ImageReader(text_parser=Settings.llm).load_data(IMAGE_DIR)
278
- all_documents.extend(image_documents)
279
- print(f"\n✅ Total documents loaded: {len(all_documents)}")
280
-
281
- # Build the Knowledge Graph Index
282
- print("\n--- Building Knowledge Graph ---")
283
- graph_store = SimpleGraphStore()
284
- storage_context = StorageContext.from_defaults(graph_store=graph_store)
285
- index = KnowledgeGraphIndex.from_documents(
286
- documents=all_documents,
287
- storage_context=storage_context,
288
- max_triplets_per_chunk=5,
289
- include_embeddings=True,
290
- show_progress=True,
291
- )
292
-
293
- # Persist the index to disk
294
- index.storage_context.persist(persist_dir=STORAGE_DIR)
295
- print(f"\n✅✅✅ Graph built and saved to disk at '{STORAGE_DIR}'! You can now run query.py. ✅✅✅")
 
1
+ import os
2
+ import re
3
+ import glob
4
+ from dotenv import load_dotenv
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ import pandas as pd
8
+ import pymupdf4llm
9
+ import networkx as nx
10
+ from openai import OpenAI
11
+
12
+ # Load environment and initialize OpenAI client
13
+ load_dotenv(override=True)
14
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
15
+
16
+ # Helper function to split PDF by second-level header
17
+ def split_by_header(md_text):
18
+ """
19
+ 1) Split on third‑level headers (“### ”).
20
+ 2) For each part >7 500 tokens, try splitting on explicit page markers (“Page 1”, “Page 2”, …).
21
+ 3) If no page markers are found, break into ~5 000‑token chunks.
22
+ Returns a list of markdown fragments.
23
+ """
24
+ # 1) Initial split on '### ' headers
25
+ parts = re.split(r'(?m)^### ', md_text)
26
+ sections = [('### ' + p) if not p.startswith('### ') else p for p in parts if p.strip()]
27
+
28
+ final_sections = []
29
+ for sec in sections:
30
+ tokens = sec.split()
31
+ if len(tokens) > 7500:
32
+ # 2) Try explicit “Page X” markers
33
+ pages = re.split(r'(?m)^Page \d+', sec)
34
+ if len(pages) > 1:
35
+ for pg in pages:
36
+ if pg.strip():
37
+ final_sections.append(pg)
38
+ continue
39
+
40
+ # 3) Fallback: split into 5 000‑token chunks
41
+ for i in range(0, len(tokens), 5000):
42
+ chunk = ' '.join(tokens[i : i + 5000])
43
+ final_sections.append(chunk)
44
+ else:
45
+ final_sections.append(sec)
46
+
47
+ return final_sections
48
+
49
+
50
+ # Initialize graph database
51
+ G = nx.Graph()
52
+
53
+
54
+ # Process local PDFs in gov_pdfs/
55
+ for pdf_path in glob.glob("gov_pdfs/*.pdf"):
56
+ filename = os.path.basename(pdf_path)
57
+ title = os.path.splitext(filename)[0]
58
+ # Convert PDF to Markdown
59
+ md_text = pymupdf4llm.to_markdown(pdf_path)
60
+ # Split into sections
61
+ sections = split_by_header(md_text)
62
+ # Embed and add nodes
63
+ for idx, sec in enumerate(sections):
64
+ resp = client.embeddings.create(model="text-embedding-3-large", input=sec)
65
+ vector = resp.data[0].embedding
66
+ node_id = f"PDF::{title}::section{idx}"
67
+ G.add_node(node_id, text=sec, embedding=vector, source=title)
68
+
69
+ # HTML Document List
70
+ html_data = [
71
+ {
72
+ "title": "29 CFR 1910.211 – Definitions (Machinery and Machine Guarding)",
73
+ "url": "https://www.ecfr.gov/current/title-29/section-1910.211",
74
+ "source": "OSHA",
75
+ "year": 2025,
76
+ "category": "Regulation",
77
+ "summary": "Provides definitions for terms used in OSHA’s machine guarding standards (Subpart O), laying the groundwork for understanding and applying the specific safeguarding requirements for machinery to prevent operator injury.",
78
+ "format": "HTML"
79
+ },
80
+ {
81
+ "title": "29 CFR 1910.212 – General Requirements for All Machines",
82
+ "url": "https://www.ecfr.gov/current/title-29/section-1910.212",
83
+ "source": "OSHA",
84
+ "year": 2025,
85
+ "category": "Regulation",
86
+ "summary": "Establishes broad, fundamental machine safeguarding requirements (e.g. guards at points of operation, secure attachment of guards, protection from flying debris) to protect workers from hazards like nip points, rotating parts, flying chips, and sparks.",
87
+ "format": "HTML"
88
+ },
89
+ {
90
+ "title": "29 CFR 1910.213 – Woodworking Machinery Requirements",
91
+ "url": "https://www.ecfr.gov/current/title-29/section-1910.213",
92
+ "source": "OSHA",
93
+ "year": 2025,
94
+ "category": "Regulation",
95
+ "summary": "Sets specific safety requirements for woodworking equipment (such as saws, jointers, planers, and sanders), including guarding of blades, hoods, push sticks, and other protective measures to prevent cuts, amputations, and kickback injuries in woodworking operations.",
96
+ "format": "HTML"
97
+ },
98
+ {
99
+ "title": "29 CFR 1910.215 – Abrasive Wheel Machinery",
100
+ "url": "https://www.ecfr.gov/current/title-29/section-1910.215",
101
+ "source": "OSHA",
102
+ "year": 2025,
103
+ "category": "Regulation",
104
+ "summary": "Covers guarding and safety precautions for machines with abrasive wheels (grinders and cut-off machines), requiring wheel enclosures, work rests, tongue guards, and inspections to prevent wheel shattering, sparks, and operator contact with moving abrasive parts.",
105
+ "format": "HTML"
106
+ },
107
+ {
108
+ "title": "29 CFR 1910.216 – Mills and Calenders in the Rubber and Plastics Industries",
109
+ "url": "https://www.ecfr.gov/current/title-29/section-1910.216",
110
+ "source": "OSHA",
111
+ "year": 2025,
112
+ "category": "Regulation",
113
+ "summary": "Specifies safeguarding for two-roll mills, calenders, and similar processing machines in rubber/plastics manufacturing – including required safety trip controls, emergency stopping devices, and barrier guards – to protect workers from being caught in rollers or nip points.",
114
+ "format": "HTML"
115
+ },
116
+ {
117
+ "title": "29 CFR 1910.217 – Mechanical Power Presses",
118
+ "url": "https://www.ecfr.gov/current/title-29/section-1910.217",
119
+ "source": "OSHA",
120
+ "year": 2025,
121
+ "category": "Regulation",
122
+ "summary": "Detailed standard for mechanical power presses (e.g. stamping presses) mandating guarding of points of operation, use of devices like two-hand controls or presence-sensing systems, inspection and maintenance requirements, and training – aimed at preventing severe crushing, amputation, or die-punch injuries.",
123
+ "format": "HTML"
124
+ },
125
+ {
126
+ "title": "29 CFR 1910.218 – Forging Machines",
127
+ "url": "https://www.ecfr.gov/current/title-29/section-1910.218",
128
+ "source": "OSHA",
129
+ "year": 2025,
130
+ "category": "Regulation",
131
+ "summary": "Covers safety requirements for forging machinery (such as hammers, presses, upsetters, and boltheaders), including provisions for guarding dies and rams, handling hot metal safely, and use of tongs or mechanical loaders – all intended to prevent struck-by, caught-in, and burn injuries in forge operations.",
132
+ "format": "HTML"
133
+ },
134
+ {
135
+ "title": "29 CFR 1910.219 – Mechanical Power-Transmission Apparatus",
136
+ "url": "https://www.ecfr.gov/current/title-29/section-1910.219",
137
+ "source": "OSHA",
138
+ "year": 2025,
139
+ "category": "Regulation",
140
+ "summary": "Requires guards for all exposed belts, pulleys, chains, gears, flywheels, couplings, and other power-transmission parts on machinery. This standard ensures that rotating or moving drivetrain components are enclosed to prevent employees from getting caught in or struck by these parts.",
141
+ "format": "HTML"
142
+ },
143
+ {
144
+ "title": "29 CFR 1910.147 – The Control of Hazardous Energy (Lockout/Tagout)",
145
+ "url": "https://www.ecfr.gov/current/title-29/section-1910.147",
146
+ "source": "OSHA",
147
+ "year": 2025,
148
+ "category": "Regulation",
149
+ "summary": "OSHA’s Lockout/Tagout standard, which mandates that dangerous machinery must be de-energized and locked out (or tagged out) during maintenance or servicing. It details the required energy control procedures, employee training, and periodic inspections to ensure that workers are protected from the release of stored energy or accidental machine start-up (a major cause of caught-in/between and amputation incidents).",
150
+ "format": "HTML"
151
+ },
152
+ {
153
+ "title": "29 CFR 1910.178 – Powered Industrial Trucks (Forklifts)",
154
+ "url": "https://www.ecfr.gov/current/title-29/section-1910.178",
155
+ "source": "OSHA",
156
+ "year": 2025,
157
+ "category": "Regulation",
158
+ "summary": "The OSHA standard governing the design, maintenance, and safe operation of forklifts and other powered industrial trucks. It covers operator training and certification requirements, inspection and maintenance of equipment, safe fueling/charging, and operating rules (like speed limits, handling loads, and avoiding hazards) – all aimed at preventing tip-overs, collisions, and struck-by or crushed-by accidents involving these vehicles.",
159
+ "format": "HTML"
160
+ },
161
+ {
162
+ "title": "NIOSH Robotics in the Workplace – Safety Overview (Human-Robot Collaboration)",
163
+ "url": "https://www.cdc.gov/niosh/robotics/about/",
164
+ "source": "NIOSH",
165
+ "year": 2024,
166
+ "category": "Technical Guide",
167
+ "summary": "A NIOSH overview of emerging safety challenges as robots increasingly collaborate with human workers. Updated in 2024, this page discusses how robots can improve safety by taking over dangerous tasks but also introduces new struck-by and caught-between hazards. It emphasizes the need for updated safety standards, risk assessments, and research on human-robot interaction, and it outlines NIOSH’s efforts (through its Center for Occupational Robotics Research) to develop best practices and guidance for safe integration of robotics in industry.",
168
+ "format": "HTML"
169
+ }
170
+ ]
171
+
172
+ # Process HTML sources
173
+ def process_html(item):
174
+ resp = requests.get(item['url'])
175
+ resp.raise_for_status()
176
+ soup = BeautifulSoup(resp.text, 'html.parser')
177
+ texts = [p.get_text() for p in soup.find_all('p')]
178
+ tables = [pd.read_html(str(t))[0].to_markdown() for t in soup.find_all('table')]
179
+ # Join paragraphs and tables with blank lines
180
+ full = "\n\n".join(texts + tables)
181
+ resp_emb = client.embeddings.create(model="text-embedding-3-large", input=full)
182
+ vec = resp_emb.data[0].embedding
183
+ node_id = f"HTML::{item['title']}"
184
+ G.add_node(node_id, text=full, embedding=vec, source=item['title'])
185
+
186
+ # Run HTML processing
187
+ for item in html_data:
188
+ process_html(item)
189
+
190
+ # Save graph
191
+ nx.write_gml(G, "graph.gml")
192
+ print("Graph RAG database created: graph.gml")