Spaces:
Paused
Paused
Adding the actual preprocess.py file
Browse files- preprocess.py +192 -295
preprocess.py
CHANGED
|
@@ -1,295 +1,192 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
import
|
| 5 |
-
import requests
|
| 6 |
-
import
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
import
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
"
|
| 73 |
-
"
|
| 74 |
-
"
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
"
|
| 78 |
-
"
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
"
|
| 82 |
-
"
|
| 83 |
-
"
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
"
|
| 87 |
-
"
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
"
|
| 91 |
-
"
|
| 92 |
-
"
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
"
|
| 96 |
-
"
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
"
|
| 100 |
-
"
|
| 101 |
-
"
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
"
|
| 105 |
-
"
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
"
|
| 109 |
-
"
|
| 110 |
-
"
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
"
|
| 114 |
-
"
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
"
|
| 118 |
-
"
|
| 119 |
-
"
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
"
|
| 123 |
-
"
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
"
|
| 127 |
-
"
|
| 128 |
-
"
|
| 129 |
-
"
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
"
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
"
|
| 136 |
-
"
|
| 137 |
-
"
|
| 138 |
-
"
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
"
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
"
|
| 145 |
-
"
|
| 146 |
-
"
|
| 147 |
-
"
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
"
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
"
|
| 154 |
-
"
|
| 155 |
-
"
|
| 156 |
-
"
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
"
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
"
|
| 163 |
-
"
|
| 164 |
-
"
|
| 165 |
-
"
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
"
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
"
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
},
|
| 194 |
-
{
|
| 195 |
-
"title": "NIOSH Topic: Human-Robot Collaboration",
|
| 196 |
-
"url": "https://www.cdc.gov/niosh/topics/robot/hrc.html",
|
| 197 |
-
"source": "NIOSH",
|
| 198 |
-
"year": 2024,
|
| 199 |
-
"category": "Technical Guide",
|
| 200 |
-
"summary": "Addresses the unique safety challenges of collaborative robots (cobots), focusing on research to prevent struck-by injuries through better sensor technology, risk assessments, and safety standards.",
|
| 201 |
-
"format": "HTML"
|
| 202 |
-
},
|
| 203 |
-
{
|
| 204 |
-
"title": "ANSI B11.0-2020: Safety of Machinery",
|
| 205 |
-
"url": "https://www.assp.org/standards/standards-descriptions/ansi-b11.0-2020-safety-of-machinery",
|
| 206 |
-
"source": "ANSI",
|
| 207 |
-
"year": 2020,
|
| 208 |
-
"category": "Technical Guide",
|
| 209 |
-
"summary": "This foundational US standard provides the framework for assessing risk and applying safeguarding measures to machinery to achieve an acceptable level of risk, heavily influencing OSHA's approach.",
|
| 210 |
-
"format": "HTML"
|
| 211 |
-
},
|
| 212 |
-
{
|
| 213 |
-
"title": "ANSI B11.19-2019: Performance Requirements for Safeguarding",
|
| 214 |
-
"url": "https://webstore.ansi.org/Standards/B11/ANSIB112019",
|
| 215 |
-
"source": "ANSI",
|
| 216 |
-
"year": 2019,
|
| 217 |
-
"category": "Technical Guide",
|
| 218 |
-
"summary": "Specifies performance requirements for the design, construction, installation, and operation of machine safeguarding methods, including guards, interlocking devices, and safety circuits.",
|
| 219 |
-
"format": "HTML"
|
| 220 |
-
}
|
| 221 |
-
]
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
# 2. Helper Functions (Scraping and Processing)
|
| 225 |
-
# ---
|
| 226 |
-
def scrape_and_process_url(item):
|
| 227 |
-
"""Scrapes a URL, extracts text, and downloads images."""
|
| 228 |
-
url = item.get("url")
|
| 229 |
-
if not url or item.get("status") == "link broken":
|
| 230 |
-
print(f"⚠️ Skipping broken or missing URL for: {item['title']}")
|
| 231 |
-
return [Document(text=item["summary"], metadata=item)]
|
| 232 |
-
|
| 233 |
-
print(f"⚙️ Processing URL: {url}")
|
| 234 |
-
try:
|
| 235 |
-
headers = {'User-Agent': 'Mozilla/5.0'}
|
| 236 |
-
response = requests.get(url, headers=headers, timeout=20)
|
| 237 |
-
response.raise_for_status()
|
| 238 |
-
content_type = response.headers.get("content-type", "").lower()
|
| 239 |
-
documents = []
|
| 240 |
-
metadata = {"source_url": url, "title": item["title"], "category": item["category"]}
|
| 241 |
-
|
| 242 |
-
if "pdf" in content_type:
|
| 243 |
-
pdf_docs = PyMuPDFReader().load(file_path=io.BytesIO(response.content), metadata=metadata)
|
| 244 |
-
documents.extend(pdf_docs)
|
| 245 |
-
else:
|
| 246 |
-
soup = BeautifulSoup(response.content, 'html.parser')
|
| 247 |
-
text_content = soup.get_text(separator='\n', strip=True)
|
| 248 |
-
documents.append(Document(text=text_content, metadata=metadata))
|
| 249 |
-
for img_tag in soup.find_all('img'):
|
| 250 |
-
img_src = img_tag.get('src')
|
| 251 |
-
if not img_src: continue
|
| 252 |
-
img_url = urljoin(url, img_src)
|
| 253 |
-
try:
|
| 254 |
-
img_response = requests.get(img_url, headers=headers, timeout=10)
|
| 255 |
-
img_response.raise_for_status()
|
| 256 |
-
img_filename = os.path.join(IMAGE_DIR, f"{item['source']}_{len(os.listdir(IMAGE_DIR))}.png")
|
| 257 |
-
Image.open(io.BytesIO(img_response.content)).convert("RGB").save(img_filename)
|
| 258 |
-
except Exception as e: print(f" ❌ Could not download image {img_url}: {e}")
|
| 259 |
-
return documents
|
| 260 |
-
except requests.RequestException as e:
|
| 261 |
-
print(f"❌ Failed to scrape {url}: {e}")
|
| 262 |
-
return [Document(text=item["summary"], metadata=item)]
|
| 263 |
-
|
| 264 |
-
# 3. Execution: Scrape, Build, and Save
|
| 265 |
-
# ---
|
| 266 |
-
print("--- Starting Data Pre-processing ---")
|
| 267 |
-
# Configure LlamaIndex Settings
|
| 268 |
-
Settings.llm = OpenAIMultiModal(model="gpt-4o", max_new_tokens=1000)
|
| 269 |
-
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
|
| 270 |
-
|
| 271 |
-
# Scrape all text
|
| 272 |
-
all_documents = []
|
| 273 |
-
for item in source_data:
|
| 274 |
-
all_documents.extend(scrape_and_process_url(item))
|
| 275 |
-
|
| 276 |
-
# Process all images and generate descriptions
|
| 277 |
-
image_documents = ImageReader(text_parser=Settings.llm).load_data(IMAGE_DIR)
|
| 278 |
-
all_documents.extend(image_documents)
|
| 279 |
-
print(f"\n✅ Total documents loaded: {len(all_documents)}")
|
| 280 |
-
|
| 281 |
-
# Build the Knowledge Graph Index
|
| 282 |
-
print("\n--- Building Knowledge Graph ---")
|
| 283 |
-
graph_store = SimpleGraphStore()
|
| 284 |
-
storage_context = StorageContext.from_defaults(graph_store=graph_store)
|
| 285 |
-
index = KnowledgeGraphIndex.from_documents(
|
| 286 |
-
documents=all_documents,
|
| 287 |
-
storage_context=storage_context,
|
| 288 |
-
max_triplets_per_chunk=5,
|
| 289 |
-
include_embeddings=True,
|
| 290 |
-
show_progress=True,
|
| 291 |
-
)
|
| 292 |
-
|
| 293 |
-
# Persist the index to disk
|
| 294 |
-
index.storage_context.persist(persist_dir=STORAGE_DIR)
|
| 295 |
-
print(f"\n✅✅✅ Graph built and saved to disk at '{STORAGE_DIR}'! You can now run query.py. ✅✅✅")
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import glob
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
import requests
|
| 6 |
+
from bs4 import BeautifulSoup
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import pymupdf4llm
|
| 9 |
+
import networkx as nx
|
| 10 |
+
from openai import OpenAI
|
| 11 |
+
|
| 12 |
+
# Load environment and initialize OpenAI client
|
| 13 |
+
load_dotenv(override=True)
|
| 14 |
+
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
| 15 |
+
|
| 16 |
+
# Helper function to split PDF by second-level header
|
| 17 |
+
def split_by_header(md_text):
|
| 18 |
+
"""
|
| 19 |
+
1) Split on third‑level headers (“### ”).
|
| 20 |
+
2) For each part >7 500 tokens, try splitting on explicit page markers (“Page 1”, “Page 2”, …).
|
| 21 |
+
3) If no page markers are found, break into ~5 000‑token chunks.
|
| 22 |
+
Returns a list of markdown fragments.
|
| 23 |
+
"""
|
| 24 |
+
# 1) Initial split on '### ' headers
|
| 25 |
+
parts = re.split(r'(?m)^### ', md_text)
|
| 26 |
+
sections = [('### ' + p) if not p.startswith('### ') else p for p in parts if p.strip()]
|
| 27 |
+
|
| 28 |
+
final_sections = []
|
| 29 |
+
for sec in sections:
|
| 30 |
+
tokens = sec.split()
|
| 31 |
+
if len(tokens) > 7500:
|
| 32 |
+
# 2) Try explicit “Page X” markers
|
| 33 |
+
pages = re.split(r'(?m)^Page \d+', sec)
|
| 34 |
+
if len(pages) > 1:
|
| 35 |
+
for pg in pages:
|
| 36 |
+
if pg.strip():
|
| 37 |
+
final_sections.append(pg)
|
| 38 |
+
continue
|
| 39 |
+
|
| 40 |
+
# 3) Fallback: split into 5 000‑token chunks
|
| 41 |
+
for i in range(0, len(tokens), 5000):
|
| 42 |
+
chunk = ' '.join(tokens[i : i + 5000])
|
| 43 |
+
final_sections.append(chunk)
|
| 44 |
+
else:
|
| 45 |
+
final_sections.append(sec)
|
| 46 |
+
|
| 47 |
+
return final_sections
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# Initialize graph database
|
| 51 |
+
G = nx.Graph()
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# Process local PDFs in gov_pdfs/
|
| 55 |
+
for pdf_path in glob.glob("gov_pdfs/*.pdf"):
|
| 56 |
+
filename = os.path.basename(pdf_path)
|
| 57 |
+
title = os.path.splitext(filename)[0]
|
| 58 |
+
# Convert PDF to Markdown
|
| 59 |
+
md_text = pymupdf4llm.to_markdown(pdf_path)
|
| 60 |
+
# Split into sections
|
| 61 |
+
sections = split_by_header(md_text)
|
| 62 |
+
# Embed and add nodes
|
| 63 |
+
for idx, sec in enumerate(sections):
|
| 64 |
+
resp = client.embeddings.create(model="text-embedding-3-large", input=sec)
|
| 65 |
+
vector = resp.data[0].embedding
|
| 66 |
+
node_id = f"PDF::{title}::section{idx}"
|
| 67 |
+
G.add_node(node_id, text=sec, embedding=vector, source=title)
|
| 68 |
+
|
| 69 |
+
# HTML Document List
|
| 70 |
+
html_data = [
|
| 71 |
+
{
|
| 72 |
+
"title": "29 CFR 1910.211 – Definitions (Machinery and Machine Guarding)",
|
| 73 |
+
"url": "https://www.ecfr.gov/current/title-29/section-1910.211",
|
| 74 |
+
"source": "OSHA",
|
| 75 |
+
"year": 2025,
|
| 76 |
+
"category": "Regulation",
|
| 77 |
+
"summary": "Provides definitions for terms used in OSHA’s machine guarding standards (Subpart O), laying the groundwork for understanding and applying the specific safeguarding requirements for machinery to prevent operator injury.",
|
| 78 |
+
"format": "HTML"
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"title": "29 CFR 1910.212 – General Requirements for All Machines",
|
| 82 |
+
"url": "https://www.ecfr.gov/current/title-29/section-1910.212",
|
| 83 |
+
"source": "OSHA",
|
| 84 |
+
"year": 2025,
|
| 85 |
+
"category": "Regulation",
|
| 86 |
+
"summary": "Establishes broad, fundamental machine safeguarding requirements (e.g. guards at points of operation, secure attachment of guards, protection from flying debris) to protect workers from hazards like nip points, rotating parts, flying chips, and sparks.",
|
| 87 |
+
"format": "HTML"
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"title": "29 CFR 1910.213 – Woodworking Machinery Requirements",
|
| 91 |
+
"url": "https://www.ecfr.gov/current/title-29/section-1910.213",
|
| 92 |
+
"source": "OSHA",
|
| 93 |
+
"year": 2025,
|
| 94 |
+
"category": "Regulation",
|
| 95 |
+
"summary": "Sets specific safety requirements for woodworking equipment (such as saws, jointers, planers, and sanders), including guarding of blades, hoods, push sticks, and other protective measures to prevent cuts, amputations, and kickback injuries in woodworking operations.",
|
| 96 |
+
"format": "HTML"
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"title": "29 CFR 1910.215 – Abrasive Wheel Machinery",
|
| 100 |
+
"url": "https://www.ecfr.gov/current/title-29/section-1910.215",
|
| 101 |
+
"source": "OSHA",
|
| 102 |
+
"year": 2025,
|
| 103 |
+
"category": "Regulation",
|
| 104 |
+
"summary": "Covers guarding and safety precautions for machines with abrasive wheels (grinders and cut-off machines), requiring wheel enclosures, work rests, tongue guards, and inspections to prevent wheel shattering, sparks, and operator contact with moving abrasive parts.",
|
| 105 |
+
"format": "HTML"
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
"title": "29 CFR 1910.216 – Mills and Calenders in the Rubber and Plastics Industries",
|
| 109 |
+
"url": "https://www.ecfr.gov/current/title-29/section-1910.216",
|
| 110 |
+
"source": "OSHA",
|
| 111 |
+
"year": 2025,
|
| 112 |
+
"category": "Regulation",
|
| 113 |
+
"summary": "Specifies safeguarding for two-roll mills, calenders, and similar processing machines in rubber/plastics manufacturing – including required safety trip controls, emergency stopping devices, and barrier guards – to protect workers from being caught in rollers or nip points.",
|
| 114 |
+
"format": "HTML"
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"title": "29 CFR 1910.217 – Mechanical Power Presses",
|
| 118 |
+
"url": "https://www.ecfr.gov/current/title-29/section-1910.217",
|
| 119 |
+
"source": "OSHA",
|
| 120 |
+
"year": 2025,
|
| 121 |
+
"category": "Regulation",
|
| 122 |
+
"summary": "Detailed standard for mechanical power presses (e.g. stamping presses) mandating guarding of points of operation, use of devices like two-hand controls or presence-sensing systems, inspection and maintenance requirements, and training – aimed at preventing severe crushing, amputation, or die-punch injuries.",
|
| 123 |
+
"format": "HTML"
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"title": "29 CFR 1910.218 – Forging Machines",
|
| 127 |
+
"url": "https://www.ecfr.gov/current/title-29/section-1910.218",
|
| 128 |
+
"source": "OSHA",
|
| 129 |
+
"year": 2025,
|
| 130 |
+
"category": "Regulation",
|
| 131 |
+
"summary": "Covers safety requirements for forging machinery (such as hammers, presses, upsetters, and boltheaders), including provisions for guarding dies and rams, handling hot metal safely, and use of tongs or mechanical loaders – all intended to prevent struck-by, caught-in, and burn injuries in forge operations.",
|
| 132 |
+
"format": "HTML"
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
"title": "29 CFR 1910.219 – Mechanical Power-Transmission Apparatus",
|
| 136 |
+
"url": "https://www.ecfr.gov/current/title-29/section-1910.219",
|
| 137 |
+
"source": "OSHA",
|
| 138 |
+
"year": 2025,
|
| 139 |
+
"category": "Regulation",
|
| 140 |
+
"summary": "Requires guards for all exposed belts, pulleys, chains, gears, flywheels, couplings, and other power-transmission parts on machinery. This standard ensures that rotating or moving drivetrain components are enclosed to prevent employees from getting caught in or struck by these parts.",
|
| 141 |
+
"format": "HTML"
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"title": "29 CFR 1910.147 – The Control of Hazardous Energy (Lockout/Tagout)",
|
| 145 |
+
"url": "https://www.ecfr.gov/current/title-29/section-1910.147",
|
| 146 |
+
"source": "OSHA",
|
| 147 |
+
"year": 2025,
|
| 148 |
+
"category": "Regulation",
|
| 149 |
+
"summary": "OSHA’s Lockout/Tagout standard, which mandates that dangerous machinery must be de-energized and locked out (or tagged out) during maintenance or servicing. It details the required energy control procedures, employee training, and periodic inspections to ensure that workers are protected from the release of stored energy or accidental machine start-up (a major cause of caught-in/between and amputation incidents).",
|
| 150 |
+
"format": "HTML"
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"title": "29 CFR 1910.178 – Powered Industrial Trucks (Forklifts)",
|
| 154 |
+
"url": "https://www.ecfr.gov/current/title-29/section-1910.178",
|
| 155 |
+
"source": "OSHA",
|
| 156 |
+
"year": 2025,
|
| 157 |
+
"category": "Regulation",
|
| 158 |
+
"summary": "The OSHA standard governing the design, maintenance, and safe operation of forklifts and other powered industrial trucks. It covers operator training and certification requirements, inspection and maintenance of equipment, safe fueling/charging, and operating rules (like speed limits, handling loads, and avoiding hazards) – all aimed at preventing tip-overs, collisions, and struck-by or crushed-by accidents involving these vehicles.",
|
| 159 |
+
"format": "HTML"
|
| 160 |
+
},
|
| 161 |
+
{
|
| 162 |
+
"title": "NIOSH Robotics in the Workplace – Safety Overview (Human-Robot Collaboration)",
|
| 163 |
+
"url": "https://www.cdc.gov/niosh/robotics/about/",
|
| 164 |
+
"source": "NIOSH",
|
| 165 |
+
"year": 2024,
|
| 166 |
+
"category": "Technical Guide",
|
| 167 |
+
"summary": "A NIOSH overview of emerging safety challenges as robots increasingly collaborate with human workers. Updated in 2024, this page discusses how robots can improve safety by taking over dangerous tasks but also introduces new struck-by and caught-between hazards. It emphasizes the need for updated safety standards, risk assessments, and research on human-robot interaction, and it outlines NIOSH’s efforts (through its Center for Occupational Robotics Research) to develop best practices and guidance for safe integration of robotics in industry.",
|
| 168 |
+
"format": "HTML"
|
| 169 |
+
}
|
| 170 |
+
]
|
| 171 |
+
|
| 172 |
+
# Process HTML sources
|
| 173 |
+
def process_html(item):
|
| 174 |
+
resp = requests.get(item['url'])
|
| 175 |
+
resp.raise_for_status()
|
| 176 |
+
soup = BeautifulSoup(resp.text, 'html.parser')
|
| 177 |
+
texts = [p.get_text() for p in soup.find_all('p')]
|
| 178 |
+
tables = [pd.read_html(str(t))[0].to_markdown() for t in soup.find_all('table')]
|
| 179 |
+
# Join paragraphs and tables with blank lines
|
| 180 |
+
full = "\n\n".join(texts + tables)
|
| 181 |
+
resp_emb = client.embeddings.create(model="text-embedding-3-large", input=full)
|
| 182 |
+
vec = resp_emb.data[0].embedding
|
| 183 |
+
node_id = f"HTML::{item['title']}"
|
| 184 |
+
G.add_node(node_id, text=full, embedding=vec, source=item['title'])
|
| 185 |
+
|
| 186 |
+
# Run HTML processing
|
| 187 |
+
for item in html_data:
|
| 188 |
+
process_html(item)
|
| 189 |
+
|
| 190 |
+
# Save graph
|
| 191 |
+
nx.write_gml(G, "graph.gml")
|
| 192 |
+
print("Graph RAG database created: graph.gml")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|