# SCRIPT 1: PREPROCESS.PY # Run this script one time to build and save your knowledge graph. import os import requests import json from bs4 import BeautifulSoup from urllib.parse import urljoin import io from llama_index.core import Document, Settings, StorageContext, KnowledgeGraphIndex, load_index_from_storage from llama_index.core.graph_stores import SimpleGraphStore from llama_index.llms.openai_multi_modal import OpenAIMultiModal from llama_index.embeddings.openai import OpenAIEmbedding from llama_index.readers.file import PyMuPDFReader, ImageReader from PIL import Image # 1. Configuration # --- if not os.getenv("OPENAI_API_KEY"): raise ValueError("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.") # Directories for storing data and the final index IMAGE_DIR = "image_data" STORAGE_DIR = "storage" os.makedirs(IMAGE_DIR, exist_ok=True) os.makedirs(STORAGE_DIR, exist_ok=True) # The JSON data you provided (ensure you have the full list here) source_data = [ { "title": "29 CFR § 1910.211 - Definitions", "url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.211", "source": "OSHA", "year": 2024, "category": "Regulation", "summary": "Provides key legal definitions for terms used throughout Subpart O, such as 'point of operation,' 'guard,' and 'power press,' which are foundational for understanding and applying machine safeguarding rules.", "format": "HTML" }, { "title": "29 CFR § 1910.212 - General requirements for all machines", "url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.212", "source": "OSHA", "year": 2024, "category": "Regulation", "summary": "This is the core machine guarding standard, mandating that one or more safeguarding methods be used to protect operators from point-of-operation hazards, ingoing nip points, and other machinery dangers.", "format": "HTML" }, { "title": "29 CFR § 1910.213 - Woodworking machinery requirements", "url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.213", "source": "OSHA", "year": 2024, "category": "Regulation", "summary": "Details specific guarding requirements for various woodworking machines, including circular saws, band saws, and jointers, to prevent lacerations and amputations from blade contact.", "format": "HTML" }, { "title": "29 CFR § 1910.215 - Abrasive wheel machinery", "url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.215", "source": "OSHA", "year": 2024, "category": "Regulation", "summary": "Specifies safety requirements for abrasive wheel grinders, including guards, flanges, and work rests, to protect workers from wheel breakage, projectiles, and contact with the wheel.", "format": "HTML" }, { "title": "29 CFR § 1910.217 - Mechanical power presses", "url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.217", "source": "OSHA", "year": 2024, "category": "Regulation", "summary": "Outlines extensive requirements for mechanical power presses to prevent injuries to hands and fingers, covering guards, devices, clutch/brake mechanisms, and inspection procedures.", "format": "HTML" }, { "title": "29 CFR § 1910.219 - Mechanical power-transmission apparatus", "url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.219", "source": "OSHA", "year": 2024, "category": "Regulation", "summary": "Mandates the guarding of mechanical power-transmission components like belts, pulleys, gears, and shafts to prevent caught-in/between injuries from entanglement.", "format": "HTML" }, { "title": "Safeguarding Equipment and Protecting Workers from Amputations (OSHA 3170)", "url": "https://www.osha.gov/sites/default/files/publications/osha3170.pdf", "source": "OSHA", "year": 2007, "category": "Technical Guide", "summary": "This guide helps identify and manage amputation hazards from various machines by explaining hazard analysis, machine safeguarding methods, and the importance of hazardous energy control.", "format": "PDF" }, { "title": "The Control of Hazardous Energy (Lockout/Tagout) (OSHA 3120)", "url": "https://www.osha.gov/sites/default/files/publications/osha3120.pdf", "source": "OSHA", "year": 2002, "category": "Technical Guide", "summary": "Provides a detailed explanation of the Lockout/Tagout standard (1910.147), offering guidance on energy control procedures, training, and periodic inspections to prevent unexpected machine startup.", "format": "PDF" }, { "title": "29 CFR § 1910.147 - The control of hazardous energy (lockout/tagout)", "url": "https://www.ecfr.gov/current/title-29/section-1910.147", "source": "OSHA", "year": 2024, "category": "Regulation", "summary": "This regulation establishes the employer's responsibility to protect workers from hazardous energy sources during machine servicing and maintenance by requiring energy isolation and lockout/tagout procedures.", "format": "HTML" }, { "title": "29 CFR § 1910.178 - Powered Industrial Trucks", "url": "https://www.ecfr.gov/current/title-29/section-1910.178", "source": "OSHA", "year": 2024, "category": "Regulation", "summary": "Covers safety requirements for forklifts and other powered industrial trucks, addressing design, maintenance, and operation to prevent struck-by, caught-in, and crushing incidents.", "format": "HTML" }, { "title": "NIOSH Workplace Solutions: Preventing Worker Injuries from Industrial Machines", "url": "https://www.cdc.gov/niosh/docs/wp-solutions/2012-116/", "source": "NIOSH", "year": 2012, "category": "Technical Guide", "summary": "Describes how to prevent machine-related injuries using a combination of engineering controls, administrative controls like LOTO, and personal protective equipment, focusing on the hierarchy of controls.", "format": "HTML", "status": "link broken" }, { "title": "Engineering Control Guidelines for Safety in Manufacturing", "url": "https://www.cdc.gov/niosh/docs/2001-123/", "source": "NIOSH", "year": 2001, "category": "Technical Guide", "summary": "Provides guidance on applying engineering controls—the most effective way to reduce workplace hazards—to manufacturing processes, directly supporting the prevention of machine-related incidents.", "format": "HTML" }, { "title": "OSHA eTool: Machine Guarding", "url": "https://www.osha.gov/etools/machine-guarding", "source": "OSHA", "year": 2023, "category": "eTool", "summary": "This interactive web tool illustrates the hazards and safeguarding methods for a wide range of machinery, providing visual examples and explanations relevant to preventing struck-by and caught-in injuries.", "format": "HTML" }, { "title": "OSHA eTool: Powered Industrial Trucks (Forklifts)", "url": "https://www.osha.gov/etools/powered-industrial-trucks", "source": "OSHA", "year": 2023, "category": "eTool", "summary": "An interactive resource detailing forklift hazards and controls, covering topics like operating procedures, workplace conditions, and stability, which are critical for preventing struck-by incidents.", "format": "HTML" }, { "title": "Fact Sheet: Lockout/Tagout", "url": "https://www.osha.gov/sites/default/files/publications/factsheet_lockout-tagout.pdf", "source": "OSHA", "year": 2022, "category": "Fact Sheet", "summary": "A concise summary of the Lockout/Tagout standard, highlighting the purpose and key components of energy control programs to prevent injuries during machine servicing.", "format": "PDF" }, { "title": "OSHA Technical Manual (OTM) Section IV: Chapter 5 - Industrial Robots and Robot System Safety", "url": "https://www.osha.gov/otm/section-4/chapter-5", "source": "OSHA", "year": 2017, "category": "Technical Guide", "summary": "Details hazards associated with industrial robots, such as struck-by and caught-between incidents, and outlines safeguarding requirements including guards, presence-sensing devices, and proper work procedures.", "format": "HTML" }, { "title": "Safety and Health Topics: Robotics", "url": "https://www.osha.gov/robotics", "source": "OSHA", "year": 2024, "category": "Technical Guide", "summary": "This page provides a comprehensive overview of robotic system hazards and control methods, referencing key consensus standards (like ANSI/RIA R15.06) for preventing injuries.", "format": "HTML" }, { "title": "Directive: National Emphasis Program on Amputations in Manufacturing Industries", "url": "https://www.osha.gov/enforcement/directives/cpl-03-00-022", "source": "OSHA", "year": 2019, "category": "Directive", "summary": "Establishes an enforcement program for inspecting workplaces with machinery that poses amputation hazards, focusing on compliance with standards for machine guarding and Lockout/Tagout.", "format": "HTML" }, { "title": "NIOSH Topic: Human-Robot Collaboration", "url": "https://www.cdc.gov/niosh/topics/robot/hrc.html", "source": "NIOSH", "year": 2024, "category": "Technical Guide", "summary": "Addresses the unique safety challenges of collaborative robots (cobots), focusing on research to prevent struck-by injuries through better sensor technology, risk assessments, and safety standards.", "format": "HTML" }, { "title": "ANSI B11.0-2020: Safety of Machinery", "url": "https://www.assp.org/standards/standards-descriptions/ansi-b11.0-2020-safety-of-machinery", "source": "ANSI", "year": 2020, "category": "Technical Guide", "summary": "This foundational US standard provides the framework for assessing risk and applying safeguarding measures to machinery to achieve an acceptable level of risk, heavily influencing OSHA's approach.", "format": "HTML" }, { "title": "ANSI B11.19-2019: Performance Requirements for Safeguarding", "url": "https://webstore.ansi.org/Standards/B11/ANSIB112019", "source": "ANSI", "year": 2019, "category": "Technical Guide", "summary": "Specifies performance requirements for the design, construction, installation, and operation of machine safeguarding methods, including guards, interlocking devices, and safety circuits.", "format": "HTML" } ] # 2. Helper Functions (Scraping and Processing) # --- def scrape_and_process_url(item): """Scrapes a URL, extracts text, and downloads images.""" url = item.get("url") if not url or item.get("status") == "link broken": print(f"⚠️ Skipping broken or missing URL for: {item['title']}") return [Document(text=item["summary"], metadata=item)] print(f"⚙️ Processing URL: {url}") try: headers = {'User-Agent': 'Mozilla/5.0'} response = requests.get(url, headers=headers, timeout=20) response.raise_for_status() content_type = response.headers.get("content-type", "").lower() documents = [] metadata = {"source_url": url, "title": item["title"], "category": item["category"]} if "pdf" in content_type: pdf_docs = PyMuPDFReader().load(file_path=io.BytesIO(response.content), metadata=metadata) documents.extend(pdf_docs) else: soup = BeautifulSoup(response.content, 'html.parser') text_content = soup.get_text(separator='\n', strip=True) documents.append(Document(text=text_content, metadata=metadata)) for img_tag in soup.find_all('img'): img_src = img_tag.get('src') if not img_src: continue img_url = urljoin(url, img_src) try: img_response = requests.get(img_url, headers=headers, timeout=10) img_response.raise_for_status() img_filename = os.path.join(IMAGE_DIR, f"{item['source']}_{len(os.listdir(IMAGE_DIR))}.png") Image.open(io.BytesIO(img_response.content)).convert("RGB").save(img_filename) except Exception as e: print(f" ❌ Could not download image {img_url}: {e}") return documents except requests.RequestException as e: print(f"❌ Failed to scrape {url}: {e}") return [Document(text=item["summary"], metadata=item)] # 3. Execution: Scrape, Build, and Save # --- print("--- Starting Data Pre-processing ---") # Configure LlamaIndex Settings Settings.llm = OpenAIMultiModal(model="gpt-4o", max_new_tokens=1000) Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small") # Scrape all text all_documents = [] for item in source_data: all_documents.extend(scrape_and_process_url(item)) # Process all images and generate descriptions image_documents = ImageReader(text_parser=Settings.llm).load_data(IMAGE_DIR) all_documents.extend(image_documents) print(f"\n✅ Total documents loaded: {len(all_documents)}") # Build the Knowledge Graph Index print("\n--- Building Knowledge Graph ---") graph_store = SimpleGraphStore() storage_context = StorageContext.from_defaults(graph_store=graph_store) index = KnowledgeGraphIndex.from_documents( documents=all_documents, storage_context=storage_context, max_triplets_per_chunk=5, include_embeddings=True, show_progress=True, ) # Persist the index to disk index.storage_context.persist(persist_dir=STORAGE_DIR) print(f"\n✅✅✅ Graph built and saved to disk at '{STORAGE_DIR}'! You can now run query.py. ✅✅✅")