Spaces:
Paused
Paused
main app files
Browse files- app.py +74 -0
- graph.gml +0 -0
- preprocess.py +295 -0
- query_graph.py +55 -0
app.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from query_graph import query_graph
|
| 3 |
+
|
| 4 |
+
# Sidebar configuration
|
| 5 |
+
st.sidebar.title("About")
|
| 6 |
+
st.sidebar.markdown("**Authors:** [The SIGHT Project Team](https://sites.miamioh.edu/sight/)")
|
| 7 |
+
st.sidebar.markdown("**Version:** V. 0.0.1")
|
| 8 |
+
st.sidebar.markdown("**Date:** July 24, 2025")
|
| 9 |
+
st.sidebar.markdown("**Model:** gpt4o")
|
| 10 |
+
|
| 11 |
+
st.sidebar.markdown("---")
|
| 12 |
+
st.sidebar.markdown(
|
| 13 |
+
"**Funding:** SIGHT is funded by [OHBWC WSIC](https://info.bwc.ohio.gov/for-employers/safety-services/workplace-safety-innovation-center/wsic-overview)"
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
# References toggle in sidebar
|
| 17 |
+
st.sidebar.markdown("---")
|
| 18 |
+
show_refs = st.sidebar.checkbox("Show references")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# Main interface
|
| 23 |
+
st.set_page_config(page_title="Miami University's SIGHT Chatbot")
|
| 24 |
+
st.title("Chat with SIGHT")
|
| 25 |
+
st.write("Ask questions about machine safeguarding, LOTO, and hazard prevention based on OSHA/CFR's corpus.")
|
| 26 |
+
|
| 27 |
+
# Example questions toggled in main window
|
| 28 |
+
with st.expander("Example Questions", expanded=False):
|
| 29 |
+
st.markdown(
|
| 30 |
+
"- What are general machine guarding requirements? \n"
|
| 31 |
+
"- How do I perform lockout/tagout? \n"
|
| 32 |
+
"- Summarize the definition of machine guarding from 29 CFR 1910.211"
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
# Initialize chat history
|
| 36 |
+
if 'history' not in st.session_state:
|
| 37 |
+
st.session_state.history = []
|
| 38 |
+
|
| 39 |
+
# User input
|
| 40 |
+
query = st.text_input("Your question:")
|
| 41 |
+
if st.button("Send") and query:
|
| 42 |
+
answer, sources = query_graph(query)
|
| 43 |
+
st.session_state.history.append({
|
| 44 |
+
'query': query,
|
| 45 |
+
'answer': answer,
|
| 46 |
+
'sources': sources
|
| 47 |
+
})
|
| 48 |
+
|
| 49 |
+
# Display chat history
|
| 50 |
+
for i, entry in enumerate(st.session_state.history):
|
| 51 |
+
st.markdown(f"**You:** {entry['query']}")
|
| 52 |
+
st.markdown(f"**Assistant:** {entry['answer']}")
|
| 53 |
+
# Explanations expander
|
| 54 |
+
with st.expander("Sources used", expanded=False):
|
| 55 |
+
for src in entry['sources']:
|
| 56 |
+
st.markdown(f"- {src}")
|
| 57 |
+
|
| 58 |
+
# Optionally show references list
|
| 59 |
+
if show_refs:
|
| 60 |
+
refs = [
|
| 61 |
+
"29 CFR 1910.211", "29 CFR 1910.212", "29 CFR 1910.213", "29 CFR 1910.215",
|
| 62 |
+
"OSHA 3170", "OSHA 3120", "NIOSH WP Solutions 2011-156", "NIOSH Robotics (2024)"
|
| 63 |
+
]
|
| 64 |
+
for r in refs:
|
| 65 |
+
st.sidebar.markdown(f"- {r}")
|
| 66 |
+
|
| 67 |
+
# Footer
|
| 68 |
+
st.markdown("---")
|
| 69 |
+
st.markdown(
|
| 70 |
+
"**Disclaimer:** *Powered by a Graph RAG to reduce hallucinations; please verify as it can still make mistakes.*"
|
| 71 |
+
)
|
| 72 |
+
st.markdown(
|
| 73 |
+
"**Funding:** *We are thankful for [Ohio BWC/WSIC](https://info.bwc.ohio.gov/for-employers/safety-services/workplace-safety-innovation-center/wsic-overview)'s funding that made this chat bot possible.*"
|
| 74 |
+
)
|
graph.gml
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
preprocess.py
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SCRIPT 1: PREPROCESS.PY
|
| 2 |
+
# Run this script one time to build and save your knowledge graph.
|
| 3 |
+
|
| 4 |
+
import os
|
| 5 |
+
import requests
|
| 6 |
+
import json
|
| 7 |
+
from bs4 import BeautifulSoup
|
| 8 |
+
from urllib.parse import urljoin
|
| 9 |
+
import io
|
| 10 |
+
|
| 11 |
+
from llama_index.core import Document, Settings, StorageContext, KnowledgeGraphIndex, load_index_from_storage
|
| 12 |
+
from llama_index.core.graph_stores import SimpleGraphStore
|
| 13 |
+
from llama_index.llms.openai_multi_modal import OpenAIMultiModal
|
| 14 |
+
from llama_index.embeddings.openai import OpenAIEmbedding
|
| 15 |
+
from llama_index.readers.file import PyMuPDFReader, ImageReader
|
| 16 |
+
from PIL import Image
|
| 17 |
+
|
| 18 |
+
# 1. Configuration
|
| 19 |
+
# ---
|
| 20 |
+
if not os.getenv("OPENAI_API_KEY"):
|
| 21 |
+
raise ValueError("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
|
| 22 |
+
|
| 23 |
+
# Directories for storing data and the final index
|
| 24 |
+
IMAGE_DIR = "image_data"
|
| 25 |
+
STORAGE_DIR = "storage"
|
| 26 |
+
os.makedirs(IMAGE_DIR, exist_ok=True)
|
| 27 |
+
os.makedirs(STORAGE_DIR, exist_ok=True)
|
| 28 |
+
|
| 29 |
+
# The JSON data you provided (ensure you have the full list here)
|
| 30 |
+
source_data = [
|
| 31 |
+
{
|
| 32 |
+
"title": "29 CFR Β§ 1910.211 - Definitions",
|
| 33 |
+
"url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.211",
|
| 34 |
+
"source": "OSHA",
|
| 35 |
+
"year": 2024,
|
| 36 |
+
"category": "Regulation",
|
| 37 |
+
"summary": "Provides key legal definitions for terms used throughout Subpart O, such as 'point of operation,' 'guard,' and 'power press,' which are foundational for understanding and applying machine safeguarding rules.",
|
| 38 |
+
"format": "HTML"
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"title": "29 CFR Β§ 1910.212 - General requirements for all machines",
|
| 42 |
+
"url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.212",
|
| 43 |
+
"source": "OSHA",
|
| 44 |
+
"year": 2024,
|
| 45 |
+
"category": "Regulation",
|
| 46 |
+
"summary": "This is the core machine guarding standard, mandating that one or more safeguarding methods be used to protect operators from point-of-operation hazards, ingoing nip points, and other machinery dangers.",
|
| 47 |
+
"format": "HTML"
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"title": "29 CFR Β§ 1910.213 - Woodworking machinery requirements",
|
| 51 |
+
"url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.213",
|
| 52 |
+
"source": "OSHA",
|
| 53 |
+
"year": 2024,
|
| 54 |
+
"category": "Regulation",
|
| 55 |
+
"summary": "Details specific guarding requirements for various woodworking machines, including circular saws, band saws, and jointers, to prevent lacerations and amputations from blade contact.",
|
| 56 |
+
"format": "HTML"
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"title": "29 CFR Β§ 1910.215 - Abrasive wheel machinery",
|
| 60 |
+
"url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.215",
|
| 61 |
+
"source": "OSHA",
|
| 62 |
+
"year": 2024,
|
| 63 |
+
"category": "Regulation",
|
| 64 |
+
"summary": "Specifies safety requirements for abrasive wheel grinders, including guards, flanges, and work rests, to protect workers from wheel breakage, projectiles, and contact with the wheel.",
|
| 65 |
+
"format": "HTML"
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"title": "29 CFR Β§ 1910.217 - Mechanical power presses",
|
| 69 |
+
"url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.217",
|
| 70 |
+
"source": "OSHA",
|
| 71 |
+
"year": 2024,
|
| 72 |
+
"category": "Regulation",
|
| 73 |
+
"summary": "Outlines extensive requirements for mechanical power presses to prevent injuries to hands and fingers, covering guards, devices, clutch/brake mechanisms, and inspection procedures.",
|
| 74 |
+
"format": "HTML"
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"title": "29 CFR Β§ 1910.219 - Mechanical power-transmission apparatus",
|
| 78 |
+
"url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.219",
|
| 79 |
+
"source": "OSHA",
|
| 80 |
+
"year": 2024,
|
| 81 |
+
"category": "Regulation",
|
| 82 |
+
"summary": "Mandates the guarding of mechanical power-transmission components like belts, pulleys, gears, and shafts to prevent caught-in/between injuries from entanglement.",
|
| 83 |
+
"format": "HTML"
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"title": "Safeguarding Equipment and Protecting Workers from Amputations (OSHA 3170)",
|
| 87 |
+
"url": "https://www.osha.gov/sites/default/files/publications/osha3170.pdf",
|
| 88 |
+
"source": "OSHA",
|
| 89 |
+
"year": 2007,
|
| 90 |
+
"category": "Technical Guide",
|
| 91 |
+
"summary": "This guide helps identify and manage amputation hazards from various machines by explaining hazard analysis, machine safeguarding methods, and the importance of hazardous energy control.",
|
| 92 |
+
"format": "PDF"
|
| 93 |
+
},
|
| 94 |
+
{
|
| 95 |
+
"title": "The Control of Hazardous Energy (Lockout/Tagout) (OSHA 3120)",
|
| 96 |
+
"url": "https://www.osha.gov/sites/default/files/publications/osha3120.pdf",
|
| 97 |
+
"source": "OSHA",
|
| 98 |
+
"year": 2002,
|
| 99 |
+
"category": "Technical Guide",
|
| 100 |
+
"summary": "Provides a detailed explanation of the Lockout/Tagout standard (1910.147), offering guidance on energy control procedures, training, and periodic inspections to prevent unexpected machine startup.",
|
| 101 |
+
"format": "PDF"
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"title": "29 CFR Β§ 1910.147 - The control of hazardous energy (lockout/tagout)",
|
| 105 |
+
"url": "https://www.ecfr.gov/current/title-29/section-1910.147",
|
| 106 |
+
"source": "OSHA",
|
| 107 |
+
"year": 2024,
|
| 108 |
+
"category": "Regulation",
|
| 109 |
+
"summary": "This regulation establishes the employer's responsibility to protect workers from hazardous energy sources during machine servicing and maintenance by requiring energy isolation and lockout/tagout procedures.",
|
| 110 |
+
"format": "HTML"
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"title": "29 CFR Β§ 1910.178 - Powered Industrial Trucks",
|
| 114 |
+
"url": "https://www.ecfr.gov/current/title-29/section-1910.178",
|
| 115 |
+
"source": "OSHA",
|
| 116 |
+
"year": 2024,
|
| 117 |
+
"category": "Regulation",
|
| 118 |
+
"summary": "Covers safety requirements for forklifts and other powered industrial trucks, addressing design, maintenance, and operation to prevent struck-by, caught-in, and crushing incidents.",
|
| 119 |
+
"format": "HTML"
|
| 120 |
+
},
|
| 121 |
+
{
|
| 122 |
+
"title": "NIOSH Workplace Solutions: Preventing Worker Injuries from Industrial Machines",
|
| 123 |
+
"url": "https://www.cdc.gov/niosh/docs/wp-solutions/2012-116/",
|
| 124 |
+
"source": "NIOSH",
|
| 125 |
+
"year": 2012,
|
| 126 |
+
"category": "Technical Guide",
|
| 127 |
+
"summary": "Describes how to prevent machine-related injuries using a combination of engineering controls, administrative controls like LOTO, and personal protective equipment, focusing on the hierarchy of controls.",
|
| 128 |
+
"format": "HTML",
|
| 129 |
+
"status": "link broken"
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"title": "Engineering Control Guidelines for Safety in Manufacturing",
|
| 133 |
+
"url": "https://www.cdc.gov/niosh/docs/2001-123/",
|
| 134 |
+
"source": "NIOSH",
|
| 135 |
+
"year": 2001,
|
| 136 |
+
"category": "Technical Guide",
|
| 137 |
+
"summary": "Provides guidance on applying engineering controlsβthe most effective way to reduce workplace hazardsβto manufacturing processes, directly supporting the prevention of machine-related incidents.",
|
| 138 |
+
"format": "HTML"
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"title": "OSHA eTool: Machine Guarding",
|
| 142 |
+
"url": "https://www.osha.gov/etools/machine-guarding",
|
| 143 |
+
"source": "OSHA",
|
| 144 |
+
"year": 2023,
|
| 145 |
+
"category": "eTool",
|
| 146 |
+
"summary": "This interactive web tool illustrates the hazards and safeguarding methods for a wide range of machinery, providing visual examples and explanations relevant to preventing struck-by and caught-in injuries.",
|
| 147 |
+
"format": "HTML"
|
| 148 |
+
},
|
| 149 |
+
{
|
| 150 |
+
"title": "OSHA eTool: Powered Industrial Trucks (Forklifts)",
|
| 151 |
+
"url": "https://www.osha.gov/etools/powered-industrial-trucks",
|
| 152 |
+
"source": "OSHA",
|
| 153 |
+
"year": 2023,
|
| 154 |
+
"category": "eTool",
|
| 155 |
+
"summary": "An interactive resource detailing forklift hazards and controls, covering topics like operating procedures, workplace conditions, and stability, which are critical for preventing struck-by incidents.",
|
| 156 |
+
"format": "HTML"
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
"title": "Fact Sheet: Lockout/Tagout",
|
| 160 |
+
"url": "https://www.osha.gov/sites/default/files/publications/factsheet_lockout-tagout.pdf",
|
| 161 |
+
"source": "OSHA",
|
| 162 |
+
"year": 2022,
|
| 163 |
+
"category": "Fact Sheet",
|
| 164 |
+
"summary": "A concise summary of the Lockout/Tagout standard, highlighting the purpose and key components of energy control programs to prevent injuries during machine servicing.",
|
| 165 |
+
"format": "PDF"
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"title": "OSHA Technical Manual (OTM) Section IV: Chapter 5 - Industrial Robots and Robot System Safety",
|
| 169 |
+
"url": "https://www.osha.gov/otm/section-4/chapter-5",
|
| 170 |
+
"source": "OSHA",
|
| 171 |
+
"year": 2017,
|
| 172 |
+
"category": "Technical Guide",
|
| 173 |
+
"summary": "Details hazards associated with industrial robots, such as struck-by and caught-between incidents, and outlines safeguarding requirements including guards, presence-sensing devices, and proper work procedures.",
|
| 174 |
+
"format": "HTML"
|
| 175 |
+
},
|
| 176 |
+
{
|
| 177 |
+
"title": "Safety and Health Topics: Robotics",
|
| 178 |
+
"url": "https://www.osha.gov/robotics",
|
| 179 |
+
"source": "OSHA",
|
| 180 |
+
"year": 2024,
|
| 181 |
+
"category": "Technical Guide",
|
| 182 |
+
"summary": "This page provides a comprehensive overview of robotic system hazards and control methods, referencing key consensus standards (like ANSI/RIA R15.06) for preventing injuries.",
|
| 183 |
+
"format": "HTML"
|
| 184 |
+
},
|
| 185 |
+
{
|
| 186 |
+
"title": "Directive: National Emphasis Program on Amputations in Manufacturing Industries",
|
| 187 |
+
"url": "https://www.osha.gov/enforcement/directives/cpl-03-00-022",
|
| 188 |
+
"source": "OSHA",
|
| 189 |
+
"year": 2019,
|
| 190 |
+
"category": "Directive",
|
| 191 |
+
"summary": "Establishes an enforcement program for inspecting workplaces with machinery that poses amputation hazards, focusing on compliance with standards for machine guarding and Lockout/Tagout.",
|
| 192 |
+
"format": "HTML"
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"title": "NIOSH Topic: Human-Robot Collaboration",
|
| 196 |
+
"url": "https://www.cdc.gov/niosh/topics/robot/hrc.html",
|
| 197 |
+
"source": "NIOSH",
|
| 198 |
+
"year": 2024,
|
| 199 |
+
"category": "Technical Guide",
|
| 200 |
+
"summary": "Addresses the unique safety challenges of collaborative robots (cobots), focusing on research to prevent struck-by injuries through better sensor technology, risk assessments, and safety standards.",
|
| 201 |
+
"format": "HTML"
|
| 202 |
+
},
|
| 203 |
+
{
|
| 204 |
+
"title": "ANSI B11.0-2020: Safety of Machinery",
|
| 205 |
+
"url": "https://www.assp.org/standards/standards-descriptions/ansi-b11.0-2020-safety-of-machinery",
|
| 206 |
+
"source": "ANSI",
|
| 207 |
+
"year": 2020,
|
| 208 |
+
"category": "Technical Guide",
|
| 209 |
+
"summary": "This foundational US standard provides the framework for assessing risk and applying safeguarding measures to machinery to achieve an acceptable level of risk, heavily influencing OSHA's approach.",
|
| 210 |
+
"format": "HTML"
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
"title": "ANSI B11.19-2019: Performance Requirements for Safeguarding",
|
| 214 |
+
"url": "https://webstore.ansi.org/Standards/B11/ANSIB112019",
|
| 215 |
+
"source": "ANSI",
|
| 216 |
+
"year": 2019,
|
| 217 |
+
"category": "Technical Guide",
|
| 218 |
+
"summary": "Specifies performance requirements for the design, construction, installation, and operation of machine safeguarding methods, including guards, interlocking devices, and safety circuits.",
|
| 219 |
+
"format": "HTML"
|
| 220 |
+
}
|
| 221 |
+
]
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
# 2. Helper Functions (Scraping and Processing)
|
| 225 |
+
# ---
|
| 226 |
+
def scrape_and_process_url(item):
|
| 227 |
+
"""Scrapes a URL, extracts text, and downloads images."""
|
| 228 |
+
url = item.get("url")
|
| 229 |
+
if not url or item.get("status") == "link broken":
|
| 230 |
+
print(f"β οΈ Skipping broken or missing URL for: {item['title']}")
|
| 231 |
+
return [Document(text=item["summary"], metadata=item)]
|
| 232 |
+
|
| 233 |
+
print(f"βοΈ Processing URL: {url}")
|
| 234 |
+
try:
|
| 235 |
+
headers = {'User-Agent': 'Mozilla/5.0'}
|
| 236 |
+
response = requests.get(url, headers=headers, timeout=20)
|
| 237 |
+
response.raise_for_status()
|
| 238 |
+
content_type = response.headers.get("content-type", "").lower()
|
| 239 |
+
documents = []
|
| 240 |
+
metadata = {"source_url": url, "title": item["title"], "category": item["category"]}
|
| 241 |
+
|
| 242 |
+
if "pdf" in content_type:
|
| 243 |
+
pdf_docs = PyMuPDFReader().load(file_path=io.BytesIO(response.content), metadata=metadata)
|
| 244 |
+
documents.extend(pdf_docs)
|
| 245 |
+
else:
|
| 246 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 247 |
+
text_content = soup.get_text(separator='\n', strip=True)
|
| 248 |
+
documents.append(Document(text=text_content, metadata=metadata))
|
| 249 |
+
for img_tag in soup.find_all('img'):
|
| 250 |
+
img_src = img_tag.get('src')
|
| 251 |
+
if not img_src: continue
|
| 252 |
+
img_url = urljoin(url, img_src)
|
| 253 |
+
try:
|
| 254 |
+
img_response = requests.get(img_url, headers=headers, timeout=10)
|
| 255 |
+
img_response.raise_for_status()
|
| 256 |
+
img_filename = os.path.join(IMAGE_DIR, f"{item['source']}_{len(os.listdir(IMAGE_DIR))}.png")
|
| 257 |
+
Image.open(io.BytesIO(img_response.content)).convert("RGB").save(img_filename)
|
| 258 |
+
except Exception as e: print(f" β Could not download image {img_url}: {e}")
|
| 259 |
+
return documents
|
| 260 |
+
except requests.RequestException as e:
|
| 261 |
+
print(f"β Failed to scrape {url}: {e}")
|
| 262 |
+
return [Document(text=item["summary"], metadata=item)]
|
| 263 |
+
|
| 264 |
+
# 3. Execution: Scrape, Build, and Save
|
| 265 |
+
# ---
|
| 266 |
+
print("--- Starting Data Pre-processing ---")
|
| 267 |
+
# Configure LlamaIndex Settings
|
| 268 |
+
Settings.llm = OpenAIMultiModal(model="gpt-4o", max_new_tokens=1000)
|
| 269 |
+
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
|
| 270 |
+
|
| 271 |
+
# Scrape all text
|
| 272 |
+
all_documents = []
|
| 273 |
+
for item in source_data:
|
| 274 |
+
all_documents.extend(scrape_and_process_url(item))
|
| 275 |
+
|
| 276 |
+
# Process all images and generate descriptions
|
| 277 |
+
image_documents = ImageReader(text_parser=Settings.llm).load_data(IMAGE_DIR)
|
| 278 |
+
all_documents.extend(image_documents)
|
| 279 |
+
print(f"\nβ
Total documents loaded: {len(all_documents)}")
|
| 280 |
+
|
| 281 |
+
# Build the Knowledge Graph Index
|
| 282 |
+
print("\n--- Building Knowledge Graph ---")
|
| 283 |
+
graph_store = SimpleGraphStore()
|
| 284 |
+
storage_context = StorageContext.from_defaults(graph_store=graph_store)
|
| 285 |
+
index = KnowledgeGraphIndex.from_documents(
|
| 286 |
+
documents=all_documents,
|
| 287 |
+
storage_context=storage_context,
|
| 288 |
+
max_triplets_per_chunk=5,
|
| 289 |
+
include_embeddings=True,
|
| 290 |
+
show_progress=True,
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
# Persist the index to disk
|
| 294 |
+
index.storage_context.persist(persist_dir=STORAGE_DIR)
|
| 295 |
+
print(f"\nβ
β
β
Graph built and saved to disk at '{STORAGE_DIR}'! You can now run query.py. β
β
β
")
|
query_graph.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import numpy as np
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from openai import OpenAI
|
| 5 |
+
import networkx as nx
|
| 6 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 7 |
+
|
| 8 |
+
# Initialize OpenAI client
|
| 9 |
+
load_dotenv(override=True)
|
| 10 |
+
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
| 11 |
+
|
| 12 |
+
# Load graph from GML
|
| 13 |
+
G = nx.read_gml("graph.gml")
|
| 14 |
+
enodes = list(G.nodes)
|
| 15 |
+
embeddings = np.array([G.nodes[n]['embedding'] for n in enodes])
|
| 16 |
+
|
| 17 |
+
def query_graph(question, top_k=5):
|
| 18 |
+
# Embed question
|
| 19 |
+
emb_resp = client.embeddings.create(
|
| 20 |
+
model="text-embedding-3-large",
|
| 21 |
+
input=question
|
| 22 |
+
)
|
| 23 |
+
q_vec = emb_resp.data[0].embedding
|
| 24 |
+
sims = cosine_similarity([q_vec], embeddings)[0]
|
| 25 |
+
idxs = sims.argsort()[::-1][:top_k]
|
| 26 |
+
|
| 27 |
+
# Gather context and sources
|
| 28 |
+
context = [G.nodes[enodes[i]]['text'] for i in idxs]
|
| 29 |
+
sources = list({G.nodes[enodes[i]]['source'] for i in idxs})
|
| 30 |
+
|
| 31 |
+
# Generate answer
|
| 32 |
+
prompt = (
|
| 33 |
+
"Use the following context to answer the question:\n\n"
|
| 34 |
+
+ "\n\n---\n\n".join(context)
|
| 35 |
+
+ f"\n\nQuestion: {question}\nAnswer:")
|
| 36 |
+
chat_resp = client.chat.completions.create(
|
| 37 |
+
model="gpt-4o-mini",
|
| 38 |
+
messages=[
|
| 39 |
+
{"role": "system", "content": "You are a helpful assistant for XR safety training."},
|
| 40 |
+
{"role": "user", "content": prompt}
|
| 41 |
+
]
|
| 42 |
+
)
|
| 43 |
+
answer = chat_resp.choices[0].message.content
|
| 44 |
+
return answer, sources
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# Test queries
|
| 48 |
+
test_questions = [
|
| 49 |
+
"What are general machine guarding requirements?",
|
| 50 |
+
"Explain the key steps in lockout/tagout procedures."
|
| 51 |
+
]
|
| 52 |
+
|
| 53 |
+
for q in test_questions:
|
| 54 |
+
ans, srcs = query_graph(q)
|
| 55 |
+
print(f"Q: {q}\nA: {ans}\nSources: {srcs}\n")
|