fmegahed commited on
Commit
cd68afd
Β·
verified Β·
1 Parent(s): 5564026

main app files

Browse files
Files changed (4) hide show
  1. app.py +74 -0
  2. graph.gml +0 -0
  3. preprocess.py +295 -0
  4. query_graph.py +55 -0
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from query_graph import query_graph
3
+
4
+ # Sidebar configuration
5
+ st.sidebar.title("About")
6
+ st.sidebar.markdown("**Authors:** [The SIGHT Project Team](https://sites.miamioh.edu/sight/)")
7
+ st.sidebar.markdown("**Version:** V. 0.0.1")
8
+ st.sidebar.markdown("**Date:** July 24, 2025")
9
+ st.sidebar.markdown("**Model:** gpt4o")
10
+
11
+ st.sidebar.markdown("---")
12
+ st.sidebar.markdown(
13
+ "**Funding:** SIGHT is funded by [OHBWC WSIC](https://info.bwc.ohio.gov/for-employers/safety-services/workplace-safety-innovation-center/wsic-overview)"
14
+ )
15
+
16
+ # References toggle in sidebar
17
+ st.sidebar.markdown("---")
18
+ show_refs = st.sidebar.checkbox("Show references")
19
+
20
+
21
+
22
+ # Main interface
23
+ st.set_page_config(page_title="Miami University's SIGHT Chatbot")
24
+ st.title("Chat with SIGHT")
25
+ st.write("Ask questions about machine safeguarding, LOTO, and hazard prevention based on OSHA/CFR's corpus.")
26
+
27
+ # Example questions toggled in main window
28
+ with st.expander("Example Questions", expanded=False):
29
+ st.markdown(
30
+ "- What are general machine guarding requirements? \n"
31
+ "- How do I perform lockout/tagout? \n"
32
+ "- Summarize the definition of machine guarding from 29 CFR 1910.211"
33
+ )
34
+
35
+ # Initialize chat history
36
+ if 'history' not in st.session_state:
37
+ st.session_state.history = []
38
+
39
+ # User input
40
+ query = st.text_input("Your question:")
41
+ if st.button("Send") and query:
42
+ answer, sources = query_graph(query)
43
+ st.session_state.history.append({
44
+ 'query': query,
45
+ 'answer': answer,
46
+ 'sources': sources
47
+ })
48
+
49
+ # Display chat history
50
+ for i, entry in enumerate(st.session_state.history):
51
+ st.markdown(f"**You:** {entry['query']}")
52
+ st.markdown(f"**Assistant:** {entry['answer']}")
53
+ # Explanations expander
54
+ with st.expander("Sources used", expanded=False):
55
+ for src in entry['sources']:
56
+ st.markdown(f"- {src}")
57
+
58
+ # Optionally show references list
59
+ if show_refs:
60
+ refs = [
61
+ "29 CFR 1910.211", "29 CFR 1910.212", "29 CFR 1910.213", "29 CFR 1910.215",
62
+ "OSHA 3170", "OSHA 3120", "NIOSH WP Solutions 2011-156", "NIOSH Robotics (2024)"
63
+ ]
64
+ for r in refs:
65
+ st.sidebar.markdown(f"- {r}")
66
+
67
+ # Footer
68
+ st.markdown("---")
69
+ st.markdown(
70
+ "**Disclaimer:** *Powered by a Graph RAG to reduce hallucinations; please verify as it can still make mistakes.*"
71
+ )
72
+ st.markdown(
73
+ "**Funding:** *We are thankful for [Ohio BWC/WSIC](https://info.bwc.ohio.gov/for-employers/safety-services/workplace-safety-innovation-center/wsic-overview)'s funding that made this chat bot possible.*"
74
+ )
graph.gml ADDED
The diff for this file is too large to render. See raw diff
 
preprocess.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SCRIPT 1: PREPROCESS.PY
2
+ # Run this script one time to build and save your knowledge graph.
3
+
4
+ import os
5
+ import requests
6
+ import json
7
+ from bs4 import BeautifulSoup
8
+ from urllib.parse import urljoin
9
+ import io
10
+
11
+ from llama_index.core import Document, Settings, StorageContext, KnowledgeGraphIndex, load_index_from_storage
12
+ from llama_index.core.graph_stores import SimpleGraphStore
13
+ from llama_index.llms.openai_multi_modal import OpenAIMultiModal
14
+ from llama_index.embeddings.openai import OpenAIEmbedding
15
+ from llama_index.readers.file import PyMuPDFReader, ImageReader
16
+ from PIL import Image
17
+
18
+ # 1. Configuration
19
+ # ---
20
+ if not os.getenv("OPENAI_API_KEY"):
21
+ raise ValueError("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
22
+
23
+ # Directories for storing data and the final index
24
+ IMAGE_DIR = "image_data"
25
+ STORAGE_DIR = "storage"
26
+ os.makedirs(IMAGE_DIR, exist_ok=True)
27
+ os.makedirs(STORAGE_DIR, exist_ok=True)
28
+
29
+ # The JSON data you provided (ensure you have the full list here)
30
+ source_data = [
31
+ {
32
+ "title": "29 CFR Β§ 1910.211 - Definitions",
33
+ "url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.211",
34
+ "source": "OSHA",
35
+ "year": 2024,
36
+ "category": "Regulation",
37
+ "summary": "Provides key legal definitions for terms used throughout Subpart O, such as 'point of operation,' 'guard,' and 'power press,' which are foundational for understanding and applying machine safeguarding rules.",
38
+ "format": "HTML"
39
+ },
40
+ {
41
+ "title": "29 CFR Β§ 1910.212 - General requirements for all machines",
42
+ "url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.212",
43
+ "source": "OSHA",
44
+ "year": 2024,
45
+ "category": "Regulation",
46
+ "summary": "This is the core machine guarding standard, mandating that one or more safeguarding methods be used to protect operators from point-of-operation hazards, ingoing nip points, and other machinery dangers.",
47
+ "format": "HTML"
48
+ },
49
+ {
50
+ "title": "29 CFR Β§ 1910.213 - Woodworking machinery requirements",
51
+ "url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.213",
52
+ "source": "OSHA",
53
+ "year": 2024,
54
+ "category": "Regulation",
55
+ "summary": "Details specific guarding requirements for various woodworking machines, including circular saws, band saws, and jointers, to prevent lacerations and amputations from blade contact.",
56
+ "format": "HTML"
57
+ },
58
+ {
59
+ "title": "29 CFR Β§ 1910.215 - Abrasive wheel machinery",
60
+ "url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.215",
61
+ "source": "OSHA",
62
+ "year": 2024,
63
+ "category": "Regulation",
64
+ "summary": "Specifies safety requirements for abrasive wheel grinders, including guards, flanges, and work rests, to protect workers from wheel breakage, projectiles, and contact with the wheel.",
65
+ "format": "HTML"
66
+ },
67
+ {
68
+ "title": "29 CFR Β§ 1910.217 - Mechanical power presses",
69
+ "url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.217",
70
+ "source": "OSHA",
71
+ "year": 2024,
72
+ "category": "Regulation",
73
+ "summary": "Outlines extensive requirements for mechanical power presses to prevent injuries to hands and fingers, covering guards, devices, clutch/brake mechanisms, and inspection procedures.",
74
+ "format": "HTML"
75
+ },
76
+ {
77
+ "title": "29 CFR Β§ 1910.219 - Mechanical power-transmission apparatus",
78
+ "url": "https://www.ecfr.gov/current/title-29/part-1910/subpart-O/section-1910.219",
79
+ "source": "OSHA",
80
+ "year": 2024,
81
+ "category": "Regulation",
82
+ "summary": "Mandates the guarding of mechanical power-transmission components like belts, pulleys, gears, and shafts to prevent caught-in/between injuries from entanglement.",
83
+ "format": "HTML"
84
+ },
85
+ {
86
+ "title": "Safeguarding Equipment and Protecting Workers from Amputations (OSHA 3170)",
87
+ "url": "https://www.osha.gov/sites/default/files/publications/osha3170.pdf",
88
+ "source": "OSHA",
89
+ "year": 2007,
90
+ "category": "Technical Guide",
91
+ "summary": "This guide helps identify and manage amputation hazards from various machines by explaining hazard analysis, machine safeguarding methods, and the importance of hazardous energy control.",
92
+ "format": "PDF"
93
+ },
94
+ {
95
+ "title": "The Control of Hazardous Energy (Lockout/Tagout) (OSHA 3120)",
96
+ "url": "https://www.osha.gov/sites/default/files/publications/osha3120.pdf",
97
+ "source": "OSHA",
98
+ "year": 2002,
99
+ "category": "Technical Guide",
100
+ "summary": "Provides a detailed explanation of the Lockout/Tagout standard (1910.147), offering guidance on energy control procedures, training, and periodic inspections to prevent unexpected machine startup.",
101
+ "format": "PDF"
102
+ },
103
+ {
104
+ "title": "29 CFR Β§ 1910.147 - The control of hazardous energy (lockout/tagout)",
105
+ "url": "https://www.ecfr.gov/current/title-29/section-1910.147",
106
+ "source": "OSHA",
107
+ "year": 2024,
108
+ "category": "Regulation",
109
+ "summary": "This regulation establishes the employer's responsibility to protect workers from hazardous energy sources during machine servicing and maintenance by requiring energy isolation and lockout/tagout procedures.",
110
+ "format": "HTML"
111
+ },
112
+ {
113
+ "title": "29 CFR Β§ 1910.178 - Powered Industrial Trucks",
114
+ "url": "https://www.ecfr.gov/current/title-29/section-1910.178",
115
+ "source": "OSHA",
116
+ "year": 2024,
117
+ "category": "Regulation",
118
+ "summary": "Covers safety requirements for forklifts and other powered industrial trucks, addressing design, maintenance, and operation to prevent struck-by, caught-in, and crushing incidents.",
119
+ "format": "HTML"
120
+ },
121
+ {
122
+ "title": "NIOSH Workplace Solutions: Preventing Worker Injuries from Industrial Machines",
123
+ "url": "https://www.cdc.gov/niosh/docs/wp-solutions/2012-116/",
124
+ "source": "NIOSH",
125
+ "year": 2012,
126
+ "category": "Technical Guide",
127
+ "summary": "Describes how to prevent machine-related injuries using a combination of engineering controls, administrative controls like LOTO, and personal protective equipment, focusing on the hierarchy of controls.",
128
+ "format": "HTML",
129
+ "status": "link broken"
130
+ },
131
+ {
132
+ "title": "Engineering Control Guidelines for Safety in Manufacturing",
133
+ "url": "https://www.cdc.gov/niosh/docs/2001-123/",
134
+ "source": "NIOSH",
135
+ "year": 2001,
136
+ "category": "Technical Guide",
137
+ "summary": "Provides guidance on applying engineering controlsβ€”the most effective way to reduce workplace hazardsβ€”to manufacturing processes, directly supporting the prevention of machine-related incidents.",
138
+ "format": "HTML"
139
+ },
140
+ {
141
+ "title": "OSHA eTool: Machine Guarding",
142
+ "url": "https://www.osha.gov/etools/machine-guarding",
143
+ "source": "OSHA",
144
+ "year": 2023,
145
+ "category": "eTool",
146
+ "summary": "This interactive web tool illustrates the hazards and safeguarding methods for a wide range of machinery, providing visual examples and explanations relevant to preventing struck-by and caught-in injuries.",
147
+ "format": "HTML"
148
+ },
149
+ {
150
+ "title": "OSHA eTool: Powered Industrial Trucks (Forklifts)",
151
+ "url": "https://www.osha.gov/etools/powered-industrial-trucks",
152
+ "source": "OSHA",
153
+ "year": 2023,
154
+ "category": "eTool",
155
+ "summary": "An interactive resource detailing forklift hazards and controls, covering topics like operating procedures, workplace conditions, and stability, which are critical for preventing struck-by incidents.",
156
+ "format": "HTML"
157
+ },
158
+ {
159
+ "title": "Fact Sheet: Lockout/Tagout",
160
+ "url": "https://www.osha.gov/sites/default/files/publications/factsheet_lockout-tagout.pdf",
161
+ "source": "OSHA",
162
+ "year": 2022,
163
+ "category": "Fact Sheet",
164
+ "summary": "A concise summary of the Lockout/Tagout standard, highlighting the purpose and key components of energy control programs to prevent injuries during machine servicing.",
165
+ "format": "PDF"
166
+ },
167
+ {
168
+ "title": "OSHA Technical Manual (OTM) Section IV: Chapter 5 - Industrial Robots and Robot System Safety",
169
+ "url": "https://www.osha.gov/otm/section-4/chapter-5",
170
+ "source": "OSHA",
171
+ "year": 2017,
172
+ "category": "Technical Guide",
173
+ "summary": "Details hazards associated with industrial robots, such as struck-by and caught-between incidents, and outlines safeguarding requirements including guards, presence-sensing devices, and proper work procedures.",
174
+ "format": "HTML"
175
+ },
176
+ {
177
+ "title": "Safety and Health Topics: Robotics",
178
+ "url": "https://www.osha.gov/robotics",
179
+ "source": "OSHA",
180
+ "year": 2024,
181
+ "category": "Technical Guide",
182
+ "summary": "This page provides a comprehensive overview of robotic system hazards and control methods, referencing key consensus standards (like ANSI/RIA R15.06) for preventing injuries.",
183
+ "format": "HTML"
184
+ },
185
+ {
186
+ "title": "Directive: National Emphasis Program on Amputations in Manufacturing Industries",
187
+ "url": "https://www.osha.gov/enforcement/directives/cpl-03-00-022",
188
+ "source": "OSHA",
189
+ "year": 2019,
190
+ "category": "Directive",
191
+ "summary": "Establishes an enforcement program for inspecting workplaces with machinery that poses amputation hazards, focusing on compliance with standards for machine guarding and Lockout/Tagout.",
192
+ "format": "HTML"
193
+ },
194
+ {
195
+ "title": "NIOSH Topic: Human-Robot Collaboration",
196
+ "url": "https://www.cdc.gov/niosh/topics/robot/hrc.html",
197
+ "source": "NIOSH",
198
+ "year": 2024,
199
+ "category": "Technical Guide",
200
+ "summary": "Addresses the unique safety challenges of collaborative robots (cobots), focusing on research to prevent struck-by injuries through better sensor technology, risk assessments, and safety standards.",
201
+ "format": "HTML"
202
+ },
203
+ {
204
+ "title": "ANSI B11.0-2020: Safety of Machinery",
205
+ "url": "https://www.assp.org/standards/standards-descriptions/ansi-b11.0-2020-safety-of-machinery",
206
+ "source": "ANSI",
207
+ "year": 2020,
208
+ "category": "Technical Guide",
209
+ "summary": "This foundational US standard provides the framework for assessing risk and applying safeguarding measures to machinery to achieve an acceptable level of risk, heavily influencing OSHA's approach.",
210
+ "format": "HTML"
211
+ },
212
+ {
213
+ "title": "ANSI B11.19-2019: Performance Requirements for Safeguarding",
214
+ "url": "https://webstore.ansi.org/Standards/B11/ANSIB112019",
215
+ "source": "ANSI",
216
+ "year": 2019,
217
+ "category": "Technical Guide",
218
+ "summary": "Specifies performance requirements for the design, construction, installation, and operation of machine safeguarding methods, including guards, interlocking devices, and safety circuits.",
219
+ "format": "HTML"
220
+ }
221
+ ]
222
+
223
+
224
+ # 2. Helper Functions (Scraping and Processing)
225
+ # ---
226
+ def scrape_and_process_url(item):
227
+ """Scrapes a URL, extracts text, and downloads images."""
228
+ url = item.get("url")
229
+ if not url or item.get("status") == "link broken":
230
+ print(f"⚠️ Skipping broken or missing URL for: {item['title']}")
231
+ return [Document(text=item["summary"], metadata=item)]
232
+
233
+ print(f"βš™οΈ Processing URL: {url}")
234
+ try:
235
+ headers = {'User-Agent': 'Mozilla/5.0'}
236
+ response = requests.get(url, headers=headers, timeout=20)
237
+ response.raise_for_status()
238
+ content_type = response.headers.get("content-type", "").lower()
239
+ documents = []
240
+ metadata = {"source_url": url, "title": item["title"], "category": item["category"]}
241
+
242
+ if "pdf" in content_type:
243
+ pdf_docs = PyMuPDFReader().load(file_path=io.BytesIO(response.content), metadata=metadata)
244
+ documents.extend(pdf_docs)
245
+ else:
246
+ soup = BeautifulSoup(response.content, 'html.parser')
247
+ text_content = soup.get_text(separator='\n', strip=True)
248
+ documents.append(Document(text=text_content, metadata=metadata))
249
+ for img_tag in soup.find_all('img'):
250
+ img_src = img_tag.get('src')
251
+ if not img_src: continue
252
+ img_url = urljoin(url, img_src)
253
+ try:
254
+ img_response = requests.get(img_url, headers=headers, timeout=10)
255
+ img_response.raise_for_status()
256
+ img_filename = os.path.join(IMAGE_DIR, f"{item['source']}_{len(os.listdir(IMAGE_DIR))}.png")
257
+ Image.open(io.BytesIO(img_response.content)).convert("RGB").save(img_filename)
258
+ except Exception as e: print(f" ❌ Could not download image {img_url}: {e}")
259
+ return documents
260
+ except requests.RequestException as e:
261
+ print(f"❌ Failed to scrape {url}: {e}")
262
+ return [Document(text=item["summary"], metadata=item)]
263
+
264
+ # 3. Execution: Scrape, Build, and Save
265
+ # ---
266
+ print("--- Starting Data Pre-processing ---")
267
+ # Configure LlamaIndex Settings
268
+ Settings.llm = OpenAIMultiModal(model="gpt-4o", max_new_tokens=1000)
269
+ Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
270
+
271
+ # Scrape all text
272
+ all_documents = []
273
+ for item in source_data:
274
+ all_documents.extend(scrape_and_process_url(item))
275
+
276
+ # Process all images and generate descriptions
277
+ image_documents = ImageReader(text_parser=Settings.llm).load_data(IMAGE_DIR)
278
+ all_documents.extend(image_documents)
279
+ print(f"\nβœ… Total documents loaded: {len(all_documents)}")
280
+
281
+ # Build the Knowledge Graph Index
282
+ print("\n--- Building Knowledge Graph ---")
283
+ graph_store = SimpleGraphStore()
284
+ storage_context = StorageContext.from_defaults(graph_store=graph_store)
285
+ index = KnowledgeGraphIndex.from_documents(
286
+ documents=all_documents,
287
+ storage_context=storage_context,
288
+ max_triplets_per_chunk=5,
289
+ include_embeddings=True,
290
+ show_progress=True,
291
+ )
292
+
293
+ # Persist the index to disk
294
+ index.storage_context.persist(persist_dir=STORAGE_DIR)
295
+ print(f"\nβœ…βœ…βœ… Graph built and saved to disk at '{STORAGE_DIR}'! You can now run query.py. βœ…βœ…βœ…")
query_graph.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ from dotenv import load_dotenv
4
+ from openai import OpenAI
5
+ import networkx as nx
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+
8
+ # Initialize OpenAI client
9
+ load_dotenv(override=True)
10
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
11
+
12
+ # Load graph from GML
13
+ G = nx.read_gml("graph.gml")
14
+ enodes = list(G.nodes)
15
+ embeddings = np.array([G.nodes[n]['embedding'] for n in enodes])
16
+
17
+ def query_graph(question, top_k=5):
18
+ # Embed question
19
+ emb_resp = client.embeddings.create(
20
+ model="text-embedding-3-large",
21
+ input=question
22
+ )
23
+ q_vec = emb_resp.data[0].embedding
24
+ sims = cosine_similarity([q_vec], embeddings)[0]
25
+ idxs = sims.argsort()[::-1][:top_k]
26
+
27
+ # Gather context and sources
28
+ context = [G.nodes[enodes[i]]['text'] for i in idxs]
29
+ sources = list({G.nodes[enodes[i]]['source'] for i in idxs})
30
+
31
+ # Generate answer
32
+ prompt = (
33
+ "Use the following context to answer the question:\n\n"
34
+ + "\n\n---\n\n".join(context)
35
+ + f"\n\nQuestion: {question}\nAnswer:")
36
+ chat_resp = client.chat.completions.create(
37
+ model="gpt-4o-mini",
38
+ messages=[
39
+ {"role": "system", "content": "You are a helpful assistant for XR safety training."},
40
+ {"role": "user", "content": prompt}
41
+ ]
42
+ )
43
+ answer = chat_resp.choices[0].message.content
44
+ return answer, sources
45
+
46
+
47
+ # Test queries
48
+ test_questions = [
49
+ "What are general machine guarding requirements?",
50
+ "Explain the key steps in lockout/tagout procedures."
51
+ ]
52
+
53
+ for q in test_questions:
54
+ ans, srcs = query_graph(q)
55
+ print(f"Q: {q}\nA: {ans}\nSources: {srcs}\n")