Spaces:
Paused
Paused
sight chat app v0.0.2
Browse files- Improved the graph RAG (made all the CFR codes in PDF since we were being blocked)
- Improved the display of sources
- Reversed Chat History display to ensure that the most recent is on top (and added a line break to separate the different messages)
- app.py +29 -21
- graph.gml +0 -0
- preprocess.py +85 -192
- query_graph.py +102 -17
app.py
CHANGED
|
@@ -1,10 +1,33 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
from query_graph import query_graph
|
| 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
# Sidebar configuration
|
| 5 |
st.sidebar.title("About")
|
| 6 |
st.sidebar.markdown("**Authors:** [The SIGHT Project Team](https://sites.miamioh.edu/sight/)")
|
| 7 |
-
st.sidebar.markdown("**Version:** V. 0.0.
|
| 8 |
st.sidebar.markdown("**Date:** July 24, 2025")
|
| 9 |
st.sidebar.markdown("**Model:** gpt4o")
|
| 10 |
|
|
@@ -13,11 +36,6 @@ st.sidebar.markdown(
|
|
| 13 |
"**Funding:** SIGHT is funded by [OHBWC WSIC](https://info.bwc.ohio.gov/for-employers/safety-services/workplace-safety-innovation-center/wsic-overview)"
|
| 14 |
)
|
| 15 |
|
| 16 |
-
# References toggle in sidebar
|
| 17 |
-
st.sidebar.markdown("---")
|
| 18 |
-
show_refs = st.sidebar.checkbox("Show references")
|
| 19 |
-
|
| 20 |
-
|
| 21 |
|
| 22 |
# Main interface
|
| 23 |
st.set_page_config(page_title="Miami University's SIGHT Chatbot")
|
|
@@ -39,30 +57,20 @@ if 'history' not in st.session_state:
|
|
| 39 |
# User input
|
| 40 |
query = st.text_input("Your question:")
|
| 41 |
if st.button("Send") and query:
|
| 42 |
-
answer, sources = query_graph(query)
|
| 43 |
st.session_state.history.append({
|
| 44 |
'query': query,
|
| 45 |
'answer': answer,
|
| 46 |
-
'sources': sources
|
|
|
|
| 47 |
})
|
| 48 |
|
| 49 |
# Display chat history
|
| 50 |
-
for
|
| 51 |
st.markdown(f"**You:** {entry['query']}")
|
| 52 |
st.markdown(f"**Assistant:** {entry['answer']}")
|
| 53 |
-
|
| 54 |
-
with st.expander("Sources used", expanded=False):
|
| 55 |
-
for src in entry['sources']:
|
| 56 |
-
st.markdown(f"- {src}")
|
| 57 |
|
| 58 |
-
# Optionally show references list
|
| 59 |
-
if show_refs:
|
| 60 |
-
refs = [
|
| 61 |
-
"29 CFR 1910.211", "29 CFR 1910.212", "29 CFR 1910.213", "29 CFR 1910.215",
|
| 62 |
-
"OSHA 3170", "OSHA 3120", "NIOSH WP Solutions 2011-156", "NIOSH Robotics (2024)"
|
| 63 |
-
]
|
| 64 |
-
for r in refs:
|
| 65 |
-
st.sidebar.markdown(f"- {r}")
|
| 66 |
|
| 67 |
# Footer
|
| 68 |
st.markdown("---")
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
from query_graph import query_graph
|
| 3 |
|
| 4 |
+
# Helper for <details>
|
| 5 |
+
def format_citations_html(chunks):
|
| 6 |
+
html = []
|
| 7 |
+
for idx, (hdr, sc, txt, citation) in enumerate(chunks, start=1):
|
| 8 |
+
preamble = (
|
| 9 |
+
f"<p style='font-size:0.9em;'><strong>Preamble:</strong> "
|
| 10 |
+
f"The text in the following detail is reproduced from [{citation}]. "
|
| 11 |
+
f"It had a cosine similarity of {sc:.2f} with the user question, "
|
| 12 |
+
f"and it ranked {idx} among the text chunks in our graph database.</p>"
|
| 13 |
+
)
|
| 14 |
+
body = txt.replace("\n", "<br>")
|
| 15 |
+
html.append(
|
| 16 |
+
f"<details>"
|
| 17 |
+
f"<summary>{hdr} (cosine similarity: {sc:.2f})</summary>"
|
| 18 |
+
f"<div style='font-size:0.9em; margin-top:0.5em;'>"
|
| 19 |
+
f"<strong>Preamble:</strong> The text below is reproduced from {citation}. "
|
| 20 |
+
f"</div>"
|
| 21 |
+
f"<div style='font-size:0.7em; margin-left:1em; margin-top:0.5em;'>{body}</div>"
|
| 22 |
+
f"</details><br><br>"
|
| 23 |
+
)
|
| 24 |
+
return "<br>".join(html)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
# Sidebar configuration
|
| 28 |
st.sidebar.title("About")
|
| 29 |
st.sidebar.markdown("**Authors:** [The SIGHT Project Team](https://sites.miamioh.edu/sight/)")
|
| 30 |
+
st.sidebar.markdown("**Version:** V. 0.0.2")
|
| 31 |
st.sidebar.markdown("**Date:** July 24, 2025")
|
| 32 |
st.sidebar.markdown("**Model:** gpt4o")
|
| 33 |
|
|
|
|
| 36 |
"**Funding:** SIGHT is funded by [OHBWC WSIC](https://info.bwc.ohio.gov/for-employers/safety-services/workplace-safety-innovation-center/wsic-overview)"
|
| 37 |
)
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
# Main interface
|
| 41 |
st.set_page_config(page_title="Miami University's SIGHT Chatbot")
|
|
|
|
| 57 |
# User input
|
| 58 |
query = st.text_input("Your question:")
|
| 59 |
if st.button("Send") and query:
|
| 60 |
+
answer, sources, chunks = query_graph(query)
|
| 61 |
st.session_state.history.append({
|
| 62 |
'query': query,
|
| 63 |
'answer': answer,
|
| 64 |
+
'sources': sources,
|
| 65 |
+
'chunks': chunks
|
| 66 |
})
|
| 67 |
|
| 68 |
# Display chat history
|
| 69 |
+
for entry in st.session_state.history[::-1]:
|
| 70 |
st.markdown(f"**You:** {entry['query']}")
|
| 71 |
st.markdown(f"**Assistant:** {entry['answer']}")
|
| 72 |
+
st.markdown(format_citations_html(entry['chunks']), unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
# Footer
|
| 76 |
st.markdown("---")
|
graph.gml
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
preprocess.py
CHANGED
|
@@ -1,192 +1,85 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import re
|
| 3 |
-
import glob
|
| 4 |
-
from dotenv import load_dotenv
|
| 5 |
-
import requests
|
| 6 |
-
from bs4 import BeautifulSoup
|
| 7 |
-
import pandas as pd
|
| 8 |
-
import pymupdf4llm
|
| 9 |
-
import networkx as nx
|
| 10 |
-
from openai import OpenAI
|
| 11 |
-
|
| 12 |
-
# Load environment and initialize OpenAI client
|
| 13 |
-
load_dotenv(override=True)
|
| 14 |
-
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
| 15 |
-
|
| 16 |
-
# Helper
|
| 17 |
-
def split_by_header(md_text):
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
"summary": "Establishes broad, fundamental machine safeguarding requirements (e.g. guards at points of operation, secure attachment of guards, protection from flying debris) to protect workers from hazards like nip points, rotating parts, flying chips, and sparks.",
|
| 87 |
-
"format": "HTML"
|
| 88 |
-
},
|
| 89 |
-
{
|
| 90 |
-
"title": "29 CFR 1910.213 – Woodworking Machinery Requirements",
|
| 91 |
-
"url": "https://www.ecfr.gov/current/title-29/section-1910.213",
|
| 92 |
-
"source": "OSHA",
|
| 93 |
-
"year": 2025,
|
| 94 |
-
"category": "Regulation",
|
| 95 |
-
"summary": "Sets specific safety requirements for woodworking equipment (such as saws, jointers, planers, and sanders), including guarding of blades, hoods, push sticks, and other protective measures to prevent cuts, amputations, and kickback injuries in woodworking operations.",
|
| 96 |
-
"format": "HTML"
|
| 97 |
-
},
|
| 98 |
-
{
|
| 99 |
-
"title": "29 CFR 1910.215 – Abrasive Wheel Machinery",
|
| 100 |
-
"url": "https://www.ecfr.gov/current/title-29/section-1910.215",
|
| 101 |
-
"source": "OSHA",
|
| 102 |
-
"year": 2025,
|
| 103 |
-
"category": "Regulation",
|
| 104 |
-
"summary": "Covers guarding and safety precautions for machines with abrasive wheels (grinders and cut-off machines), requiring wheel enclosures, work rests, tongue guards, and inspections to prevent wheel shattering, sparks, and operator contact with moving abrasive parts.",
|
| 105 |
-
"format": "HTML"
|
| 106 |
-
},
|
| 107 |
-
{
|
| 108 |
-
"title": "29 CFR 1910.216 – Mills and Calenders in the Rubber and Plastics Industries",
|
| 109 |
-
"url": "https://www.ecfr.gov/current/title-29/section-1910.216",
|
| 110 |
-
"source": "OSHA",
|
| 111 |
-
"year": 2025,
|
| 112 |
-
"category": "Regulation",
|
| 113 |
-
"summary": "Specifies safeguarding for two-roll mills, calenders, and similar processing machines in rubber/plastics manufacturing – including required safety trip controls, emergency stopping devices, and barrier guards – to protect workers from being caught in rollers or nip points.",
|
| 114 |
-
"format": "HTML"
|
| 115 |
-
},
|
| 116 |
-
{
|
| 117 |
-
"title": "29 CFR 1910.217 – Mechanical Power Presses",
|
| 118 |
-
"url": "https://www.ecfr.gov/current/title-29/section-1910.217",
|
| 119 |
-
"source": "OSHA",
|
| 120 |
-
"year": 2025,
|
| 121 |
-
"category": "Regulation",
|
| 122 |
-
"summary": "Detailed standard for mechanical power presses (e.g. stamping presses) mandating guarding of points of operation, use of devices like two-hand controls or presence-sensing systems, inspection and maintenance requirements, and training – aimed at preventing severe crushing, amputation, or die-punch injuries.",
|
| 123 |
-
"format": "HTML"
|
| 124 |
-
},
|
| 125 |
-
{
|
| 126 |
-
"title": "29 CFR 1910.218 – Forging Machines",
|
| 127 |
-
"url": "https://www.ecfr.gov/current/title-29/section-1910.218",
|
| 128 |
-
"source": "OSHA",
|
| 129 |
-
"year": 2025,
|
| 130 |
-
"category": "Regulation",
|
| 131 |
-
"summary": "Covers safety requirements for forging machinery (such as hammers, presses, upsetters, and boltheaders), including provisions for guarding dies and rams, handling hot metal safely, and use of tongs or mechanical loaders – all intended to prevent struck-by, caught-in, and burn injuries in forge operations.",
|
| 132 |
-
"format": "HTML"
|
| 133 |
-
},
|
| 134 |
-
{
|
| 135 |
-
"title": "29 CFR 1910.219 – Mechanical Power-Transmission Apparatus",
|
| 136 |
-
"url": "https://www.ecfr.gov/current/title-29/section-1910.219",
|
| 137 |
-
"source": "OSHA",
|
| 138 |
-
"year": 2025,
|
| 139 |
-
"category": "Regulation",
|
| 140 |
-
"summary": "Requires guards for all exposed belts, pulleys, chains, gears, flywheels, couplings, and other power-transmission parts on machinery. This standard ensures that rotating or moving drivetrain components are enclosed to prevent employees from getting caught in or struck by these parts.",
|
| 141 |
-
"format": "HTML"
|
| 142 |
-
},
|
| 143 |
-
{
|
| 144 |
-
"title": "29 CFR 1910.147 – The Control of Hazardous Energy (Lockout/Tagout)",
|
| 145 |
-
"url": "https://www.ecfr.gov/current/title-29/section-1910.147",
|
| 146 |
-
"source": "OSHA",
|
| 147 |
-
"year": 2025,
|
| 148 |
-
"category": "Regulation",
|
| 149 |
-
"summary": "OSHA’s Lockout/Tagout standard, which mandates that dangerous machinery must be de-energized and locked out (or tagged out) during maintenance or servicing. It details the required energy control procedures, employee training, and periodic inspections to ensure that workers are protected from the release of stored energy or accidental machine start-up (a major cause of caught-in/between and amputation incidents).",
|
| 150 |
-
"format": "HTML"
|
| 151 |
-
},
|
| 152 |
-
{
|
| 153 |
-
"title": "29 CFR 1910.178 – Powered Industrial Trucks (Forklifts)",
|
| 154 |
-
"url": "https://www.ecfr.gov/current/title-29/section-1910.178",
|
| 155 |
-
"source": "OSHA",
|
| 156 |
-
"year": 2025,
|
| 157 |
-
"category": "Regulation",
|
| 158 |
-
"summary": "The OSHA standard governing the design, maintenance, and safe operation of forklifts and other powered industrial trucks. It covers operator training and certification requirements, inspection and maintenance of equipment, safe fueling/charging, and operating rules (like speed limits, handling loads, and avoiding hazards) – all aimed at preventing tip-overs, collisions, and struck-by or crushed-by accidents involving these vehicles.",
|
| 159 |
-
"format": "HTML"
|
| 160 |
-
},
|
| 161 |
-
{
|
| 162 |
-
"title": "NIOSH Robotics in the Workplace – Safety Overview (Human-Robot Collaboration)",
|
| 163 |
-
"url": "https://www.cdc.gov/niosh/robotics/about/",
|
| 164 |
-
"source": "NIOSH",
|
| 165 |
-
"year": 2024,
|
| 166 |
-
"category": "Technical Guide",
|
| 167 |
-
"summary": "A NIOSH overview of emerging safety challenges as robots increasingly collaborate with human workers. Updated in 2024, this page discusses how robots can improve safety by taking over dangerous tasks but also introduces new struck-by and caught-between hazards. It emphasizes the need for updated safety standards, risk assessments, and research on human-robot interaction, and it outlines NIOSH’s efforts (through its Center for Occupational Robotics Research) to develop best practices and guidance for safe integration of robotics in industry.",
|
| 168 |
-
"format": "HTML"
|
| 169 |
-
}
|
| 170 |
-
]
|
| 171 |
-
|
| 172 |
-
# Process HTML sources
|
| 173 |
-
def process_html(item):
|
| 174 |
-
resp = requests.get(item['url'])
|
| 175 |
-
resp.raise_for_status()
|
| 176 |
-
soup = BeautifulSoup(resp.text, 'html.parser')
|
| 177 |
-
texts = [p.get_text() for p in soup.find_all('p')]
|
| 178 |
-
tables = [pd.read_html(str(t))[0].to_markdown() for t in soup.find_all('table')]
|
| 179 |
-
# Join paragraphs and tables with blank lines
|
| 180 |
-
full = "\n\n".join(texts + tables)
|
| 181 |
-
resp_emb = client.embeddings.create(model="text-embedding-3-large", input=full)
|
| 182 |
-
vec = resp_emb.data[0].embedding
|
| 183 |
-
node_id = f"HTML::{item['title']}"
|
| 184 |
-
G.add_node(node_id, text=full, embedding=vec, source=item['title'])
|
| 185 |
-
|
| 186 |
-
# Run HTML processing
|
| 187 |
-
for item in html_data:
|
| 188 |
-
process_html(item)
|
| 189 |
-
|
| 190 |
-
# Save graph
|
| 191 |
-
nx.write_gml(G, "graph.gml")
|
| 192 |
-
print("Graph RAG database created: graph.gml")
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import glob
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
import requests
|
| 6 |
+
from bs4 import BeautifulSoup
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import pymupdf4llm
|
| 9 |
+
import networkx as nx
|
| 10 |
+
from openai import OpenAI
|
| 11 |
+
|
| 12 |
+
# Load environment and initialize OpenAI client
|
| 13 |
+
load_dotenv(override=True)
|
| 14 |
+
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
| 15 |
+
|
| 16 |
+
# Helper: split Markdown text by third-level headers
|
| 17 |
+
def split_by_header(md_text):
|
| 18 |
+
parts = re.split(r'(?m)^### ', md_text)
|
| 19 |
+
return [('### ' + p) if not p.startswith('### ') else p for p in parts if p.strip()]
|
| 20 |
+
|
| 21 |
+
# Initialize graph database
|
| 22 |
+
G = nx.Graph()
|
| 23 |
+
|
| 24 |
+
# Process local PDFs
|
| 25 |
+
for pdf_path in glob.glob("scrapped_data/*.pdf"):
|
| 26 |
+
filename = os.path.basename(pdf_path)
|
| 27 |
+
title = os.path.splitext(filename)[0]
|
| 28 |
+
# Convert PDF to Markdown
|
| 29 |
+
md_text = pymupdf4llm.to_markdown(pdf_path)
|
| 30 |
+
# Split into sections
|
| 31 |
+
sections = split_by_header(md_text)
|
| 32 |
+
for idx, sec in enumerate(sections):
|
| 33 |
+
resp = client.embeddings.create(model="text-embedding-3-large", input=sec)
|
| 34 |
+
vector = resp.data[0].embedding
|
| 35 |
+
node_id = f"PDF::{title}::section{idx}"
|
| 36 |
+
# Store the local file path for citation
|
| 37 |
+
G.add_node(node_id,
|
| 38 |
+
text=sec,
|
| 39 |
+
embedding=vector,
|
| 40 |
+
source=title,
|
| 41 |
+
path=pdf_path)
|
| 42 |
+
|
| 43 |
+
# HTML Document List
|
| 44 |
+
html_data = [
|
| 45 |
+
{
|
| 46 |
+
"title": "NIOSH Robotics in the Workplace – Safety Overview (Human-Robot Collaboration)",
|
| 47 |
+
"url": "https://www.cdc.gov/niosh/robotics/about/",
|
| 48 |
+
"source": "NIOSH",
|
| 49 |
+
"year": 2024,
|
| 50 |
+
"category": "Technical Guide",
|
| 51 |
+
"summary": "A NIOSH overview of emerging safety challenges as robots increasingly collaborate with human workers. Updated in 2024, this page discusses how robots can improve safety by taking over dangerous tasks but also introduces new struck-by and caught-between hazards. It emphasizes the need for updated safety standards, risk assessments, and research on human-robot interaction, and it outlines NIOSH’s efforts (through its Center for Occupational Robotics Research) to develop best practices and guidance for safe integration of robotics in industry.",
|
| 52 |
+
"format": "HTML"
|
| 53 |
+
}
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
# Process HTML sources
|
| 57 |
+
def process_html(item):
|
| 58 |
+
resp = requests.get(item['url'])
|
| 59 |
+
resp.raise_for_status()
|
| 60 |
+
soup = BeautifulSoup(resp.text, 'html.parser')
|
| 61 |
+
# Extract paragraph texts
|
| 62 |
+
texts = [p.get_text() for p in soup.find_all('p')]
|
| 63 |
+
# Extract tables as markdown
|
| 64 |
+
tables = []
|
| 65 |
+
for t in soup.find_all('table'):
|
| 66 |
+
df = pd.read_html(str(t))[0]
|
| 67 |
+
tables.append(df.to_markdown())
|
| 68 |
+
# Join paragraphs and tables with double newlines
|
| 69 |
+
full = "\n\n".join(texts + tables)
|
| 70 |
+
# Embed the combined text
|
| 71 |
+
resp_emb = client.embeddings.create(model="text-embedding-3-large", input=full)
|
| 72 |
+
vec = resp_emb.data[0].embedding
|
| 73 |
+
node_id = f"HTML::{item['title']}"
|
| 74 |
+
# Add node with URL citation
|
| 75 |
+
G.add_node(
|
| 76 |
+
node_id, text=full, embedding=vec, source=item['title'], url=item['url']
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
# Run HTML processing
|
| 80 |
+
for item in html_data:
|
| 81 |
+
process_html(item)
|
| 82 |
+
|
| 83 |
+
# Save graph
|
| 84 |
+
nx.write_gml(G, "graph.gml")
|
| 85 |
+
print("Graph RAG database created: graph.gml")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
query_graph.py
CHANGED
|
@@ -15,41 +15,126 @@ enodes = list(G.nodes)
|
|
| 15 |
embeddings = np.array([G.nodes[n]['embedding'] for n in enodes])
|
| 16 |
|
| 17 |
def query_graph(question, top_k=5):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
# Embed question
|
| 19 |
emb_resp = client.embeddings.create(
|
| 20 |
model="text-embedding-3-large",
|
| 21 |
input=question
|
| 22 |
)
|
| 23 |
q_vec = emb_resp.data[0].embedding
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
sims = cosine_similarity([q_vec], embeddings)[0]
|
| 25 |
idxs = sims.argsort()[::-1][:top_k]
|
| 26 |
|
| 27 |
-
# Gather
|
| 28 |
-
|
| 29 |
-
sources =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
-
#
|
|
|
|
| 32 |
prompt = (
|
| 33 |
"Use the following context to answer the question:\n\n"
|
| 34 |
-
+
|
| 35 |
-
+ f"\n\nQuestion: {question}\nAnswer:"
|
|
|
|
|
|
|
|
|
|
| 36 |
chat_resp = client.chat.completions.create(
|
| 37 |
model="gpt-4o-mini",
|
| 38 |
messages=[
|
| 39 |
-
{"role": "system", "content": "You are a helpful assistant for
|
| 40 |
-
{"role": "user",
|
| 41 |
]
|
| 42 |
)
|
| 43 |
answer = chat_resp.choices[0].message.content
|
| 44 |
-
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
# Test queries
|
| 48 |
-
test_questions = [
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
]
|
| 52 |
-
|
| 53 |
-
for q in test_questions:
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
embeddings = np.array([G.nodes[n]['embedding'] for n in enodes])
|
| 16 |
|
| 17 |
def query_graph(question, top_k=5):
|
| 18 |
+
"""
|
| 19 |
+
Embed the question, retrieve the top_k relevant chunks,
|
| 20 |
+
and return: (answer, sources, chunks)
|
| 21 |
+
- answer: generated response string
|
| 22 |
+
- sources: list of unique source names
|
| 23 |
+
- chunks: list of tuples (header, score, full_text, source_url_or_path)
|
| 24 |
+
"""
|
| 25 |
# Embed question
|
| 26 |
emb_resp = client.embeddings.create(
|
| 27 |
model="text-embedding-3-large",
|
| 28 |
input=question
|
| 29 |
)
|
| 30 |
q_vec = emb_resp.data[0].embedding
|
| 31 |
+
|
| 32 |
+
# Compute cosine similarities
|
| 33 |
+
sims = cosine_similarity([q_vec], embeddings)[0]
|
| 34 |
+
idxs = sims.argsort()[::-1][:top_k]
|
| 35 |
+
|
| 36 |
+
# Collect chunk-level info
|
| 37 |
+
chunks = []
|
| 38 |
+
sources = []
|
| 39 |
+
for rank, i in enumerate(idxs, start=1):
|
| 40 |
+
node = enodes[i]
|
| 41 |
+
text = G.nodes[node]['text']
|
| 42 |
+
header = text.split('\n', 1)[0].lstrip('# ').strip()
|
| 43 |
+
score = sims[i]
|
| 44 |
+
# Determine citation (URL for HTML, path for PDF)
|
| 45 |
+
citation = G.nodes[node].get('url') or G.nodes[node].get('path') or G.nodes[node]['source']
|
| 46 |
+
chunks.append((header, score, text, citation))
|
| 47 |
+
sources.append(G.nodes[node]['source'])
|
| 48 |
+
# Deduplicate sources
|
| 49 |
+
sources = list(dict.fromkeys(sources))
|
| 50 |
+
|
| 51 |
+
# Assemble prompt
|
| 52 |
+
context = "\n\n---\n\n".join([c[2] for c in chunks])
|
| 53 |
+
prompt = (
|
| 54 |
+
"Use the following context to answer the question:\n\n" +
|
| 55 |
+
context +
|
| 56 |
+
f"\n\nQuestion: {question}\nAnswer:"
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Query chat model
|
| 60 |
+
chat_resp = client.chat.completions.create(
|
| 61 |
+
model="gpt-4o-mini",
|
| 62 |
+
messages=[
|
| 63 |
+
{"role": "system", "content": "You are a helpful assistant for manufacturing equipment safety."},
|
| 64 |
+
{"role": "user", "content": prompt}
|
| 65 |
+
]
|
| 66 |
+
)
|
| 67 |
+
answer = chat_resp.choices[0].message.content
|
| 68 |
+
|
| 69 |
+
return answer, sources, chunks
|
| 70 |
+
|
| 71 |
+
"""
|
| 72 |
+
Embed the user question, retrieve the top_k relevant chunks from the graph,
|
| 73 |
+
assemble a prompt with those chunks, call the chat model, and return:
|
| 74 |
+
- answer: the generated response
|
| 75 |
+
- sources: unique list of source documents
|
| 76 |
+
- chunks: list of (header, score, full_text) for the top_k passages
|
| 77 |
+
"""
|
| 78 |
+
# Embed the question
|
| 79 |
+
emb_resp = client.embeddings.create(
|
| 80 |
+
model="text-embedding-3-large",
|
| 81 |
+
input=question
|
| 82 |
+
)
|
| 83 |
+
q_vec = emb_resp.data[0].embedding
|
| 84 |
+
|
| 85 |
+
# Compute similarities against all stored embeddings
|
| 86 |
sims = cosine_similarity([q_vec], embeddings)[0]
|
| 87 |
idxs = sims.argsort()[::-1][:top_k]
|
| 88 |
|
| 89 |
+
# Gather chunk‑level info and sources
|
| 90 |
+
chunks = []
|
| 91 |
+
sources = []
|
| 92 |
+
for i in idxs:
|
| 93 |
+
node = enodes[i]
|
| 94 |
+
text = G.nodes[node]['text']
|
| 95 |
+
# Use the first line as the header
|
| 96 |
+
header = text.split('\n', 1)[0].lstrip('# ').strip()
|
| 97 |
+
score = sims[i]
|
| 98 |
+
chunks.append((header, score, text))
|
| 99 |
+
sources.append(G.nodes[node]['source'])
|
| 100 |
+
# Deduplicate sources while preserving order
|
| 101 |
+
sources = list(dict.fromkeys(sources))
|
| 102 |
|
| 103 |
+
# Assemble the prompt from the chunk texts
|
| 104 |
+
context_text = "\n\n---\n\n".join([chunk[2] for chunk in chunks])
|
| 105 |
prompt = (
|
| 106 |
"Use the following context to answer the question:\n\n"
|
| 107 |
+
+ context_text
|
| 108 |
+
+ f"\n\nQuestion: {question}\nAnswer:"
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
# Call the chat model
|
| 112 |
chat_resp = client.chat.completions.create(
|
| 113 |
model="gpt-4o-mini",
|
| 114 |
messages=[
|
| 115 |
+
{"role": "system", "content": "You are a helpful assistant for manufacturing equipment safety."},
|
| 116 |
+
{"role": "user", "content": prompt}
|
| 117 |
]
|
| 118 |
)
|
| 119 |
answer = chat_resp.choices[0].message.content
|
| 120 |
+
|
| 121 |
+
return answer, sources, chunks
|
| 122 |
|
| 123 |
|
| 124 |
# Test queries
|
| 125 |
+
# test_questions = [
|
| 126 |
+
# "What are general machine guarding requirements?",
|
| 127 |
+
# "Explain the key steps in lockout/tagout procedures."
|
| 128 |
+
# ]
|
| 129 |
+
|
| 130 |
+
# for q in test_questions:
|
| 131 |
+
# answer, sources, chunks = query_graph(q)
|
| 132 |
+
# print(f"Q: {q}")
|
| 133 |
+
# print(f"Answer: {answer}\n")
|
| 134 |
+
# print("Sources:")
|
| 135 |
+
# for src in sources:
|
| 136 |
+
# print(f"- {src}")
|
| 137 |
+
# print("\nTop Chunks:")
|
| 138 |
+
# for header, score, _, citation in chunks:
|
| 139 |
+
# print(f" * {header} (score: {score:.2f}) from {citation}")
|
| 140 |
+
# print("\n", "#"*40, "\n")
|