fmegahed commited on
Commit
01c0ebb
·
verified ·
1 Parent(s): 982f6ef

sight chat app v0.0.2

Browse files

- Improved the graph RAG (made all the CFR codes in PDF since we were being blocked)
- Improved the display of sources
- Reversed Chat History display to ensure that the most recent is on top (and added a line break to separate the different messages)

Files changed (4) hide show
  1. app.py +29 -21
  2. graph.gml +0 -0
  3. preprocess.py +85 -192
  4. query_graph.py +102 -17
app.py CHANGED
@@ -1,10 +1,33 @@
1
  import streamlit as st
2
  from query_graph import query_graph
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  # Sidebar configuration
5
  st.sidebar.title("About")
6
  st.sidebar.markdown("**Authors:** [The SIGHT Project Team](https://sites.miamioh.edu/sight/)")
7
- st.sidebar.markdown("**Version:** V. 0.0.1")
8
  st.sidebar.markdown("**Date:** July 24, 2025")
9
  st.sidebar.markdown("**Model:** gpt4o")
10
 
@@ -13,11 +36,6 @@ st.sidebar.markdown(
13
  "**Funding:** SIGHT is funded by [OHBWC WSIC](https://info.bwc.ohio.gov/for-employers/safety-services/workplace-safety-innovation-center/wsic-overview)"
14
  )
15
 
16
- # References toggle in sidebar
17
- st.sidebar.markdown("---")
18
- show_refs = st.sidebar.checkbox("Show references")
19
-
20
-
21
 
22
  # Main interface
23
  st.set_page_config(page_title="Miami University's SIGHT Chatbot")
@@ -39,30 +57,20 @@ if 'history' not in st.session_state:
39
  # User input
40
  query = st.text_input("Your question:")
41
  if st.button("Send") and query:
42
- answer, sources = query_graph(query)
43
  st.session_state.history.append({
44
  'query': query,
45
  'answer': answer,
46
- 'sources': sources
 
47
  })
48
 
49
  # Display chat history
50
- for i, entry in enumerate(st.session_state.history):
51
  st.markdown(f"**You:** {entry['query']}")
52
  st.markdown(f"**Assistant:** {entry['answer']}")
53
- # Explanations expander
54
- with st.expander("Sources used", expanded=False):
55
- for src in entry['sources']:
56
- st.markdown(f"- {src}")
57
 
58
- # Optionally show references list
59
- if show_refs:
60
- refs = [
61
- "29 CFR 1910.211", "29 CFR 1910.212", "29 CFR 1910.213", "29 CFR 1910.215",
62
- "OSHA 3170", "OSHA 3120", "NIOSH WP Solutions 2011-156", "NIOSH Robotics (2024)"
63
- ]
64
- for r in refs:
65
- st.sidebar.markdown(f"- {r}")
66
 
67
  # Footer
68
  st.markdown("---")
 
1
  import streamlit as st
2
  from query_graph import query_graph
3
 
4
+ # Helper for <details>
5
+ def format_citations_html(chunks):
6
+ html = []
7
+ for idx, (hdr, sc, txt, citation) in enumerate(chunks, start=1):
8
+ preamble = (
9
+ f"<p style='font-size:0.9em;'><strong>Preamble:</strong> "
10
+ f"The text in the following detail is reproduced from [{citation}]. "
11
+ f"It had a cosine similarity of {sc:.2f} with the user question, "
12
+ f"and it ranked {idx} among the text chunks in our graph database.</p>"
13
+ )
14
+ body = txt.replace("\n", "<br>")
15
+ html.append(
16
+ f"<details>"
17
+ f"<summary>{hdr} (cosine similarity: {sc:.2f})</summary>"
18
+ f"<div style='font-size:0.9em; margin-top:0.5em;'>"
19
+ f"<strong>Preamble:</strong> The text below is reproduced from {citation}. "
20
+ f"</div>"
21
+ f"<div style='font-size:0.7em; margin-left:1em; margin-top:0.5em;'>{body}</div>"
22
+ f"</details><br><br>"
23
+ )
24
+ return "<br>".join(html)
25
+
26
+
27
  # Sidebar configuration
28
  st.sidebar.title("About")
29
  st.sidebar.markdown("**Authors:** [The SIGHT Project Team](https://sites.miamioh.edu/sight/)")
30
+ st.sidebar.markdown("**Version:** V. 0.0.2")
31
  st.sidebar.markdown("**Date:** July 24, 2025")
32
  st.sidebar.markdown("**Model:** gpt4o")
33
 
 
36
  "**Funding:** SIGHT is funded by [OHBWC WSIC](https://info.bwc.ohio.gov/for-employers/safety-services/workplace-safety-innovation-center/wsic-overview)"
37
  )
38
 
 
 
 
 
 
39
 
40
  # Main interface
41
  st.set_page_config(page_title="Miami University's SIGHT Chatbot")
 
57
  # User input
58
  query = st.text_input("Your question:")
59
  if st.button("Send") and query:
60
+ answer, sources, chunks = query_graph(query)
61
  st.session_state.history.append({
62
  'query': query,
63
  'answer': answer,
64
+ 'sources': sources,
65
+ 'chunks': chunks
66
  })
67
 
68
  # Display chat history
69
+ for entry in st.session_state.history[::-1]:
70
  st.markdown(f"**You:** {entry['query']}")
71
  st.markdown(f"**Assistant:** {entry['answer']}")
72
+ st.markdown(format_citations_html(entry['chunks']), unsafe_allow_html=True)
 
 
 
73
 
 
 
 
 
 
 
 
 
74
 
75
  # Footer
76
  st.markdown("---")
graph.gml CHANGED
The diff for this file is too large to render. See raw diff
 
preprocess.py CHANGED
@@ -1,192 +1,85 @@
1
- import os
2
- import re
3
- import glob
4
- from dotenv import load_dotenv
5
- import requests
6
- from bs4 import BeautifulSoup
7
- import pandas as pd
8
- import pymupdf4llm
9
- import networkx as nx
10
- from openai import OpenAI
11
-
12
- # Load environment and initialize OpenAI client
13
- load_dotenv(override=True)
14
- client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
15
-
16
- # Helper function to split PDF by second-level header
17
- def split_by_header(md_text):
18
- """
19
- 1) Split on third‑level headers (“### ).
20
- 2) For each part >7 500 tokens, try splitting on explicit page markers (“Page 1”, “Page 2”, …).
21
- 3) If no page markers are found, break into ~5 000‑token chunks.
22
- Returns a list of markdown fragments.
23
- """
24
- # 1) Initial split on '### ' headers
25
- parts = re.split(r'(?m)^### ', md_text)
26
- sections = [('### ' + p) if not p.startswith('### ') else p for p in parts if p.strip()]
27
-
28
- final_sections = []
29
- for sec in sections:
30
- tokens = sec.split()
31
- if len(tokens) > 7500:
32
- # 2) Try explicit “Page X” markers
33
- pages = re.split(r'(?m)^Page \d+', sec)
34
- if len(pages) > 1:
35
- for pg in pages:
36
- if pg.strip():
37
- final_sections.append(pg)
38
- continue
39
-
40
- # 3) Fallback: split into 5 000‑token chunks
41
- for i in range(0, len(tokens), 5000):
42
- chunk = ' '.join(tokens[i : i + 5000])
43
- final_sections.append(chunk)
44
- else:
45
- final_sections.append(sec)
46
-
47
- return final_sections
48
-
49
-
50
- # Initialize graph database
51
- G = nx.Graph()
52
-
53
-
54
- # Process local PDFs in gov_pdfs/
55
- for pdf_path in glob.glob("gov_pdfs/*.pdf"):
56
- filename = os.path.basename(pdf_path)
57
- title = os.path.splitext(filename)[0]
58
- # Convert PDF to Markdown
59
- md_text = pymupdf4llm.to_markdown(pdf_path)
60
- # Split into sections
61
- sections = split_by_header(md_text)
62
- # Embed and add nodes
63
- for idx, sec in enumerate(sections):
64
- resp = client.embeddings.create(model="text-embedding-3-large", input=sec)
65
- vector = resp.data[0].embedding
66
- node_id = f"PDF::{title}::section{idx}"
67
- G.add_node(node_id, text=sec, embedding=vector, source=title)
68
-
69
- # HTML Document List
70
- html_data = [
71
- {
72
- "title": "29 CFR 1910.211 – Definitions (Machinery and Machine Guarding)",
73
- "url": "https://www.ecfr.gov/current/title-29/section-1910.211",
74
- "source": "OSHA",
75
- "year": 2025,
76
- "category": "Regulation",
77
- "summary": "Provides definitions for terms used in OSHA’s machine guarding standards (Subpart O), laying the groundwork for understanding and applying the specific safeguarding requirements for machinery to prevent operator injury.",
78
- "format": "HTML"
79
- },
80
- {
81
- "title": "29 CFR 1910.212 – General Requirements for All Machines",
82
- "url": "https://www.ecfr.gov/current/title-29/section-1910.212",
83
- "source": "OSHA",
84
- "year": 2025,
85
- "category": "Regulation",
86
- "summary": "Establishes broad, fundamental machine safeguarding requirements (e.g. guards at points of operation, secure attachment of guards, protection from flying debris) to protect workers from hazards like nip points, rotating parts, flying chips, and sparks.",
87
- "format": "HTML"
88
- },
89
- {
90
- "title": "29 CFR 1910.213 – Woodworking Machinery Requirements",
91
- "url": "https://www.ecfr.gov/current/title-29/section-1910.213",
92
- "source": "OSHA",
93
- "year": 2025,
94
- "category": "Regulation",
95
- "summary": "Sets specific safety requirements for woodworking equipment (such as saws, jointers, planers, and sanders), including guarding of blades, hoods, push sticks, and other protective measures to prevent cuts, amputations, and kickback injuries in woodworking operations.",
96
- "format": "HTML"
97
- },
98
- {
99
- "title": "29 CFR 1910.215 – Abrasive Wheel Machinery",
100
- "url": "https://www.ecfr.gov/current/title-29/section-1910.215",
101
- "source": "OSHA",
102
- "year": 2025,
103
- "category": "Regulation",
104
- "summary": "Covers guarding and safety precautions for machines with abrasive wheels (grinders and cut-off machines), requiring wheel enclosures, work rests, tongue guards, and inspections to prevent wheel shattering, sparks, and operator contact with moving abrasive parts.",
105
- "format": "HTML"
106
- },
107
- {
108
- "title": "29 CFR 1910.216 – Mills and Calenders in the Rubber and Plastics Industries",
109
- "url": "https://www.ecfr.gov/current/title-29/section-1910.216",
110
- "source": "OSHA",
111
- "year": 2025,
112
- "category": "Regulation",
113
- "summary": "Specifies safeguarding for two-roll mills, calenders, and similar processing machines in rubber/plastics manufacturing – including required safety trip controls, emergency stopping devices, and barrier guards – to protect workers from being caught in rollers or nip points.",
114
- "format": "HTML"
115
- },
116
- {
117
- "title": "29 CFR 1910.217 – Mechanical Power Presses",
118
- "url": "https://www.ecfr.gov/current/title-29/section-1910.217",
119
- "source": "OSHA",
120
- "year": 2025,
121
- "category": "Regulation",
122
- "summary": "Detailed standard for mechanical power presses (e.g. stamping presses) mandating guarding of points of operation, use of devices like two-hand controls or presence-sensing systems, inspection and maintenance requirements, and training – aimed at preventing severe crushing, amputation, or die-punch injuries.",
123
- "format": "HTML"
124
- },
125
- {
126
- "title": "29 CFR 1910.218 – Forging Machines",
127
- "url": "https://www.ecfr.gov/current/title-29/section-1910.218",
128
- "source": "OSHA",
129
- "year": 2025,
130
- "category": "Regulation",
131
- "summary": "Covers safety requirements for forging machinery (such as hammers, presses, upsetters, and boltheaders), including provisions for guarding dies and rams, handling hot metal safely, and use of tongs or mechanical loaders – all intended to prevent struck-by, caught-in, and burn injuries in forge operations.",
132
- "format": "HTML"
133
- },
134
- {
135
- "title": "29 CFR 1910.219 – Mechanical Power-Transmission Apparatus",
136
- "url": "https://www.ecfr.gov/current/title-29/section-1910.219",
137
- "source": "OSHA",
138
- "year": 2025,
139
- "category": "Regulation",
140
- "summary": "Requires guards for all exposed belts, pulleys, chains, gears, flywheels, couplings, and other power-transmission parts on machinery. This standard ensures that rotating or moving drivetrain components are enclosed to prevent employees from getting caught in or struck by these parts.",
141
- "format": "HTML"
142
- },
143
- {
144
- "title": "29 CFR 1910.147 – The Control of Hazardous Energy (Lockout/Tagout)",
145
- "url": "https://www.ecfr.gov/current/title-29/section-1910.147",
146
- "source": "OSHA",
147
- "year": 2025,
148
- "category": "Regulation",
149
- "summary": "OSHA’s Lockout/Tagout standard, which mandates that dangerous machinery must be de-energized and locked out (or tagged out) during maintenance or servicing. It details the required energy control procedures, employee training, and periodic inspections to ensure that workers are protected from the release of stored energy or accidental machine start-up (a major cause of caught-in/between and amputation incidents).",
150
- "format": "HTML"
151
- },
152
- {
153
- "title": "29 CFR 1910.178 – Powered Industrial Trucks (Forklifts)",
154
- "url": "https://www.ecfr.gov/current/title-29/section-1910.178",
155
- "source": "OSHA",
156
- "year": 2025,
157
- "category": "Regulation",
158
- "summary": "The OSHA standard governing the design, maintenance, and safe operation of forklifts and other powered industrial trucks. It covers operator training and certification requirements, inspection and maintenance of equipment, safe fueling/charging, and operating rules (like speed limits, handling loads, and avoiding hazards) – all aimed at preventing tip-overs, collisions, and struck-by or crushed-by accidents involving these vehicles.",
159
- "format": "HTML"
160
- },
161
- {
162
- "title": "NIOSH Robotics in the Workplace – Safety Overview (Human-Robot Collaboration)",
163
- "url": "https://www.cdc.gov/niosh/robotics/about/",
164
- "source": "NIOSH",
165
- "year": 2024,
166
- "category": "Technical Guide",
167
- "summary": "A NIOSH overview of emerging safety challenges as robots increasingly collaborate with human workers. Updated in 2024, this page discusses how robots can improve safety by taking over dangerous tasks but also introduces new struck-by and caught-between hazards. It emphasizes the need for updated safety standards, risk assessments, and research on human-robot interaction, and it outlines NIOSH’s efforts (through its Center for Occupational Robotics Research) to develop best practices and guidance for safe integration of robotics in industry.",
168
- "format": "HTML"
169
- }
170
- ]
171
-
172
- # Process HTML sources
173
- def process_html(item):
174
- resp = requests.get(item['url'])
175
- resp.raise_for_status()
176
- soup = BeautifulSoup(resp.text, 'html.parser')
177
- texts = [p.get_text() for p in soup.find_all('p')]
178
- tables = [pd.read_html(str(t))[0].to_markdown() for t in soup.find_all('table')]
179
- # Join paragraphs and tables with blank lines
180
- full = "\n\n".join(texts + tables)
181
- resp_emb = client.embeddings.create(model="text-embedding-3-large", input=full)
182
- vec = resp_emb.data[0].embedding
183
- node_id = f"HTML::{item['title']}"
184
- G.add_node(node_id, text=full, embedding=vec, source=item['title'])
185
-
186
- # Run HTML processing
187
- for item in html_data:
188
- process_html(item)
189
-
190
- # Save graph
191
- nx.write_gml(G, "graph.gml")
192
- print("Graph RAG database created: graph.gml")
 
1
+ import os
2
+ import re
3
+ import glob
4
+ from dotenv import load_dotenv
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ import pandas as pd
8
+ import pymupdf4llm
9
+ import networkx as nx
10
+ from openai import OpenAI
11
+
12
+ # Load environment and initialize OpenAI client
13
+ load_dotenv(override=True)
14
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
15
+
16
+ # Helper: split Markdown text by third-level headers
17
+ def split_by_header(md_text):
18
+ parts = re.split(r'(?m)^### ', md_text)
19
+ return [('### ' + p) if not p.startswith('### ') else p for p in parts if p.strip()]
20
+
21
+ # Initialize graph database
22
+ G = nx.Graph()
23
+
24
+ # Process local PDFs
25
+ for pdf_path in glob.glob("scrapped_data/*.pdf"):
26
+ filename = os.path.basename(pdf_path)
27
+ title = os.path.splitext(filename)[0]
28
+ # Convert PDF to Markdown
29
+ md_text = pymupdf4llm.to_markdown(pdf_path)
30
+ # Split into sections
31
+ sections = split_by_header(md_text)
32
+ for idx, sec in enumerate(sections):
33
+ resp = client.embeddings.create(model="text-embedding-3-large", input=sec)
34
+ vector = resp.data[0].embedding
35
+ node_id = f"PDF::{title}::section{idx}"
36
+ # Store the local file path for citation
37
+ G.add_node(node_id,
38
+ text=sec,
39
+ embedding=vector,
40
+ source=title,
41
+ path=pdf_path)
42
+
43
+ # HTML Document List
44
+ html_data = [
45
+ {
46
+ "title": "NIOSH Robotics in the Workplace – Safety Overview (Human-Robot Collaboration)",
47
+ "url": "https://www.cdc.gov/niosh/robotics/about/",
48
+ "source": "NIOSH",
49
+ "year": 2024,
50
+ "category": "Technical Guide",
51
+ "summary": "A NIOSH overview of emerging safety challenges as robots increasingly collaborate with human workers. Updated in 2024, this page discusses how robots can improve safety by taking over dangerous tasks but also introduces new struck-by and caught-between hazards. It emphasizes the need for updated safety standards, risk assessments, and research on human-robot interaction, and it outlines NIOSH’s efforts (through its Center for Occupational Robotics Research) to develop best practices and guidance for safe integration of robotics in industry.",
52
+ "format": "HTML"
53
+ }
54
+ ]
55
+
56
+ # Process HTML sources
57
+ def process_html(item):
58
+ resp = requests.get(item['url'])
59
+ resp.raise_for_status()
60
+ soup = BeautifulSoup(resp.text, 'html.parser')
61
+ # Extract paragraph texts
62
+ texts = [p.get_text() for p in soup.find_all('p')]
63
+ # Extract tables as markdown
64
+ tables = []
65
+ for t in soup.find_all('table'):
66
+ df = pd.read_html(str(t))[0]
67
+ tables.append(df.to_markdown())
68
+ # Join paragraphs and tables with double newlines
69
+ full = "\n\n".join(texts + tables)
70
+ # Embed the combined text
71
+ resp_emb = client.embeddings.create(model="text-embedding-3-large", input=full)
72
+ vec = resp_emb.data[0].embedding
73
+ node_id = f"HTML::{item['title']}"
74
+ # Add node with URL citation
75
+ G.add_node(
76
+ node_id, text=full, embedding=vec, source=item['title'], url=item['url']
77
+ )
78
+
79
+ # Run HTML processing
80
+ for item in html_data:
81
+ process_html(item)
82
+
83
+ # Save graph
84
+ nx.write_gml(G, "graph.gml")
85
+ print("Graph RAG database created: graph.gml")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
query_graph.py CHANGED
@@ -15,41 +15,126 @@ enodes = list(G.nodes)
15
  embeddings = np.array([G.nodes[n]['embedding'] for n in enodes])
16
 
17
  def query_graph(question, top_k=5):
 
 
 
 
 
 
 
18
  # Embed question
19
  emb_resp = client.embeddings.create(
20
  model="text-embedding-3-large",
21
  input=question
22
  )
23
  q_vec = emb_resp.data[0].embedding
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  sims = cosine_similarity([q_vec], embeddings)[0]
25
  idxs = sims.argsort()[::-1][:top_k]
26
 
27
- # Gather context and sources
28
- context = [G.nodes[enodes[i]]['text'] for i in idxs]
29
- sources = list({G.nodes[enodes[i]]['source'] for i in idxs})
 
 
 
 
 
 
 
 
 
 
30
 
31
- # Generate answer
 
32
  prompt = (
33
  "Use the following context to answer the question:\n\n"
34
- + "\n\n---\n\n".join(context)
35
- + f"\n\nQuestion: {question}\nAnswer:")
 
 
 
36
  chat_resp = client.chat.completions.create(
37
  model="gpt-4o-mini",
38
  messages=[
39
- {"role": "system", "content": "You are a helpful assistant for XR safety training."},
40
- {"role": "user", "content": prompt}
41
  ]
42
  )
43
  answer = chat_resp.choices[0].message.content
44
- return answer, sources
 
45
 
46
 
47
  # Test queries
48
- test_questions = [
49
- "What are general machine guarding requirements?",
50
- "Explain the key steps in lockout/tagout procedures."
51
- ]
52
-
53
- for q in test_questions:
54
- ans, srcs = query_graph(q)
55
- print(f"Q: {q}\nA: {ans}\nSources: {srcs}\n")
 
 
 
 
 
 
 
 
 
15
  embeddings = np.array([G.nodes[n]['embedding'] for n in enodes])
16
 
17
  def query_graph(question, top_k=5):
18
+ """
19
+ Embed the question, retrieve the top_k relevant chunks,
20
+ and return: (answer, sources, chunks)
21
+ - answer: generated response string
22
+ - sources: list of unique source names
23
+ - chunks: list of tuples (header, score, full_text, source_url_or_path)
24
+ """
25
  # Embed question
26
  emb_resp = client.embeddings.create(
27
  model="text-embedding-3-large",
28
  input=question
29
  )
30
  q_vec = emb_resp.data[0].embedding
31
+
32
+ # Compute cosine similarities
33
+ sims = cosine_similarity([q_vec], embeddings)[0]
34
+ idxs = sims.argsort()[::-1][:top_k]
35
+
36
+ # Collect chunk-level info
37
+ chunks = []
38
+ sources = []
39
+ for rank, i in enumerate(idxs, start=1):
40
+ node = enodes[i]
41
+ text = G.nodes[node]['text']
42
+ header = text.split('\n', 1)[0].lstrip('# ').strip()
43
+ score = sims[i]
44
+ # Determine citation (URL for HTML, path for PDF)
45
+ citation = G.nodes[node].get('url') or G.nodes[node].get('path') or G.nodes[node]['source']
46
+ chunks.append((header, score, text, citation))
47
+ sources.append(G.nodes[node]['source'])
48
+ # Deduplicate sources
49
+ sources = list(dict.fromkeys(sources))
50
+
51
+ # Assemble prompt
52
+ context = "\n\n---\n\n".join([c[2] for c in chunks])
53
+ prompt = (
54
+ "Use the following context to answer the question:\n\n" +
55
+ context +
56
+ f"\n\nQuestion: {question}\nAnswer:"
57
+ )
58
+
59
+ # Query chat model
60
+ chat_resp = client.chat.completions.create(
61
+ model="gpt-4o-mini",
62
+ messages=[
63
+ {"role": "system", "content": "You are a helpful assistant for manufacturing equipment safety."},
64
+ {"role": "user", "content": prompt}
65
+ ]
66
+ )
67
+ answer = chat_resp.choices[0].message.content
68
+
69
+ return answer, sources, chunks
70
+
71
+ """
72
+ Embed the user question, retrieve the top_k relevant chunks from the graph,
73
+ assemble a prompt with those chunks, call the chat model, and return:
74
+ - answer: the generated response
75
+ - sources: unique list of source documents
76
+ - chunks: list of (header, score, full_text) for the top_k passages
77
+ """
78
+ # Embed the question
79
+ emb_resp = client.embeddings.create(
80
+ model="text-embedding-3-large",
81
+ input=question
82
+ )
83
+ q_vec = emb_resp.data[0].embedding
84
+
85
+ # Compute similarities against all stored embeddings
86
  sims = cosine_similarity([q_vec], embeddings)[0]
87
  idxs = sims.argsort()[::-1][:top_k]
88
 
89
+ # Gather chunk‑level info and sources
90
+ chunks = []
91
+ sources = []
92
+ for i in idxs:
93
+ node = enodes[i]
94
+ text = G.nodes[node]['text']
95
+ # Use the first line as the header
96
+ header = text.split('\n', 1)[0].lstrip('# ').strip()
97
+ score = sims[i]
98
+ chunks.append((header, score, text))
99
+ sources.append(G.nodes[node]['source'])
100
+ # Deduplicate sources while preserving order
101
+ sources = list(dict.fromkeys(sources))
102
 
103
+ # Assemble the prompt from the chunk texts
104
+ context_text = "\n\n---\n\n".join([chunk[2] for chunk in chunks])
105
  prompt = (
106
  "Use the following context to answer the question:\n\n"
107
+ + context_text
108
+ + f"\n\nQuestion: {question}\nAnswer:"
109
+ )
110
+
111
+ # Call the chat model
112
  chat_resp = client.chat.completions.create(
113
  model="gpt-4o-mini",
114
  messages=[
115
+ {"role": "system", "content": "You are a helpful assistant for manufacturing equipment safety."},
116
+ {"role": "user", "content": prompt}
117
  ]
118
  )
119
  answer = chat_resp.choices[0].message.content
120
+
121
+ return answer, sources, chunks
122
 
123
 
124
  # Test queries
125
+ # test_questions = [
126
+ # "What are general machine guarding requirements?",
127
+ # "Explain the key steps in lockout/tagout procedures."
128
+ # ]
129
+
130
+ # for q in test_questions:
131
+ # answer, sources, chunks = query_graph(q)
132
+ # print(f"Q: {q}")
133
+ # print(f"Answer: {answer}\n")
134
+ # print("Sources:")
135
+ # for src in sources:
136
+ # print(f"- {src}")
137
+ # print("\nTop Chunks:")
138
+ # for header, score, _, citation in chunks:
139
+ # print(f" * {header} (score: {score:.2f}) from {citation}")
140
+ # print("\n", "#"*40, "\n")