Spaces:

fmegahed
/

sight_chat

Paused

App Files Files Community

fmegahed commited on Jul 24

Commit

01c0ebb

verified ·

1 Parent(s): 982f6ef

sight chat app v0.0.2

Browse files

- Improved the graph RAG (made all the CFR codes in PDF since we were being blocked)
- Improved the display of sources
- Reversed Chat History display to ensure that the most recent is on top (and added a line break to separate the different messages)

Files changed (4) hide show

app.py +29 -21
graph.gml +0 -0
preprocess.py +85 -192
query_graph.py +102 -17

app.py CHANGED Viewed

@@ -1,10 +1,33 @@
 import streamlit as st
 from query_graph import query_graph
 # Sidebar configuration
 st.sidebar.title("About")
 st.sidebar.markdown("**Authors:** [The SIGHT Project Team](https://sites.miamioh.edu/sight/)")
-st.sidebar.markdown("**Version:** V. 0.0.1")
 st.sidebar.markdown("**Date:** July 24, 2025")
 st.sidebar.markdown("**Model:** gpt4o")
@@ -13,11 +36,6 @@ st.sidebar.markdown(
     "**Funding:** SIGHT is funded by [OHBWC WSIC](https://info.bwc.ohio.gov/for-employers/safety-services/workplace-safety-innovation-center/wsic-overview)"
 )
-# References toggle in sidebar
-st.sidebar.markdown("---")
-show_refs = st.sidebar.checkbox("Show references")
 # Main interface
 st.set_page_config(page_title="Miami University's SIGHT Chatbot")
@@ -39,30 +57,20 @@ if 'history' not in st.session_state:
 # User input
 query = st.text_input("Your question:")
 if st.button("Send") and query:
-    answer, sources = query_graph(query)
     st.session_state.history.append({
         'query': query,
         'answer': answer,
-        'sources': sources
     })
 # Display chat history
-for i, entry in enumerate(st.session_state.history):
     st.markdown(f"**You:** {entry['query']}")
     st.markdown(f"**Assistant:** {entry['answer']}")
-    # Explanations expander
-    with st.expander("Sources used", expanded=False):
-        for src in entry['sources']:
-            st.markdown(f"- {src}")
-# Optionally show references list
-if show_refs:
-    refs = [
-        "29 CFR 1910.211", "29 CFR 1910.212", "29 CFR 1910.213", "29 CFR 1910.215",
-        "OSHA 3170", "OSHA 3120", "NIOSH WP Solutions 2011-156", "NIOSH Robotics (2024)"
-    ]
-    for r in refs:
-        st.sidebar.markdown(f"- {r}")
 # Footer
 st.markdown("---")

 import streamlit as st
 from query_graph import query_graph
+# Helper for <details>
+def format_citations_html(chunks):
+    html = []
+    for idx, (hdr, sc, txt, citation) in enumerate(chunks, start=1):
+        preamble = (
+            f"<p style='font-size:0.9em;'><strong>Preamble:</strong> "
+            f"The text in the following detail is reproduced from [{citation}]. "
+            f"It had a cosine similarity of {sc:.2f} with the user question, "
+            f"and it ranked {idx} among the text chunks in our graph database.</p>"
+        )
+        body = txt.replace("\n", "<br>")
+        html.append(
+            f"<details>"
+            f"<summary>{hdr} (cosine similarity: {sc:.2f})</summary>"
+            f"<div style='font-size:0.9em; margin-top:0.5em;'>"
+            f"<strong>Preamble:</strong> The text below is reproduced from {citation}. "
+            f"</div>"
+            f"<div style='font-size:0.7em; margin-left:1em; margin-top:0.5em;'>{body}</div>"
+            f"</details><br><br>"
+        )
+    return "<br>".join(html)
 # Sidebar configuration
 st.sidebar.title("About")
 st.sidebar.markdown("**Authors:** [The SIGHT Project Team](https://sites.miamioh.edu/sight/)")
+st.sidebar.markdown("**Version:** V. 0.0.2")
 st.sidebar.markdown("**Date:** July 24, 2025")
 st.sidebar.markdown("**Model:** gpt4o")
     "**Funding:** SIGHT is funded by [OHBWC WSIC](https://info.bwc.ohio.gov/for-employers/safety-services/workplace-safety-innovation-center/wsic-overview)"
 )
 # Main interface
 st.set_page_config(page_title="Miami University's SIGHT Chatbot")
 # User input
 query = st.text_input("Your question:")
 if st.button("Send") and query:
+    answer, sources, chunks = query_graph(query)
     st.session_state.history.append({
         'query': query,
         'answer': answer,
+        'sources': sources,
+        'chunks': chunks
     })
 # Display chat history
+for entry in st.session_state.history[::-1]:
     st.markdown(f"**You:** {entry['query']}")
     st.markdown(f"**Assistant:** {entry['answer']}")
+    st.markdown(format_citations_html(entry['chunks']), unsafe_allow_html=True)
 # Footer
 st.markdown("---")

graph.gml CHANGED Viewed

The diff for this file is too large to render. See raw diff

preprocess.py CHANGED Viewed

@@ -1,192 +1,85 @@
-import os
-import re
-import glob
-from dotenv import load_dotenv
-import requests
-from bs4 import BeautifulSoup
-import pandas as pd
-import pymupdf4llm
-import networkx as nx
-from openai import OpenAI
-# Load environment and initialize OpenAI client
-load_dotenv(override=True)
-client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
-# Helper function to split PDF by second-level header
-def split_by_header(md_text):
-    """
-    1) Split on third‑level headers (“### ”).
-    2) For each part >7 500 tokens, try splitting on explicit page markers (“Page 1”, “Page 2”, …).
-    3) If no page markers are found, break into ~5 000‑token chunks.
-    Returns a list of markdown fragments.
-    """
-    # 1) Initial split on '### ' headers
-    parts = re.split(r'(?m)^### ', md_text)
-    sections = [('### ' + p) if not p.startswith('### ') else p for p in parts if p.strip()]
-    final_sections = []
-    for sec in sections:
-        tokens = sec.split()
-        if len(tokens) > 7500:
-            # 2) Try explicit “Page X” markers
-            pages = re.split(r'(?m)^Page \d+', sec)
-            if len(pages) > 1:
-                for pg in pages:
-                    if pg.strip():
-                        final_sections.append(pg)
-                continue
-            # 3) Fallback: split into 5 000‑token chunks
-            for i in range(0, len(tokens), 5000):
-                chunk = ' '.join(tokens[i : i + 5000])
-                final_sections.append(chunk)
-        else:
-            final_sections.append(sec)
-    return final_sections
-# Initialize graph database
-G = nx.Graph()
-# Process local PDFs in gov_pdfs/
-for pdf_path in glob.glob("gov_pdfs/*.pdf"):
-    filename = os.path.basename(pdf_path)
-    title = os.path.splitext(filename)[0]
-    # Convert PDF to Markdown
-    md_text = pymupdf4llm.to_markdown(pdf_path)
-    # Split into sections
-    sections = split_by_header(md_text)
-    # Embed and add nodes
-    for idx, sec in enumerate(sections):
-        resp = client.embeddings.create(model="text-embedding-3-large", input=sec)
-        vector = resp.data[0].embedding
-        node_id = f"PDF::{title}::section{idx}"
-        G.add_node(node_id, text=sec, embedding=vector, source=title)
-# HTML Document List
-html_data = [
-  {
-    "title": "29 CFR 1910.211 – Definitions (Machinery and Machine Guarding)",
-    "url": "https://www.ecfr.gov/current/title-29/section-1910.211",
-    "source": "OSHA",
-    "year": 2025,
-    "category": "Regulation",
-    "summary": "Provides definitions for terms used in OSHA’s machine guarding standards (Subpart O), laying the groundwork for understanding and applying the specific safeguarding requirements for machinery to prevent operator injury.",
-    "format": "HTML"
-  },
-  {
-    "title": "29 CFR 1910.212 – General Requirements for All Machines",
-    "url": "https://www.ecfr.gov/current/title-29/section-1910.212",
-    "source": "OSHA",
-    "year": 2025,
-    "category": "Regulation",
-    "summary": "Establishes broad, fundamental machine safeguarding requirements (e.g. guards at points of operation, secure attachment of guards, protection from flying debris) to protect workers from hazards like nip points, rotating parts, flying chips, and sparks.",
-    "format": "HTML"
-  },
-  {
-    "title": "29 CFR 1910.213 – Woodworking Machinery Requirements",
-    "url": "https://www.ecfr.gov/current/title-29/section-1910.213",
-    "source": "OSHA",
-    "year": 2025,
-    "category": "Regulation",
-    "summary": "Sets specific safety requirements for woodworking equipment (such as saws, jointers, planers, and sanders), including guarding of blades, hoods, push sticks, and other protective measures to prevent cuts, amputations, and kickback injuries in woodworking operations.",
-    "format": "HTML"
-  },
-  {
-    "title": "29 CFR 1910.215 – Abrasive Wheel Machinery",
-    "url": "https://www.ecfr.gov/current/title-29/section-1910.215",
-    "source": "OSHA",
-    "year": 2025,
-    "category": "Regulation",
-    "summary": "Covers guarding and safety precautions for machines with abrasive wheels (grinders and cut-off machines), requiring wheel enclosures, work rests, tongue guards, and inspections to prevent wheel shattering, sparks, and operator contact with moving abrasive parts.",
-    "format": "HTML"
-  },
-  {
-    "title": "29 CFR 1910.216 – Mills and Calenders in the Rubber and Plastics Industries",
-    "url": "https://www.ecfr.gov/current/title-29/section-1910.216",
-    "source": "OSHA",
-    "year": 2025,
-    "category": "Regulation",
-    "summary": "Specifies safeguarding for two-roll mills, calenders, and similar processing machines in rubber/plastics manufacturing – including required safety trip controls, emergency stopping devices, and barrier guards – to protect workers from being caught in rollers or nip points.",
-    "format": "HTML"
-  },
-  {
-    "title": "29 CFR 1910.217 – Mechanical Power Presses",
-    "url": "https://www.ecfr.gov/current/title-29/section-1910.217",
-    "source": "OSHA",
-    "year": 2025,
-    "category": "Regulation",
-    "summary": "Detailed standard for mechanical power presses (e.g. stamping presses) mandating guarding of points of operation, use of devices like two-hand controls or presence-sensing systems, inspection and maintenance requirements, and training – aimed at preventing severe crushing, amputation, or die-punch injuries.",
-    "format": "HTML"
-  },
-  {
-    "title": "29 CFR 1910.218 – Forging Machines",
-    "url": "https://www.ecfr.gov/current/title-29/section-1910.218",
-    "source": "OSHA",
-    "year": 2025,
-    "category": "Regulation",
-    "summary": "Covers safety requirements for forging machinery (such as hammers, presses, upsetters, and boltheaders), including provisions for guarding dies and rams, handling hot metal safely, and use of tongs or mechanical loaders – all intended to prevent struck-by, caught-in, and burn injuries in forge operations.",
-    "format": "HTML"
-  },
-  {
-    "title": "29 CFR 1910.219 – Mechanical Power-Transmission Apparatus",
-    "url": "https://www.ecfr.gov/current/title-29/section-1910.219",
-    "source": "OSHA",
-    "year": 2025,
-    "category": "Regulation",
-    "summary": "Requires guards for all exposed belts, pulleys, chains, gears, flywheels, couplings, and other power-transmission parts on machinery. This standard ensures that rotating or moving drivetrain components are enclosed to prevent employees from getting caught in or struck by these parts.",
-    "format": "HTML"
-  },
-  {
-    "title": "29 CFR 1910.147 – The Control of Hazardous Energy (Lockout/Tagout)",
-    "url": "https://www.ecfr.gov/current/title-29/section-1910.147",
-    "source": "OSHA",
-    "year": 2025,
-    "category": "Regulation",
-    "summary": "OSHA’s Lockout/Tagout standard, which mandates that dangerous machinery must be de-energized and locked out (or tagged out) during maintenance or servicing. It details the required energy control procedures, employee training, and periodic inspections to ensure that workers are protected from the release of stored energy or accidental machine start-up (a major cause of caught-in/between and amputation incidents).",
-    "format": "HTML"
-  },
-  {
-    "title": "29 CFR 1910.178 – Powered Industrial Trucks (Forklifts)",
-    "url": "https://www.ecfr.gov/current/title-29/section-1910.178",
-    "source": "OSHA",
-    "year": 2025,
-    "category": "Regulation",
-    "summary": "The OSHA standard governing the design, maintenance, and safe operation of forklifts and other powered industrial trucks. It covers operator training and certification requirements, inspection and maintenance of equipment, safe fueling/charging, and operating rules (like speed limits, handling loads, and avoiding hazards) – all aimed at preventing tip-overs, collisions, and struck-by or crushed-by accidents involving these vehicles.",
-    "format": "HTML"
-  },
-  {
-    "title": "NIOSH Robotics in the Workplace – Safety Overview (Human-Robot Collaboration)",
-    "url": "https://www.cdc.gov/niosh/robotics/about/",
-    "source": "NIOSH",
-    "year": 2024,
-    "category": "Technical Guide",
-    "summary": "A NIOSH overview of emerging safety challenges as robots increasingly collaborate with human workers. Updated in 2024, this page discusses how robots can improve safety by taking over dangerous tasks but also introduces new struck-by and caught-between hazards. It emphasizes the need for updated safety standards, risk assessments, and research on human-robot interaction, and it outlines NIOSH’s efforts (through its Center for Occupational Robotics Research) to develop best practices and guidance for safe integration of robotics in industry.",
-    "format": "HTML"
-  }
-]
-# Process HTML sources
-def process_html(item):
-    resp = requests.get(item['url'])
-    resp.raise_for_status()
-    soup = BeautifulSoup(resp.text, 'html.parser')
-    texts = [p.get_text() for p in soup.find_all('p')]
-    tables = [pd.read_html(str(t))[0].to_markdown() for t in soup.find_all('table')]
-    # Join paragraphs and tables with blank lines
-    full = "\n\n".join(texts + tables)
-    resp_emb = client.embeddings.create(model="text-embedding-3-large", input=full)
-    vec = resp_emb.data[0].embedding
-    node_id = f"HTML::{item['title']}"
-    G.add_node(node_id, text=full, embedding=vec, source=item['title'])
-# Run HTML processing
-for item in html_data:
-    process_html(item)
-# Save graph
-nx.write_gml(G, "graph.gml")
-print("Graph RAG database created: graph.gml")

+import os
+import re
+import glob
+from dotenv import load_dotenv
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import pymupdf4llm
+import networkx as nx
+from openai import OpenAI
+# Load environment and initialize OpenAI client
+load_dotenv(override=True)
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+# Helper: split Markdown text by third-level headers
+def split_by_header(md_text):
+    parts = re.split(r'(?m)^### ', md_text)
+    return [('### ' + p) if not p.startswith('### ') else p for p in parts if p.strip()]
+# Initialize graph database
+G = nx.Graph()
+# Process local PDFs
+for pdf_path in glob.glob("scrapped_data/*.pdf"):
+    filename = os.path.basename(pdf_path)
+    title = os.path.splitext(filename)[0]
+    # Convert PDF to Markdown
+    md_text = pymupdf4llm.to_markdown(pdf_path)
+    # Split into sections
+    sections = split_by_header(md_text)
+    for idx, sec in enumerate(sections):
+        resp = client.embeddings.create(model="text-embedding-3-large", input=sec)
+        vector = resp.data[0].embedding
+        node_id = f"PDF::{title}::section{idx}"
+        # Store the local file path for citation
+        G.add_node(node_id,
+                   text=sec,
+                   embedding=vector,
+                   source=title,
+                   path=pdf_path)
+# HTML Document List
+html_data = [
+  {
+    "title": "NIOSH Robotics in the Workplace – Safety Overview (Human-Robot Collaboration)",
+    "url": "https://www.cdc.gov/niosh/robotics/about/",
+    "source": "NIOSH",
+    "year": 2024,
+    "category": "Technical Guide",
+    "summary": "A NIOSH overview of emerging safety challenges as robots increasingly collaborate with human workers. Updated in 2024, this page discusses how robots can improve safety by taking over dangerous tasks but also introduces new struck-by and caught-between hazards. It emphasizes the need for updated safety standards, risk assessments, and research on human-robot interaction, and it outlines NIOSH’s efforts (through its Center for Occupational Robotics Research) to develop best practices and guidance for safe integration of robotics in industry.",
+    "format": "HTML"
+  }
+]
+# Process HTML sources
+def process_html(item):
+    resp = requests.get(item['url'])
+    resp.raise_for_status()
+    soup = BeautifulSoup(resp.text, 'html.parser')
+    # Extract paragraph texts
+    texts = [p.get_text() for p in soup.find_all('p')]
+    # Extract tables as markdown
+    tables = []
+    for t in soup.find_all('table'):
+        df = pd.read_html(str(t))[0]
+        tables.append(df.to_markdown())
+    # Join paragraphs and tables with double newlines
+    full = "\n\n".join(texts + tables)
+    # Embed the combined text
+    resp_emb = client.embeddings.create(model="text-embedding-3-large", input=full)
+    vec = resp_emb.data[0].embedding
+    node_id = f"HTML::{item['title']}"
+    # Add node with URL citation
+    G.add_node(
+      node_id,  text=full, embedding=vec, source=item['title'], url=item['url']
+    )
+# Run HTML processing
+for item in html_data:
+    process_html(item)
+# Save graph
+nx.write_gml(G, "graph.gml")
+print("Graph RAG database created: graph.gml")

query_graph.py CHANGED Viewed

@@ -15,41 +15,126 @@ enodes = list(G.nodes)
 embeddings = np.array([G.nodes[n]['embedding'] for n in enodes])
 def query_graph(question, top_k=5):
     # Embed question
     emb_resp = client.embeddings.create(
         model="text-embedding-3-large",
         input=question
     )
     q_vec = emb_resp.data[0].embedding
     sims = cosine_similarity([q_vec], embeddings)[0]
     idxs = sims.argsort()[::-1][:top_k]
-    # Gather context and sources
-    context = [G.nodes[enodes[i]]['text'] for i in idxs]
-    sources = list({G.nodes[enodes[i]]['source'] for i in idxs})
-    # Generate answer
     prompt = (
         "Use the following context to answer the question:\n\n"
-        + "\n\n---\n\n".join(context)
-        + f"\n\nQuestion: {question}\nAnswer:")
     chat_resp = client.chat.completions.create(
         model="gpt-4o-mini",
         messages=[
-            {"role": "system", "content": "You are a helpful assistant for XR safety training."},
-            {"role": "user", "content": prompt}
         ]
     )
     answer = chat_resp.choices[0].message.content
-    return answer, sources
 # Test queries
-test_questions = [
-    "What are general machine guarding requirements?",
-    "Explain the key steps in lockout/tagout procedures."
-]
-for q in test_questions:
-    ans, srcs = query_graph(q)
-    print(f"Q: {q}\nA: {ans}\nSources: {srcs}\n")

 embeddings = np.array([G.nodes[n]['embedding'] for n in enodes])
 def query_graph(question, top_k=5):
+    """
+    Embed the question, retrieve the top_k relevant chunks,
+    and return: (answer, sources, chunks)
+      - answer: generated response string
+      - sources: list of unique source names
+      - chunks: list of tuples (header, score, full_text, source_url_or_path)
+    """
     # Embed question
     emb_resp = client.embeddings.create(
         model="text-embedding-3-large",
         input=question
     )
     q_vec = emb_resp.data[0].embedding
+    # Compute cosine similarities
+    sims = cosine_similarity([q_vec], embeddings)[0]
+    idxs = sims.argsort()[::-1][:top_k]
+    # Collect chunk-level info
+    chunks = []
+    sources = []
+    for rank, i in enumerate(idxs, start=1):
+        node = enodes[i]
+        text = G.nodes[node]['text']
+        header = text.split('\n', 1)[0].lstrip('# ').strip()
+        score = sims[i]
+        # Determine citation (URL for HTML, path for PDF)
+        citation = G.nodes[node].get('url') or G.nodes[node].get('path') or G.nodes[node]['source']
+        chunks.append((header, score, text, citation))
+        sources.append(G.nodes[node]['source'])
+    # Deduplicate sources
+    sources = list(dict.fromkeys(sources))
+    # Assemble prompt
+    context = "\n\n---\n\n".join([c[2] for c in chunks])
+    prompt = (
+        "Use the following context to answer the question:\n\n" +
+        context +
+        f"\n\nQuestion: {question}\nAnswer:"
+    )
+    # Query chat model
+    chat_resp = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant for manufacturing equipment safety."},
+            {"role": "user",   "content": prompt}
+        ]
+    )
+    answer = chat_resp.choices[0].message.content
+    return answer, sources, chunks
+    """
+    Embed the user question, retrieve the top_k relevant chunks from the graph,
+    assemble a prompt with those chunks, call the chat model, and return:
+      - answer: the generated response
+      - sources: unique list of source documents
+      - chunks: list of (header, score, full_text) for the top_k passages
+    """
+    # Embed the question
+    emb_resp = client.embeddings.create(
+        model="text-embedding-3-large",
+        input=question
+    )
+    q_vec = emb_resp.data[0].embedding
+    # Compute similarities against all stored embeddings
     sims = cosine_similarity([q_vec], embeddings)[0]
     idxs = sims.argsort()[::-1][:top_k]
+    # Gather chunk‑level info and sources
+    chunks = []
+    sources = []
+    for i in idxs:
+        node = enodes[i]
+        text = G.nodes[node]['text']
+        # Use the first line as the header
+        header = text.split('\n', 1)[0].lstrip('# ').strip()
+        score = sims[i]
+        chunks.append((header, score, text))
+        sources.append(G.nodes[node]['source'])
+    # Deduplicate sources while preserving order
+    sources = list(dict.fromkeys(sources))
+    # Assemble the prompt from the chunk texts
+    context_text = "\n\n---\n\n".join([chunk[2] for chunk in chunks])
     prompt = (
         "Use the following context to answer the question:\n\n"
+        + context_text
+        + f"\n\nQuestion: {question}\nAnswer:"
+    )
+    # Call the chat model
     chat_resp = client.chat.completions.create(
         model="gpt-4o-mini",
         messages=[
+            {"role": "system", "content": "You are a helpful assistant for manufacturing equipment safety."},
+            {"role": "user",   "content": prompt}
         ]
     )
     answer = chat_resp.choices[0].message.content
+    return answer, sources, chunks
 # Test queries
+# test_questions = [
+#     "What are general machine guarding requirements?",
+#     "Explain the key steps in lockout/tagout procedures."
+# ]
+# for q in test_questions:
+#     answer, sources, chunks = query_graph(q)
+#     print(f"Q: {q}")
+#     print(f"Answer: {answer}\n")
+#     print("Sources:")
+#     for src in sources:
+#         print(f"- {src}")
+#     print("\nTop Chunks:")
+#     for header, score, _, citation in chunks:
+#         print(f"  * {header} (score: {score:.2f}) from {citation}")
+#     print("\n", "#"*40, "\n")