import os import re import glob from dotenv import load_dotenv import requests from bs4 import BeautifulSoup import pandas as pd import pymupdf4llm import networkx as nx from openai import OpenAI # Load environment and initialize OpenAI client load_dotenv(override=True) client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) # Helper: split Markdown text by third-level headers def split_by_header(md_text): parts = re.split(r'(?m)^### ', md_text) return [('### ' + p) if not p.startswith('### ') else p for p in parts if p.strip()] # Initialize graph database G = nx.Graph() # Process local PDFs for pdf_path in glob.glob("scrapped_data/*.pdf"): filename = os.path.basename(pdf_path) title = os.path.splitext(filename)[0] # Convert PDF to Markdown md_text = pymupdf4llm.to_markdown(pdf_path) # Split into sections sections = split_by_header(md_text) for idx, sec in enumerate(sections): resp = client.embeddings.create(model="text-embedding-3-large", input=sec) vector = resp.data[0].embedding node_id = f"PDF::{title}::section{idx}" # Store the local file path for citation G.add_node(node_id, text=sec, embedding=vector, source=title, path=pdf_path) # HTML Document List html_data = [ { "title": "NIOSH Robotics in the Workplace – Safety Overview (Human-Robot Collaboration)", "url": "https://www.cdc.gov/niosh/robotics/about/", "source": "NIOSH", "year": 2024, "category": "Technical Guide", "summary": "A NIOSH overview of emerging safety challenges as robots increasingly collaborate with human workers. Updated in 2024, this page discusses how robots can improve safety by taking over dangerous tasks but also introduces new struck-by and caught-between hazards. It emphasizes the need for updated safety standards, risk assessments, and research on human-robot interaction, and it outlines NIOSH’s efforts (through its Center for Occupational Robotics Research) to develop best practices and guidance for safe integration of robotics in industry.", "format": "HTML" } ] # Process HTML sources def process_html(item): resp = requests.get(item['url']) resp.raise_for_status() soup = BeautifulSoup(resp.text, 'html.parser') # Extract paragraph texts texts = [p.get_text() for p in soup.find_all('p')] # Extract tables as markdown tables = [] for t in soup.find_all('table'): df = pd.read_html(str(t))[0] tables.append(df.to_markdown()) # Join paragraphs and tables with double newlines full = "\n\n".join(texts + tables) # Embed the combined text resp_emb = client.embeddings.create(model="text-embedding-3-large", input=full) vec = resp_emb.data[0].embedding node_id = f"HTML::{item['title']}" # Add node with URL citation G.add_node( node_id, text=full, embedding=vec, source=item['title'], url=item['url'] ) # Run HTML processing for item in html_data: process_html(item) # Save graph nx.write_gml(G, "graph.gml") print("Graph RAG database created: graph.gml")