Spaces:
Paused
Paused
| import os | |
| import re | |
| import glob | |
| from dotenv import load_dotenv | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| import pymupdf4llm | |
| import networkx as nx | |
| from openai import OpenAI | |
| # Load environment and initialize OpenAI client | |
| load_dotenv(override=True) | |
| client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) | |
| # Helper: split Markdown text by third-level headers | |
| def split_by_header(md_text): | |
| parts = re.split(r'(?m)^### ', md_text) | |
| return [('### ' + p) if not p.startswith('### ') else p for p in parts if p.strip()] | |
| # Initialize graph database | |
| G = nx.Graph() | |
| # Process local PDFs | |
| for pdf_path in glob.glob("scrapped_data/*.pdf"): | |
| filename = os.path.basename(pdf_path) | |
| title = os.path.splitext(filename)[0] | |
| # Convert PDF to Markdown | |
| md_text = pymupdf4llm.to_markdown(pdf_path) | |
| # Split into sections | |
| sections = split_by_header(md_text) | |
| for idx, sec in enumerate(sections): | |
| resp = client.embeddings.create(model="text-embedding-3-large", input=sec) | |
| vector = resp.data[0].embedding | |
| node_id = f"PDF::{title}::section{idx}" | |
| # Store the local file path for citation | |
| G.add_node(node_id, | |
| text=sec, | |
| embedding=vector, | |
| source=title, | |
| path=pdf_path) | |
| # HTML Document List | |
| html_data = [ | |
| { | |
| "title": "NIOSH Robotics in the Workplace – Safety Overview (Human-Robot Collaboration)", | |
| "url": "https://www.cdc.gov/niosh/robotics/about/", | |
| "source": "NIOSH", | |
| "year": 2024, | |
| "category": "Technical Guide", | |
| "summary": "A NIOSH overview of emerging safety challenges as robots increasingly collaborate with human workers. Updated in 2024, this page discusses how robots can improve safety by taking over dangerous tasks but also introduces new struck-by and caught-between hazards. It emphasizes the need for updated safety standards, risk assessments, and research on human-robot interaction, and it outlines NIOSH’s efforts (through its Center for Occupational Robotics Research) to develop best practices and guidance for safe integration of robotics in industry.", | |
| "format": "HTML" | |
| } | |
| ] | |
| # Process HTML sources | |
| def process_html(item): | |
| resp = requests.get(item['url']) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, 'html.parser') | |
| # Extract paragraph texts | |
| texts = [p.get_text() for p in soup.find_all('p')] | |
| # Extract tables as markdown | |
| tables = [] | |
| for t in soup.find_all('table'): | |
| df = pd.read_html(str(t))[0] | |
| tables.append(df.to_markdown()) | |
| # Join paragraphs and tables with double newlines | |
| full = "\n\n".join(texts + tables) | |
| # Embed the combined text | |
| resp_emb = client.embeddings.create(model="text-embedding-3-large", input=full) | |
| vec = resp_emb.data[0].embedding | |
| node_id = f"HTML::{item['title']}" | |
| # Add node with URL citation | |
| G.add_node( | |
| node_id, text=full, embedding=vec, source=item['title'], url=item['url'] | |
| ) | |
| # Run HTML processing | |
| for item in html_data: | |
| process_html(item) | |
| # Save graph | |
| nx.write_gml(G, "graph.gml") | |
| print("Graph RAG database created: graph.gml") | |