Spaces:
Paused
Paused
File size: 3,246 Bytes
01c0ebb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import os
import re
import glob
from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pymupdf4llm
import networkx as nx
from openai import OpenAI
# Load environment and initialize OpenAI client
load_dotenv(override=True)
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# Helper: split Markdown text by third-level headers
def split_by_header(md_text):
parts = re.split(r'(?m)^### ', md_text)
return [('### ' + p) if not p.startswith('### ') else p for p in parts if p.strip()]
# Initialize graph database
G = nx.Graph()
# Process local PDFs
for pdf_path in glob.glob("scrapped_data/*.pdf"):
filename = os.path.basename(pdf_path)
title = os.path.splitext(filename)[0]
# Convert PDF to Markdown
md_text = pymupdf4llm.to_markdown(pdf_path)
# Split into sections
sections = split_by_header(md_text)
for idx, sec in enumerate(sections):
resp = client.embeddings.create(model="text-embedding-3-large", input=sec)
vector = resp.data[0].embedding
node_id = f"PDF::{title}::section{idx}"
# Store the local file path for citation
G.add_node(node_id,
text=sec,
embedding=vector,
source=title,
path=pdf_path)
# HTML Document List
html_data = [
{
"title": "NIOSH Robotics in the Workplace – Safety Overview (Human-Robot Collaboration)",
"url": "https://www.cdc.gov/niosh/robotics/about/",
"source": "NIOSH",
"year": 2024,
"category": "Technical Guide",
"summary": "A NIOSH overview of emerging safety challenges as robots increasingly collaborate with human workers. Updated in 2024, this page discusses how robots can improve safety by taking over dangerous tasks but also introduces new struck-by and caught-between hazards. It emphasizes the need for updated safety standards, risk assessments, and research on human-robot interaction, and it outlines NIOSH’s efforts (through its Center for Occupational Robotics Research) to develop best practices and guidance for safe integration of robotics in industry.",
"format": "HTML"
}
]
# Process HTML sources
def process_html(item):
resp = requests.get(item['url'])
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'html.parser')
# Extract paragraph texts
texts = [p.get_text() for p in soup.find_all('p')]
# Extract tables as markdown
tables = []
for t in soup.find_all('table'):
df = pd.read_html(str(t))[0]
tables.append(df.to_markdown())
# Join paragraphs and tables with double newlines
full = "\n\n".join(texts + tables)
# Embed the combined text
resp_emb = client.embeddings.create(model="text-embedding-3-large", input=full)
vec = resp_emb.data[0].embedding
node_id = f"HTML::{item['title']}"
# Add node with URL citation
G.add_node(
node_id, text=full, embedding=vec, source=item['title'], url=item['url']
)
# Run HTML processing
for item in html_data:
process_html(item)
# Save graph
nx.write_gml(G, "graph.gml")
print("Graph RAG database created: graph.gml")
|