Spaces:

fmegahed
/

sight_chat

Paused

File size: 3,246 Bytes

01c0ebb

import os
import re
import glob
from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pymupdf4llm
import networkx as nx
from openai import OpenAI

# Load environment and initialize OpenAI client
load_dotenv(override=True)
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Helper: split Markdown text by third-level headers
def split_by_header(md_text):
    parts = re.split(r'(?m)^### ', md_text)
    return [('### ' + p) if not p.startswith('### ') else p for p in parts if p.strip()]

# Initialize graph database
G = nx.Graph()

# Process local PDFs
for pdf_path in glob.glob("scrapped_data/*.pdf"):
    filename = os.path.basename(pdf_path)
    title = os.path.splitext(filename)[0]
    # Convert PDF to Markdown
    md_text = pymupdf4llm.to_markdown(pdf_path)
    # Split into sections
    sections = split_by_header(md_text)
    for idx, sec in enumerate(sections):
        resp = client.embeddings.create(model="text-embedding-3-large", input=sec)
        vector = resp.data[0].embedding
        node_id = f"PDF::{title}::section{idx}"
        # Store the local file path for citation
        G.add_node(node_id,
                   text=sec,
                   embedding=vector,
                   source=title,
                   path=pdf_path)

# HTML Document List
html_data = [
  {
    "title": "NIOSH Robotics in the Workplace – Safety Overview (Human-Robot Collaboration)",
    "url": "https://www.cdc.gov/niosh/robotics/about/",
    "source": "NIOSH",
    "year": 2024,
    "category": "Technical Guide",
    "summary": "A NIOSH overview of emerging safety challenges as robots increasingly collaborate with human workers. Updated in 2024, this page discusses how robots can improve safety by taking over dangerous tasks but also introduces new struck-by and caught-between hazards. It emphasizes the need for updated safety standards, risk assessments, and research on human-robot interaction, and it outlines NIOSH’s efforts (through its Center for Occupational Robotics Research) to develop best practices and guidance for safe integration of robotics in industry.",
    "format": "HTML"
  }
]

# Process HTML sources
def process_html(item):
    resp = requests.get(item['url'])
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'html.parser')
    # Extract paragraph texts
    texts = [p.get_text() for p in soup.find_all('p')]
    # Extract tables as markdown
    tables = []
    for t in soup.find_all('table'):
        df = pd.read_html(str(t))[0]
        tables.append(df.to_markdown())
    # Join paragraphs and tables with double newlines
    full = "\n\n".join(texts + tables)
    # Embed the combined text
    resp_emb = client.embeddings.create(model="text-embedding-3-large", input=full)
    vec = resp_emb.data[0].embedding
    node_id = f"HTML::{item['title']}"
    # Add node with URL citation
    G.add_node(
      node_id,  text=full, embedding=vec, source=item['title'], url=item['url']
    )

# Run HTML processing
for item in html_data:
    process_html(item)

# Save graph 
nx.write_gml(G, "graph.gml") 
print("Graph RAG database created: graph.gml")