Spaces:

fmegahed
/

sight_chat

Paused

App Files Files Community

sight_chat / preprocess.py

fmegahed

sight chat app v0.0.2

01c0ebb verified 5 months ago

raw

history blame

3.25 kB

	import os
	import re
	import glob
	from dotenv import load_dotenv
	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	import pymupdf4llm
	import networkx as nx
	from openai import OpenAI

	# Load environment and initialize OpenAI client
	load_dotenv(override=True)
	client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

	# Helper: split Markdown text by third-level headers
	def split_by_header(md_text):
	parts = re.split(r'(?m)^### ', md_text)
	return [('### ' + p) if not p.startswith('### ') else p for p in parts if p.strip()]

	# Initialize graph database
	G = nx.Graph()

	# Process local PDFs
	for pdf_path in glob.glob("scrapped_data/*.pdf"):
	filename = os.path.basename(pdf_path)
	title = os.path.splitext(filename)[0]
	# Convert PDF to Markdown
	md_text = pymupdf4llm.to_markdown(pdf_path)
	# Split into sections
	sections = split_by_header(md_text)
	for idx, sec in enumerate(sections):
	resp = client.embeddings.create(model="text-embedding-3-large", input=sec)
	vector = resp.data[0].embedding
	node_id = f"PDF::{title}::section{idx}"
	# Store the local file path for citation
	G.add_node(node_id,
	text=sec,
	embedding=vector,
	source=title,
	path=pdf_path)

	# HTML Document List
	html_data = [
	{
	"title": "NIOSH Robotics in the Workplace – Safety Overview (Human-Robot Collaboration)",
	"url": "https://www.cdc.gov/niosh/robotics/about/",
	"source": "NIOSH",
	"year": 2024,
	"category": "Technical Guide",
	"summary": "A NIOSH overview of emerging safety challenges as robots increasingly collaborate with human workers. Updated in 2024, this page discusses how robots can improve safety by taking over dangerous tasks but also introduces new struck-by and caught-between hazards. It emphasizes the need for updated safety standards, risk assessments, and research on human-robot interaction, and it outlines NIOSH’s efforts (through its Center for Occupational Robotics Research) to develop best practices and guidance for safe integration of robotics in industry.",
	"format": "HTML"
	}
	]

	# Process HTML sources
	def process_html(item):
	resp = requests.get(item['url'])
	resp.raise_for_status()
	soup = BeautifulSoup(resp.text, 'html.parser')
	# Extract paragraph texts
	texts = [p.get_text() for p in soup.find_all('p')]
	# Extract tables as markdown
	tables = []
	for t in soup.find_all('table'):
	df = pd.read_html(str(t))[0]
	tables.append(df.to_markdown())
	# Join paragraphs and tables with double newlines
	full = "\n\n".join(texts + tables)
	# Embed the combined text
	resp_emb = client.embeddings.create(model="text-embedding-3-large", input=full)
	vec = resp_emb.data[0].embedding
	node_id = f"HTML::{item['title']}"
	# Add node with URL citation
	G.add_node(
	node_id, text=full, embedding=vec, source=item['title'], url=item['url']
	)

	# Run HTML processing
	for item in html_data:
	process_html(item)

	# Save graph
	nx.write_gml(G, "graph.gml")
	print("Graph RAG database created: graph.gml")