Spaces:
Paused
Paused
File size: 4,705 Bytes
cd68afd 01c0ebb cd68afd 01c0ebb cd68afd 01c0ebb cd68afd 01c0ebb cd68afd 01c0ebb cd68afd 01c0ebb cd68afd 01c0ebb cd68afd 01c0ebb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import os
import numpy as np
from dotenv import load_dotenv
from openai import OpenAI
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
# Initialize OpenAI client
load_dotenv(override=True)
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# Load graph from GML
G = nx.read_gml("graph.gml")
enodes = list(G.nodes)
embeddings = np.array([G.nodes[n]['embedding'] for n in enodes])
def query_graph(question, top_k=5):
"""
Embed the question, retrieve the top_k relevant chunks,
and return: (answer, sources, chunks)
- answer: generated response string
- sources: list of unique source names
- chunks: list of tuples (header, score, full_text, source_url_or_path)
"""
# Embed question
emb_resp = client.embeddings.create(
model="text-embedding-3-large",
input=question
)
q_vec = emb_resp.data[0].embedding
# Compute cosine similarities
sims = cosine_similarity([q_vec], embeddings)[0]
idxs = sims.argsort()[::-1][:top_k]
# Collect chunk-level info
chunks = []
sources = []
for rank, i in enumerate(idxs, start=1):
node = enodes[i]
text = G.nodes[node]['text']
header = text.split('\n', 1)[0].lstrip('# ').strip()
score = sims[i]
# Determine citation (URL for HTML, path for PDF)
citation = G.nodes[node].get('url') or G.nodes[node].get('path') or G.nodes[node]['source']
chunks.append((header, score, text, citation))
sources.append(G.nodes[node]['source'])
# Deduplicate sources
sources = list(dict.fromkeys(sources))
# Assemble prompt
context = "\n\n---\n\n".join([c[2] for c in chunks])
prompt = (
"Use the following context to answer the question:\n\n" +
context +
f"\n\nQuestion: {question}\nAnswer:"
)
# Query chat model
chat_resp = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a helpful assistant for manufacturing equipment safety."},
{"role": "user", "content": prompt}
]
)
answer = chat_resp.choices[0].message.content
return answer, sources, chunks
"""
Embed the user question, retrieve the top_k relevant chunks from the graph,
assemble a prompt with those chunks, call the chat model, and return:
- answer: the generated response
- sources: unique list of source documents
- chunks: list of (header, score, full_text) for the top_k passages
"""
# Embed the question
emb_resp = client.embeddings.create(
model="text-embedding-3-large",
input=question
)
q_vec = emb_resp.data[0].embedding
# Compute similarities against all stored embeddings
sims = cosine_similarity([q_vec], embeddings)[0]
idxs = sims.argsort()[::-1][:top_k]
# Gather chunk‑level info and sources
chunks = []
sources = []
for i in idxs:
node = enodes[i]
text = G.nodes[node]['text']
# Use the first line as the header
header = text.split('\n', 1)[0].lstrip('# ').strip()
score = sims[i]
chunks.append((header, score, text))
sources.append(G.nodes[node]['source'])
# Deduplicate sources while preserving order
sources = list(dict.fromkeys(sources))
# Assemble the prompt from the chunk texts
context_text = "\n\n---\n\n".join([chunk[2] for chunk in chunks])
prompt = (
"Use the following context to answer the question:\n\n"
+ context_text
+ f"\n\nQuestion: {question}\nAnswer:"
)
# Call the chat model
chat_resp = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a helpful assistant for manufacturing equipment safety."},
{"role": "user", "content": prompt}
]
)
answer = chat_resp.choices[0].message.content
return answer, sources, chunks
# Test queries
# test_questions = [
# "What are general machine guarding requirements?",
# "Explain the key steps in lockout/tagout procedures."
# ]
# for q in test_questions:
# answer, sources, chunks = query_graph(q)
# print(f"Q: {q}")
# print(f"Answer: {answer}\n")
# print("Sources:")
# for src in sources:
# print(f"- {src}")
# print("\nTop Chunks:")
# for header, score, _, citation in chunks:
# print(f" * {header} (score: {score:.2f}) from {citation}")
# print("\n", "#"*40, "\n")
|