Spaces:

HamidOmarov
/

First_RAG_System

Sleeping

First_RAG_System / rag_system.py

Update rag_system.py

aad903a verified 3 months ago

1.68 kB

	from pdf_loader import load_pdf
	from optimal_chunker import chunk_documents
	from embedder_light import get_embedder, embed_text
	from vector_store import get_chroma_client, create_collection

	class RAGPipeline:
	def __init__(self):
	self.tokenizer, self.model = get_embedder()
	self.db_client = get_chroma_client()
	self.collection = create_collection(self.db_client)

	def index_document(self, pdf_path):
	print(f"📄 Loading: {pdf_path}")
	docs = load_pdf(pdf_path)

	print("✂️ Chunking...")
	chunks = chunk_documents(docs)

	print("🔢 Creating embeddings...")
	texts = [chunk.page_content for chunk in chunks]
	vectors = embed_text(texts, self.tokenizer, self.model)

	print("🧠 Adding to ChromaDB...")
	ids = [f"doc_{i}" for i in range(len(texts))]
	self.collection.add(documents=texts, embeddings=vectors, ids=ids)

	print(f"✅ Indexed {len(texts)} chunks.")

	def query(self, question):
	print(f"❓ Question: {question}")
	question_vec = embed_text([question], self.tokenizer, self.model)[0]

	results = self.collection.query(
	query_embeddings=[question_vec],
	n_results=3
	)

	print("\n🔍 Top Documents:")
	for i, doc in enumerate(results["documents"][0]):
	print(f"{i+1}. {doc[:200]}...\n")

	# HF Spaces output üçün cavabı qaytar
	return "\n\n".join([f"{i+1}. {doc[:500]}" for i, doc in enumerate(results["documents"][0])])

	if __name__ == "__main__":
	rag = RAGPipeline()
	rag.index_document("sample.pdf")
	rag.query("What is this document about?")