Spaces:
Sleeping
Sleeping
| from pdf_loader import load_pdf | |
| from optimal_chunker import chunk_documents | |
| from embedder_light import get_embedder, embed_text | |
| from vector_store import get_chroma_client, create_collection | |
| class RAGPipeline: | |
| def __init__(self): | |
| self.tokenizer, self.model = get_embedder() | |
| self.db_client = get_chroma_client() | |
| self.collection = create_collection(self.db_client) | |
| def index_document(self, pdf_path): | |
| print(f"📄 Loading: {pdf_path}") | |
| docs = load_pdf(pdf_path) | |
| print("✂️ Chunking...") | |
| chunks = chunk_documents(docs) | |
| print("🔢 Creating embeddings...") | |
| texts = [chunk.page_content for chunk in chunks] | |
| vectors = embed_text(texts, self.tokenizer, self.model) | |
| print("🧠 Adding to ChromaDB...") | |
| ids = [f"doc_{i}" for i in range(len(texts))] | |
| self.collection.add(documents=texts, embeddings=vectors, ids=ids) | |
| print(f"✅ Indexed {len(texts)} chunks.") | |
| def query(self, question): | |
| print(f"❓ Question: {question}") | |
| question_vec = embed_text([question], self.tokenizer, self.model)[0] | |
| results = self.collection.query( | |
| query_embeddings=[question_vec], | |
| n_results=3 | |
| ) | |
| print("\n🔍 Top Documents:") | |
| for i, doc in enumerate(results["documents"][0]): | |
| print(f"{i+1}. {doc[:200]}...\n") | |
| # HF Spaces output üçün cavabı qaytar | |
| return "\n\n".join([f"{i+1}. {doc[:500]}" for i, doc in enumerate(results["documents"][0])]) | |
| if __name__ == "__main__": | |
| rag = RAGPipeline() | |
| rag.index_document("sample.pdf") | |
| rag.query("What is this document about?") | |