Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import json | |
| import re | |
| from langchain_core.documents import Document | |
| from langchain_chroma import Chroma | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.retrievers import BM25Retriever | |
| from langchain.retrievers import EnsembleRetriever | |
| from langchain_core.runnables import ( | |
| RunnableLambda | |
| ) | |
| from langchain_core.documents import Document | |
| def load_data(): | |
| with open("data/test/constitution.json") as f: | |
| return json.load(f) | |
| def format_for_search(chapter, subchapter, article_number, article_content): | |
| subchapter_name = subchapter["heading"] | |
| if subchapter_name != "": | |
| subchapter_name = f"\n{subchapter_name}\n" | |
| if article_number == "-1": | |
| article_number = "" | |
| else: | |
| article_number = f"\nArtykuł {article_number}\n" | |
| if chapter['number'] == 0: | |
| chapter_name = chapter["title"] | |
| else: | |
| chapter_name = f"Rozdział {chapter['number']} {chapter['title']}" | |
| chapter_name = f"{chapter_name}\n" | |
| return f"{article_content}\n\n\n{chapter_name}{subchapter_name}{article_number}" | |
| constitution = load_data() | |
| print(constitution) | |
| documents = [ | |
| Document( | |
| page_content=format_for_search(chapter, subchapter, article_number, article_content), | |
| metadata={"chapter_number": chapter["number"], "chapter_title": chapter["title"], | |
| "subchapter_title": subchapter["heading"], "subchapter_number": subchapter_number, | |
| "article_number": article_number}, | |
| ) | |
| for chapter in constitution["chapters"] | |
| for subchapter_number, subchapter in chapter["subchapters"].items() | |
| for article_number, article_content in subchapter["articles"].items() | |
| ] | |
| def get_full_content(constitution): | |
| text = "# KONSTYTUCJA\n# RZECZYPOSPOLITEJ POLSKIEJ\n\n## z dnia 2 kwietnia 1997 r.\n" | |
| for chapter in constitution["chapters"]: | |
| chapter_prefix = f"## **Rozdział {chapter['number']}**\n\n" if chapter['number'] != 0 else '' | |
| chapter_title = f"**{chapter['title']}**" if chapter['title'].strip() != '' else '' | |
| chapter_name = f"{chapter_prefix} {chapter_title}" | |
| text += f"\n\n{chapter_name}" | |
| for subchapter_number, subchapter in chapter["subchapters"].items(): | |
| subchapter_heading = f"\n\n### **{subchapter['heading']}**" if subchapter['heading'].strip() != '' else '' | |
| text += subchapter_heading | |
| for article_number, article_content in subchapter["articles"].items(): | |
| if article_number == "-1": | |
| article_prefix = "" | |
| else: | |
| article_prefix = f"**Art. {article_number}.**\n\n" | |
| text += f"\n\n{article_prefix}{article_content}" | |
| return text | |
| def load_models(): | |
| embeddings = HuggingFaceEmbeddings(model_name="ipipan/silver-retriever-base-v1") | |
| vector_store = Chroma.from_documents(documents=documents, embedding=embeddings) | |
| embeddings_retriever = vector_store.as_retriever() | |
| bm25_retriever = BM25Retriever.from_documents(documents) | |
| formatter = RunnableLambda(lambda x: f"Pytanie: {x}") | |
| formatting_retriever = formatter | embeddings_retriever | |
| # ensemble_retriever = EnsembleRetriever( | |
| # retrievers=[bm25_retriever, formatting_retriever], weights=[0.5, 0.5] | |
| # ) | |
| return embeddings, vector_store, formatting_retriever | |
| embeddings, vector_store, retriever = load_models() | |
| st.title('Konstytucja RP') | |
| tab1, tab2 = st.tabs(["Wyszukiwarka", "Treść"]) | |
| with tab1: | |
| question = st.text_input('Zadaj pytanie:', 'Czy zgodnie z konstytucją wszyscy są równi wobec prawa?') | |
| retrieved_docs = retriever.invoke(f"{question}") | |
| def format_single(doc): | |
| meta_removed = doc.page_content.split("\n\n\n")[0] | |
| if doc.metadata['article_number'] == "-1": | |
| prefix = "**Preambuła**" | |
| else: | |
| prefix = f"**Art. {doc.metadata['article_number']}.**" | |
| return f"{prefix}\n\n{meta_removed}" | |
| def format_docs(docs): | |
| return "\n\n".join([ | |
| format_single(doc) | |
| for doc in docs | |
| ]) | |
| result = format_docs(retrieved_docs) | |
| st.markdown(format_docs(retrieved_docs)) | |
| with tab2: | |
| st.markdown(get_full_content(constitution)) |