Spaces:

seanpedrickcase
/

Light-PDF-Web-QA-Chatbot

Running

App Files Files Community

seanpedrickcase commited on 11 days ago

Commit

48d391e

1 Parent(s): d2296bf

Updated user guide example. App should now be able to accept .md files as input

Browse files

Files changed (3) hide show

faiss_embedding/faiss_embedding.zip +2 -2
ingest_borough_plan.py +1 -1
tools/ingest.py +20 -8

faiss_embedding/faiss_embedding.zip CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ca011e3baf4b92201d378f35ab5f2fe0b6d16ac2eaac4f0705b8c4e84e24a6ae
-size 243109

 version https://git-lfs.github.com/spec/v1
+oid sha256:20d1c95c817837b8538657ded2fbc840677ccb28fa92becf8d678d51f5662199
+size 286111

ingest_borough_plan.py CHANGED Viewed

@@ -3,7 +3,7 @@ from tools.ingest import parse_file, text_to_docs, load_embeddings_model, embed_
 borough_plan_text, file_names = parse_file([open("Lambeth_2030-Our_Future_Our_Lambeth.pdf")])
 print("Borough plan text created")
-#print(borough_plan_text)
 borough_plan_docs = text_to_docs(borough_plan_text)
 print("Borough plan docs created")

 borough_plan_text, file_names = parse_file([open("Lambeth_2030-Our_Future_Our_Lambeth.pdf")])
 print("Borough plan text created")
+print(borough_plan_text)
 borough_plan_docs = text_to_docs(borough_plan_text)
 print("Borough plan docs created")

tools/ingest.py CHANGED Viewed

@@ -48,7 +48,7 @@ def determine_file_type(file_path):
             file_path (str): Path to the file.
         Returns:
-            str: File extension (e.g., '.pdf', '.docx', '.txt', '.html').
         """
         return os.path.splitext(file_path)[1].lower()
@@ -74,6 +74,7 @@ def parse_file(file_paths, text_column='text'):
         '.pdf': parse_pdf,
         '.docx': parse_docx,
         '.txt': parse_txt,
         '.html': parse_html,
         '.htm': parse_html,  # Considering both .html and .htm for HTML files
         '.csv': lambda file_path: parse_csv_or_excel(file_path, text_column),
@@ -217,10 +218,10 @@ def parse_docx(file_path):
 def parse_txt(file_path):
     """
-    Read text from a TXT or HTML file.
     Parameters:
-        file_path (str): Path to the TXT or HTML file.
     Returns:
         str: Text content of the file.
@@ -232,6 +233,17 @@ def parse_txt(file_path):
         return file_contents
 def parse_html(page_url, div_filter="p"):
     """
     Determine if the source is a web URL or a local HTML file, extract the content based on the div of choice. Also tries to extract dates (WIP)
@@ -363,11 +375,11 @@ def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document
         # Depending on the file extension, handle the content
         if ext == '.pdf':
-            docs, page_docs = pdf_text_to_docs(content, chunk_size)
-        elif ext in ['.html', '.htm', '.txt', '.docx']:
-            docs = html_text_to_docs(content, chunk_size)
         elif ext in ['.csv', '.xlsx']:
-            docs, page_docs = csv_excel_text_to_docs(content, chunk_size)
         else:
             print(f"Unsupported file type {ext} for {file_path}. Skipping.")
             continue
@@ -436,7 +448,7 @@ def pdf_text_to_docs(text, chunk_size: int = chunk_size) -> List[Document]:
     return doc_sections, page_docs#, parent_doc
-def html_text_to_docs(texts, metadatas, chunk_size:int = chunk_size):
     text_splitter = RecursiveCharacterTextSplitter(
         separators=split_strat,#["\n\n", "\n", ".", "!", "?", ",", " ", ""],

             file_path (str): Path to the file.
         Returns:
+            str: File extension (e.g., '.pdf', '.docx', '.txt', '.html', '.md').
         """
         return os.path.splitext(file_path)[1].lower()
         '.pdf': parse_pdf,
         '.docx': parse_docx,
         '.txt': parse_txt,
+        '.md': parse_markdown,
         '.html': parse_html,
         '.htm': parse_html,  # Considering both .html and .htm for HTML files
         '.csv': lambda file_path: parse_csv_or_excel(file_path, text_column),
 def parse_txt(file_path):
     """
+    Read text from a TXT, HTML, or MD file.
     Parameters:
+        file_path (str): Path to the TXT, HTML, or MD file.
     Returns:
         str: Text content of the file.
         return file_contents
+def parse_markdown(file_path):
+    """
+    Read text from a MD file.
+    """
+    with open(file_path, 'r', encoding="utf-8") as file:
+        file_contents = file.read().replace("  ", " ").strip()
+        file_contents = text_regex_clean(file_contents)
+        return file_contents
 def parse_html(page_url, div_filter="p"):
     """
     Determine if the source is a web URL or a local HTML file, extract the content based on the div of choice. Also tries to extract dates (WIP)
         # Depending on the file extension, handle the content
         if ext == '.pdf':
+            docs, page_docs = pdf_text_to_docs(content, chunk_size=chunk_size)
+        elif ext in ['.html', '.htm', '.txt', '.docx', '.md']:
+            docs = html_text_to_docs(content, chunk_size=chunk_size)
         elif ext in ['.csv', '.xlsx']:
+            docs, page_docs = csv_excel_text_to_docs(content, chunk_size=chunk_size)
         else:
             print(f"Unsupported file type {ext} for {file_path}. Skipping.")
             continue
     return doc_sections, page_docs#, parent_doc
+def html_text_to_docs(texts:list[str], metadatas:dict={}, chunk_size:int = chunk_size):
     text_splitter = RecursiveCharacterTextSplitter(
         separators=split_strat,#["\n\n", "\n", ".", "!", "?", ",", " ", ""],