Commit
·
48d391e
1
Parent(s):
d2296bf
Updated user guide example. App should now be able to accept .md files as input
Browse files- faiss_embedding/faiss_embedding.zip +2 -2
- ingest_borough_plan.py +1 -1
- tools/ingest.py +20 -8
faiss_embedding/faiss_embedding.zip
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:20d1c95c817837b8538657ded2fbc840677ccb28fa92becf8d678d51f5662199
|
| 3 |
+
size 286111
|
ingest_borough_plan.py
CHANGED
|
@@ -3,7 +3,7 @@ from tools.ingest import parse_file, text_to_docs, load_embeddings_model, embed_
|
|
| 3 |
borough_plan_text, file_names = parse_file([open("Lambeth_2030-Our_Future_Our_Lambeth.pdf")])
|
| 4 |
print("Borough plan text created")
|
| 5 |
|
| 6 |
-
|
| 7 |
|
| 8 |
borough_plan_docs = text_to_docs(borough_plan_text)
|
| 9 |
print("Borough plan docs created")
|
|
|
|
| 3 |
borough_plan_text, file_names = parse_file([open("Lambeth_2030-Our_Future_Our_Lambeth.pdf")])
|
| 4 |
print("Borough plan text created")
|
| 5 |
|
| 6 |
+
print(borough_plan_text)
|
| 7 |
|
| 8 |
borough_plan_docs = text_to_docs(borough_plan_text)
|
| 9 |
print("Borough plan docs created")
|
tools/ingest.py
CHANGED
|
@@ -48,7 +48,7 @@ def determine_file_type(file_path):
|
|
| 48 |
file_path (str): Path to the file.
|
| 49 |
|
| 50 |
Returns:
|
| 51 |
-
str: File extension (e.g., '.pdf', '.docx', '.txt', '.html').
|
| 52 |
"""
|
| 53 |
return os.path.splitext(file_path)[1].lower()
|
| 54 |
|
|
@@ -74,6 +74,7 @@ def parse_file(file_paths, text_column='text'):
|
|
| 74 |
'.pdf': parse_pdf,
|
| 75 |
'.docx': parse_docx,
|
| 76 |
'.txt': parse_txt,
|
|
|
|
| 77 |
'.html': parse_html,
|
| 78 |
'.htm': parse_html, # Considering both .html and .htm for HTML files
|
| 79 |
'.csv': lambda file_path: parse_csv_or_excel(file_path, text_column),
|
|
@@ -217,10 +218,10 @@ def parse_docx(file_path):
|
|
| 217 |
|
| 218 |
def parse_txt(file_path):
|
| 219 |
"""
|
| 220 |
-
Read text from a TXT or
|
| 221 |
|
| 222 |
Parameters:
|
| 223 |
-
file_path (str): Path to the TXT or
|
| 224 |
|
| 225 |
Returns:
|
| 226 |
str: Text content of the file.
|
|
@@ -232,6 +233,17 @@ def parse_txt(file_path):
|
|
| 232 |
|
| 233 |
return file_contents
|
| 234 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
def parse_html(page_url, div_filter="p"):
|
| 236 |
"""
|
| 237 |
Determine if the source is a web URL or a local HTML file, extract the content based on the div of choice. Also tries to extract dates (WIP)
|
|
@@ -363,11 +375,11 @@ def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document
|
|
| 363 |
|
| 364 |
# Depending on the file extension, handle the content
|
| 365 |
if ext == '.pdf':
|
| 366 |
-
docs, page_docs = pdf_text_to_docs(content, chunk_size)
|
| 367 |
-
elif ext in ['.html', '.htm', '.txt', '.docx']:
|
| 368 |
-
docs = html_text_to_docs(content, chunk_size)
|
| 369 |
elif ext in ['.csv', '.xlsx']:
|
| 370 |
-
docs, page_docs = csv_excel_text_to_docs(content, chunk_size)
|
| 371 |
else:
|
| 372 |
print(f"Unsupported file type {ext} for {file_path}. Skipping.")
|
| 373 |
continue
|
|
@@ -436,7 +448,7 @@ def pdf_text_to_docs(text, chunk_size: int = chunk_size) -> List[Document]:
|
|
| 436 |
|
| 437 |
return doc_sections, page_docs#, parent_doc
|
| 438 |
|
| 439 |
-
def html_text_to_docs(texts, metadatas, chunk_size:int = chunk_size):
|
| 440 |
|
| 441 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 442 |
separators=split_strat,#["\n\n", "\n", ".", "!", "?", ",", " ", ""],
|
|
|
|
| 48 |
file_path (str): Path to the file.
|
| 49 |
|
| 50 |
Returns:
|
| 51 |
+
str: File extension (e.g., '.pdf', '.docx', '.txt', '.html', '.md').
|
| 52 |
"""
|
| 53 |
return os.path.splitext(file_path)[1].lower()
|
| 54 |
|
|
|
|
| 74 |
'.pdf': parse_pdf,
|
| 75 |
'.docx': parse_docx,
|
| 76 |
'.txt': parse_txt,
|
| 77 |
+
'.md': parse_markdown,
|
| 78 |
'.html': parse_html,
|
| 79 |
'.htm': parse_html, # Considering both .html and .htm for HTML files
|
| 80 |
'.csv': lambda file_path: parse_csv_or_excel(file_path, text_column),
|
|
|
|
| 218 |
|
| 219 |
def parse_txt(file_path):
|
| 220 |
"""
|
| 221 |
+
Read text from a TXT, HTML, or MD file.
|
| 222 |
|
| 223 |
Parameters:
|
| 224 |
+
file_path (str): Path to the TXT, HTML, or MD file.
|
| 225 |
|
| 226 |
Returns:
|
| 227 |
str: Text content of the file.
|
|
|
|
| 233 |
|
| 234 |
return file_contents
|
| 235 |
|
| 236 |
+
def parse_markdown(file_path):
|
| 237 |
+
"""
|
| 238 |
+
Read text from a MD file.
|
| 239 |
+
"""
|
| 240 |
+
with open(file_path, 'r', encoding="utf-8") as file:
|
| 241 |
+
file_contents = file.read().replace(" ", " ").strip()
|
| 242 |
+
|
| 243 |
+
file_contents = text_regex_clean(file_contents)
|
| 244 |
+
|
| 245 |
+
return file_contents
|
| 246 |
+
|
| 247 |
def parse_html(page_url, div_filter="p"):
|
| 248 |
"""
|
| 249 |
Determine if the source is a web URL or a local HTML file, extract the content based on the div of choice. Also tries to extract dates (WIP)
|
|
|
|
| 375 |
|
| 376 |
# Depending on the file extension, handle the content
|
| 377 |
if ext == '.pdf':
|
| 378 |
+
docs, page_docs = pdf_text_to_docs(content, chunk_size=chunk_size)
|
| 379 |
+
elif ext in ['.html', '.htm', '.txt', '.docx', '.md']:
|
| 380 |
+
docs = html_text_to_docs(content, chunk_size=chunk_size)
|
| 381 |
elif ext in ['.csv', '.xlsx']:
|
| 382 |
+
docs, page_docs = csv_excel_text_to_docs(content, chunk_size=chunk_size)
|
| 383 |
else:
|
| 384 |
print(f"Unsupported file type {ext} for {file_path}. Skipping.")
|
| 385 |
continue
|
|
|
|
| 448 |
|
| 449 |
return doc_sections, page_docs#, parent_doc
|
| 450 |
|
| 451 |
+
def html_text_to_docs(texts:list[str], metadatas:dict={}, chunk_size:int = chunk_size):
|
| 452 |
|
| 453 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 454 |
separators=split_strat,#["\n\n", "\n", ".", "!", "?", ",", " ", ""],
|