seanpedrickcase commited on
Commit
48d391e
·
1 Parent(s): d2296bf

Updated user guide example. App should now be able to accept .md files as input

Browse files
faiss_embedding/faiss_embedding.zip CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca011e3baf4b92201d378f35ab5f2fe0b6d16ac2eaac4f0705b8c4e84e24a6ae
3
- size 243109
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20d1c95c817837b8538657ded2fbc840677ccb28fa92becf8d678d51f5662199
3
+ size 286111
ingest_borough_plan.py CHANGED
@@ -3,7 +3,7 @@ from tools.ingest import parse_file, text_to_docs, load_embeddings_model, embed_
3
  borough_plan_text, file_names = parse_file([open("Lambeth_2030-Our_Future_Our_Lambeth.pdf")])
4
  print("Borough plan text created")
5
 
6
- #print(borough_plan_text)
7
 
8
  borough_plan_docs = text_to_docs(borough_plan_text)
9
  print("Borough plan docs created")
 
3
  borough_plan_text, file_names = parse_file([open("Lambeth_2030-Our_Future_Our_Lambeth.pdf")])
4
  print("Borough plan text created")
5
 
6
+ print(borough_plan_text)
7
 
8
  borough_plan_docs = text_to_docs(borough_plan_text)
9
  print("Borough plan docs created")
tools/ingest.py CHANGED
@@ -48,7 +48,7 @@ def determine_file_type(file_path):
48
  file_path (str): Path to the file.
49
 
50
  Returns:
51
- str: File extension (e.g., '.pdf', '.docx', '.txt', '.html').
52
  """
53
  return os.path.splitext(file_path)[1].lower()
54
 
@@ -74,6 +74,7 @@ def parse_file(file_paths, text_column='text'):
74
  '.pdf': parse_pdf,
75
  '.docx': parse_docx,
76
  '.txt': parse_txt,
 
77
  '.html': parse_html,
78
  '.htm': parse_html, # Considering both .html and .htm for HTML files
79
  '.csv': lambda file_path: parse_csv_or_excel(file_path, text_column),
@@ -217,10 +218,10 @@ def parse_docx(file_path):
217
 
218
  def parse_txt(file_path):
219
  """
220
- Read text from a TXT or HTML file.
221
 
222
  Parameters:
223
- file_path (str): Path to the TXT or HTML file.
224
 
225
  Returns:
226
  str: Text content of the file.
@@ -232,6 +233,17 @@ def parse_txt(file_path):
232
 
233
  return file_contents
234
 
 
 
 
 
 
 
 
 
 
 
 
235
  def parse_html(page_url, div_filter="p"):
236
  """
237
  Determine if the source is a web URL or a local HTML file, extract the content based on the div of choice. Also tries to extract dates (WIP)
@@ -363,11 +375,11 @@ def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document
363
 
364
  # Depending on the file extension, handle the content
365
  if ext == '.pdf':
366
- docs, page_docs = pdf_text_to_docs(content, chunk_size)
367
- elif ext in ['.html', '.htm', '.txt', '.docx']:
368
- docs = html_text_to_docs(content, chunk_size)
369
  elif ext in ['.csv', '.xlsx']:
370
- docs, page_docs = csv_excel_text_to_docs(content, chunk_size)
371
  else:
372
  print(f"Unsupported file type {ext} for {file_path}. Skipping.")
373
  continue
@@ -436,7 +448,7 @@ def pdf_text_to_docs(text, chunk_size: int = chunk_size) -> List[Document]:
436
 
437
  return doc_sections, page_docs#, parent_doc
438
 
439
- def html_text_to_docs(texts, metadatas, chunk_size:int = chunk_size):
440
 
441
  text_splitter = RecursiveCharacterTextSplitter(
442
  separators=split_strat,#["\n\n", "\n", ".", "!", "?", ",", " ", ""],
 
48
  file_path (str): Path to the file.
49
 
50
  Returns:
51
+ str: File extension (e.g., '.pdf', '.docx', '.txt', '.html', '.md').
52
  """
53
  return os.path.splitext(file_path)[1].lower()
54
 
 
74
  '.pdf': parse_pdf,
75
  '.docx': parse_docx,
76
  '.txt': parse_txt,
77
+ '.md': parse_markdown,
78
  '.html': parse_html,
79
  '.htm': parse_html, # Considering both .html and .htm for HTML files
80
  '.csv': lambda file_path: parse_csv_or_excel(file_path, text_column),
 
218
 
219
  def parse_txt(file_path):
220
  """
221
+ Read text from a TXT, HTML, or MD file.
222
 
223
  Parameters:
224
+ file_path (str): Path to the TXT, HTML, or MD file.
225
 
226
  Returns:
227
  str: Text content of the file.
 
233
 
234
  return file_contents
235
 
236
+ def parse_markdown(file_path):
237
+ """
238
+ Read text from a MD file.
239
+ """
240
+ with open(file_path, 'r', encoding="utf-8") as file:
241
+ file_contents = file.read().replace(" ", " ").strip()
242
+
243
+ file_contents = text_regex_clean(file_contents)
244
+
245
+ return file_contents
246
+
247
  def parse_html(page_url, div_filter="p"):
248
  """
249
  Determine if the source is a web URL or a local HTML file, extract the content based on the div of choice. Also tries to extract dates (WIP)
 
375
 
376
  # Depending on the file extension, handle the content
377
  if ext == '.pdf':
378
+ docs, page_docs = pdf_text_to_docs(content, chunk_size=chunk_size)
379
+ elif ext in ['.html', '.htm', '.txt', '.docx', '.md']:
380
+ docs = html_text_to_docs(content, chunk_size=chunk_size)
381
  elif ext in ['.csv', '.xlsx']:
382
+ docs, page_docs = csv_excel_text_to_docs(content, chunk_size=chunk_size)
383
  else:
384
  print(f"Unsupported file type {ext} for {file_path}. Skipping.")
385
  continue
 
448
 
449
  return doc_sections, page_docs#, parent_doc
450
 
451
+ def html_text_to_docs(texts:list[str], metadatas:dict={}, chunk_size:int = chunk_size):
452
 
453
  text_splitter = RecursiveCharacterTextSplitter(
454
  separators=split_strat,#["\n\n", "\n", ".", "!", "?", ",", " ", ""],