import os.path import json from langchain.docstore.document import Document from langchain.text_splitter import RecursiveCharacterTextSplitter class DocProcessor: def __init__(self,LIST_DIR, PATH_SAVE): self.LIST_DIR = LIST_DIR self.PATH_SAVE = PATH_SAVE def process_data(self): if os.path.exists(self.PATH_SAVE): self.load_docs_from_jsonl() else: self.create_chuncks() self.save_docs_to_jsonl() self.docs_size = len(self.chunks) def create_chuncks(self, nb_char=1000, chunk_overlap=100): data = [] # for filename in os.listdir(self.LIST_DIR): # if filename.endswith(".json"): # path = os.path.join(self.LIST_DIR, filename) # with open(path, 'r') as f: # data += [Document(page_content=str(json.load(f)), metadata={"source": path})] for path in self.LIST_DIR: if path.endswith(".json"): # path = os.path.join(self.LIST_DIR, filename) with open(path, 'r') as f: data += [Document(page_content=str(json.load(f)), metadata={"source": path})] text_splitter = RecursiveCharacterTextSplitter( chunk_size=nb_char, chunk_overlap=chunk_overlap, add_start_index=True, strip_whitespace=True, separators=["}", "]", "\n\n", "\n", ".", " ", ""], ) chunks = text_splitter.split_documents(data) self.chunks = chunks print("Chunks created") def save_docs_to_jsonl(self): with open(self.PATH_SAVE, 'w') as jsonl_file: for doc in self.chunks: jsonl_file.write(doc.json() + '\n') print("Data saved") def load_docs_from_jsonl(self): self.chunks = [] with open(self.PATH_SAVE, 'r') as jsonl_file: for line in jsonl_file: data = json.loads(line) obj = Document(**data) self.chunks.append(obj) # print("Data loaded") # return self.chunks