import os.path import json from langchain.docstore.document import Document from langchain.text_splitter import RecursiveCharacterTextSplitter import ollama from tqdm import tqdm from doc_processor import DocProcessor class ContextualDocProcessor(DocProcessor): def __init__(self,DATA_DIR, PATH_SAVE, PATH_SAVE_CONTEXT): DocProcessor.__init__(self,DATA_DIR, PATH_SAVE) self.PATH_SAVE_CONTEXT = PATH_SAVE_CONTEXT def process_data(self): DocProcessor.process_data(self) if os.path.exists(self.PATH_SAVE_CONTEXT): self.load_context_from_jsonl() else: self.create_context() self.save_context_to_jsonl() def create_context(self): self.context = [] for filename in tqdm(os.listdir(self.DATA_DIR)): if filename.endswith(".json"): path = os.path.join(self.DATA_DIR, filename) with open(path, 'r') as f: doc = str(json.load(f)) prompt = f"Résume le document ci-dessous en 500 caractères maximum:\n\nDocument:\n{doc}" response = ollama.chat( model="llama3", messages=[ { "role": "user", "content": prompt, }, ], ) self.context += [Document(page_content=response["message"]["content"], metadata={"source": path}, id=path)] print("Chunks created") def save_context_to_jsonl(self): with open(self.PATH_SAVE_CONTEXT, 'w') as jsonl_file: for doc in self.context: jsonl_file.write(doc.json() + '\n') print("Context saved") def load_context_from_jsonl(self): self.context = [] with open(self.PATH_SAVE_CONTEXT, 'r') as jsonl_file: for line in jsonl_file: data = json.loads(line) obj = Document(**data) self.context.append(obj) print("Context loaded") # return self.chunks