|
|
import os.path |
|
|
import json |
|
|
from langchain.docstore.document import Document |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
import ollama |
|
|
from tqdm import tqdm |
|
|
|
|
|
from doc_processor import DocProcessor |
|
|
|
|
|
class ContextualDocProcessor(DocProcessor): |
|
|
def __init__(self,DATA_DIR, PATH_SAVE, PATH_SAVE_CONTEXT): |
|
|
DocProcessor.__init__(self,DATA_DIR, PATH_SAVE) |
|
|
self.PATH_SAVE_CONTEXT = PATH_SAVE_CONTEXT |
|
|
|
|
|
def process_data(self): |
|
|
DocProcessor.process_data(self) |
|
|
if os.path.exists(self.PATH_SAVE_CONTEXT): |
|
|
self.load_context_from_jsonl() |
|
|
else: |
|
|
self.create_context() |
|
|
self.save_context_to_jsonl() |
|
|
|
|
|
|
|
|
def create_context(self): |
|
|
self.context = [] |
|
|
for filename in tqdm(os.listdir(self.DATA_DIR)): |
|
|
if filename.endswith(".json"): |
|
|
path = os.path.join(self.DATA_DIR, filename) |
|
|
with open(path, 'r') as f: |
|
|
doc = str(json.load(f)) |
|
|
prompt = f"Résume le document ci-dessous en 500 caractères maximum:\n\nDocument:\n{doc}" |
|
|
response = ollama.chat( |
|
|
model="llama3", |
|
|
messages=[ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": prompt, |
|
|
}, |
|
|
], |
|
|
) |
|
|
|
|
|
self.context += [Document(page_content=response["message"]["content"], metadata={"source": path}, id=path)] |
|
|
|
|
|
print("Chunks created") |
|
|
|
|
|
def save_context_to_jsonl(self): |
|
|
with open(self.PATH_SAVE_CONTEXT, 'w') as jsonl_file: |
|
|
for doc in self.context: |
|
|
jsonl_file.write(doc.json() + '\n') |
|
|
print("Context saved") |
|
|
|
|
|
def load_context_from_jsonl(self): |
|
|
self.context = [] |
|
|
with open(self.PATH_SAVE_CONTEXT, 'r') as jsonl_file: |
|
|
for line in jsonl_file: |
|
|
data = json.loads(line) |
|
|
obj = Document(**data) |
|
|
self.context.append(obj) |
|
|
print("Context loaded") |
|
|
|
|
|
|
|
|
|
|
|
|