Agent_UB / contextual_doc_processor_old.py
t-pris's picture
Upload folder using huggingface_hub
7b295db verified
raw
history blame
2.18 kB
import os.path
import json
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import ollama
from tqdm import tqdm
from doc_processor import DocProcessor
class ContextualDocProcessor(DocProcessor):
def __init__(self,DATA_DIR, PATH_SAVE, PATH_SAVE_CONTEXT):
DocProcessor.__init__(self,DATA_DIR, PATH_SAVE)
self.PATH_SAVE_CONTEXT = PATH_SAVE_CONTEXT
def process_data(self):
DocProcessor.process_data(self)
if os.path.exists(self.PATH_SAVE_CONTEXT):
self.load_context_from_jsonl()
else:
self.create_context()
self.save_context_to_jsonl()
def create_context(self):
self.context = []
for filename in tqdm(os.listdir(self.DATA_DIR)):
if filename.endswith(".json"):
path = os.path.join(self.DATA_DIR, filename)
with open(path, 'r') as f:
doc = str(json.load(f))
prompt = f"Résume le document ci-dessous en 500 caractères maximum:\n\nDocument:\n{doc}"
response = ollama.chat(
model="llama3",
messages=[
{
"role": "user",
"content": prompt,
},
],
)
self.context += [Document(page_content=response["message"]["content"], metadata={"source": path}, id=path)]
print("Chunks created")
def save_context_to_jsonl(self):
with open(self.PATH_SAVE_CONTEXT, 'w') as jsonl_file:
for doc in self.context:
jsonl_file.write(doc.json() + '\n')
print("Context saved")
def load_context_from_jsonl(self):
self.context = []
with open(self.PATH_SAVE_CONTEXT, 'r') as jsonl_file:
for line in jsonl_file:
data = json.loads(line)
obj = Document(**data)
self.context.append(obj)
print("Context loaded")
# return self.chunks