import json import ollama import os from tqdm import tqdm import copy import pandas as pd def ask_a_question(path_document, df_QA, nb_question=3): with open(path_document) as json_file: document = json.load(json_file) prompt = f"""Task: Generate {nb_question} question and answer pairs in french based on the content of the following document. Constraint: Each answer must be a direct excerpt (verbatim or nearly verbatim) from the document. Do not summarize or paraphrase. Format: Q: [Your question] A: [Exact excerpt from the document] Document: {document} """ URL = document["url"] response = ollama.chat( model="llama3", messages=[ { "role": "user", "content": prompt, }, ], ) response = response["message"]["content"] res = response.split("Q: ") for r in res: if "A: " in r: qa = r.split("\nA: ") if len(qa) == 1: qa = qa[0].split("A: ") qa += [URL] # QA += [{'Question': qa[0], 'Réponse': qa[1], 'Lien du site': URL}] df_QA = pd.concat([pd.DataFrame([qa], columns=df_QA.columns), df_QA], ignore_index=True) # print(df_QA) return df_QA def remove_menu_files(paths): paths_wt_menu = copy.copy(paths) for p1 in paths: for p2 in paths: p21 = p1[:-5] p22 = p2[:-5] if p21 != p22 and p21 in p22: paths_wt_menu.remove(p1) break return paths_wt_menu def get_list_dir(DATA_DIR): list_dir = os.listdir(DATA_DIR) list_dir = [d for d in list_dir if os.path.isdir(os.path.join(DATA_DIR, d))] return list_dir def get_data_paths(DATA_DIR): list_files = [] list_dir = get_list_dir(DATA_DIR) for d in list_dir: start_path = os.path.join(DATA_DIR, d) filenames = os.listdir(start_path) filenames = [f for f in filenames if f.endswith(".json")] paths = [os.path.join(start_path, f) for f in filenames] list_files += paths return list_files def ask_question_on_multiple_doc(data_path, filename_save, nb_question=3): df_QA = pd.DataFrame(columns=['Question', 'Réponse', 'Lien du site']) files = get_data_paths(data_path) # print(*files, sep="\n") path_files = [f for f in files if not "actualite" in f and not "!trash" in f and f != ".json" and f != "Accueil.json" and f.endswith(".json")] # print(*path_files, sep="\n") path_files = remove_menu_files(path_files) # path_files = path_files[:10] for pf in tqdm(path_files): df_QA = ask_a_question(pf, df_QA, nb_question) df_QA.to_csv(filename_save, index=False) # print(data) # with open(filename_save, 'w') as fp: # json.dump(data, fp, ensure_ascii=False) if __name__ == "__main__": # path_document = "../data_sh/espace-candidats_actifs-et-personnes-en-reprise-detudes_candidater.json" path_document = "data_websites/" QA_filename = "QA_generated.csv" NB_QA = 1 ask_question_on_multiple_doc(path_document, QA_filename, NB_QA)