Spaces:
Runtime error
Runtime error
| import json | |
| from tqdm import tqdm | |
| QUERY_TYPE="Q0" # TREC format legacy | |
| RELEVANCE_SCORE=1 | |
| PASSAGE_ID_VALID_PREFIX=["MARCO", "KILT"] # disable WAPO | |
| ### read: cast 22 eval json | |
| eval_json_path="/root/Corpus/CAsT22_eval_queries/cqr_inferred_results.json" | |
| # read data | |
| with open(eval_json_path, 'r') as fr: | |
| data = json.load(fr) | |
| # write: qrels.txt (format: {qid}\t{query}) | |
| eval_qrels_path = "/root/Corpus/CAsT22_eval_queries/cqr_qrels.txt" | |
| qid_pid_pair_list = [] # filter out duplicate pair exists in evaluation file | |
| with open(eval_qrels_path, 'w') as fw: | |
| for sample in tqdm(data): | |
| conv_id = sample['number'] | |
| for turn in sample['turn']: | |
| turn_id = turn['number'] | |
| automatic_rewritten_utterance = turn['automatic_rewritten_utterance'] | |
| q_id = f"{conv_id}_{turn_id}" | |
| if "provenance" in turn.keys(): | |
| for passage_id in turn["provenance"]: | |
| if any([valid_prefix in passage_id for valid_prefix in PASSAGE_ID_VALID_PREFIX]): | |
| if ' ' in passage_id: | |
| print(f"delete whitespace in passage_id: {passage_id}") | |
| passage_id = passage_id.replace(' ', '') | |
| qid_pid_pair = f"{q_id}&{passage_id}" | |
| if qid_pid_pair not in qid_pid_pair_list: | |
| qid_pid_pair_list.append(qid_pid_pair) | |
| fw.write(f"{q_id} {QUERY_TYPE} {passage_id} {RELEVANCE_SCORE}\n") | |
| else: | |
| print(f"skip appending duplicate qid&pid pair: qid = {q_id}, p_id = {passage_id}") | |
| else: | |
| print(f"exclude passage id: {passage_id}") | |
| else: | |
| print('no provenance for turn') | |
| print(turn) | |