Spaces:
Runtime error
Runtime error
| # dedup script shared by Sean MacAvaney on TREC Slack workspace, 2022/10/28 | |
| # usage: python dedup.py msmarco-v2-passage-neardupes.txt.gz myrun1 myrun2 ... | |
| # outputs myrun1.dedup, myrun2.dedup, etc. | |
| from collections import Counter | |
| import argparse | |
| import gzip | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('dupefile') | |
| parser.add_argument('runs', nargs='+') | |
| args = parser.parse_args() | |
| equiv_map = {} | |
| for line in gzip.open(args.dupefile, 'rt'): | |
| cols = line.split() | |
| class_id, doc_id = cols[0], cols[1] | |
| if class_id != doc_id: | |
| equiv_map[doc_id] = class_id | |
| for run in args.runs: | |
| classes = set() | |
| qid_count = Counter() | |
| with open(run, 'rt') as fin, open(run+'.dedup', 'wt') as fout: | |
| for line in fin: | |
| qid, q0, did, rank, score, runid = line.split() | |
| class_id = equiv_map.get(did, did) | |
| key = (qid, class_id) | |
| if key not in classes: | |
| classes.add(key) | |
| new_rank = str(qid_count[qid]) | |
| fout.write(' '.join([qid, q0, class_id, new_rank, score, runid]) + '\n') | |
| qid_count[qid] += 1 | |