import json

def remove_duplicates_by_question_id(file_a, file_b, output_file):
    # 读取A文件的question_id
    with open(file_a, 'r') as f:
        ids_a = {json.loads(line)['question_id'] for line in f if line.strip()}
    
    # 过滤B文件
    with open(file_b, 'r') as f_in, open(output_file, 'w') as f_out:
        for line in f_in:
            if line.strip():
                data = json.loads(line)
                if data['question_id'] not in ids_a:
                    f_out.write(line)
    
    print(f"处理完成！结果保存到: {output_file}")

# 使用
remove_duplicates_by_question_id('/home/bingxing2/ailab/scx6mh7/jkl/LLaVA_8_8_null_space/capture_know/sample_data/MME/rank128_sample32.jsonl', '/home/bingxing2/ailab/group/ai4bio/renyuchen/jkl/MME/llava_mme.jsonl', '/home/bingxing2/ailab/scx6mh7/jkl/LLaVA_8_8_null_space/capture_know/tichu_sample_data/new_tichu_32_mme.jsonl')