import json
import random
from pathlib import Path

def merge_and_shuffle_json_files(file1_path, file2_path, output_path, seed=42):
    """
    合并两个JSON文件并随机打乱数据
    
    Args:
        file1_path (str): 第一个JSON文件路径
        file2_path (str): 第二个JSON文件路径  
        output_path (str): 输出文件路径
        seed (int): 随机种子，默认为42
    """
    
    # 设置随机种子
    random.seed(seed)
    
    # 读取第一个JSON文件
    print(f"正在读取第一个文件: {file1_path}")
    with open(file1_path, 'r', encoding='utf-8') as f:
        data1 = json.load(f)
    print(f"第一个文件包含 {len(data1)} 条数据")
    
    # 读取第二个JSON文件
    print(f"正在读取第二个文件: {file2_path}")
    with open(file2_path, 'r', encoding='utf-8') as f:
        data2 = json.load(f)
    print(f"第二个文件包含 {len(data2)} 条数据")
    
    # 合并数据
    merged_data = data1 + data2
    print(f"合并后总共 {len(merged_data)} 条数据")
    
    # 随机打乱数据
    print(f"使用种子 {seed} 随机打乱数据...")
    random.shuffle(merged_data)
    
    # 保存合并后的数据
    print(f"正在保存到: {output_path}")
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(merged_data, f, ensure_ascii=False, indent=4)
    
    print(f"合并完成！输出文件: {output_path}")
    print(f"总共处理了 {len(merged_data)} 条数据")

# 文件路径（直接写入）
file1_path = "/home/bingxing2/ailab/scx6mh7/jkl/LLaVA_8_8_null_space/train_data/step4_build_self_qa.jsonl"
file2_path = "/home/bingxing2/ailab/scx6mh7/jkl/LLaVA_8_8_null_space/train_data/step11_build_instruction_tasks_data.jsonl"
output_path = "/home/bingxing2/ailab/scx6mh7/jkl/LLaVA_8_8_null_space/train_data/random_train_data_seed42.json"

# 执行合并和打乱
if __name__ == "__main__":
    try:
        merge_and_shuffle_json_files(file1_path, file2_path, output_path, seed=42)
    except FileNotFoundError as e:
        print(f"错误：找不到文件 - {e}")
    except json.JSONDecodeError as e:
        print(f"错误：JSON格式错误 - {e}")
    except Exception as e:
        print(f"发生错误：{e}")