Delta-Vector commited on
Commit
d88e8c3
·
verified ·
1 Parent(s): 5a14f5a

Upload parquet.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. parquet.py +48 -0
parquet.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import json
3
+ import argparse
4
+ import os
5
+ import numpy as np
6
+
7
+ class NumpyEncoder(json.JSONEncoder):
8
+ def default(self, obj):
9
+ if isinstance(obj, np.integer):
10
+ return int(obj)
11
+ if isinstance(obj, np.floating):
12
+ return float(obj)
13
+ if isinstance(obj, np.ndarray):
14
+ return obj.tolist()
15
+ if isinstance(obj, np.bool_):
16
+ return bool(obj)
17
+ return super().default(obj)
18
+
19
+ def parquet_to_jsonl(input_path: str, output_path: str):
20
+ print(f"Reading parquet file: {input_path}")
21
+ df = pd.read_parquet(input_path)
22
+
23
+ # Add directory creation
24
+ output_dir = os.path.dirname(output_path)
25
+ if output_dir:
26
+ os.makedirs(output_dir, exist_ok=True)
27
+
28
+ print(f"Converting to JSONL: {output_path}")
29
+ with open(output_path, 'w', encoding='utf-8') as f:
30
+ for _, row in df.iterrows():
31
+ json_line = row.to_dict()
32
+ f.write(json.dumps(json_line, ensure_ascii=False, cls=NumpyEncoder) + '\n')
33
+
34
+ print(f"Converted {len(df)} rows to JSONL")
35
+
36
+ if __name__ == "__main__":
37
+ parser = argparse.ArgumentParser(description="Convert Parquet to JSONL")
38
+ parser.add_argument("input", help="Input parquet file")
39
+ parser.add_argument("output_dir", help="Directory to save the output JSONL file")
40
+ args = parser.parse_args()
41
+
42
+ # Create output directory
43
+ os.makedirs(args.output_dir, exist_ok=True)
44
+
45
+ # Generate output file path
46
+ output_file = os.path.join(args.output_dir, f"{os.path.splitext(os.path.basename(args.input))[0]}.jsonl")
47
+
48
+ parquet_to_jsonl(args.input, output_file)