MonaHamid commited on
Commit
a158a6a
·
verified ·
1 Parent(s): 866767d

Create batch.py

Browse files
Files changed (1) hide show
  1. batch.py +31 -0
batch.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import pandas as pd
3
+
4
+ categorical = ['PULocationID', 'DOLocationID']
5
+
6
+ def read_data(filename):
7
+ df = pd.read_parquet(filename)
8
+ df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
9
+ df['duration'] = df.duration.dt.total_seconds() / 60
10
+ df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
11
+ df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
12
+ return df
13
+
14
+ def run(year: int, month: int):
15
+ with open('model.bin', 'rb') as f_in:
16
+ dv, model = pickle.load(f_in)
17
+
18
+ url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet"
19
+ df = read_data(url)
20
+
21
+ dicts = df[categorical].to_dict(orient='records')
22
+ X_val = dv.transform(dicts)
23
+ y_pred = model.predict(X_val)
24
+
25
+ df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
26
+ df_result = pd.DataFrame({'ride_id': df['ride_id'], 'predicted_duration': y_pred})
27
+
28
+ output_file = f'output_{year:04d}_{month:02d}.parquet'
29
+ df_result.to_parquet(output_file, engine='pyarrow', index=False)
30
+
31
+ return output_file, y_pred.mean()