import pandas as pd from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from .pipeline_components import ( DataCleaner, FeatureEngineer, CategoricalEncoder, KDEKNNFeatureCreator, CoordinateGetter, ColumnCleaner ) def create_preprocessing_pipeline(): """ Creates a preprocessing pipeline for real estate data. """ return Pipeline([ ('data_cleaner', DataCleaner()), ('feature_engineer', FeatureEngineer()), ('coordinate_getter', CoordinateGetter()), ('categorical_encoder', CategoricalEncoder()), ('kde_knn_creator', KDEKNNFeatureCreator()), ('column_cleaner', ColumnCleaner()), #('standard_scaler', scaler) ]) def preprocess_data(df_train, df_test=None, pipeline=None): """ Preprocesses the training and/or test (single prediction) data using the pipeline. Args: df_train (pd.DataFrame): Training data df_test (pd.DataFrame, optional): Test data or single row for prediction pipeline (Pipeline, optional): If provided, use existing fitted pipeline Returns: tuple: (X_train, X_test, fitted_pipeline, scaler) """ y_test = None # Drop rows with NaN values in critical columns if df_train is not None: df_train = df_train.dropna(subset=['price', 'habitableSurface']) if df_test is not None: if 'price' in df_test.columns: df_test = df_test.dropna(subset=['price']) df_test = df_test.dropna(subset=['habitableSurface']) if pipeline is None: pipeline = create_preprocessing_pipeline() X_train = pipeline.fit_transform(df_train) y_train = X_train['price'] X_train = X_train.drop(columns=['price']) else: X_train = None y_train = None X_test = pipeline.transform(df_test) if df_test is not None else None if 'price' in X_test.columns: y_test = X_test['price'] X_test = X_test.drop(columns=['price']) else: y_test = None X_test = X_test return X_train, y_train, X_test, y_test, pipeline def main(): # Load data df = pd.read_csv("./data/Kangaroo.csv") df = df.drop_duplicates(subset=["id"], keep="first") df = df[df['price'] < 1500000] df = df.dropna(subset=['price']) # Filter EPC scores epc_order = ['A++', 'A+', 'A', 'B', 'C', 'D', 'E', 'F', 'G'] df = df[df['epcScore'].isin(epc_order)] df['epcScore'] = df['epcScore'].fillna(df['epcScore'].mode()[0]) # Convert price to float df['price'] = df['price'].astype(float) # Split data from sklearn.model_selection import train_test_split df_train, df_test = train_test_split(df, test_size=0.2, random_state=42) # Preprocess data X_train, X_test = preprocess_data(df_train, df_test) # Save processed data pd.DataFrame(X_train).to_csv("./data/train_processed.csv", index=False) pd.DataFrame(X_test).to_csv("./data/test_processed.csv", index=False) if __name__ == "__main__": main()