Rhodham96 commited on
Commit
5089ff4
·
1 Parent(s): 1e47a48
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore Python bytecode
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+
7
+ # Ignore virtualenv folder (à adapter selon ton env)
8
+ env/
9
+ venv/
10
+ .machine_learning/
11
+
Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Utiliser une image de base Python
2
+ FROM python:3.10-slim-buster
3
+
4
+ # Définir le répertoire de travail dans le conteneur
5
+ WORKDIR /app
6
+
7
+ # Copier les fichiers de requirements
8
+ COPY requirements.txt .
9
+ # Installer les dépendances
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ # Copier le code de l'application
13
+ COPY . /app
14
+
15
+ # Exposer le port sur lequel Streamlit s'exécute
16
+ EXPOSE 7860
17
+
18
+ # Définir la commande à exécuter pour lancer l'application
19
+ CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.enableCORS=false"]
README.md CHANGED
@@ -1,10 +1,11 @@
1
  ---
2
  title: RealEstate PricePrediction Belgium
3
- emoji: 🏆
4
- colorFrom: indigo
5
- colorTo: indigo
6
  sdk: docker
7
  pinned: false
 
8
  ---
9
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: RealEstate PricePrediction Belgium
3
+ emoji: 🌍
4
+ colorFrom: gray
5
+ colorTo: red
6
  sdk: docker
7
  pinned: false
8
+ license: mit
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import joblib
4
+
5
+ def create_dataframe_from_user_input():
6
+ """
7
+ Collects user input for house features using Streamlit and
8
+ returns a Pandas DataFrame.
9
+ """
10
+
11
+ # Define the lists of possible values for dropdown selections
12
+ type_list = ['HOUSE', 'APARTMENT']
13
+ subtype_list = ['HOUSE', 'APARTMENT', 'VILLA', 'APARTMENT_BLOCK', 'APARTMENT_GROUP',
14
+ 'MIXED_USE_BUILDING', 'GROUND_FLOOR', 'DUPLEX', 'HOUSE_GROUP',
15
+ 'FLAT_STUDIO', 'PENTHOUSE', 'EXCEPTIONAL_PROPERTY', 'MANSION',
16
+ 'TOWN_HOUSE', 'SERVICE_FLAT', 'BUNGALOW', 'KOT', 'COUNTRY_COTTAGE',
17
+ 'FARMHOUSE', 'LOFT', 'CHALET', 'TRIPLEX', 'CASTLE', 'OTHER_PROPERTY',
18
+ 'MANOR_HOUSE', 'PAVILION']
19
+ province_list = ['West Flanders', 'Antwerp', 'East Flanders', 'Brussels', 'Hainaut',
20
+ 'Liège', 'Flemish Brabant', 'Limburg', 'Walloon Brabant', 'Namur',
21
+ 'Luxembourg']
22
+ building_condition_list = ['GOOD', 'AS_NEW', 'TO_RENOVATE', 'TO_BE_DONE_UP',
23
+ 'JUST_RENOVATED', 'TO_RESTORE']
24
+ flood_zone_type_list = ['NON_FLOOD_ZONE', 'POSSIBLE_FLOOD_ZONE', 'RECOGNIZED_FLOOD_ZONE',
25
+ 'RECOGNIZED_N_CIRCUMSCRIBED_FLOOD_ZONE', 'CIRCUMSCRIBED_WATERSIDE_ZONE',
26
+ 'CIRCUMSCRIBED_FLOOD_ZONE', 'POSSIBLE_N_CIRCUMSCRIBED_FLOOD_ZONE',
27
+ 'POSSIBLE_N_CIRCUMSCRIBED_WATERSIDE_ZONE', 'RECOGNIZED_N_CIRCUMSCRIBED_WATERSIDE_FLOOD_ZONE']
28
+ heating_type_list = ['GAS', 'FUELOIL', 'ELECTRIC', 'PELLET', 'WOOD', 'SOLAR', 'CARBON']
29
+ kitchen_type_list = ['INSTALLED', 'HYPER_EQUIPPED', 'SEMI_EQUIPPED', 'NOT_INSTALLED',
30
+ 'USA_HYPER_EQUIPPED', 'USA_INSTALLED', 'USA_SEMI_EQUIPPED',
31
+ 'USA_UNINSTALLED']
32
+ garden_orientation_list = ['SOUTH', 'SOUTH_WEST', 'SOUTH_EAST', 'WEST', 'EAST',
33
+ 'NORTH_WEST', 'NORTH_EAST', 'NORTH']
34
+ terrace_orientation_list = ['SOUTH', 'SOUTH_WEST', 'SOUTH_EAST', 'WEST', 'EAST',
35
+ 'NORTH_WEST', 'NORTH_EAST', 'NORTH']
36
+ epc_score_list = ['B', 'C', 'D', 'A', 'F', 'E', 'G', 'A+', 'A++']
37
+
38
+ # Create Streamlit input fields
39
+ st.header("Enter House Information")
40
+
41
+ col1, col2 = st.columns(2) # Divide the layout in two columns
42
+
43
+ with col1:
44
+ property_type = st.selectbox("Property Type", type_list, key='type')
45
+ property_subtype = st.selectbox("Subtype", subtype_list, key='subtype')
46
+ bedroom_count = st.number_input("Bedroom Count", min_value=0, step=1, key='bedroomCount')
47
+ bathroom_count = st.number_input("Bathroom Count", min_value=0, step=1, key='bathroomCount')
48
+ province = st.selectbox("Province", province_list, key='province')
49
+ locality = st.text_input("Locality", key='locality')
50
+ post_code = st.number_input("Post Code", min_value=1000, max_value=9999, step=1, key='postCode')
51
+ habitable_surface = st.number_input("Habitable Surface (sqm)", min_value=0.0, key='habitableSurface')
52
+ room_count = st.number_input("Room Count", min_value=0, step=1, key='roomCount')
53
+ monthly_cost = st.number_input("Monthly Cost (€)", min_value=0.0, key='monthlyCost')
54
+ has_attic = st.selectbox("Has Attic", ['Yes', 'No'], key='hasAttic')
55
+ has_basement = st.selectbox("Has Basement", ['Yes', 'No'], key='hasBasement')
56
+ has_dressing_room = st.selectbox("Has Dressing Room", ['Yes', 'No'], key='hasDressingRoom')
57
+ dining_room_surface = st.number_input("Dining Room Surface (sqm)", min_value=0.0, key='diningRoomSurface')
58
+ has_dining_room = st.selectbox("Has Dining Room", ['Yes', 'No'], key='hasDiningRoom')
59
+ building_condition = st.selectbox("Building Condition", building_condition_list, key='buildingCondition')
60
+ building_construction_year = st.number_input("Building Construction Year", min_value=1000, max_value=2024, step=1, key='buildingConstructionYear')
61
+ facade_count = st.number_input("Facade Count", min_value=0, step=1, key='facadeCount')
62
+ floor_count = st.number_input("Floor Count", min_value=0, step=1, key='floorCount')
63
+ street_facade_width = st.number_input("Street Facade Width (m)", min_value=0.0, key='streetFacadeWidth')
64
+ has_lift = st.selectbox("Has Lift", ['Yes', 'No'], key='hasLift')
65
+ flood_zone_type = st.selectbox("Flood Zone Type", flood_zone_type_list, key='floodZoneType')
66
+ heating_type = st.selectbox("Heating Type", heating_type_list, key='heatingType')
67
+ has_heat_pump = st.selectbox("Has Heat Pump", ['Yes', 'No'], key='hasHeatPump')
68
+
69
+ with col2:
70
+ has_photovoltaic_panels = st.selectbox("Has Photovoltaic Panels", ['Yes', 'No'], key='hasPhotovoltaicPanels')
71
+ has_thermic_panels = st.selectbox("Has Thermic Panels", ['Yes', 'No'], key='hasThermicPanels')
72
+ kitchen_surface = st.number_input("Kitchen Surface (sqm)", min_value=0.0, key='kitchenSurface')
73
+ kitchen_type = st.selectbox("Kitchen Type", kitchen_type_list, key='kitchenType')
74
+ land_surface = st.number_input("Land Surface (sqm)", min_value=0.0, key='landSurface')
75
+ has_living_room = st.selectbox("Has Living Room", ['Yes', 'No'], key='hasLivingRoom')
76
+ living_room_surface = st.number_input("Living Room Surface (sqm)", min_value=0.0, key='livingRoomSurface')
77
+ has_balcony = st.selectbox("Has Balcony", ['Yes', 'No'], key='hasBalcony')
78
+ has_garden = st.selectbox("Has Garden", ['Yes', 'No'], key='hasGarden')
79
+ garden_surface = st.number_input("Garden Surface (sqm)", min_value=0.0, key='gardenSurface')
80
+ garden_orientation = st.selectbox("Garden Orientation", garden_orientation_list, key='gardenOrientation')
81
+ parking_count_indoor = st.number_input("Indoor Parking Count", min_value=0, step=1, key='parkingCountIndoor')
82
+ parking_count_outdoor = st.number_input("Outdoor Parking Count", min_value=0, step=1, key='parkingCountOutdoor')
83
+ has_air_conditioning = st.selectbox("Has Air Conditioning", ['Yes', 'No'], key='hasAirConditioning')
84
+ has_armored_door = st.selectbox("Has Armored Door", ['Yes', 'No'], key='hasArmoredDoor')
85
+ has_visiophone = st.selectbox("Has Visiophone", ['Yes', 'No'], key='hasVisiophone')
86
+ has_office = st.selectbox("Has Office", ['Yes', 'No'], key='hasOffice')
87
+ toilet_count = st.number_input("Toilet Count", min_value=0, step=1, key='toiletCount')
88
+ has_swimming_pool = st.selectbox("Has Swimming Pool", ['Yes', 'No'], key='hasSwimmingPool')
89
+ has_fireplace = st.selectbox("Has Fireplace", ['Yes', 'No'], key='hasFireplace')
90
+ has_terrace = st.selectbox("Has Terrace", ['Yes', 'No'], key='hasTerrace')
91
+ terrace_surface = st.number_input("Terrace Surface (sqm)", min_value=0.0, key='terraceSurface')
92
+ terrace_orientation = st.selectbox("Terrace Orientation", terrace_orientation_list, key='terraceOrientation')
93
+ accessible_disabled_people = st.selectbox("Accessible Disabled People", ['True', 'False'], key='accessibleDisabledPeople')
94
+ epc_score = st.selectbox("EPC Score", epc_score_list, key='epcScore')
95
+
96
+ # Create a button to trigger DataFrame creation
97
+ if st.button("Predict"): # Changed button text to "Predict"
98
+ # Create the DataFrame
99
+ data = {
100
+ 'type': property_type,
101
+ 'subtype': property_subtype,
102
+ 'bedroomCount': bedroom_count,
103
+ 'bathroomCount': bathroom_count,
104
+ 'province': province,
105
+ 'locality': locality,
106
+ 'postCode': post_code,
107
+ 'habitableSurface': habitable_surface,
108
+ 'roomCount': room_count,
109
+ 'monthlyCost': monthly_cost,
110
+ 'hasAttic': has_attic == 'Yes',
111
+ 'hasBasement': has_basement == 'Yes',
112
+ 'hasDressingRoom': has_dressing_room == 'Yes',
113
+ 'diningRoomSurface': dining_room_surface,
114
+ 'hasDiningRoom': has_dining_room == 'Yes',
115
+ 'buildingCondition': building_condition,
116
+ 'buildingConstructionYear': building_construction_year,
117
+ 'facadeCount': facade_count,
118
+ 'floorCount': floor_count,
119
+ 'streetFacadeWidth': street_facade_width,
120
+ 'hasLift': has_lift == 'Yes',
121
+ 'floodZoneType': flood_zone_type,
122
+ 'heatingType': heating_type,
123
+ 'hasHeatPump': has_heat_pump == 'Yes',
124
+ 'hasPhotovoltaicPanels': has_photovoltaic_panels == 'Yes',
125
+ 'hasThermicPanels': has_thermic_panels == 'Yes',
126
+ 'kitchenSurface': kitchen_surface,
127
+ 'kitchenType': kitchen_type,
128
+ 'landSurface': land_surface,
129
+ 'hasLivingRoom': has_living_room == 'Yes',
130
+ 'livingRoomSurface': living_room_surface,
131
+ 'hasBalcony': has_balcony == 'Yes',
132
+ 'hasGarden': has_garden == 'Yes',
133
+ 'gardenSurface': garden_surface,
134
+ 'gardenOrientation': garden_orientation,
135
+ 'parkingCountIndoor': parking_count_indoor,
136
+ 'parkingCountOutdoor': parking_count_outdoor,
137
+ 'hasAirConditioning': has_air_conditioning == 'Yes',
138
+ 'hasArmoredDoor': has_armored_door == 'Yes',
139
+ 'hasVisiophone': has_visiophone == 'Yes',
140
+ 'hasOffice': has_office == 'Yes',
141
+ 'toiletCount': toilet_count,
142
+ 'hasSwimmingPool': has_swimming_pool == 'Yes',
143
+ 'hasFireplace': has_fireplace == 'Yes',
144
+ 'hasTerrace': has_terrace == 'Yes',
145
+ 'terraceSurface': terrace_surface,
146
+ 'terraceOrientation': terrace_orientation,
147
+ 'accessibleDisabledPeople': accessible_disabled_people == 'True',
148
+ 'epcScore': epc_score
149
+ }
150
+ df = pd.DataFrame(data, index=[0]) # Important : wrap data in a list.
151
+ #st.subheader("Generated DataFrame")
152
+ #st.dataframe(df)
153
+ pipeline = joblib.load('saved/pipeline.pkl')
154
+ model = joblib.load('saved/model.pkl')
155
+ expected_columns = joblib.load('saved/columns.pkl')
156
+ df_test = pipeline.transform(df)
157
+ for col in expected_columns:
158
+ if col not in df_test.columns:
159
+ df_test[col] = 0
160
+
161
+ df_test = df_test[expected_columns]
162
+ preds = model.predict(df_test)
163
+ st.subheader("Price prediction")
164
+ st.write("House price : ", preds)
165
+ return df # Return the dataframe.
166
+
167
+ if __name__ == "__main__":
168
+ create_dataframe_from_user_input()
preprocessing/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from .pipeline import create_preprocessing_pipeline, preprocess_data
3
+
4
+ __all__ = ['create_preprocessing_pipeline', 'preprocess_data']
preprocessing/pipeline.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.pipeline import Pipeline
3
+ from sklearn.preprocessing import StandardScaler
4
+ from .pipeline_components import (
5
+ DataCleaner, FeatureEngineer, CategoricalEncoder,
6
+ KDEKNNFeatureCreator, CoordinateGetter, ColumnCleaner
7
+ )
8
+
9
+ def create_preprocessing_pipeline():
10
+ """
11
+ Creates a preprocessing pipeline for real estate data.
12
+ """
13
+ return Pipeline([
14
+ ('data_cleaner', DataCleaner()),
15
+ ('feature_engineer', FeatureEngineer()),
16
+ ('coordinate_getter', CoordinateGetter()),
17
+ ('categorical_encoder', CategoricalEncoder()),
18
+ ('kde_knn_creator', KDEKNNFeatureCreator()),
19
+ ('column_cleaner', ColumnCleaner()),
20
+ #('standard_scaler', scaler)
21
+ ])
22
+ def preprocess_data(df_train, df_test=None, pipeline=None):
23
+ """
24
+ Preprocesses the training and/or test (single prediction) data using the pipeline.
25
+ Args:
26
+ df_train (pd.DataFrame): Training data
27
+ df_test (pd.DataFrame, optional): Test data or single row for prediction
28
+ pipeline (Pipeline, optional): If provided, use existing fitted pipeline
29
+ Returns:
30
+ tuple: (X_train, X_test, fitted_pipeline, scaler)
31
+ """
32
+ y_test = None
33
+ # Drop rows with NaN values in critical columns
34
+ if df_train is not None:
35
+ df_train = df_train.dropna(subset=['price', 'habitableSurface'])
36
+ if df_test is not None:
37
+ if 'price' in df_test.columns:
38
+ df_test = df_test.dropna(subset=['price'])
39
+
40
+ df_test = df_test.dropna(subset=['habitableSurface'])
41
+ if pipeline is None:
42
+ pipeline = create_preprocessing_pipeline()
43
+ X_train = pipeline.fit_transform(df_train)
44
+ y_train = X_train['price']
45
+ X_train = X_train.drop(columns=['price'])
46
+ else:
47
+ X_train = None
48
+ y_train = None
49
+ X_test = pipeline.transform(df_test) if df_test is not None else None
50
+ if 'price' in X_test.columns:
51
+ y_test = X_test['price']
52
+ X_test = X_test.drop(columns=['price'])
53
+ else:
54
+ y_test = None
55
+ X_test = X_test
56
+ return X_train, y_train, X_test, y_test, pipeline
57
+
58
+ def main():
59
+ # Load data
60
+ df = pd.read_csv("./data/Kangaroo.csv")
61
+ df = df.drop_duplicates(subset=["id"], keep="first")
62
+ df = df[df['price'] < 1500000]
63
+ df = df.dropna(subset=['price'])
64
+
65
+ # Filter EPC scores
66
+ epc_order = ['A++', 'A+', 'A', 'B', 'C', 'D', 'E', 'F', 'G']
67
+ df = df[df['epcScore'].isin(epc_order)]
68
+ df['epcScore'] = df['epcScore'].fillna(df['epcScore'].mode()[0])
69
+
70
+ # Convert price to float
71
+ df['price'] = df['price'].astype(float)
72
+
73
+ # Split data
74
+ from sklearn.model_selection import train_test_split
75
+ df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
76
+
77
+ # Preprocess data
78
+ X_train, X_test = preprocess_data(df_train, df_test)
79
+
80
+ # Save processed data
81
+ pd.DataFrame(X_train).to_csv("./data/train_processed.csv", index=False)
82
+ pd.DataFrame(X_test).to_csv("./data/test_processed.csv", index=False)
83
+
84
+ if __name__ == "__main__":
85
+ main()
preprocessing/pipeline_components.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.base import BaseEstimator, TransformerMixin
4
+ from scipy.stats import gaussian_kde
5
+ from sklearn.preprocessing import StandardScaler
6
+ from sklearn.neighbors import NearestNeighbors
7
+ import pgeocode
8
+ from datetime import datetime
9
+ import pgeocode
10
+
11
+ class DataCleaner(BaseEstimator, TransformerMixin):
12
+ def __init__(self):
13
+ self.col_types = {
14
+ 'id': 'int', 'type': 'str', 'subtype': 'str', 'bedroomCount': 'int',
15
+ 'bathroomCount': 'int', 'province': 'str', 'locality': 'str',
16
+ 'postCode': 'int', 'habitableSurface': 'float', 'hasBasement': 'int',
17
+ 'buildingCondition': 'str', 'buildingConstructionYear': 'int',
18
+ 'hasLift': 'int', 'floodZoneType': 'str', 'heatingType': 'str',
19
+ 'hasHeatPump': 'int', 'hasPhotovoltaicPanels': 'int',
20
+ 'hasThermicPanels': 'int', 'kitchenType': 'str', 'landSurface': 'float',
21
+ 'hasLivingRoom': 'int', 'livingRoomSurface': 'float', 'hasGarden': 'int',
22
+ 'gardenSurface': 'float', 'parkingCountIndoor': 'int',
23
+ 'parkingCountOutdoor': 'int', 'hasAirConditioning': 'int',
24
+ 'hasArmoredDoor': 'int', 'hasVisiophone': 'int', 'hasOffice': 'int',
25
+ 'toiletCount': 'int', 'hasSwimmingPool': 'int', 'hasFireplace': 'int',
26
+ 'hasTerrace': 'int', 'terraceSurface': 'float', 'terraceOrientation': 'str',
27
+ 'epcScore': 'str', 'facadeCount': 'int'
28
+ }
29
+
30
+ def fit(self, X, y=None):
31
+ return self
32
+
33
+ def transform(self, X):
34
+ df = X.copy()
35
+
36
+ # Drop unnecessary columns
37
+ df = df.drop(columns=[col for col in ["Unnamed: 0", "url"] if col in df.columns])
38
+ df = df.drop(columns=[col for col in ['monthlyCost', 'hasBalcony', 'accessibleDisabledPeople',
39
+ 'roomCount', 'diningRoomSurface', 'streetFacadeWidth',
40
+ 'gardenOrientation', 'kitchenSurface', 'floorCount',
41
+ 'hasDiningRoom', 'hasDressingRoom'] if col in df.columns])
42
+
43
+
44
+ # Handle binary columns
45
+ binary_cols = [
46
+ 'hasBasement', 'hasLift', 'hasHeatPump', 'hasPhotovoltaicPanels',
47
+ 'hasAirConditioning', 'hasArmoredDoor', 'hasVisiophone', 'hasOffice',
48
+ 'hasSwimmingPool', 'hasFireplace', 'parkingCountIndoor', 'parkingCountOutdoor',
49
+ 'hasAttic', 'hasThermicPanels'
50
+ ]
51
+
52
+ for col in binary_cols:
53
+ df[col] = df[col].map({True: 1, False: 0, 'True': 1, 'False': 0, 'YES': 1, 'NO': 0}).fillna(0).astype(int)
54
+
55
+ # Handle dependent columns
56
+ df['hasLivingRoom'] = df['hasLivingRoom'].map({True: 1, False: 0, 'True': 1, 'False': 0, 'YES': 1, 'NO': 0})
57
+ df.loc[df['hasLivingRoom'].isna(), 'hasLivingRoom'] = df['livingRoomSurface'].notnull().astype(int)
58
+
59
+ df['hasGarden'] = df['hasGarden'].map({True: 1, False: 0, 'True': 1, 'False': 0, 'YES': 1, 'NO': 0})
60
+ df.loc[df['hasGarden'].isna(), 'hasGarden'] = df['gardenSurface'].notnull().astype(int)
61
+
62
+ df['hasTerrace'] = df['hasTerrace'].map({True: 1, False: 0, 'True': 1, 'False': 0, 'YES': 1, 'NO': 0})
63
+ df.loc[df['hasTerrace'].isna(), 'hasTerrace'] = df['terraceSurface'].notnull().astype(int)
64
+
65
+ # Set surfaces to 0 when feature is not present
66
+ df.loc[df['hasLivingRoom'] == 0, 'livingRoomSurface'] = 0
67
+ df.loc[df['hasGarden'] == 0, 'gardenSurface'] = 0
68
+ df.loc[df['hasTerrace'] == 0, 'terraceSurface'] = 0
69
+ df.loc[df['hasTerrace'] == 0, 'terraceOrientation'] = 0
70
+
71
+ # Handle facade count
72
+ df['facadeCount'] = df['facadeCount'].fillna(-1)
73
+
74
+ # Fill missing values
75
+ df['bedroomCount'] = df['bedroomCount'].fillna(-1).astype(float)
76
+ df['bathroomCount'] = df['bathroomCount'].fillna(-1).astype(float)
77
+ df['toiletCount'] = df['toiletCount'].fillna(-1).astype(float)
78
+
79
+ # Drop habitable surface na
80
+ df = df.dropna(subset=['habitableSurface'])
81
+
82
+ # Fill other missing values
83
+ df['buildingCondition'] = df['buildingCondition'].fillna('NOT_MENTIONED')
84
+ df['floodZoneType'] = df['floodZoneType'].fillna('NON_FLOOD_ZONE')
85
+ df['heatingType'] = df['heatingType'].fillna(df['heatingType'].mode()[0])
86
+ df['hasThermicPanels'] = df['hasThermicPanels'].fillna(0.0)
87
+ df['kitchenType'] = df['kitchenType'].fillna(df['kitchenType'].mode()[0])
88
+ df['landSurface'] = df['landSurface'].fillna(df['landSurface'].median())
89
+ df['livingRoomSurface'] = df['livingRoomSurface'].fillna(df['livingRoomSurface'].median())
90
+
91
+ # Transform building construction year into age and fillna(-1)
92
+ current_year = datetime.now().year
93
+ df['buildingAge'] = current_year - df['buildingConstructionYear']
94
+ df['buildingAge'] = df['buildingAge'].fillna(-1)
95
+
96
+ # Handle terrace surface and orientation
97
+ median_terrace = df.loc[(df['hasTerrace'] == 1) & (df['terraceSurface'].notnull()), 'terraceSurface'].median()
98
+ df.loc[(df['hasTerrace'] == 1) & (df['terraceSurface'].isna()), 'terraceSurface'] = -1
99
+ df.loc[(df['hasTerrace'] != 1) & (df['terraceSurface'].isna()), 'terraceSurface'] = 0
100
+
101
+ mode_terrace = df.loc[(df['hasTerrace'] == 1), 'terraceOrientation'].mode()[0]
102
+ df.loc[(df['hasTerrace'] == 1) & (df['terraceOrientation'].isna()), 'terraceOrientation'] = 'NOT_MENTIONED'
103
+ df.loc[(df['hasTerrace'] != 1) & (df['terraceOrientation'].isna()), 'terraceOrientation'] = 'NO_TERRACE'
104
+
105
+ # Convert data types
106
+ for col, dtype in self.col_types.items():
107
+ if col in df.columns:
108
+ if pd.api.types.is_integer_dtype(dtype):
109
+ df[col] = df[col].fillna(0).astype(dtype)
110
+ else:
111
+ df[col] = df[col].astype(dtype)
112
+
113
+ return df
114
+
115
+ class FeatureEngineer(BaseEstimator, TransformerMixin):
116
+ def __init__(self):
117
+ self.epc_mapping = {
118
+ 'Flanders': {
119
+ 'A++': 0, 'A+': 0, 'A': 100, 'B': 200, 'C': 300,
120
+ 'D': 400, 'E': 500, 'F': 600, 'G': 700
121
+ },
122
+ 'Wallonia': {
123
+ 'A++': 0, 'A+': 50, 'A': 90, 'B': 170, 'C': 250,
124
+ 'D': 330, 'E': 420, 'F': 510, 'G': 600
125
+ },
126
+ 'Bruxelles': {
127
+ 'A++': 0, 'A+': 0, 'A': 45, 'B': 95, 'C': 145,
128
+ 'D': 210, 'E': 275, 'F': 345, 'G': 450
129
+ }
130
+ }
131
+
132
+ def fit(self, X, y=None):
133
+ return self
134
+
135
+ def transform(self, X):
136
+ df = X.copy()
137
+ if 'price' in df.columns:
138
+ # Filter out extremely high prices
139
+ high_price_count = (df['price'] > 1500000).sum()
140
+ df = df[df['price'] <= 1500000]
141
+ # Check for problematic values
142
+ zero_price = (df['price'] <= 0).sum()
143
+ zero_surface = (df['habitableSurface'] <= 0).sum()
144
+
145
+ # Handle problematic values
146
+ if zero_price > 0:
147
+ df.loc[df['price'] <= 0, 'price'] = np.nan
148
+
149
+ if zero_surface > 0:
150
+ df.loc[df['habitableSurface'] <= 0, 'habitableSurface'] = np.nan
151
+
152
+ # Add isHouse feature
153
+ df['isHouse'] = (df['type'] == 'HOUSE').astype(int)
154
+
155
+ # Add region information first
156
+ def get_region(zip_code):
157
+ if 1000 <= zip_code <= 1299:
158
+ return "Bruxelles"
159
+ elif 1300 <= zip_code <= 1499 or 4000 <= zip_code <= 7999:
160
+ return "Wallonia"
161
+ else:
162
+ return "Flanders"
163
+
164
+ df['region'] = df['postCode'].apply(get_region)
165
+ if 'price' in df.columns:
166
+ # Now add price per m2
167
+ df['pricePerM2'] = df['price'] / df['habitableSurface']
168
+ # Handle inf values
169
+ df['pricePerM2'] = df['pricePerM2'].replace([np.inf, -np.inf], np.nan)
170
+ # Fill NaN values with median by region
171
+ df['pricePerM2'] = df['pricePerM2'].fillna(-1)
172
+
173
+ # Convert EPC score
174
+ df['epcScore'] = df.apply(lambda row: self.epc_mapping.get(row['region'], {}).get(row['epcScore'], None), axis=1)
175
+ df['epcScore'] = df['epcScore'].fillna(-1)
176
+
177
+ # Convert building condition
178
+ condition_rating = {
179
+ 'to restore': 0, 'to renovate': 1, 'to be done up': 2,
180
+ 'good': 3, 'just renovated': 4, 'as new': 5
181
+ }
182
+ df['buildingCondition'] = (df['buildingCondition'].astype(str).str.strip().str.lower()
183
+ .map(condition_rating).fillna(-1).astype(int))
184
+
185
+ # Convert flood zone type
186
+ df['floodZoneType'] = (df['floodZoneType'] != 'NON_FLOOD_ZONE').astype(int)
187
+
188
+ return df
189
+
190
+ class CategoricalEncoder(BaseEstimator, TransformerMixin):
191
+ def __init__(self):
192
+ self.categorical_columns = ['province', 'heatingType', 'kitchenType', 'subtype', 'terraceOrientation']
193
+
194
+ def fit(self, X, y=None):
195
+ return self
196
+
197
+ def transform(self, X):
198
+ df = X.copy()
199
+
200
+ # One-hot encode categorical columns
201
+ for col in self.categorical_columns:
202
+ if col in df.columns:
203
+ df = pd.get_dummies(df, columns=[col], prefix=col, dtype=int)
204
+
205
+ return df
206
+
207
+ class CoordinateGetter(BaseEstimator, TransformerMixin):
208
+ def __init__(self):
209
+ pass
210
+
211
+ def fit(self, X, y=None):
212
+ return self
213
+
214
+ def transform(self, X):
215
+ df = X.copy()
216
+ if 'id' in df.columns:
217
+ df_giraffe = pd.read_csv('data/Giraffe.csv')
218
+ df_giraffe = df_giraffe[['propertyId', 'latitude', 'longitude']]
219
+
220
+ df_giraffe['id'] = df_giraffe['propertyId']
221
+ cols = df_giraffe.columns.tolist()
222
+ cols.remove('id')
223
+ new_order = ['id'] + cols
224
+ df_giraffe = df_giraffe[new_order]
225
+
226
+ df_giraffe = df_giraffe.drop(columns='propertyId')
227
+
228
+ df = df.merge(df_giraffe, on='id', how='left')
229
+ df = df.dropna(subset=['latitude', 'longitude'])
230
+
231
+ else :
232
+ nomi = pgeocode.Nominatim('be')
233
+
234
+ df['postCode'] = df['postCode'].astype(str)
235
+ unique_postcodes = df["postCode"].astype(str).unique()
236
+
237
+ geo_df = nomi.query_postal_code(list(unique_postcodes))
238
+ geo_df = geo_df[['postal_code', 'latitude', 'longitude']]
239
+ geo_df = geo_df.rename(columns={'postal_code': 'postCode'})
240
+ geo_df['postCode'] = geo_df['postCode'].astype(str)
241
+ df = df.merge(geo_df, on='postCode', how='left')
242
+
243
+ return df
244
+
245
+ class KDEKNNFeatureCreator(BaseEstimator, TransformerMixin):
246
+ def __init__(self, k=20):
247
+ self.k = k
248
+ self.scaler = StandardScaler()
249
+ self.knn = NearestNeighbors(n_neighbors=k)
250
+ self.train_prices = None
251
+
252
+ def fit(self, X, y=None):
253
+ if 'latitude' not in X.columns or 'longitude' not in X.columns:
254
+ print("Warning: Missing latitude/longitude columns")
255
+ return self
256
+
257
+ coords_scaled = self.scaler.fit_transform(X[['latitude', 'longitude']])
258
+ self.knn.fit(coords_scaled)
259
+
260
+ # Store training prices
261
+ self.train_prices = X['pricePerM2'].values
262
+
263
+ return self
264
+
265
+ def transform(self, X):
266
+ df = X.copy()
267
+
268
+ if 'latitude' not in df.columns or 'longitude' not in df.columns:
269
+ print("Warning: Missing latitude/longitude columns")
270
+ df['kde_price_per_m2_knn'] = np.nan
271
+ return df
272
+
273
+ coords_scaled = self.scaler.transform(df[['latitude', 'longitude']])
274
+ distances, indices = self.knn.kneighbors(coords_scaled)
275
+
276
+ kde_scores = []
277
+
278
+ invalid_kde_count = 0
279
+
280
+ for i in range(len(df)):
281
+ neighbor_idxs = indices[i]
282
+ # Use stored training prices for neighbors
283
+ neighbor_prices = self.train_prices[neighbor_idxs]
284
+ neighbor_prices = neighbor_prices[~np.isnan(neighbor_prices)]
285
+
286
+ if len(neighbor_prices) < 2:
287
+ kde_scores.append(np.nan)
288
+ invalid_kde_count += 1
289
+ continue
290
+
291
+ try:
292
+ kde = gaussian_kde(neighbor_prices)
293
+ value_to_evaluate = neighbor_prices.mean()
294
+ kde_score = kde(value_to_evaluate)[0]
295
+
296
+ if np.isfinite(kde_score):
297
+ kde_scores.append(kde_score)
298
+ else:
299
+ kde_scores.append(np.nan)
300
+ invalid_kde_count += 1
301
+ except Exception as e:
302
+ print(f"Error in KDE calculation for row {i}: {str(e)}")
303
+ kde_scores.append(np.nan)
304
+ invalid_kde_count += 1
305
+
306
+ df['kde_price_per_m2_knn'] = kde_scores
307
+
308
+ # Fill NaN values with median by region
309
+ df['kde_price_per_m2_knn'] = df['kde_price_per_m2_knn'].fillna(-1)
310
+
311
+ return df.drop(columns=['latitude', 'longitude'], errors='ignore')
312
+
313
+ class ColumnCleaner(BaseEstimator, TransformerMixin):
314
+ def __init__(self):
315
+ self.columns_to_drop = [
316
+ 'id', 'postCode', 'buildingConstructionYear', 'type', 'locality', 'region',
317
+ 'latitude', 'longitude', 'buildingConstructionYear'
318
+ ]
319
+
320
+ def fit(self, X, y=None):
321
+ return self
322
+
323
+ def transform(self, X):
324
+ df = X.copy()
325
+
326
+ # Drop columns that are no longer needed
327
+ columns_to_drop = [col for col in self.columns_to_drop if col in df.columns]
328
+ df = df.drop(columns=columns_to_drop)
329
+ if 'pricePerM2' in df.columns:
330
+ df = df.drop(columns=['pricePerM2'])
331
+ # Ensure all remaining columns are numeric
332
+ non_numeric_cols = df.select_dtypes(include=['object', 'category']).columns
333
+ if len(non_numeric_cols) > 0:
334
+ # Convert any remaining categorical columns to numeric
335
+ for col in non_numeric_cols:
336
+ if col != 'price': # Don't encode the target variable
337
+ df[col] = pd.Categorical(df[col]).codes
338
+
339
+ # Reorganize columns to put price at the end
340
+ cols = df.columns.tolist()
341
+ if 'price' in cols:
342
+ cols.remove('price')
343
+ cols.append('price')
344
+ df = df[cols]
345
+
346
+ return df
requirements.txt ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.5.0
2
+ attrs==25.3.0
3
+ blinker==1.9.0
4
+ cachetools==5.5.2
5
+ certifi==2025.4.26
6
+ charset-normalizer==3.4.2
7
+ click==8.2.0
8
+ gitdb==4.0.12
9
+ GitPython==3.1.44
10
+ idna==3.10
11
+ Jinja2==3.1.6
12
+ joblib==1.5.0
13
+ jsonschema==4.23.0
14
+ jsonschema-specifications==2025.4.1
15
+ MarkupSafe==3.0.2
16
+ narwhals==1.39.0
17
+ numpy==2.2.5
18
+ packaging==24.2
19
+ pandas==2.2.3
20
+ pgeocode==0.5.0
21
+ pillow==11.2.1
22
+ protobuf==6.31.0
23
+ pyarrow==20.0.0
24
+ pydeck==0.9.1
25
+ python-dateutil==2.9.0.post0
26
+ pytz==2025.2
27
+ referencing==0.36.2
28
+ requests==2.32.3
29
+ rpds-py==0.24.0
30
+ scikit-learn==1.6.1
31
+ scipy==1.15.3
32
+ six==1.17.0
33
+ smmap==5.0.2
34
+ streamlit==1.45.1
35
+ tenacity==9.1.2
36
+ threadpoolctl==3.6.0
37
+ toml==0.10.2
38
+ tornado==6.4.2
39
+ typing_extensions==4.13.2
40
+ tzdata==2025.2
41
+ urllib3==2.4.0
42
+ xgboost==3.0.1
saved/columns.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:236986f05a9f615d73af85b33abfb85fa82e36fc50391c62cf8959fbe310b47e
3
+ size 1911
saved/model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:294261411aa75310e5e9ea5427c35cb4d97f23e0927a325d09490ab12532acd9
3
+ size 4703400
saved/pipeline.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8794879223249a6241dbd1ae60c0c11564ce200f595c77a41a91bae53217f2b
3
+ size 2256890