Spaces:

Rhodham96
/

RealEstate_pricePrediction_Belgium

Sleeping

App Files Files Community

Rhodham96 commited on May 15

Commit

5089ff4

1 Parent(s): 1e47a48

first

Browse files

Files changed (11) hide show

.gitignore +11 -0
Dockerfile +19 -0
README.md +4 -3
app.py +168 -0
preprocessing/__init__.py +4 -0
preprocessing/pipeline.py +85 -0
preprocessing/pipeline_components.py +346 -0
requirements.txt +42 -0
saved/columns.pkl +3 -0
saved/model.pkl +3 -0
saved/pipeline.pkl +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+# Ignore Python bytecode
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+# Ignore virtualenv folder (à adapter selon ton env)
+env/
+venv/
+.machine_learning/

Dockerfile ADDED Viewed

	@@ -0,0 +1,19 @@

+# Utiliser une image de base Python
+FROM python:3.10-slim-buster
+# Définir le répertoire de travail dans le conteneur
+WORKDIR /app
+# Copier les fichiers de requirements
+COPY requirements.txt .
+# Installer les dépendances
+RUN pip install --no-cache-dir -r requirements.txt
+# Copier le code de l'application
+COPY . /app
+# Exposer le port sur lequel Streamlit s'exécute
+EXPOSE 7860
+# Définir la commande à exécuter pour lancer l'application
+CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.enableCORS=false"]

README.md CHANGED Viewed

@@ -1,10 +1,11 @@
 ---
 title: RealEstate PricePrediction Belgium
-emoji: 🏆
-colorFrom: indigo
-colorTo: indigo
 sdk: docker
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: RealEstate PricePrediction Belgium
+emoji: 🌍
+colorFrom: gray
+colorTo: red
 sdk: docker
 pinned: false
+license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import streamlit as st
+import pandas as pd
+import joblib
+def create_dataframe_from_user_input():
+    """
+    Collects user input for house features using Streamlit and
+    returns a Pandas DataFrame.
+    """
+    # Define the lists of possible values for dropdown selections
+    type_list = ['HOUSE', 'APARTMENT']
+    subtype_list = ['HOUSE', 'APARTMENT', 'VILLA', 'APARTMENT_BLOCK', 'APARTMENT_GROUP',
+                     'MIXED_USE_BUILDING', 'GROUND_FLOOR', 'DUPLEX', 'HOUSE_GROUP',
+                     'FLAT_STUDIO', 'PENTHOUSE', 'EXCEPTIONAL_PROPERTY', 'MANSION',
+                     'TOWN_HOUSE', 'SERVICE_FLAT', 'BUNGALOW', 'KOT', 'COUNTRY_COTTAGE',
+                     'FARMHOUSE', 'LOFT', 'CHALET', 'TRIPLEX', 'CASTLE', 'OTHER_PROPERTY',
+                     'MANOR_HOUSE', 'PAVILION']
+    province_list = ['West Flanders', 'Antwerp', 'East Flanders', 'Brussels', 'Hainaut',
+                     'Liège', 'Flemish Brabant', 'Limburg', 'Walloon Brabant', 'Namur',
+                     'Luxembourg']
+    building_condition_list = ['GOOD', 'AS_NEW', 'TO_RENOVATE', 'TO_BE_DONE_UP',
+                               'JUST_RENOVATED', 'TO_RESTORE']
+    flood_zone_type_list = ['NON_FLOOD_ZONE', 'POSSIBLE_FLOOD_ZONE', 'RECOGNIZED_FLOOD_ZONE',
+                            'RECOGNIZED_N_CIRCUMSCRIBED_FLOOD_ZONE', 'CIRCUMSCRIBED_WATERSIDE_ZONE',
+                            'CIRCUMSCRIBED_FLOOD_ZONE', 'POSSIBLE_N_CIRCUMSCRIBED_FLOOD_ZONE',
+                            'POSSIBLE_N_CIRCUMSCRIBED_WATERSIDE_ZONE', 'RECOGNIZED_N_CIRCUMSCRIBED_WATERSIDE_FLOOD_ZONE']
+    heating_type_list = ['GAS', 'FUELOIL', 'ELECTRIC', 'PELLET', 'WOOD', 'SOLAR', 'CARBON']
+    kitchen_type_list = ['INSTALLED', 'HYPER_EQUIPPED', 'SEMI_EQUIPPED', 'NOT_INSTALLED',
+                         'USA_HYPER_EQUIPPED', 'USA_INSTALLED', 'USA_SEMI_EQUIPPED',
+                         'USA_UNINSTALLED']
+    garden_orientation_list = ['SOUTH', 'SOUTH_WEST', 'SOUTH_EAST', 'WEST', 'EAST',
+                               'NORTH_WEST', 'NORTH_EAST', 'NORTH']
+    terrace_orientation_list = ['SOUTH', 'SOUTH_WEST', 'SOUTH_EAST', 'WEST', 'EAST',
+                               'NORTH_WEST', 'NORTH_EAST', 'NORTH']
+    epc_score_list = ['B', 'C', 'D', 'A', 'F', 'E', 'G', 'A+', 'A++']
+    # Create Streamlit input fields
+    st.header("Enter House Information")
+    col1, col2 = st.columns(2) # Divide the layout in two columns
+    with col1:
+        property_type = st.selectbox("Property Type", type_list, key='type')
+        property_subtype = st.selectbox("Subtype", subtype_list, key='subtype')
+        bedroom_count = st.number_input("Bedroom Count", min_value=0, step=1, key='bedroomCount')
+        bathroom_count = st.number_input("Bathroom Count", min_value=0, step=1, key='bathroomCount')
+        province = st.selectbox("Province", province_list, key='province')
+        locality = st.text_input("Locality", key='locality')
+        post_code = st.number_input("Post Code", min_value=1000, max_value=9999, step=1, key='postCode')
+        habitable_surface = st.number_input("Habitable Surface (sqm)", min_value=0.0, key='habitableSurface')
+        room_count = st.number_input("Room Count", min_value=0, step=1, key='roomCount')
+        monthly_cost = st.number_input("Monthly Cost (€)", min_value=0.0, key='monthlyCost')
+        has_attic = st.selectbox("Has Attic", ['Yes', 'No'], key='hasAttic')
+        has_basement = st.selectbox("Has Basement", ['Yes', 'No'], key='hasBasement')
+        has_dressing_room = st.selectbox("Has Dressing Room", ['Yes', 'No'], key='hasDressingRoom')
+        dining_room_surface = st.number_input("Dining Room Surface (sqm)", min_value=0.0, key='diningRoomSurface')
+        has_dining_room = st.selectbox("Has Dining Room", ['Yes', 'No'], key='hasDiningRoom')
+        building_condition = st.selectbox("Building Condition", building_condition_list, key='buildingCondition')
+        building_construction_year = st.number_input("Building Construction Year", min_value=1000, max_value=2024, step=1, key='buildingConstructionYear')
+        facade_count = st.number_input("Facade Count", min_value=0, step=1, key='facadeCount')
+        floor_count = st.number_input("Floor Count", min_value=0, step=1, key='floorCount')
+        street_facade_width = st.number_input("Street Facade Width (m)", min_value=0.0, key='streetFacadeWidth')
+        has_lift = st.selectbox("Has Lift", ['Yes', 'No'], key='hasLift')
+        flood_zone_type = st.selectbox("Flood Zone Type", flood_zone_type_list, key='floodZoneType')
+        heating_type = st.selectbox("Heating Type", heating_type_list, key='heatingType')
+        has_heat_pump = st.selectbox("Has Heat Pump", ['Yes', 'No'], key='hasHeatPump')
+    with col2:
+        has_photovoltaic_panels = st.selectbox("Has Photovoltaic Panels", ['Yes', 'No'], key='hasPhotovoltaicPanels')
+        has_thermic_panels = st.selectbox("Has Thermic Panels", ['Yes', 'No'], key='hasThermicPanels')
+        kitchen_surface = st.number_input("Kitchen Surface (sqm)", min_value=0.0, key='kitchenSurface')
+        kitchen_type = st.selectbox("Kitchen Type", kitchen_type_list, key='kitchenType')
+        land_surface = st.number_input("Land Surface (sqm)", min_value=0.0, key='landSurface')
+        has_living_room = st.selectbox("Has Living Room", ['Yes', 'No'], key='hasLivingRoom')
+        living_room_surface = st.number_input("Living Room Surface (sqm)", min_value=0.0, key='livingRoomSurface')
+        has_balcony = st.selectbox("Has Balcony", ['Yes', 'No'], key='hasBalcony')
+        has_garden = st.selectbox("Has Garden", ['Yes', 'No'], key='hasGarden')
+        garden_surface = st.number_input("Garden Surface (sqm)", min_value=0.0, key='gardenSurface')
+        garden_orientation = st.selectbox("Garden Orientation", garden_orientation_list, key='gardenOrientation')
+        parking_count_indoor = st.number_input("Indoor Parking Count", min_value=0, step=1, key='parkingCountIndoor')
+        parking_count_outdoor = st.number_input("Outdoor Parking Count", min_value=0, step=1, key='parkingCountOutdoor')
+        has_air_conditioning = st.selectbox("Has Air Conditioning", ['Yes', 'No'], key='hasAirConditioning')
+        has_armored_door = st.selectbox("Has Armored Door", ['Yes', 'No'], key='hasArmoredDoor')
+        has_visiophone = st.selectbox("Has Visiophone", ['Yes', 'No'], key='hasVisiophone')
+        has_office = st.selectbox("Has Office", ['Yes', 'No'], key='hasOffice')
+        toilet_count = st.number_input("Toilet Count", min_value=0, step=1, key='toiletCount')
+        has_swimming_pool = st.selectbox("Has Swimming Pool", ['Yes', 'No'], key='hasSwimmingPool')
+        has_fireplace = st.selectbox("Has Fireplace", ['Yes', 'No'], key='hasFireplace')
+        has_terrace = st.selectbox("Has Terrace", ['Yes', 'No'], key='hasTerrace')
+        terrace_surface = st.number_input("Terrace Surface (sqm)", min_value=0.0, key='terraceSurface')
+        terrace_orientation = st.selectbox("Terrace Orientation", terrace_orientation_list, key='terraceOrientation')
+        accessible_disabled_people = st.selectbox("Accessible Disabled People", ['True', 'False'], key='accessibleDisabledPeople')
+        epc_score = st.selectbox("EPC Score", epc_score_list, key='epcScore')
+    # Create a button to trigger DataFrame creation
+    if st.button("Predict"): # Changed button text to "Predict"
+        # Create the DataFrame
+        data = {
+            'type': property_type,
+            'subtype': property_subtype,
+            'bedroomCount': bedroom_count,
+            'bathroomCount': bathroom_count,
+            'province': province,
+            'locality': locality,
+            'postCode': post_code,
+            'habitableSurface': habitable_surface,
+            'roomCount': room_count,
+            'monthlyCost': monthly_cost,
+            'hasAttic': has_attic == 'Yes',
+            'hasBasement': has_basement == 'Yes',
+            'hasDressingRoom': has_dressing_room == 'Yes',
+            'diningRoomSurface': dining_room_surface,
+            'hasDiningRoom': has_dining_room == 'Yes',
+            'buildingCondition': building_condition,
+            'buildingConstructionYear': building_construction_year,
+            'facadeCount': facade_count,
+            'floorCount': floor_count,
+            'streetFacadeWidth': street_facade_width,
+            'hasLift': has_lift == 'Yes',
+            'floodZoneType': flood_zone_type,
+            'heatingType': heating_type,
+            'hasHeatPump': has_heat_pump == 'Yes',
+            'hasPhotovoltaicPanels': has_photovoltaic_panels == 'Yes',
+            'hasThermicPanels': has_thermic_panels == 'Yes',
+            'kitchenSurface': kitchen_surface,
+            'kitchenType': kitchen_type,
+            'landSurface': land_surface,
+            'hasLivingRoom': has_living_room == 'Yes',
+            'livingRoomSurface': living_room_surface,
+            'hasBalcony': has_balcony == 'Yes',
+            'hasGarden': has_garden == 'Yes',
+            'gardenSurface': garden_surface,
+            'gardenOrientation': garden_orientation,
+            'parkingCountIndoor': parking_count_indoor,
+            'parkingCountOutdoor': parking_count_outdoor,
+            'hasAirConditioning': has_air_conditioning == 'Yes',
+            'hasArmoredDoor': has_armored_door == 'Yes',
+            'hasVisiophone': has_visiophone == 'Yes',
+            'hasOffice': has_office == 'Yes',
+            'toiletCount': toilet_count,
+            'hasSwimmingPool': has_swimming_pool == 'Yes',
+            'hasFireplace': has_fireplace == 'Yes',
+            'hasTerrace': has_terrace == 'Yes',
+            'terraceSurface': terrace_surface,
+            'terraceOrientation': terrace_orientation,
+            'accessibleDisabledPeople': accessible_disabled_people == 'True',
+            'epcScore': epc_score
+        }
+        df = pd.DataFrame(data, index=[0]) # Important : wrap data in a list.
+        #st.subheader("Generated DataFrame")
+        #st.dataframe(df)
+        pipeline = joblib.load('saved/pipeline.pkl')
+        model = joblib.load('saved/model.pkl')
+        expected_columns = joblib.load('saved/columns.pkl')
+        df_test = pipeline.transform(df)
+        for col in expected_columns:
+            if col not in df_test.columns:
+                df_test[col] = 0
+        df_test = df_test[expected_columns]
+        preds = model.predict(df_test)
+        st.subheader("Price prediction")
+        st.write("House price : ", preds)
+        return df # Return the dataframe.
+if __name__ == "__main__":
+    create_dataframe_from_user_input()

preprocessing/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+
2	+ from .pipeline import create_preprocessing_pipeline, preprocess_data
3	+
4	+ __all__ = ['create_preprocessing_pipeline', 'preprocess_data']

preprocessing/pipeline.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import pandas as pd
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from .pipeline_components import (
+    DataCleaner, FeatureEngineer, CategoricalEncoder,
+    KDEKNNFeatureCreator, CoordinateGetter, ColumnCleaner
+)
+def create_preprocessing_pipeline():
+    """
+    Creates a preprocessing pipeline for real estate data.
+    """
+    return Pipeline([
+        ('data_cleaner', DataCleaner()),
+        ('feature_engineer', FeatureEngineer()),
+        ('coordinate_getter', CoordinateGetter()),
+        ('categorical_encoder', CategoricalEncoder()),
+        ('kde_knn_creator', KDEKNNFeatureCreator()),
+        ('column_cleaner', ColumnCleaner()),
+        #('standard_scaler', scaler)
+    ])
+def preprocess_data(df_train, df_test=None, pipeline=None):
+    """
+    Preprocesses the training and/or test (single prediction) data using the pipeline.
+    Args:
+        df_train (pd.DataFrame): Training data
+        df_test (pd.DataFrame, optional): Test data or single row for prediction
+        pipeline (Pipeline, optional): If provided, use existing fitted pipeline
+    Returns:
+        tuple: (X_train, X_test, fitted_pipeline, scaler)
+    """
+    y_test = None
+    # Drop rows with NaN values in critical columns
+    if df_train is not None:
+        df_train = df_train.dropna(subset=['price', 'habitableSurface'])
+    if df_test is not None:
+        if 'price' in df_test.columns:
+            df_test = df_test.dropna(subset=['price'])
+        df_test = df_test.dropna(subset=['habitableSurface'])
+    if pipeline is None:
+        pipeline = create_preprocessing_pipeline()
+        X_train = pipeline.fit_transform(df_train)
+        y_train = X_train['price']
+        X_train = X_train.drop(columns=['price'])
+    else:
+        X_train = None
+        y_train = None
+    X_test = pipeline.transform(df_test) if df_test is not None else None
+    if 'price' in X_test.columns:
+        y_test = X_test['price']
+        X_test = X_test.drop(columns=['price'])
+    else:
+        y_test = None
+        X_test = X_test
+    return X_train, y_train, X_test, y_test, pipeline
+def main():
+    # Load data
+    df = pd.read_csv("./data/Kangaroo.csv")
+    df = df.drop_duplicates(subset=["id"], keep="first")
+    df = df[df['price'] < 1500000]
+    df = df.dropna(subset=['price'])
+    # Filter EPC scores
+    epc_order = ['A++', 'A+', 'A', 'B', 'C', 'D', 'E', 'F', 'G']
+    df = df[df['epcScore'].isin(epc_order)]
+    df['epcScore'] = df['epcScore'].fillna(df['epcScore'].mode()[0])
+    # Convert price to float
+    df['price'] = df['price'].astype(float)
+    # Split data
+    from sklearn.model_selection import train_test_split
+    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
+    # Preprocess data
+    X_train, X_test = preprocess_data(df_train, df_test)
+    # Save processed data
+    pd.DataFrame(X_train).to_csv("./data/train_processed.csv", index=False)
+    pd.DataFrame(X_test).to_csv("./data/test_processed.csv", index=False)
+if __name__ == "__main__":
+    main()

preprocessing/pipeline_components.py ADDED Viewed

	@@ -0,0 +1,346 @@

+import pandas as pd
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from scipy.stats import gaussian_kde
+from sklearn.preprocessing import StandardScaler
+from sklearn.neighbors import NearestNeighbors
+import pgeocode
+from datetime import datetime
+import pgeocode
+class DataCleaner(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        self.col_types = {
+            'id': 'int', 'type': 'str', 'subtype': 'str', 'bedroomCount': 'int',
+            'bathroomCount': 'int', 'province': 'str', 'locality': 'str',
+            'postCode': 'int', 'habitableSurface': 'float', 'hasBasement': 'int',
+            'buildingCondition': 'str', 'buildingConstructionYear': 'int',
+            'hasLift': 'int', 'floodZoneType': 'str', 'heatingType': 'str',
+            'hasHeatPump': 'int', 'hasPhotovoltaicPanels': 'int',
+            'hasThermicPanels': 'int', 'kitchenType': 'str', 'landSurface': 'float',
+            'hasLivingRoom': 'int', 'livingRoomSurface': 'float', 'hasGarden': 'int',
+            'gardenSurface': 'float', 'parkingCountIndoor': 'int',
+            'parkingCountOutdoor': 'int', 'hasAirConditioning': 'int',
+            'hasArmoredDoor': 'int', 'hasVisiophone': 'int', 'hasOffice': 'int',
+            'toiletCount': 'int', 'hasSwimmingPool': 'int', 'hasFireplace': 'int',
+            'hasTerrace': 'int', 'terraceSurface': 'float', 'terraceOrientation': 'str',
+            'epcScore': 'str', 'facadeCount': 'int'
+        }
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X):
+        df = X.copy()
+        # Drop unnecessary columns
+        df = df.drop(columns=[col for col in ["Unnamed: 0", "url"] if col in df.columns])
+        df = df.drop(columns=[col for col in ['monthlyCost', 'hasBalcony', 'accessibleDisabledPeople',
+                            'roomCount', 'diningRoomSurface', 'streetFacadeWidth',
+                            'gardenOrientation', 'kitchenSurface', 'floorCount',
+                            'hasDiningRoom', 'hasDressingRoom'] if col in df.columns])
+        # Handle binary columns
+        binary_cols = [
+            'hasBasement', 'hasLift', 'hasHeatPump', 'hasPhotovoltaicPanels',
+            'hasAirConditioning', 'hasArmoredDoor', 'hasVisiophone', 'hasOffice',
+            'hasSwimmingPool', 'hasFireplace', 'parkingCountIndoor', 'parkingCountOutdoor',
+            'hasAttic', 'hasThermicPanels'
+        ]
+        for col in binary_cols:
+            df[col] = df[col].map({True: 1, False: 0, 'True': 1, 'False': 0, 'YES': 1, 'NO': 0}).fillna(0).astype(int)
+        # Handle dependent columns
+        df['hasLivingRoom'] = df['hasLivingRoom'].map({True: 1, False: 0, 'True': 1, 'False': 0, 'YES': 1, 'NO': 0})
+        df.loc[df['hasLivingRoom'].isna(), 'hasLivingRoom'] = df['livingRoomSurface'].notnull().astype(int)
+        df['hasGarden'] = df['hasGarden'].map({True: 1, False: 0, 'True': 1, 'False': 0, 'YES': 1, 'NO': 0})
+        df.loc[df['hasGarden'].isna(), 'hasGarden'] = df['gardenSurface'].notnull().astype(int)
+        df['hasTerrace'] = df['hasTerrace'].map({True: 1, False: 0, 'True': 1, 'False': 0, 'YES': 1, 'NO': 0})
+        df.loc[df['hasTerrace'].isna(), 'hasTerrace'] = df['terraceSurface'].notnull().astype(int)
+        # Set surfaces to 0 when feature is not present
+        df.loc[df['hasLivingRoom'] == 0, 'livingRoomSurface'] = 0
+        df.loc[df['hasGarden'] == 0, 'gardenSurface'] = 0
+        df.loc[df['hasTerrace'] == 0, 'terraceSurface'] = 0
+        df.loc[df['hasTerrace'] == 0, 'terraceOrientation'] = 0
+        # Handle facade count
+        df['facadeCount'] = df['facadeCount'].fillna(-1)
+        # Fill missing values
+        df['bedroomCount'] = df['bedroomCount'].fillna(-1).astype(float)
+        df['bathroomCount'] = df['bathroomCount'].fillna(-1).astype(float)
+        df['toiletCount'] = df['toiletCount'].fillna(-1).astype(float)
+        # Drop habitable surface na
+        df = df.dropna(subset=['habitableSurface'])
+        # Fill other missing values
+        df['buildingCondition'] = df['buildingCondition'].fillna('NOT_MENTIONED')
+        df['floodZoneType'] = df['floodZoneType'].fillna('NON_FLOOD_ZONE')
+        df['heatingType'] = df['heatingType'].fillna(df['heatingType'].mode()[0])
+        df['hasThermicPanels'] = df['hasThermicPanels'].fillna(0.0)
+        df['kitchenType'] = df['kitchenType'].fillna(df['kitchenType'].mode()[0])
+        df['landSurface'] = df['landSurface'].fillna(df['landSurface'].median())
+        df['livingRoomSurface'] = df['livingRoomSurface'].fillna(df['livingRoomSurface'].median())
+        # Transform building construction year into age and fillna(-1)
+        current_year = datetime.now().year
+        df['buildingAge'] = current_year - df['buildingConstructionYear']
+        df['buildingAge'] = df['buildingAge'].fillna(-1)
+        # Handle terrace surface and orientation
+        median_terrace = df.loc[(df['hasTerrace'] == 1) & (df['terraceSurface'].notnull()), 'terraceSurface'].median()
+        df.loc[(df['hasTerrace'] == 1) & (df['terraceSurface'].isna()), 'terraceSurface'] = -1
+        df.loc[(df['hasTerrace'] != 1) & (df['terraceSurface'].isna()), 'terraceSurface'] = 0
+        mode_terrace = df.loc[(df['hasTerrace'] == 1), 'terraceOrientation'].mode()[0]
+        df.loc[(df['hasTerrace'] == 1) & (df['terraceOrientation'].isna()), 'terraceOrientation'] = 'NOT_MENTIONED'
+        df.loc[(df['hasTerrace'] != 1) & (df['terraceOrientation'].isna()), 'terraceOrientation'] = 'NO_TERRACE'
+        # Convert data types
+        for col, dtype in self.col_types.items():
+            if col in df.columns:
+                if pd.api.types.is_integer_dtype(dtype):
+                    df[col] = df[col].fillna(0).astype(dtype)
+                else:
+                    df[col] = df[col].astype(dtype)
+        return df
+class FeatureEngineer(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        self.epc_mapping = {
+            'Flanders': {
+                'A++': 0, 'A+': 0, 'A': 100, 'B': 200, 'C': 300,
+                'D': 400, 'E': 500, 'F': 600, 'G': 700
+            },
+            'Wallonia': {
+                'A++': 0, 'A+': 50, 'A': 90, 'B': 170, 'C': 250,
+                'D': 330, 'E': 420, 'F': 510, 'G': 600
+            },
+            'Bruxelles': {
+                'A++': 0, 'A+': 0, 'A': 45, 'B': 95, 'C': 145,
+                'D': 210, 'E': 275, 'F': 345, 'G': 450
+            }
+        }
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X):
+        df = X.copy()
+        if 'price' in df.columns:
+            # Filter out extremely high prices
+            high_price_count = (df['price'] > 1500000).sum()
+            df = df[df['price'] <= 1500000]
+            # Check for problematic values
+            zero_price = (df['price'] <= 0).sum()
+            zero_surface = (df['habitableSurface'] <= 0).sum()
+            # Handle problematic values
+            if zero_price > 0:
+                df.loc[df['price'] <= 0, 'price'] = np.nan
+            if zero_surface > 0:
+                df.loc[df['habitableSurface'] <= 0, 'habitableSurface'] = np.nan
+        # Add isHouse feature
+        df['isHouse'] = (df['type'] == 'HOUSE').astype(int)
+        # Add region information first
+        def get_region(zip_code):
+            if 1000 <= zip_code <= 1299:
+                return "Bruxelles"
+            elif 1300 <= zip_code <= 1499 or 4000 <= zip_code <= 7999:
+                return "Wallonia"
+            else:
+                return "Flanders"
+        df['region'] = df['postCode'].apply(get_region)
+        if 'price' in df.columns:
+            # Now add price per m2
+            df['pricePerM2'] = df['price'] / df['habitableSurface']
+            # Handle inf values
+            df['pricePerM2'] = df['pricePerM2'].replace([np.inf, -np.inf], np.nan)
+            # Fill NaN values with median by region
+            df['pricePerM2'] = df['pricePerM2'].fillna(-1)
+        # Convert EPC score
+        df['epcScore'] = df.apply(lambda row: self.epc_mapping.get(row['region'], {}).get(row['epcScore'], None), axis=1)
+        df['epcScore'] = df['epcScore'].fillna(-1)
+        # Convert building condition
+        condition_rating = {
+            'to restore': 0, 'to renovate': 1, 'to be done up': 2,
+            'good': 3, 'just renovated': 4, 'as new': 5
+        }
+        df['buildingCondition'] = (df['buildingCondition'].astype(str).str.strip().str.lower()
+                                .map(condition_rating).fillna(-1).astype(int))
+        # Convert flood zone type
+        df['floodZoneType'] = (df['floodZoneType'] != 'NON_FLOOD_ZONE').astype(int)
+        return df
+class CategoricalEncoder(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        self.categorical_columns = ['province', 'heatingType', 'kitchenType', 'subtype', 'terraceOrientation']
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X):
+        df = X.copy()
+        # One-hot encode categorical columns
+        for col in self.categorical_columns:
+            if col in df.columns:
+                df = pd.get_dummies(df, columns=[col], prefix=col, dtype=int)
+        return df
+class CoordinateGetter(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        pass
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X):
+        df = X.copy()
+        if 'id' in df.columns:
+            df_giraffe = pd.read_csv('data/Giraffe.csv')
+            df_giraffe = df_giraffe[['propertyId', 'latitude', 'longitude']]
+            df_giraffe['id'] = df_giraffe['propertyId']
+            cols = df_giraffe.columns.tolist()
+            cols.remove('id')
+            new_order = ['id'] + cols
+            df_giraffe = df_giraffe[new_order]
+            df_giraffe = df_giraffe.drop(columns='propertyId')
+            df = df.merge(df_giraffe, on='id', how='left')
+            df = df.dropna(subset=['latitude', 'longitude'])
+        else :
+            nomi = pgeocode.Nominatim('be')
+            df['postCode'] = df['postCode'].astype(str)
+            unique_postcodes = df["postCode"].astype(str).unique()
+            geo_df = nomi.query_postal_code(list(unique_postcodes))
+            geo_df = geo_df[['postal_code', 'latitude', 'longitude']]
+            geo_df = geo_df.rename(columns={'postal_code': 'postCode'})
+            geo_df['postCode'] = geo_df['postCode'].astype(str)
+            df = df.merge(geo_df, on='postCode', how='left')
+        return df
+class KDEKNNFeatureCreator(BaseEstimator, TransformerMixin):
+    def __init__(self, k=20):
+        self.k = k
+        self.scaler = StandardScaler()
+        self.knn = NearestNeighbors(n_neighbors=k)
+        self.train_prices = None
+    def fit(self, X, y=None):
+        if 'latitude' not in X.columns or 'longitude' not in X.columns:
+            print("Warning: Missing latitude/longitude columns")
+            return self
+        coords_scaled = self.scaler.fit_transform(X[['latitude', 'longitude']])
+        self.knn.fit(coords_scaled)
+        # Store training prices
+        self.train_prices = X['pricePerM2'].values
+        return self
+    def transform(self, X):
+        df = X.copy()
+        if 'latitude' not in df.columns or 'longitude' not in df.columns:
+            print("Warning: Missing latitude/longitude columns")
+            df['kde_price_per_m2_knn'] = np.nan
+            return df
+        coords_scaled = self.scaler.transform(df[['latitude', 'longitude']])
+        distances, indices = self.knn.kneighbors(coords_scaled)
+        kde_scores = []
+        invalid_kde_count = 0
+        for i in range(len(df)):
+            neighbor_idxs = indices[i]
+            # Use stored training prices for neighbors
+            neighbor_prices = self.train_prices[neighbor_idxs]
+            neighbor_prices = neighbor_prices[~np.isnan(neighbor_prices)]
+            if len(neighbor_prices) < 2:
+                kde_scores.append(np.nan)
+                invalid_kde_count += 1
+                continue
+            try:
+                kde = gaussian_kde(neighbor_prices)
+                value_to_evaluate = neighbor_prices.mean()
+                kde_score = kde(value_to_evaluate)[0]
+                if np.isfinite(kde_score):
+                    kde_scores.append(kde_score)
+                else:
+                    kde_scores.append(np.nan)
+                    invalid_kde_count += 1
+            except Exception as e:
+                print(f"Error in KDE calculation for row {i}: {str(e)}")
+                kde_scores.append(np.nan)
+                invalid_kde_count += 1
+        df['kde_price_per_m2_knn'] = kde_scores
+        # Fill NaN values with median by region
+        df['kde_price_per_m2_knn'] = df['kde_price_per_m2_knn'].fillna(-1)
+        return df.drop(columns=['latitude', 'longitude'], errors='ignore')
+class ColumnCleaner(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        self.columns_to_drop = [
+            'id', 'postCode', 'buildingConstructionYear', 'type', 'locality', 'region',
+            'latitude', 'longitude', 'buildingConstructionYear'
+        ]
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X):
+        df = X.copy()
+        # Drop columns that are no longer needed
+        columns_to_drop = [col for col in self.columns_to_drop if col in df.columns]
+        df = df.drop(columns=columns_to_drop)
+        if 'pricePerM2' in df.columns:
+            df = df.drop(columns=['pricePerM2'])
+        # Ensure all remaining columns are numeric
+        non_numeric_cols = df.select_dtypes(include=['object', 'category']).columns
+        if len(non_numeric_cols) > 0:
+            # Convert any remaining categorical columns to numeric
+            for col in non_numeric_cols:
+                if col != 'price':  # Don't encode the target variable
+                    df[col] = pd.Categorical(df[col]).codes
+        # Reorganize columns to put price at the end
+        cols = df.columns.tolist()
+        if 'price' in cols:
+            cols.remove('price')
+            cols.append('price')
+            df = df[cols]
+        return df

requirements.txt ADDED Viewed

	@@ -0,0 +1,42 @@

+altair==5.5.0
+attrs==25.3.0
+blinker==1.9.0
+cachetools==5.5.2
+certifi==2025.4.26
+charset-normalizer==3.4.2
+click==8.2.0
+gitdb==4.0.12
+GitPython==3.1.44
+idna==3.10
+Jinja2==3.1.6
+joblib==1.5.0
+jsonschema==4.23.0
+jsonschema-specifications==2025.4.1
+MarkupSafe==3.0.2
+narwhals==1.39.0
+numpy==2.2.5
+packaging==24.2
+pandas==2.2.3
+pgeocode==0.5.0
+pillow==11.2.1
+protobuf==6.31.0
+pyarrow==20.0.0
+pydeck==0.9.1
+python-dateutil==2.9.0.post0
+pytz==2025.2
+referencing==0.36.2
+requests==2.32.3
+rpds-py==0.24.0
+scikit-learn==1.6.1
+scipy==1.15.3
+six==1.17.0
+smmap==5.0.2
+streamlit==1.45.1
+tenacity==9.1.2
+threadpoolctl==3.6.0
+toml==0.10.2
+tornado==6.4.2
+typing_extensions==4.13.2
+tzdata==2025.2
+urllib3==2.4.0
+xgboost==3.0.1

saved/columns.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:236986f05a9f615d73af85b33abfb85fa82e36fc50391c62cf8959fbe310b47e
+size 1911

saved/model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:294261411aa75310e5e9ea5427c35cb4d97f23e0927a325d09490ab12532acd9
+size 4703400

saved/pipeline.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8794879223249a6241dbd1ae60c0c11564ce200f595c77a41a91bae53217f2b
+size 2256890