Spaces:

Rhodham96
/

RealEstate_pricePrediction_Belgium

Sleeping

App Files Files Community

RealEstate_pricePrediction_Belgium / preprocessing /pipeline_components.py

Rhodham96

Update preprocessing/pipeline_components.py

933cc13 verified 7 months ago

raw

history blame contribute delete

16.1 kB

	import os
	os.environ["PGEOCODE_CACHE_DIR"] = "/tmp/pgeocode"

	import pandas as pd
	import numpy as np
	from sklearn.base import BaseEstimator, TransformerMixin
	from scipy.stats import gaussian_kde
	from sklearn.preprocessing import StandardScaler
	from sklearn.neighbors import NearestNeighbors
	from datetime import datetime
	import pgeocode
	# IMMEDIATELY after importing pgeocode, force its STORAGE_DIR
	# This must be done before any pgeocode.Nominatim() calls
	try:
	pgeocode.STORAGE_DIR = "/tmp/pgeocode"
	# Ensure the directory exists as pgeocode might not create it if overridden this way
	os.makedirs(pgeocode.STORAGE_DIR, exist_ok=True)
	print(f"DEBUG: Successfully forced pgeocode.STORAGE_DIR to '{pgeocode.STORAGE_DIR}' and ensured directory exists.")
	except Exception as e:
	print(f"ERROR: Failed to force pgeocode.STORAGE_DIR or create directory: {e}")

	class DataCleaner(BaseEstimator, TransformerMixin):
	def __init__(self):
	self.col_types = {
	'id': 'int', 'type': 'str', 'subtype': 'str', 'bedroomCount': 'int',
	'bathroomCount': 'int', 'province': 'str', 'locality': 'str',
	'postCode': 'int', 'habitableSurface': 'float', 'hasBasement': 'int',
	'buildingCondition': 'str', 'buildingConstructionYear': 'int',
	'hasLift': 'int', 'floodZoneType': 'str', 'heatingType': 'str',
	'hasHeatPump': 'int', 'hasPhotovoltaicPanels': 'int',
	'hasThermicPanels': 'int', 'kitchenType': 'str', 'landSurface': 'float',
	'hasLivingRoom': 'int', 'livingRoomSurface': 'float', 'hasGarden': 'int',
	'gardenSurface': 'float', 'parkingCountIndoor': 'int',
	'parkingCountOutdoor': 'int', 'hasAirConditioning': 'int',
	'hasArmoredDoor': 'int', 'hasVisiophone': 'int', 'hasOffice': 'int',
	'toiletCount': 'int', 'hasSwimmingPool': 'int', 'hasFireplace': 'int',
	'hasTerrace': 'int', 'terraceSurface': 'float', 'terraceOrientation': 'str',
	'epcScore': 'str', 'facadeCount': 'int'
	}
	self.kitchenType_mode = None

	def fit(self, X, y=None):
	#self.kitchenType_mode = X['kitchenType'].mode()[0]
	return self

	def transform(self, X):
	df = X.copy()

	# Drop unnecessary columns
	df = df.drop(columns=[col for col in ["Unnamed: 0", "url"] if col in df.columns])
	df = df.drop(columns=[col for col in ['monthlyCost', 'hasBalcony', 'accessibleDisabledPeople',
	'roomCount', 'diningRoomSurface', 'streetFacadeWidth',
	'gardenOrientation', 'kitchenSurface', 'floorCount',
	'hasDiningRoom', 'hasDressingRoom'] if col in df.columns])


	# Handle binary columns
	binary_cols = [
	'hasBasement', 'hasLift', 'hasHeatPump', 'hasPhotovoltaicPanels',
	'hasAirConditioning', 'hasArmoredDoor', 'hasVisiophone', 'hasOffice',
	'hasSwimmingPool', 'hasFireplace', 'parkingCountIndoor', 'parkingCountOutdoor',
	'hasAttic', 'hasThermicPanels'
	]

	for col in binary_cols:
	df[col] = df[col].map({True: 1, False: 0, 'True': 1, 'False': 0, 'YES': 1, 'NO': 0}).fillna(0).astype(int)

	# Handle dependent columns
	df['hasLivingRoom'] = df['hasLivingRoom'].map({True: 1, False: 0, 'True': 1, 'False': 0, 'YES': 1, 'NO': 0})
	df.loc[df['hasLivingRoom'].isna(), 'hasLivingRoom'] = df['livingRoomSurface'].notnull().astype(int)

	df['hasGarden'] = df['hasGarden'].map({True: 1, False: 0, 'True': 1, 'False': 0, 'YES': 1, 'NO': 0})
	df.loc[df['hasGarden'].isna(), 'hasGarden'] = df['gardenSurface'].notnull().astype(int)

	df['hasTerrace'] = df['hasTerrace'].map({True: 1, False: 0, 'True': 1, 'False': 0, 'YES': 1, 'NO': 0})
	df.loc[df['hasTerrace'].isna(), 'hasTerrace'] = df['terraceSurface'].notnull().astype(int)

	# Set surfaces to 0 when feature is not present
	df.loc[df['hasLivingRoom'] == 0, 'livingRoomSurface'] = 0
	df.loc[df['hasGarden'] == 0, 'gardenSurface'] = 0
	df.loc[df['hasTerrace'] == 0, 'terraceSurface'] = 0
	df.loc[df['hasTerrace'] == 0, 'terraceOrientation'] = 0

	# Handle facade count
	df['facadeCount'] = df['facadeCount'].fillna(-1)

	# Fill missing values
	df['bedroomCount'] = df['bedroomCount'].fillna(-1).astype(float)
	df['bathroomCount'] = df['bathroomCount'].fillna(-1).astype(float)
	df['toiletCount'] = df['toiletCount'].fillna(-1).astype(float)

	# Drop habitable surface na
	df = df.dropna(subset=['habitableSurface'])

	# Fill other missing values
	df['buildingCondition'] = df['buildingCondition'].fillna('NOT_MENTIONED')
	df['floodZoneType'] = df['floodZoneType'].fillna('NON_FLOOD_ZONE')
	df['heatingType'] = df['heatingType'].fillna(df['heatingType'].mode()[0])
	df['hasThermicPanels'] = df['hasThermicPanels'].fillna(0.0)
	df['kitchenType'] = df['kitchenType'].fillna(df['kitchenType'].mode()[0])
	df['landSurface'] = df['landSurface'].fillna(df['landSurface'].median())
	df['livingRoomSurface'] = df['livingRoomSurface'].fillna(df['livingRoomSurface'].median())

	# Transform building construction year into age and fillna(-1)
	current_year = datetime.now().year
	df['buildingAge'] = current_year - df['buildingConstructionYear']
	df['buildingAge'] = df['buildingAge'].fillna(-1)

	# Handle terrace surface and orientation
	median_terrace = df.loc[(df['hasTerrace'] == 1) & (df['terraceSurface'].notnull()), 'terraceSurface'].median()
	df.loc[(df['hasTerrace'] == 1) & (df['terraceSurface'].isna()), 'terraceSurface'] = -1
	df.loc[(df['hasTerrace'] != 1) & (df['terraceSurface'].isna()), 'terraceSurface'] = 0

	mode_terrace = df.loc[(df['hasTerrace'] == 1), 'terraceOrientation'].mode()[0]
	df.loc[(df['hasTerrace'] == 1) & (df['terraceOrientation'].isna()), 'terraceOrientation'] = 'NOT_MENTIONED'
	df.loc[(df['hasTerrace'] != 1) & (df['terraceOrientation'].isna()), 'terraceOrientation'] = 'NO_TERRACE'

	# Convert data types
	for col, dtype in self.col_types.items():
	if col in df.columns:
	if pd.api.types.is_integer_dtype(dtype):
	df[col] = df[col].fillna(0).astype(dtype)
	else:
	df[col] = df[col].astype(dtype)

	return df

	class FeatureEngineer(BaseEstimator, TransformerMixin):
	def __init__(self):
	self.epc_mapping = {
	'Flanders': {
	'A++': 0, 'A+': 0, 'A': 100, 'B': 200, 'C': 300,
	'D': 400, 'E': 500, 'F': 600, 'G': 700
	},
	'Wallonia': {
	'A++': 0, 'A+': 50, 'A': 90, 'B': 170, 'C': 250,
	'D': 330, 'E': 420, 'F': 510, 'G': 600
	},
	'Bruxelles': {
	'A++': 0, 'A+': 0, 'A': 45, 'B': 95, 'C': 145,
	'D': 210, 'E': 275, 'F': 345, 'G': 450
	}
	}

	def fit(self, X, y=None):
	return self

	def transform(self, X):
	df = X.copy()
	if 'price' in df.columns:
	# Filter out extremely high prices
	high_price_count = (df['price'] > 1500000).sum()
	df = df[df['price'] <= 1500000]
	# Check for problematic values
	zero_price = (df['price'] <= 0).sum()
	zero_surface = (df['habitableSurface'] <= 0).sum()

	# Handle problematic values
	if zero_price > 0:
	df.loc[df['price'] <= 0, 'price'] = np.nan

	if zero_surface > 0:
	df.loc[df['habitableSurface'] <= 0, 'habitableSurface'] = np.nan

	# Add isHouse feature
	df['isHouse'] = (df['type'] == 'HOUSE').astype(int)

	# Add region information first
	def get_region(zip_code):
	if 1000 <= zip_code <= 1299:
	return "Bruxelles"
	elif 1300 <= zip_code <= 1499 or 4000 <= zip_code <= 7999:
	return "Wallonia"
	else:
	return "Flanders"

	df['region'] = df['postCode'].apply(get_region)
	if 'price' in df.columns:
	# Now add price per m2
	df['pricePerM2'] = df['price'] / df['habitableSurface']
	# Handle inf values
	df['pricePerM2'] = df['pricePerM2'].replace([np.inf, -np.inf], np.nan)
	# Fill NaN values with median by region
	df['pricePerM2'] = df['pricePerM2'].fillna(-1)

	# Convert EPC score
	df['epcScore'] = df.apply(lambda row: self.epc_mapping.get(row['region'], {}).get(row['epcScore'], None), axis=1)
	df['epcScore'] = df['epcScore'].fillna(-1)

	# Convert building condition
	condition_rating = {
	'to restore': 0, 'to renovate': 1, 'to be done up': 2,
	'good': 3, 'just renovated': 4, 'as new': 5
	}
	df['buildingCondition'] = (df['buildingCondition'].astype(str).str.strip().str.lower()
	.map(condition_rating).fillna(-1).astype(int))

	# Convert flood zone type
	df['floodZoneType'] = (df['floodZoneType'] != 'NON_FLOOD_ZONE').astype(int)

	return df

	class CategoricalEncoder(BaseEstimator, TransformerMixin):
	def __init__(self):
	self.categorical_columns = ['province', 'heatingType', 'kitchenType', 'subtype', 'terraceOrientation']

	def fit(self, X, y=None):
	return self

	def transform(self, X):
	df = X.copy()

	# One-hot encode categorical columns
	for col in self.categorical_columns:
	if col in df.columns:
	df = pd.get_dummies(df, columns=[col], prefix=col, dtype=int)

	return df

	class CoordinateGetter(BaseEstimator, TransformerMixin):
	def __init__(self):
	pass

	def fit(self, X, y=None):
	return self

	def transform(self, X):
	df = X.copy()
	if 'id' in df.columns:
	df_giraffe = pd.read_csv('data/Giraffe.csv')
	df_giraffe = df_giraffe[['propertyId', 'latitude', 'longitude']]

	df_giraffe['id'] = df_giraffe['propertyId']
	cols = df_giraffe.columns.tolist()
	cols.remove('id')
	new_order = ['id'] + cols
	df_giraffe = df_giraffe[new_order]

	df_giraffe = df_giraffe.drop(columns='propertyId')

	df = df.merge(df_giraffe, on='id', how='left')
	df = df.dropna(subset=['latitude', 'longitude'])

	else :
	print(f"DEBUG: PGEOCODE_CACHE_DIR from os.environ: '{os.environ.get('PGEOCODE_CACHE_DIR')}'")
	print(f"DEBUG: XDG_CACHE_HOME from os.environ: '{os.environ.get('XDG_CACHE_HOME')}'")
	print(f"DEBUG: os.path.expanduser('~'): '{os.path.expanduser('~')}'")
	print(f"DEBUG: Current Working Directory: '{os.getcwd()}'")

	# Try to create the /tmp/pgeocode directory to check permissions there
	try:
	temp_cache_dir = "/tmp/pgeocode"
	os.makedirs(temp_cache_dir, exist_ok=True)
	print(f"DEBUG: Successfully created/ensured existence of '{temp_cache_dir}'.")
	except Exception as e:
	print(f"DEBUG: Failed to create '{temp_cache_dir}': {e}")

	nomi = pgeocode.Nominatim('be')

	df['postCode'] = df['postCode'].astype(str)
	unique_postcodes = df["postCode"].astype(str).unique()

	geo_df = nomi.query_postal_code(list(unique_postcodes))
	geo_df = geo_df[['postal_code', 'latitude', 'longitude']]
	geo_df = geo_df.rename(columns={'postal_code': 'postCode'})
	geo_df['postCode'] = geo_df['postCode'].astype(str)
	df = df.merge(geo_df, on='postCode', how='left')

	return df

	class KDEKNNFeatureCreator(BaseEstimator, TransformerMixin):
	def __init__(self, k=20):
	self.k = k
	self.scaler = StandardScaler()
	self.knn = NearestNeighbors(n_neighbors=k)
	self.train_prices = None

	def fit(self, X, y=None):
	if 'latitude' not in X.columns or 'longitude' not in X.columns:
	print("Warning: Missing latitude/longitude columns")
	return self

	coords_scaled = self.scaler.fit_transform(X[['latitude', 'longitude']])
	self.knn.fit(coords_scaled)

	# Store training prices
	self.train_prices = X['pricePerM2'].values

	return self

	def transform(self, X):
	df = X.copy()

	if 'latitude' not in df.columns or 'longitude' not in df.columns:
	print("Warning: Missing latitude/longitude columns")
	df['kde_price_per_m2_knn'] = np.nan
	return df

	coords_scaled = self.scaler.transform(df[['latitude', 'longitude']])
	distances, indices = self.knn.kneighbors(coords_scaled)

	kde_scores = []

	invalid_kde_count = 0

	for i in range(len(df)):
	neighbor_idxs = indices[i]
	# Use stored training prices for neighbors
	neighbor_prices = self.train_prices[neighbor_idxs]
	neighbor_prices = neighbor_prices[~np.isnan(neighbor_prices)]

	if len(neighbor_prices) < 2:
	kde_scores.append(np.nan)
	invalid_kde_count += 1
	continue

	try:
	kde = gaussian_kde(neighbor_prices)
	value_to_evaluate = neighbor_prices.mean()
	kde_score = kde(value_to_evaluate)[0]

	if np.isfinite(kde_score):
	kde_scores.append(kde_score)
	else:
	kde_scores.append(np.nan)
	invalid_kde_count += 1
	except Exception as e:
	print(f"Error in KDE calculation for row {i}: {str(e)}")
	kde_scores.append(np.nan)
	invalid_kde_count += 1

	df['kde_price_per_m2_knn'] = kde_scores

	# Fill NaN values with median by region
	df['kde_price_per_m2_knn'] = df['kde_price_per_m2_knn'].fillna(-1)

	return df.drop(columns=['latitude', 'longitude'], errors='ignore')

	class ColumnCleaner(BaseEstimator, TransformerMixin):
	def __init__(self):
	self.columns_to_drop = [
	'id', 'postCode', 'buildingConstructionYear', 'type', 'locality', 'region',
	'latitude', 'longitude', 'buildingConstructionYear'
	]

	def fit(self, X, y=None):
	return self

	def transform(self, X):
	df = X.copy()

	# Drop columns that are no longer needed
	columns_to_drop = [col for col in self.columns_to_drop if col in df.columns]
	df = df.drop(columns=columns_to_drop)
	if 'pricePerM2' in df.columns:
	df = df.drop(columns=['pricePerM2'])
	# Ensure all remaining columns are numeric
	non_numeric_cols = df.select_dtypes(include=['object', 'category']).columns
	if len(non_numeric_cols) > 0:
	# Convert any remaining categorical columns to numeric
	for col in non_numeric_cols:
	if col != 'price': # Don't encode the target variable
	df[col] = pd.Categorical(df[col]).codes

	# Reorganize columns to put price at the end
	cols = df.columns.tolist()
	if 'price' in cols:
	cols.remove('price')
	cols.append('price')
	df = df[cols]

	return df