|
|
from collections import Counter |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from scipy.spatial import cKDTree |
|
|
|
|
|
df_amenities = pd.read_csv("df_amenities.csv") |
|
|
df_banks = pd.read_csv("df_banks.csv") |
|
|
|
|
|
df_amenities["fsq_category_labels"] = df_amenities["fsq_category_labels"].apply( |
|
|
lambda x: eval(x) |
|
|
) |
|
|
|
|
|
bank_coords = df_banks[['lat','lon']].values |
|
|
tree_banks = cKDTree(bank_coords) |
|
|
|
|
|
amenity_coords = df_amenities[['lat','lon']].values |
|
|
tree_amenities = cKDTree(amenity_coords) |
|
|
|
|
|
DATASET_COLUMNS = [ |
|
|
'Dining and Drinking', 'Community and Government', 'Retail', |
|
|
'Business and Professional Services', 'Landmarks and Outdoors', |
|
|
'Arts and Entertainment', 'Health and Medicine', |
|
|
'Travel and Transportation', 'Sports and Recreation', |
|
|
'Event' |
|
|
] |
|
|
|
|
|
def compute_features(candidate_point, radius=0.005): |
|
|
lat, lon = candidate_point |
|
|
|
|
|
|
|
|
bank_idxs = tree_banks.query_ball_point([lat, lon], r=radius) |
|
|
|
|
|
print("[BANK]", bank_idxs) |
|
|
|
|
|
n_banks = len(bank_idxs) |
|
|
if n_banks > 0: |
|
|
neighbors = df_banks.iloc[bank_idxs] |
|
|
mean_dist_banks = np.mean(np.sqrt((neighbors['lat']-lat)**2 + (neighbors['lon']-lon)**2)) |
|
|
min_dist_bank = np.min(np.sqrt((neighbors['lat']-lat)**2 + (neighbors['lon']-lon)**2)) |
|
|
else: |
|
|
mean_dist_banks = radius |
|
|
min_dist_bank = radius |
|
|
|
|
|
|
|
|
amenity_idxs = tree_amenities.query_ball_point([lat, lon], r=radius) |
|
|
amenities = df_amenities.iloc[amenity_idxs] |
|
|
|
|
|
total_amenities = len(amenities) |
|
|
|
|
|
|
|
|
all_category_ids = [cats[0].split(">")[0].strip() for cats in amenities['fsq_category_labels'] if len(cats)>0] |
|
|
category_diversity = len(set(all_category_ids)) |
|
|
|
|
|
features = { |
|
|
'num_banks_in_radius': n_banks, |
|
|
'mean_dist_banks': mean_dist_banks, |
|
|
'min_dist_bank': min_dist_bank, |
|
|
'total_amenities': total_amenities, |
|
|
'category_diversity': category_diversity |
|
|
} |
|
|
|
|
|
|
|
|
count_per_category = Counter(all_category_ids) |
|
|
for feat in DATASET_COLUMNS: |
|
|
|
|
|
features[f'num_{feat}'] = count_per_category.get(feat, 0) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return features |