from collections import Counter import pandas as pd import numpy as np from scipy.spatial import cKDTree df_amenities = pd.read_csv("df_amenities.csv") df_banks = pd.read_csv("df_banks.csv") df_amenities["fsq_category_labels"] = df_amenities["fsq_category_labels"].apply( lambda x: eval(x) ) bank_coords = df_banks[['lat','lon']].values tree_banks = cKDTree(bank_coords) amenity_coords = df_amenities[['lat','lon']].values tree_amenities = cKDTree(amenity_coords) DATASET_COLUMNS = [ 'Dining and Drinking', 'Community and Government', 'Retail', 'Business and Professional Services', 'Landmarks and Outdoors', 'Arts and Entertainment', 'Health and Medicine', 'Travel and Transportation', 'Sports and Recreation', 'Event' ] def compute_features(candidate_point, radius=0.005): lat, lon = candidate_point # Banks bank_idxs = tree_banks.query_ball_point([lat, lon], r=radius) print("[BANK]", bank_idxs) n_banks = len(bank_idxs) if n_banks > 0: neighbors = df_banks.iloc[bank_idxs] mean_dist_banks = np.mean(np.sqrt((neighbors['lat']-lat)**2 + (neighbors['lon']-lon)**2)) min_dist_bank = np.min(np.sqrt((neighbors['lat']-lat)**2 + (neighbors['lon']-lon)**2)) else: mean_dist_banks = radius min_dist_bank = radius # Amenities amenity_idxs = tree_amenities.query_ball_point([lat, lon], r=radius) amenities = df_amenities.iloc[amenity_idxs] total_amenities = len(amenities) # Flatten all category IDs all_category_ids = [cats[0].split(">")[0].strip() for cats in amenities['fsq_category_labels'] if len(cats)>0] category_diversity = len(set(all_category_ids)) features = { 'num_banks_in_radius': n_banks, 'mean_dist_banks': mean_dist_banks, 'min_dist_bank': min_dist_bank, 'total_amenities': total_amenities, 'category_diversity': category_diversity } # Count occurrences per category count_per_category = Counter(all_category_ids) for feat in DATASET_COLUMNS: # for cat, cnt in count_per_category.items(): features[f'num_{feat}'] = count_per_category.get(feat, 0) # # Count occurrences of first category # first_categories = [cats[0] for cats in amenities['fsq_category_ids'] if len(cats)>0] # count_first_category = Counter(first_categories) # for cat, cnt in count_first_category.items(): # features[f'num_first_{cat}'] = cnt return features