Improve multi distance calculator
Browse files- apps/multi_points_distance_calculator.py +71 -24
- utils/utils_functions.py +111 -26
apps/multi_points_distance_calculator.py
CHANGED
|
@@ -15,16 +15,16 @@ st.write(
|
|
| 15 |
dataset1_sample_file_path = "samples/Dataset1.xlsx"
|
| 16 |
dataset2_sample_file_path = "samples/Dataset2.xlsx"
|
| 17 |
|
| 18 |
-
|
| 19 |
|
| 20 |
-
with
|
| 21 |
st.download_button(
|
| 22 |
label="Dataset1 Sample File",
|
| 23 |
data=open(dataset1_sample_file_path, "rb").read(),
|
| 24 |
file_name="Dataset1.xlsx",
|
| 25 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
| 26 |
)
|
| 27 |
-
with
|
| 28 |
st.download_button(
|
| 29 |
label="Dataset2 Sample File",
|
| 30 |
data=open(dataset2_sample_file_path, "rb").read(),
|
|
@@ -32,51 +32,98 @@ with col2:
|
|
| 32 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
| 33 |
)
|
| 34 |
|
|
|
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
-
# Upload Dataset 2
|
| 41 |
-
st.subheader("Upload Dataset 2 (Comparison Points)")
|
| 42 |
-
file2 = st.file_uploader("Upload second dataset (Excel)", type=["xlsx"], key="file2")
|
| 43 |
|
| 44 |
if file1 and file2:
|
|
|
|
| 45 |
try:
|
| 46 |
# Read the datasets
|
| 47 |
df1 = pd.read_excel(file1)
|
| 48 |
df2 = pd.read_excel(file2)
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
lat_col2 = st.selectbox("Select 'Latitude' Column", df2.columns, key="lat2")
|
| 58 |
-
long_col2 = st.selectbox("Select 'Longitude' Column", df2.columns, key="long2")
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
# Calculate distances when button is clicked
|
| 61 |
if st.button("Calculate Distances"):
|
| 62 |
-
df_distances, df_closest = calculate_distances(
|
| 63 |
-
df1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
)
|
| 65 |
|
| 66 |
-
# Display all distances
|
| 67 |
-
st.subheader("All Distances")
|
| 68 |
-
st.dataframe(df_distances)
|
| 69 |
|
| 70 |
# Display closest points
|
| 71 |
st.subheader("Closest Matches")
|
| 72 |
st.dataframe(df_closest)
|
|
|
|
|
|
|
| 73 |
|
| 74 |
-
# Downloadable CSV
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
st.download_button(
|
| 76 |
label="Download Closest Matches as CSV",
|
| 77 |
data=df_closest.to_csv(index=False),
|
| 78 |
file_name="closest_matches.csv",
|
| 79 |
mime="text/csv",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
)
|
| 81 |
|
| 82 |
except Exception as e:
|
|
|
|
| 15 |
dataset1_sample_file_path = "samples/Dataset1.xlsx"
|
| 16 |
dataset2_sample_file_path = "samples/Dataset2.xlsx"
|
| 17 |
|
| 18 |
+
download_col1, download_col2 = st.columns(2)
|
| 19 |
|
| 20 |
+
with download_col1:
|
| 21 |
st.download_button(
|
| 22 |
label="Dataset1 Sample File",
|
| 23 |
data=open(dataset1_sample_file_path, "rb").read(),
|
| 24 |
file_name="Dataset1.xlsx",
|
| 25 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
| 26 |
)
|
| 27 |
+
with download_col2:
|
| 28 |
st.download_button(
|
| 29 |
label="Dataset2 Sample File",
|
| 30 |
data=open(dataset2_sample_file_path, "rb").read(),
|
|
|
|
| 32 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
| 33 |
)
|
| 34 |
|
| 35 |
+
upload_data1_col, upload_data2_col = st.columns(2)
|
| 36 |
|
| 37 |
+
with upload_data1_col:
|
| 38 |
+
# Upload Dataset 1
|
| 39 |
+
st.subheader("Upload Dataset 1 (Reference Points)")
|
| 40 |
+
file1 = st.file_uploader("Upload first dataset (Excel)", type=["xlsx"], key="file1")
|
| 41 |
+
|
| 42 |
+
with upload_data2_col:
|
| 43 |
+
# Upload Dataset 2
|
| 44 |
+
st.subheader("Upload Dataset 2 (Comparison Points)")
|
| 45 |
+
file2 = st.file_uploader(
|
| 46 |
+
"Upload second dataset (Excel)", type=["xlsx"], key="file2"
|
| 47 |
+
)
|
| 48 |
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
if file1 and file2:
|
| 51 |
+
param_col1, param_col2 = st.columns(2)
|
| 52 |
try:
|
| 53 |
# Read the datasets
|
| 54 |
df1 = pd.read_excel(file1)
|
| 55 |
df2 = pd.read_excel(file2)
|
| 56 |
|
| 57 |
+
with param_col1:
|
| 58 |
+
st.subheader("Select Columns for Dataset 1")
|
| 59 |
+
code_col1 = st.selectbox("Select 'CODE' Column", df1.columns, key="code1")
|
| 60 |
+
lat_col1 = st.selectbox("Select 'Latitude' Column", df1.columns, key="lat1")
|
| 61 |
+
long_col1 = st.selectbox(
|
| 62 |
+
"Select 'Longitude' Column", df1.columns, key="long1"
|
| 63 |
+
)
|
|
|
|
|
|
|
| 64 |
|
| 65 |
+
with param_col2:
|
| 66 |
+
st.subheader("Select Columns for Dataset 2")
|
| 67 |
+
code_col2 = st.selectbox("Select 'CODE' Column", df2.columns, key="code2")
|
| 68 |
+
lat_col2 = st.selectbox("Select 'Latitude' Column", df2.columns, key="lat2")
|
| 69 |
+
long_col2 = st.selectbox(
|
| 70 |
+
"Select 'Longitude' Column", df2.columns, key="long2"
|
| 71 |
+
)
|
| 72 |
+
min_distance = st.number_input(
|
| 73 |
+
"Minimum Distance (km)", min_value=0.0, value=5.0
|
| 74 |
+
)
|
| 75 |
# Calculate distances when button is clicked
|
| 76 |
if st.button("Calculate Distances"):
|
| 77 |
+
df_distances, df_closest, df_closest_min_distance = calculate_distances(
|
| 78 |
+
df1,
|
| 79 |
+
df2,
|
| 80 |
+
code_col1,
|
| 81 |
+
lat_col1,
|
| 82 |
+
long_col1,
|
| 83 |
+
code_col2,
|
| 84 |
+
lat_col2,
|
| 85 |
+
long_col2,
|
| 86 |
+
min_distance,
|
| 87 |
)
|
| 88 |
|
| 89 |
+
# # Display all distances
|
| 90 |
+
# st.subheader("All Distances")
|
| 91 |
+
# st.dataframe(df_distances)
|
| 92 |
|
| 93 |
# Display closest points
|
| 94 |
st.subheader("Closest Matches")
|
| 95 |
st.dataframe(df_closest)
|
| 96 |
+
st.subheader("Closest Matches below Min Distance")
|
| 97 |
+
st.dataframe(df_closest_min_distance)
|
| 98 |
|
| 99 |
+
# Downloadable All distances CSV
|
| 100 |
+
st.download_button(
|
| 101 |
+
label="Download All Distances as CSV",
|
| 102 |
+
data=df_distances.to_csv(index=False),
|
| 103 |
+
file_name="all_distances.csv",
|
| 104 |
+
mime="text/csv",
|
| 105 |
+
on_click="ignore",
|
| 106 |
+
type="primary",
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
# Downloadable Closest matches CSV
|
| 110 |
st.download_button(
|
| 111 |
label="Download Closest Matches as CSV",
|
| 112 |
data=df_closest.to_csv(index=False),
|
| 113 |
file_name="closest_matches.csv",
|
| 114 |
mime="text/csv",
|
| 115 |
+
on_click="ignore",
|
| 116 |
+
type="primary",
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
# Downloadable Closest matches below Min Distance CSV
|
| 120 |
+
st.download_button(
|
| 121 |
+
label=f"Download Closest Matches below {min_distance}km as CSV",
|
| 122 |
+
data=df_closest_min_distance.to_csv(index=False),
|
| 123 |
+
file_name=f"closest_matches_{min_distance}km.csv",
|
| 124 |
+
mime="text/csv",
|
| 125 |
+
on_click="ignore",
|
| 126 |
+
type="primary",
|
| 127 |
)
|
| 128 |
|
| 129 |
except Exception as e:
|
utils/utils_functions.py
CHANGED
|
@@ -1,41 +1,126 @@
|
|
|
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
from geopy.distance import geodesic
|
| 3 |
|
| 4 |
-
|
| 5 |
# Function to calculate distances while preserving all original columns
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
def calculate_distances(
|
| 7 |
df1: pd.DataFrame,
|
| 8 |
df2: pd.DataFrame,
|
| 9 |
-
code_col1,
|
| 10 |
-
lat_col1,
|
| 11 |
-
long_col1,
|
| 12 |
-
code_col2,
|
| 13 |
-
lat_col2,
|
| 14 |
-
long_col2,
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
distances = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
coord1 = (row1[lat_col1], row1[long_col1])
|
| 21 |
-
coord2 = (row2[lat_col2], row2[long_col2])
|
| 22 |
-
distance_km = geodesic(coord1, coord2).kilometers # Compute distance
|
| 23 |
-
|
| 24 |
-
# Combine all original columns + distance
|
| 25 |
-
combined_row = {
|
| 26 |
-
**row1.to_dict(), # Keep all columns from Dataset1
|
| 27 |
-
**{
|
| 28 |
-
f"{col}_Dataset2": row2[col] for col in df2.columns
|
| 29 |
-
}, # Keep all columns from Dataset2
|
| 30 |
-
"Distance_km": distance_km,
|
| 31 |
-
}
|
| 32 |
-
distances.append(combined_row)
|
| 33 |
|
| 34 |
df_distances = pd.DataFrame(distances)
|
| 35 |
|
| 36 |
-
# Find
|
| 37 |
-
df_closest
|
| 38 |
df_distances.groupby(code_col1)["Distance_km"].idxmin()
|
| 39 |
]
|
| 40 |
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import warnings
|
| 2 |
+
|
| 3 |
import pandas as pd
|
| 4 |
from geopy.distance import geodesic
|
| 5 |
|
|
|
|
| 6 |
# Function to calculate distances while preserving all original columns
|
| 7 |
+
# def calculate_distances(
|
| 8 |
+
# df1: pd.DataFrame,
|
| 9 |
+
# df2: pd.DataFrame,
|
| 10 |
+
# code_col1,
|
| 11 |
+
# lat_col1,
|
| 12 |
+
# long_col1,
|
| 13 |
+
# code_col2,
|
| 14 |
+
# lat_col2,
|
| 15 |
+
# long_col2,
|
| 16 |
+
# min_distance: int = 1,
|
| 17 |
+
# ):
|
| 18 |
+
# distances = []
|
| 19 |
+
|
| 20 |
+
# for _, row1 in df1.iterrows():
|
| 21 |
+
# for _, row2 in df2.iterrows():
|
| 22 |
+
# coord1 = (row1[lat_col1], row1[long_col1])
|
| 23 |
+
# coord2 = (row2[lat_col2], row2[long_col2])
|
| 24 |
+
# distance_km = geodesic(coord1, coord2).kilometers # Compute distance
|
| 25 |
+
|
| 26 |
+
# # Combine all original columns + distance
|
| 27 |
+
# combined_row = {
|
| 28 |
+
# **row1.to_dict(), # Keep all columns from Dataset1
|
| 29 |
+
# **{
|
| 30 |
+
# f"{col}_Dataset2": row2[col] for col in df2.columns
|
| 31 |
+
# }, # Keep all columns from Dataset2
|
| 32 |
+
# "Distance_km": distance_km,
|
| 33 |
+
# }
|
| 34 |
+
# distances.append(combined_row)
|
| 35 |
+
|
| 36 |
+
# df_distances = pd.DataFrame(distances)
|
| 37 |
+
|
| 38 |
+
# # Find the closest point for each Point1
|
| 39 |
+
# df_closest: pd.DataFrame = df_distances.loc[
|
| 40 |
+
# df_distances.groupby(code_col1)["Distance_km"].idxmin()
|
| 41 |
+
# ]
|
| 42 |
+
|
| 43 |
+
# # Find the distnce below min_distance
|
| 44 |
+
# df_closest_min_distance = df_distances[df_distances["Distance_km"] < min_distance]
|
| 45 |
+
|
| 46 |
+
# return df_distances, df_closest, df_closest_min_distance
|
| 47 |
+
|
| 48 |
+
|
| 49 |
def calculate_distances(
|
| 50 |
df1: pd.DataFrame,
|
| 51 |
df2: pd.DataFrame,
|
| 52 |
+
code_col1: str,
|
| 53 |
+
lat_col1: str,
|
| 54 |
+
long_col1: str,
|
| 55 |
+
code_col2: str,
|
| 56 |
+
lat_col2: str,
|
| 57 |
+
long_col2: str,
|
| 58 |
+
min_distance: float = 1.0,
|
| 59 |
+
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
| 60 |
+
"""
|
| 61 |
+
Calculate distances between points in two datasets and find closest matches.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
df1: First DataFrame containing reference points
|
| 65 |
+
df2: Second DataFrame containing points to compare
|
| 66 |
+
code_col1: Column name in df1 containing point identifiers
|
| 67 |
+
lat_col1: Column name in df1 containing latitude
|
| 68 |
+
long_col1: Column name in df1 containing longitude
|
| 69 |
+
code_col2: Column name in df2 containing point identifiers
|
| 70 |
+
lat_col2: Column name in df2 containing latitude
|
| 71 |
+
long_col2: Column name in df2 containing longitude
|
| 72 |
+
min_distance: Minimum distance threshold in kilometers
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
tuple: (all_distances, closest_matches, matches_below_threshold)
|
| 76 |
+
"""
|
| 77 |
+
# Validate input columns
|
| 78 |
+
required_cols_1 = {code_col1, lat_col1, long_col1}
|
| 79 |
+
required_cols_2 = {code_col2, lat_col2, long_col2}
|
| 80 |
+
|
| 81 |
+
if not required_cols_1.issubset(df1.columns):
|
| 82 |
+
raise ValueError(
|
| 83 |
+
f"df1 is missing required columns: {required_cols_1 - set(df1.columns)}"
|
| 84 |
+
)
|
| 85 |
+
if not required_cols_2.issubset(df2.columns):
|
| 86 |
+
raise ValueError(
|
| 87 |
+
f"df2 is missing required columns: {required_cols_2 - set(df2.columns)}"
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# Convert to list of tuples for vectorized operations
|
| 91 |
+
coords1 = df1[[lat_col1, long_col1]].apply(tuple, axis=1).tolist()
|
| 92 |
+
coords2 = df2[[lat_col2, long_col2]].apply(tuple, axis=1).tolist()
|
| 93 |
+
|
| 94 |
+
# Calculate all pairwise distances
|
| 95 |
distances = []
|
| 96 |
+
for i, coord1 in enumerate(coords1):
|
| 97 |
+
for j, coord2 in enumerate(coords2):
|
| 98 |
+
try:
|
| 99 |
+
distance_km = geodesic(coord1, coord2).kilometers
|
| 100 |
+
distances.append(
|
| 101 |
+
{
|
| 102 |
+
**df1.iloc[i].to_dict(),
|
| 103 |
+
**{f"{col}_Dataset2": df2.iloc[j][col] for col in df2.columns},
|
| 104 |
+
"Distance_km": distance_km,
|
| 105 |
+
}
|
| 106 |
+
)
|
| 107 |
+
except ValueError as e:
|
| 108 |
+
warnings.warn(
|
| 109 |
+
f"Skipping invalid coordinates: {coord1} or {coord2}: {e}"
|
| 110 |
+
)
|
| 111 |
+
continue
|
| 112 |
|
| 113 |
+
if not distances:
|
| 114 |
+
raise ValueError("No valid coordinate pairs were processed")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
df_distances = pd.DataFrame(distances)
|
| 117 |
|
| 118 |
+
# Find closest matches
|
| 119 |
+
df_closest = df_distances.loc[
|
| 120 |
df_distances.groupby(code_col1)["Distance_km"].idxmin()
|
| 121 |
]
|
| 122 |
|
| 123 |
+
# Filter by minimum distance
|
| 124 |
+
df_closest_min_distance = df_distances[df_distances["Distance_km"] < min_distance]
|
| 125 |
+
|
| 126 |
+
return df_distances, df_closest, df_closest_min_distance
|