Spaces:

DavMelchi
/

db_query

Running

App Files Files Community

DavMelchi commited on Jul 15

Commit

b5e340d

1 Parent(s): 962e6f5

improve clustering app

Browse files

Files changed (3) hide show

apps/clustering.py +138 -15
requirements.txt +0 -0
samples/Site_Clustering.xlsx +0 -0

apps/clustering.py CHANGED Viewed

@@ -4,10 +4,11 @@ import numpy as np
 import pandas as pd
 import plotly.express as px
 import streamlit as st
 from sklearn.cluster import KMeans
-def cluster_sites(
     df: pd.DataFrame,
     lat_col: str,
     lon_col: str,
@@ -23,20 +24,103 @@ def cluster_sites(
     else:
         grouped = [("All", df)]
     for region, group in grouped:
-        coords = group[[lat_col, lon_col]].to_numpy()
-        n_clusters = max(1, int(np.ceil(len(group) / max_sites)))
-        if len(group) < max_sites:
-            labels = np.zeros(len(group), dtype=int)
-        else:
-            kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
-            labels = kmeans.fit_predict(coords)
         group = group.copy()
-        group["Cluster"] = [f"C{cluster_id + l}" for l in labels]
-        clusters.append(group)
-        cluster_id += len(set(labels))
     return pd.concat(clusters)
@@ -57,7 +141,18 @@ st.write(
                       """
 )
-uploaded_file = st.file_uploader("Upload your Excel file", type=["xlsx"])
 if uploaded_file:
     df = pd.read_excel(uploaded_file)
@@ -73,30 +168,58 @@ if uploaded_file:
         max_sites = st.number_input(
             "Max sites per cluster", min_value=5, max_value=100, value=25
         )
         mix_regions = st.checkbox(
             "Allow mixing different regions in clusters", value=False
         )
         submitted = st.form_submit_button("Run Clustering")
     if submitted:
-        clustered_df = cluster_sites(
-            df, lat_col, lon_col, region_col, max_sites, mix_regions
-        )
         st.success("Clustering completed!")
         st.write(clustered_df.head())
         # Plot
         fig = px.scatter_map(
             clustered_df,
             lat=lat_col,
             lon=lon_col,
             color="Cluster",
             hover_name=code_col,
             hover_data=[region_col],
             zoom=5,
             height=600,
         )
         fig.update_layout(mapbox_style="open-street-map")
         st.plotly_chart(fig)
         # Download button

 import pandas as pd
 import plotly.express as px
 import streamlit as st
+from hilbertcurve.hilbertcurve import HilbertCurve
 from sklearn.cluster import KMeans
+def cluster_sites_hilbert_curve_same_size(
     df: pd.DataFrame,
     lat_col: str,
     lon_col: str,
     else:
         grouped = [("All", df)]
+    # Create Hilbert Curve (higher p = more precision)
+    p = 16  # Adjust based on your coordinate precision needs
+    hilbert_curve = HilbertCurve(p, 2)  # 2D curve
     for region, group in grouped:
+        if len(group) == 0:
+            continue
+        # Normalize coordinates to [0, 2^p-1] range
+        lat_min, lat_max = group[lat_col].min(), group[lat_col].max()
+        lon_min, lon_max = group[lon_col].min(), group[lon_col].max()
         group = group.copy()
+        group["x"] = ((group[lat_col] - lat_min) / (lat_max - lat_min + 1e-10)) * (
+            2**p - 1
+        )
+        group["y"] = ((group[lon_col] - lon_min) / (lon_max - lon_min + 1e-10)) * (
+            2**p - 1
+        )
+        # Calculate Hilbert distance
+        group["hilbert"] = group.apply(
+            lambda row: hilbert_curve.distance_from_point(
+                [int(row["x"]), int(row["y"])]
+            ),
+            axis=1,
+        )
+        # Sort by Hilbert value
+        group = group.sort_values("hilbert")
+        # Create fixed-size clusters
+        for i in range(0, len(group), max_sites):
+            cluster = group.iloc[i : i + max_sites].copy()
+            cluster["Cluster"] = f"C{cluster_id}"
+            clusters.append(cluster)
+            cluster_id += 1
+    result = pd.concat(clusters)
+    return result.drop(columns=["x", "y", "hilbert"], errors="ignore")
+def cluster_sites_kmeans_lower_to_fixed_size(
+    df: pd.DataFrame,
+    lat_col: str,
+    lon_col: str,
+    region_col: str,
+    max_sites: int = 25,
+    mix_regions: bool = False,
+):
+    clusters = []
+    cluster_id = 0
+    if not mix_regions:
+        grouped = df.groupby(region_col)
+    else:
+        grouped = [("All", df)]
+    for region, group in grouped:
+        coords = group[[lat_col, lon_col]].to_numpy()
+        remaining_sites = group.copy()
+        while len(remaining_sites) > 0:
+            # Calculate number of clusters needed for remaining sites
+            n_remaining = len(remaining_sites)
+            n_clusters = max(1, int(np.ceil(n_remaining / max_sites)))
+            if n_remaining <= max_sites:
+                # If remaining sites can fit in one cluster
+                cluster_group = remaining_sites.copy()
+                cluster_group["Cluster"] = f"C{cluster_id}"
+                clusters.append(cluster_group)
+                cluster_id += 1
+                break
+            else:
+                # Apply KMeans to remaining sites
+                kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
+                labels = kmeans.fit_predict(
+                    remaining_sites[[lat_col, lon_col]].to_numpy()
+                )
+                # Split into clusters and check sizes
+                temp_df = remaining_sites.copy()
+                temp_df["Cluster"] = labels
+                temp_df["Temp_Cluster"] = labels
+                for cluster_num in range(n_clusters):
+                    cluster_group = temp_df[temp_df["Temp_Cluster"] == cluster_num]
+                    if len(cluster_group) <= max_sites:
+                        # If cluster is small enough, keep it
+                        cluster_group = cluster_group.drop(columns=["Temp_Cluster"])
+                        cluster_group["Cluster"] = f"C{cluster_id}"
+                        clusters.append(cluster_group)
+                        cluster_id += 1
+                        # Remove these sites from remaining_sites
+                        remaining_sites = remaining_sites.drop(cluster_group.index)
+                    # Else these sites will remain for next iteration
     return pd.concat(clusters)
                       """
 )
+# Download Sample file
+clustering_sample_file_path = "samples/Site_Clustering.xlsx"
+# Create a download button
+st.download_button(
+    label="Download Clustering Sample File",
+    data=open(clustering_sample_file_path, "rb").read(),
+    file_name="Site_Clustering.xlsx",
+    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+)
+uploaded_file = st.file_uploader("Upload your Excel file ", type=["xlsx"])
 if uploaded_file:
     df = pd.read_excel(uploaded_file)
         max_sites = st.number_input(
             "Max sites per cluster", min_value=5, max_value=100, value=25
         )
+        cluster_method = st.selectbox(
+            "Select clustering method",
+            ["Hilbert Curve Same Size", "KMeans Lower To Fixed Size"],
+        )
         mix_regions = st.checkbox(
             "Allow mixing different regions in clusters", value=False
         )
         submitted = st.form_submit_button("Run Clustering")
     if submitted:
+        if cluster_method == "Hilbert Curve Same Size":
+            clustered_df = cluster_sites_hilbert_curve_same_size(
+                df, lat_col, lon_col, region_col, max_sites, mix_regions
+            )
+        elif cluster_method == "KMeans Lower To Fixed Size":
+            clustered_df = cluster_sites_kmeans_lower_to_fixed_size(
+                df, lat_col, lon_col, region_col, max_sites, mix_regions
+            )
         st.success("Clustering completed!")
         st.write(clustered_df.head())
         # Plot
+        clustered_df["size"] = 10
         fig = px.scatter_map(
             clustered_df,
             lat=lat_col,
             lon=lon_col,
             color="Cluster",
+            size="size",
             hover_name=code_col,
             hover_data=[region_col],
             zoom=5,
             height=600,
         )
         fig.update_layout(mapbox_style="open-street-map")
+        fig.update_traces(marker=dict(size=15))
+        st.plotly_chart(fig)
+        # Show cluster size per cluster plot
+        cluster_size = clustered_df["Cluster"].value_counts().sort_index()
+        fig = px.bar(cluster_size, x=cluster_size.index, y=cluster_size.values)
+        fig.update_layout(title="Cluster Size")
+        st.plotly_chart(fig)
+        # Show cluster size per region plot
+        cluster_size_per_region = (
+            clustered_df.groupby([region_col, "Cluster"])
+            .size()
+            .reset_index(name="count")
+        )
+        fig = px.bar(cluster_size_per_region, x="Cluster", y="count", color=region_col)
+        fig.update_layout(title="Cluster Size per Region")
         st.plotly_chart(fig)
         # Download button

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ

samples/Site_Clustering.xlsx ADDED Viewed

Binary file (39.9 kB). View file