Comparing runtimes of different stop detection algorithms on toy datasets

Here we compare the runtimes of four different stop detection algorithms: Lachesis, grid-based, temporal DBSCAN, and HDBSCAN.

[1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# Imports
import nomad.io.base as loader
import geopandas as gpd
from shapely.geometry import box
import pandas as pd
import numpy as np
from nomad.stop_detection.viz import plot_stops_barcode, plot_pings, plot_stops, plot_time_barcode
import nomad.stop_detection.dbscan as DBSCAN
import nomad.stop_detection.lachesis as LACHESIS
import nomad.stop_detection.grid_based as GRID_BASED
import nomad.stop_detection.hdbscan as HDBSCAN
import nomad.filters as filters
import nomad.stop_detection.postprocessing as post
import time
from tqdm import tqdm

# Load data
import nomad.data as data_folder
from pathlib import Path
data_dir = Path(data_folder.__file__).parent
city = gpd.read_parquet(data_dir / 'garden-city-buildings-mercator.parquet')
outer_box = box(*city.total_bounds).buffer(15, join_style='mitre')

filepath_root = 'gc_data_long/'
tc = {
    "user_id": "gc_identifier",
    "timestamp": "unix_ts",
    "x": "dev_x",
    "y": "dev_y",
    "ha":"ha",
    "date":"date"}

users = ['admiring_brattain']
traj = loader.sample_from_file(filepath_root, format='parquet', users=users, filters=('date','==', '2024-01-01'), traj_cols=tc)

# Lachesis (sequential stop detection)
start_time = time.time()
stops = LACHESIS.lachesis(traj, delta_roam=20, dt_max = 60, dur_min=5, complete_output=True, keep_col_names=True, traj_cols=tc)
execution_time_lachesis = time.time() - start_time
print(f"Lachesis execution time: {execution_time_lachesis} seconds")

# Density based stop detection (Temporal DBSCAN)
start_time = time.time()
user_data_tadb = traj.assign(cluster=DBSCAN.ta_dbscan_labels(traj, time_thresh=240, dist_thresh=15, min_pts=3, traj_cols=tc))
clustering_time_tadbscan = time.time() - start_time
start_time_post = time.time()
cluster_labels_tadb = user_data_tadb['cluster']
execution_time_tadbscan = time.time() - start_time
post_time_tadbscan = time.time() - start_time_post
print(f"TA-DBSCAN execution time: {execution_time_tadbscan} seconds")
print(f"TA-DBSCAN clustering time: {clustering_time_tadbscan} seconds")
print(f"TA-DBSCAN label extraction time: {post_time_tadbscan} seconds")

# Grid-based
start_time = time.time()
traj['h3_cell'] = filters.to_tessellation(traj, index="h3", res=10, traj_cols=tc, data_crs='EPSG:3857')
stops_gb = GRID_BASED.grid_based(traj, time_thresh=240, complete_output=True, traj_cols=tc, location_id='h3_cell')
execution_time_grid = time.time() - start_time
print(f"Grid-Based execution time: {execution_time_grid} seconds")

# HDBSCAN
start_time = time.time()
user_data_hdb = traj.assign(cluster=HDBSCAN.hdbscan_labels(traj, time_thresh=240, min_pts=3, min_cluster_size=2, traj_cols=tc))
clustering_time_hdbscan = time.time() - start_time
start_time_post = time.time()
cluster_labels_hdb = user_data_hdb['cluster']
execution_time_hdbscan = time.time() - start_time
post_time_hdbscan = time.time() - start_time_post
print(f"HDBSCAN execution time: {execution_time_hdbscan} seconds")
print(f"HDBSCAN clustering time: {clustering_time_hdbscan} seconds")
print(f"HDBSCAN label extraction time: {post_time_hdbscan} seconds")
Lachesis execution time: 0.01671910285949707 seconds
TA-DBSCAN execution time: 0.015048027038574219 seconds
TA-DBSCAN clustering time: 0.0 seconds
TA-DBSCAN label extraction time: 0.015048027038574219 seconds
Grid-Based execution time: 0.01604151725769043 seconds
HDBSCAN execution time: 0.24063372611999512 seconds
HDBSCAN clustering time: 0.24063372611999512 seconds
HDBSCAN label extraction time: 0.0 seconds

Summary of Single-User Performance

Lachesis

[2]:
fig, (ax_map, ax_barcode) = plt.subplots(2, 1, figsize=(6,6.5),
                                         gridspec_kw={'height_ratios':[10,1]})

gpd.GeoDataFrame(geometry=[outer_box], crs='EPSG:3857').plot(ax=ax_map, color='#d3d3d3')
city.plot(ax=ax_map, edgecolor='white', linewidth=1, color='#8c8c8c')

plot_stops(stops, ax=ax_map, cmap='Reds')
plot_pings(traj, ax=ax_map, s=6, point_color='black', cmap='twilight', traj_cols=tc)
ax_map.set_axis_off()

plot_time_barcode(traj[tc['timestamp']], ax=ax_barcode, set_xlim=True)
plot_stops_barcode(stops, ax=ax_barcode, cmap='Reds', set_xlim=False, timestamp='unix_ts')

plt.tight_layout(pad=0.1)
plt.show()
../_images/source_benchmarking_of_stop_detection_algorithms_5_0.png
[3]:
print("Summary of Single-User Performance")
print(f"Lachesis execution time: {execution_time_lachesis} seconds")
print(f"TA-DBSCAN execution time: {execution_time_tadbscan} seconds")
print(f"Grid-Based execution time: {execution_time_grid} seconds")
print(f"HDBSCAN execution time: {execution_time_hdbscan} seconds")
Summary of Single-User Performance
Lachesis execution time: 0.01671910285949707 seconds
TA-DBSCAN execution time: 0.015048027038574219 seconds
Grid-Based execution time: 0.01604151725769043 seconds
HDBSCAN execution time: 0.24063372611999512 seconds
[4]:
print("Runtime Disaggregation")
print(f"Lachesis clustering time: {execution_time_lachesis} seconds")
print("--------------------------------")
print(f"TA-DBSCAN clustering time: {clustering_time_tadbscan} seconds")
print(f"TA-DBSCAN label extraction time: {post_time_tadbscan} seconds")
print("--------------------------------")
print(f"Grid-Based clustering time: {execution_time_grid} seconds")
print("--------------------------------")
print(f"HDBSCAN clustering time: {clustering_time_hdbscan} seconds")
print(f"HDBSCAN label extraction time: {post_time_hdbscan} seconds")
Runtime Disaggregation
Lachesis clustering time: 0.01671910285949707 seconds
--------------------------------
TA-DBSCAN clustering time: 0.0 seconds
TA-DBSCAN label extraction time: 0.015048027038574219 seconds
--------------------------------
Grid-Based clustering time: 0.01604151725769043 seconds
--------------------------------
HDBSCAN clustering time: 0.24063372611999512 seconds
HDBSCAN label extraction time: 0.0 seconds

Pings vs Runtime

[5]:
traj = loader.sample_from_file(filepath_root, format='parquet', traj_cols=tc, seed=10)

# H3 cells for grid_based stop detection method
traj['h3_cell'] = filters.to_tessellation(traj, index="h3", res=10, traj_cols=tc, data_crs='EPSG:3857')
pings_per_user = traj['gc_identifier'].value_counts()
[6]:
# Approximately 5 minutes for 40 users
results = []
for user, n_pings in tqdm(pings_per_user.items(), total=len(pings_per_user)):
    user_data = traj.query("gc_identifier == @user")

    # For location based
    start_time = time.time()
    stops_gb = GRID_BASED.grid_based(user_data, time_thresh=240, complete_output=True, traj_cols=tc, location_id='h3_cell')
    execution_time = time.time() - start_time
    results += [pd.Series({'user':user, 'algo':'grid_based', 'execution_time':execution_time, 'n_pings':n_pings})]

    # For Lachesis
    start_time = time.time()
    stops_lac = LACHESIS.lachesis(user_data, delta_roam=30, dt_max=240, complete_output=True, traj_cols=tc)
    execution_time = time.time() - start_time
    results += [pd.Series({'user':user, 'algo':'lachesis', 'execution_time':execution_time, 'n_pings':n_pings})]

    # For TADbscan
    start_time = time.time()
    user_data_tadb = user_data.assign(cluster=DBSCAN.ta_dbscan_labels(user_data, time_thresh=240, dist_thresh=15, min_pts=3, traj_cols=tc))
    # - post-processing
    stops_tadb = user_data_tadb[user_data_tadb.cluster != -1]
    execution_time = time.time() - start_time
    results += [pd.Series({'user':user, 'algo':'tadbscan', 'execution_time':execution_time, 'n_pings':n_pings})]

    # For HDBSCAN
    start_time = time.time()
    user_data_hdb = user_data.assign(cluster=HDBSCAN.hdbscan_labels(user_data, time_thresh=240, min_pts=3, min_cluster_size=2, traj_cols=tc))
    # - post-processing
    stops_hdb = user_data_hdb[user_data_hdb.cluster != -1]
    execution_time = time.time() - start_time
    results += [pd.Series({'user':user, 'algo':'hdbscan', 'execution_time':execution_time, 'n_pings':n_pings})]

results = pd.DataFrame(results)
100%|██████████████████████████████████████████████████████████████████████████████████| 37/37 [00:24<00:00,  1.50it/s]
100%|███████████████████████████████████████████████████| 4/4 [00:00<00:00,  7.10it/s]
[7]:
algos = ['grid_based', 'lachesis', 'tadbscan', 'hdbscan']
palette = dict(zip(algos, sns.color_palette(n_colors=len(algos))))

fig, ax = plt.subplots(figsize=(5, 5))
sns.lineplot(data=results, marker='o', x='n_pings', y='execution_time', hue='algo', ax=ax)
ax.set_title('n_pings vs execution_time')
plt.show()
../_images/source_benchmarking_of_stop_detection_algorithms_11_0.png