Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vectorized sspd #26

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions data/benchmark.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
,Euclidean,Spherical
discret_frechet,0.0659620761871,-1.0
dtw,0.0781569480896,0.114996194839
edr,0.0695221424103,0.106939792633
erp,0.171737909317,0.319380998611
frechet,29.1885719299,-1.0
hausdorff,0.310199975967,0.780081987381
lcss,0.0711951255798,0.111418008804
sowd_grid_5,0.164781093597,0.159924983978
sowd_grid_6,0.973792076111,0.954225063324
sowd_grid_7,7.62574410439,7.78553795815
sspd,0.314118862152,0.807314872742
,c-Euclidean,c-Spherical,p-Euclidean,p-Spherical
sspd,0.040505208000000015,0.34403866699999996,1.281539208,7.2932007919999995
frechet,6.306265333000001,-1.0,22.235781750000005,-1.0
discret_frechet,0.03310412500000126,-1.0,1.3742333329999994,-1.0
hausdorff,0.0402967500000031,0.35264737499999654,1.270863583999997,5.3321753749999985
dtw,0.0402214160000014,0.05442316599999941,1.3347869999999986,1.1385135829999982
lcss,0.03770791600000223,0.05140970800000133,0.9286488329999969,0.7077315000000013
edr,0.036235290999997005,0.050285250000001724,1.031809249999995,0.8270454170000008
erp,0.1168113750000046,0.17084124999999517,0.7297154169999942,2.5876744999999985
sowd_grid_5,0.027730791999999838,0.026353374999999346,0.06974483300000145,0.06926791700000479
sowd_grid_6,0.15643404199999367,0.15622233400000596,0.7711204580000057,0.7712213329999997
sowd_grid_7,1.1579893339999998,1.1593307499999952,7.283603374999998,7.2944072920000025
4,951 changes: 4,951 additions & 0 deletions data/benchmark_spherical_sspd_results.csv

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions data/benchmark_sspd.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
,c-Spherical,p-Spherical,p-Spherical-Vectorized
sspd,0.33881316699999964,7.558915042000001,0.6724925420000005
spd_pt,na,4.7675891660000005,2.637069334000003
spd_closest_pt,na,5.633197959,3.3368709999999986
8 changes: 8 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
numpy~=1.23.5
Cython~=0.29.32
geohash2~=1.1
Shapely~=1.8.5.post1
pandas~=1.5.1
scipy~=1.9.3
scikit-learn~=1.1.3
geopy~=2.3.0
16 changes: 8 additions & 8 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,19 @@

setup(
name="traj_dist",
version="1.1",
version="1.2",
license="MIT",
author="Brendan Guillouet",
author_email="[email protected]",
cmdclass={'build_ext': build_ext},
ext_modules=ext_modules,
include_dirs=[numpy.get_include()],
install_requires=["numpy>=1.14.0", "Cython>=0.27.3", "Shapely>=1.6.4", "geohash2==1.1", 'pandas>=0.20.3',
'scipy>=0.19.1'],
install_requires=["numpy>=1.23.5", "Cython>=0.27.3", "Shapely>=1.8.5.post1", "geohash2==1.1", "pandas>=1.5.1",
"scipy>=1.9.3", "scikit-learn>=1.1.3"],
description="Distance to compare 2D-trajectories in Python/Cython",
download_url = 'https://github.com/user/reponame/archive/v_01.tar.gz', # I explain this later on
keywords = ['trajectory',"distance","haversine"],
packages=["traj_dist", "traj_dist.cydist", "traj_dist.pydist"],
url = 'https://github.com/bguillouet/traj-dist',
download_urt = 'https://github.com/bguillouet/traj-dist/archive/1.1.tar.gz'
download_url='https://github.com/user/reponame/archive/v_01.tar.gz', # I explain this later on
keywords=['trajectory', "distance", "haversine"],
packages=["traj_dist", "traj_dist.pydist"],
url='https://github.com/bguillouet/traj-dist',
download_urt='https://github.com/bguillouet/traj-dist/archive/1.1.tar.gz'
)
171 changes: 171 additions & 0 deletions tests/sspd_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import pickle
import unittest

import geopy as geopy
import geopy.distance as geopy_distance
import numpy as np
import pandas as pd

from traj_dist.cydist.sspd import c_g_sspd

from traj_dist.pydist.basic_spherical import great_circle_distance
from traj_dist.pydist.sspd import s_sspd, s_pt_to_traj_dist, s_closest_pt
from traj_dist.pydist.sspd_spherical_vectorized import s_sspd_vectorized, s_pt_to_traj_dist_vectorized, \
s_closest_pt_vectorized, s_spd_vectorized_distances, s_pt_to_traj_dist_threshold_vectorized
from traj_dist.trajectory_utils import remove_duplicate_adjacent_pts

# results recorded before any code changes were applied (via create_test_data)
results_file_name = f'../data/benchmark_spherical_sspd_results.csv'
trajectories_file_name = f'../data/benchmark_trajectories.pkl'
columns = ['i', 'j', 'c_dist', 'p_dist']
column_dtypes = {'i': int, 'j': int, 'c_dist': float, 'p_dist': float}
nb_traj = 100


class SphericalSSPDTests(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
# load test data
with open(trajectories_file_name, 'rb') as file:
traj_list = pickle.load(file, encoding="bytes")
cls.traj_list = traj_list[:nb_traj]

cls.df_expected_results = pd.read_csv(results_file_name, dtype=column_dtypes, index_col=0)
cls.max_diff = max(abs(cls.df_expected_results['c_dist'] - cls.df_expected_results['p_dist']))

def test_init(self):
""" Test that the sspd code changes haven't affected the calculations' results """
print(self.max_diff)
for _, row in self.df_expected_results.iterrows():
i = int(row["i"])
j = int(row["j"])
p_dist = s_sspd(self.traj_list[i], self.traj_list[j])
expected_res = row["p_dist"]
self.assertAlmostEqual(p_dist, expected_res)

def test_sspd_vectorized(self):
""" Test the correctness of sspd vectorized implementation"""
for _, row in self.df_expected_results.iterrows():
i = int(row["i"])
j = int(row["j"])
traj_i = self.traj_list[i]
traj_j = self.traj_list[j]
p_dist_vectorized = s_sspd_vectorized(traj_i, traj_j)
expected_res = row["p_dist"] # saved results
self.assertAlmostEqual(p_dist_vectorized, expected_res)

def test_spd_vectorized_distances(self):
""" Test the correctness of spd vectorized implementation"""
for _, row in self.df_expected_results.iterrows():
i = int(row["i"])
j = int(row["j"])
traj_i = self.traj_list[i]
traj_j = self.traj_list[j]
p_dist_vectorized_1, *rest = s_spd_vectorized_distances(traj_i, traj_j)
p_dist_vectorized_2, *rest = s_spd_vectorized_distances(traj_j, traj_i)
expected_res = row["p_dist"] # saved results
self.assertAlmostEqual(p_dist_vectorized_1 + p_dist_vectorized_2, expected_res)

def test_sspd_vectorized_remove_duplicate_pts(self):
for _, row in self.df_expected_results.iterrows():
i = int(row["i"])
j = int(row["j"])
traj_i = remove_duplicate_adjacent_pts(self.traj_list[i])
traj_j = remove_duplicate_adjacent_pts(self.traj_list[j])
p_dist_vectorized = s_sspd_vectorized(traj_i, traj_j)
expected_dist = s_sspd(traj_i, traj_j)
self.assertAlmostEqual(p_dist_vectorized, expected_dist)

def test_s_pt_to_traj_dist_vectorized(self):
""" Test s_pt_to_traj_dist vectorized and non-vectorized calculations are (almost) equal """
for _, row in self.df_expected_results.iterrows():
i = int(row["i"])
j = int(row["j"])
traj_i = self.traj_list[i]
traj_j = self.traj_list[j]

for pt_i in range(len(traj_i)):
expected_res = s_pt_to_traj_dist(traj_i[pt_i], traj_j)
p_dist_vectorized = s_pt_to_traj_dist_vectorized(traj_i[pt_i], traj_j)
self.assertAlmostEqual(p_dist_vectorized, expected_res)

def test_s_pt_to_traj_dist_correctness(self):
"""
Creates trajectories 0f 2 points on the same latitude, 1 km apart from one another
and a third point, between the 2 points and 0.5 km south
Test that the distance from the 3rd point to the trajectory is approx 0.5 km
Compare the s_pt_to_traj_dist_vectorized distance to great_circle_distance (should be almost equal)
"""
for lat in range(-50, 50, 2):
for long in range(-120, 120, 10):
start = geopy.Point(latitude=lat, longitude=long)
# Define a general distance objects, initialized with a distance of 1 km and 0.5 km.
d1 = geopy_distance.distance(meters=1000)
d2 = geopy_distance.distance(meters=500)
traj_pt = d1.destination(point=start, bearing=90) # east
pt_on_traj = d2.destination(point=start, bearing=90) # east
pt = d2.destination(point=pt_on_traj, bearing=180) # south
traj_i = np.array([[long, lat], [traj_pt.longitude, traj_pt.latitude]])
dist = s_pt_to_traj_dist_vectorized(np.array([pt.longitude, pt.latitude]), traj_i)
dist_expected = great_circle_distance(pt.longitude, pt.latitude,
pt_on_traj.longitude, pt_on_traj.latitude)
self.assertAlmostEqual(dist, dist_expected, places=2) # centimeters' accuracy

def test_s_closest_pt_to_traj_vectorized(self):
for _, row in self.df_expected_results.iterrows():
i = int(row["i"])
j = int(row["j"])
traj_i = self.traj_list[i]
traj_j = self.traj_list[j]

for pt_i in range(len(traj_i)):
lon1, lat1, idx1 = s_closest_pt(traj_i[pt_i], traj_j)
lon2, lat2, idx2 = s_closest_pt_vectorized(traj_i[pt_i], traj_j)

self.assertAlmostEqual(lon1, lon2)
self.assertAlmostEqual(lat1, lat2)
self.assertEqual(idx1, idx2)

def test_s_pt_to_traj_dist_threshold_vectorized(self):
threshold_dist = 500

for _, row in self.df_expected_results.iterrows():
i = int(row["i"])
j = int(row["j"])
traj_i = self.traj_list[i]
traj_j = self.traj_list[j]

for pt_i in range(len(traj_i)):
expected_res = s_pt_to_traj_dist(traj_i[pt_i], traj_j)
dist, idx, pt_idx = s_pt_to_traj_dist_threshold_vectorized(threshold_dist, traj_i[pt_i], traj_j)
if idx is None:
self.assertGreaterEqual(expected_res, threshold_dist)
self.assertGreaterEqual(dist, threshold_dist)
else:
self.assertLessEqual(idx, len(traj_j) - 1)
self.assertGreaterEqual(idx, 0)
self.assertLessEqual(dist, threshold_dist)


def create_test_data():
"""Computes distance between each pair of the two list of trajectories"""
file = open(trajectories_file_name, 'rb')
traj_list = pickle.load(file, encoding="bytes")
traj_list = traj_list[:nb_traj]
results = []
for i in range(nb_traj):
traj_list_i = traj_list[i]
for j in range(i + 1, nb_traj):
traj_list_j = traj_list[j]
c_dist = c_g_sspd(traj_list_i, traj_list_j)
p_dist = s_sspd(traj_list_i, traj_list_j)
results.append([i, j, c_dist, p_dist])
results_df = pd.DataFrame(results, columns=columns)
max_diff = max(abs(results_df['c_dist']-results_df['p_dist']))
print(f'precision difference: {max_diff}')
# results_df.to_csv(results_file_name)


if __name__ == '__main__':
# create_test_data()
unittest.main()
26 changes: 21 additions & 5 deletions traj_dist/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,28 @@
import pandas as pd
import numpy as np

traj_list = pickle.load(open("/Users/bguillouet/These/trajectory_distance/data/benchmark_trajectories.pkl", "rb"))[:100]
file_name = f'../data/benchmark_trajectories.pkl'
file = open(file_name, 'rb')
traj_list = pickle.load(file, encoding="bytes")
traj_list = traj_list[:100]

time_dict = collections.defaultdict(dict)

for distance in ["sspd", "frechet", "discret_frechet", "hausdorff", "dtw", "lcss", "edr", "erp"]:
t_euclidean = timeit.timeit(lambda: tdist.pdist(traj_list, metric=distance), number=1)
t_p_euclidean = timeit.timeit(lambda: tdist.pdist(traj_list, metric=distance, impl="p_impl"), number=1)

if not (distance in ["frechet", "discret_frechet"]):
t_spherical = timeit.timeit(
lambda: tdist.pdist(traj_list, metric=distance, type_d="spherical"), number=1)
t_p_spherical = timeit.timeit(
lambda: tdist.pdist(traj_list, metric=distance, type_d="spherical", impl="p_impl"), number=1)

else:
t_spherical = -1
time_dict[distance] = {"Euclidean": t_euclidean, "Spherical": t_spherical}
t_p_spherical = -1
time_dict[distance] = {"c-Euclidean": t_euclidean, "c-Spherical": t_spherical,
"p-Euclidean": t_p_euclidean, "p-Spherical": t_p_spherical}

t_cells_conversion_dic = collections.defaultdict(int)
for precision in [5, 6, 7]:
Expand All @@ -32,12 +41,19 @@
lambda: tdist.pdist(cells_list, metric="sowd_grid", type_d="euclidean", converted=True), number=1, )
t_spherical = timeit.timeit(
lambda: tdist.pdist(cells_list, metric="sowd_grid", type_d="spherical", converted=True), number=1, )
t_p_euclidean = timeit.timeit(
lambda: tdist.pdist(cells_list, metric="sowd_grid", type_d="euclidean", impl="p_impl", converted=True), number=1, )
t_spherical = timeit.timeit(
lambda: tdist.pdist(cells_list, metric="sowd_grid", type_d="spherical", converted=True), number=1, )
t_p_spherical = timeit.timeit(
lambda: tdist.pdist(cells_list, metric="sowd_grid", type_d="spherical", impl="p_impl", converted=True), number=1, )

time_dict["sowd_grid_%d"%precision] = {"Euclidean": t_euclidean, "Spherical": t_spherical}
time_dict["sowd_grid_%d"%precision] = {"c-Euclidean": t_euclidean, "c-Spherical": t_spherical,
"p-Euclidean": t_p_euclidean, "p-Spherical": t_p_spherical}


df_cells_conversion = pd.Series(t_cells_conversion_dic)
df_cells_conversion.to_csv("/Users/bguillouet/These/trajectory_distance/data/benchmark_cells_conversion.csv")
df_cells_conversion.to_csv("../data/benchmark_cells_conversion.csv")

df = pd.DataFrame(time_dict).transpose()
df.to_csv("/Users/bguillouet/These/trajectory_distance/data/benchmark.csv")
df.to_csv("../data/benchmark.csv")
56 changes: 56 additions & 0 deletions traj_dist/benchmark_sspd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import pickle
from traj_dist.pydist.sspd import s_pt_to_traj_dist, s_closest_pt

import traj_dist.distance as tdist
import timeit
import collections
import pandas as pd

from traj_dist.pydist.sspd_spherical_vectorized import s_pt_to_traj_dist_vectorized, s_closest_pt_vectorized


def dist_spd_pt(s_spd_pt_func, trajectories):
for i in range(nb_traj):
traj_i = trajectories[i]
for j in range(i + 1, nb_traj):
traj_j = trajectories[j]

for pt_i in range(len(traj_i)):
s_spd_pt_func(traj_i[pt_i], traj_j)


if __name__ == '__main__':
file_name = f'../data/benchmark_trajectories.pkl'
file = open(file_name, 'rb')
traj_list = pickle.load(file, encoding="bytes")
nb_traj = 100
traj_list = traj_list[:nb_traj]

time_dict = collections.defaultdict(dict)

t_spherical = timeit.timeit(
lambda: tdist.pdist(traj_list, metric="sspd", type_d="spherical"), number=1)
t_p_spherical = timeit.timeit(
lambda: tdist.pdist(traj_list, metric="sspd", type_d="spherical", impl="p_impl"), number=1)
t_p_spherical_vectorized = timeit.timeit(
lambda: tdist.pdist(traj_list, metric="sspd_vectorized", type_d="spherical", impl="p_impl"), number=1)

t_pt_spherical = timeit.timeit(lambda: dist_spd_pt(s_pt_to_traj_dist, traj_list), number=1)
t_pt_spherical_vectorized = timeit.timeit(lambda: dist_spd_pt(s_pt_to_traj_dist_vectorized, traj_list), number=1)

t_pt_closest_spherical = timeit.timeit(lambda: dist_spd_pt(s_closest_pt, traj_list), number=1)
t_pt_closest_spherical_vectorized = timeit.timeit(lambda: dist_spd_pt(s_closest_pt_vectorized, traj_list), number=1)

time_dict["sspd"] = {"c-Spherical": t_spherical,
"p-Spherical": t_p_spherical,
"p-Spherical-Vectorized": t_p_spherical_vectorized}
time_dict["spd_pt"] = {"c-Spherical": 'na',
"p-Spherical": t_pt_spherical,
"p-Spherical-Vectorized": t_pt_spherical_vectorized}
time_dict["spd_closest_pt"] = {"c-Spherical": 'na',
"p-Spherical": t_pt_closest_spherical,
"p-Spherical-Vectorized": t_pt_closest_spherical_vectorized}

df = pd.DataFrame(time_dict).transpose()
df.to_csv("../data/benchmark_sspd.csv")

21 changes: 21 additions & 0 deletions traj_dist/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@

from enum import Enum


class EarthRadius(Enum):
"""
earth radius in meters
"""
MEAN = 0
EQUATORIAL = 1


earth_radius_type = EarthRadius.MEAN


def earth_radius():
if earth_radius_type == EarthRadius.MEAN:
return 6371008.8
elif earth_radius_type == EarthRadius.EQUATORIAL:
return 6378137.0
return EarthRadius.MEAN
Loading