From e124f7cbbf5bc997ff9188912da5514eb33249d6 Mon Sep 17 00:00:00 2001 From: belinhacbr Date: Mon, 24 Sep 2018 19:29:09 +0200 Subject: [PATCH] use dataframes to and manipulate the csvs --- .gitignore | 3 +- deliverytimes.py | 97 ++++++++----------------- requirements.txt | 2 + run.sh | 6 ++ utilities.py | 182 ++++++++++------------------------------------- 5 files changed, 80 insertions(+), 210 deletions(-) create mode 100755 run.sh diff --git a/.gitignore b/.gitignore index 04de45a..d774e50 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ __pycache__/ database_data.py -*.csv \ No newline at end of file +*.csv +.vscode diff --git a/deliverytimes.py b/deliverytimes.py index 0748dff..82ab488 100644 --- a/deliverytimes.py +++ b/deliverytimes.py @@ -1,4 +1,6 @@ from geopy.distance import geodesic +import pandas as pd + from datetime import datetime, timedelta import time @@ -10,12 +12,14 @@ driverpositions = driverpositions() stoplocations = stoplocations() -AVERAGE_WALKING_SPEED = 5 # km/h -R = 50 # radius of 30 meters +AVERAGE_WALKING_SPEED = 5 # km/h +R = 50 # radius of 30 meters + def driver_is_inside_location_area(driver_position, location_position): return geodesic(driver_position, location_position).meters <= R + def estimated_duration(timestamps_list): if not timestamps_list: return 0 @@ -23,8 +27,10 @@ def estimated_duration(timestamps_list): durations = [int(timedelta.total_seconds(timestamps[1] - timestamps[0])) for timestamps in timestamps_list] return max(durations) + def driver_is_moving_at_walking_speed(speed): - return speed < AVERAGE_WALKING_SPEED + 2 # buffer + return speed < AVERAGE_WALKING_SPEED + 2 # buffer + def enter_and_leave_timestamps(driver_positions, stoplocation_position, deliverystatustimestamp): enter_timestamp = 0 @@ -35,7 +41,7 @@ def enter_and_leave_timestamps(driver_positions, stoplocation_position, delivery # -- What does "best one" mean? timestamps_list = [] - for e in driver_positions: + for idx, e in driver_positions.iterrows(): if enter_timestamp == 0: # Optimizing enter_timestamp # Idea: Use speed data to shrink R, decreasing the area. @@ -47,7 +53,7 @@ def enter_and_leave_timestamps(driver_positions, stoplocation_position, delivery timestamps_list.append((enter_timestamp, leave_timestamp)) enter_timestamp = 0 leave_timestamp = 0 - + # Optimizing leave_timestamp # Use deliverystatustimestamp if driver registered delivery before leaving the area # Comment: Some drivers might register delivery before actually delivering. E.g., a driver might drive to @@ -57,73 +63,32 @@ def enter_and_leave_timestamps(driver_positions, stoplocation_position, delivery return timestamps_list -# def fill_in_delivery_time_estimates(): -# for routeplanid, routes in routeplans.items(): -# for routeid, driver in routes.items(): -# for driverid, stoplocs in driver.items(): -# for stoplocationid, duration in stoplocs.items(): -# stoplocation = {} - -# try: -# # some stoplocationids are not in stoplocations -# stoplocation = stoplocations[routeplanid][routeid][stoplocationid] -# driver_positions = driverpositions[routeid][driverid] -# except: -# continue - -# deliverystatustimestamp = stoplocations[routeplanid][routeid][stoplocationid]["deliverystatustimestamp"] -# timestamps = enter_and_leave_timestamps( -# driver_positions, -# stoplocation["position"], -# deliverystatustimestamp -# ) +estimated_durations_data = [] -# routeplans[routeplanid][routeid][driverid][stoplocationid]["estimated_duration"] = estimated_duration(timestamps) - -estimated_durations = [] def fill_in_delivery_time_estimates(): - for item in routeplans: - try: - # item[0]: routeplanid - # item[1]: routeid - # item[2]: driverid - # item[3]: stoplocationid - # item[4]: duration - - # some stoplocationids are not in stoplocations - routeplanid = int(item[0]) - routeid = int(item[1]) - driverid = int(item[2]) - stoplocationid = int(item[3]) - - stoplocation = stoplocations[routeplanid][routeid][stoplocationid] - driver_positions = driverpositions[routeid][driverid] - stoplocation_position = stoplocation["position"] - deliverystatustimestamp = stoplocation["deliverystatustimestamp"] - - timestamps = enter_and_leave_timestamps( - driver_positions, - stoplocation_position, - deliverystatustimestamp - ) - - estimated_durations.append(item + [estimated_duration(timestamps)]) - except: + for idx, routeplan in routeplans.iterrows(): + # some stoplocationids are not in stoplocations + stoplocation = stoplocations[ stoplocations['stoplocationid'] == routeplan['stoplocationid'] ] + driver_positions = driverpositions[ driverpositions['routeid'] == routeplan['routeid'] ][ driverpositions['driverid'] == routeplan['driverid'] ] + if stoplocation.empty or driverpositions.empty: continue + stoplocation_position = stoplocation["position"].iloc[0] + deliverystatustimestamp = stoplocation["deliverystatustimestamp"].iloc[0] -fill_in_delivery_time_estimates() - -#create_csv(routeplans, stoplocations) + timestamps = enter_and_leave_timestamps( + driver_positions, + stoplocation_position, + deliverystatustimestamp + ) + estimated_durations_data.append([routeplan['routeplanid'], estimated_duration(timestamps)]) -# for routeplanid, routes in routeplans.items(): -# for routeid, driver in routes.items(): -# for driverid, stoplocs in driver.items(): -# for stoplocationid, durations in stoplocs.items(): -# if len(durations) > 1: -# print(durations) -print(estimated_durations) +if __name__ == '__main__': + print('starting...') + fill_in_delivery_time_estimates() + estimated_durations = pd.DataFrame(estimated_durations_data, columns=['routeplanid', 'estimated_duration']) + create_csv(routeplans, stoplocations, driverpositions, estimated_durations) -print("--- %s seconds ---" % (time.time() - start_time)) + print("--- %s seconds ---" % (time.time() - start_time)) diff --git a/requirements.txt b/requirements.txt index 37cf5d8..a89237c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,5 @@ psycopg2==2.7.5 pylint==2.1.1 six==1.11.0 wrapt==1.10.11 +pandas==0.23.4 +numpy==1.15.1 diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..9b13be4 --- /dev/null +++ b/run.sh @@ -0,0 +1,6 @@ +if [ ! -d "venv" ]; then + virtualenv venv -p python3 +fi +source venv/bin/activate +pip install -r requirements.txt +python deliverytimes.py diff --git a/utilities.py b/utilities.py index 5660a40..9ed9f7b 100644 --- a/utilities.py +++ b/utilities.py @@ -1,161 +1,57 @@ -import csv -from datetime import datetime - -def create_csv(routeplans_dict, stoplocations_dict): - with open('estimated_delivery_times.csv', 'w', newline='') as csvfile: - fieldnames = ['routeplanid', 'routeid', 'driverid', 'stoplocationid', 'deliverystatus', 'duration', 'estimated_duration'] - writer = csv.DictWriter(csvfile, fieldnames = fieldnames) - - writer.writeheader() - - for routeplanid, routes in routeplans_dict.items(): - for routeid, drivers in routes.items(): - for driverid, stoplocs in drivers.items(): - for stoplocationid, durations in stoplocs.items(): - deliverystatus = '(null)' - if stoplocationid in stoplocations_dict[routeplanid][routeid]: - deliverystatus = stoplocations_dict[routeplanid][routeid][stoplocationid]["deliverystatus"] - writer.writerow({ - 'routeplanid': routeplanid, - 'routeid': routeid, - 'driverid': driverid, - 'stoplocationid': stoplocationid, - 'deliverystatus': deliverystatus, - 'duration': durations["duration"], - 'estimated_duration': durations["estimated_duration"] if "estimated_duration" in durations else 0 - }) +import pandas as pd + + +def create_csv(routeplans, stoplocations, driverpositions, estimated_durations): + df = routeplans.merge(stoplocations[['stoplocationid', 'deliverystatus']], on='stoplocationid', how='left') + df = df.merge(estimated_durations[['routeplanid', 'estimated_duration']], on='routeplanid', how='left') + print(df) + df.to_csv('estimated_delivery_times.csv', index = False) + def remove_tzinfo(date_string): return date_string.split("+")[0] + def remove_microseconds(date_string): return date_string.split(".")[0] -# def routeplans(): -# routeplans = {} - -# with open("routeplans.csv", "r") as f: -# reader = csv.reader(f, delimiter="\t") -# for i, line in enumerate(reader): -# if i == 0: # skip headers -# continue -# if line[2] == '(null)': # no driver -# continue -# routeplanid = int(line[0]) -# routeid = int(line[1]) -# driverid = int(line[2]) -# stoplocationid = int(line[3]) -# duration = int(line[4]) if line[4] != '(null)' else 0 - -# if routeplanid in routeplans: -# routes = routeplans[routeplanid] - -# if routeid in routes: -# drivers = routes[routeid] - -# if driverid in drivers: -# stoplocations = drivers[driverid] - -# stoplocations[stoplocationid] = {"duration": duration} -# else: -# drivers[driverid] = {stoplocationid: {"duration": duration}} -# else: -# routes[routeid] = { -# driverid: { -# stoplocationid: { -# "duration": duration -# } -# } -# } -# else: -# routeplans[routeplanid] = { -# routeid: { -# driverid: { -# stoplocationid: { -# "duration": duration -# } -# } -# } -# } - -# return routeplans def routeplans(): - routeplans = [] + csv = "routeplans_test.csv" + routeplans = pd.read_csv(csv, sep='\t', header=0, na_values=["(null)"]) + print('%s read shape: %s' % (csv, routeplans.shape)) + routeplans.dropna(subset=["driverid"], inplace=True) # ignoring routes without driver + print(routeplans.ftypes) + return routeplans - with open("routeplans_test.csv", "r") as f: - reader = csv.reader(f, delimiter="\t") - for i, line in enumerate(reader): - if i == 0: # skip headers - continue - if line[2] == '(null)': # no driver - continue - routeplans.append(line) +def get_routeids(stoplocations): + return stoplocations.groupby(['routeid', 'stoplocationid']) - return routeplans def stoplocations(): - stoplocations = {} - - with open("stoplocations_test.csv", "r") as f: - reader = csv.reader(f, delimiter="\t") - for i, line in enumerate(reader): - if i == 0: - continue - if line[3] == '(null)' or line[4] == '(null)': - continue - routeplanid = int(line[0]) - routeid = int(line[1]) - stoplocationid = int(line[2]) - data = { - "position": (float(line[3]), float(line[4])), - "deliverystatus": int(line[5]), - "deliverystatustimestamp": datetime.fromisoformat(line[6] if line[6] != '(null)' else 0) - } - - if routeplanid in stoplocations: - routes = stoplocations[routeplanid] - - if routeid in routes: - stoplocs = routes[routeid] - stoplocs[stoplocationid] = data # Assumption: only unique stoplocationids - else: - routes[routeid] = {stoplocationid: data} - else: - stoplocations[routeplanid] = {routeid: {stoplocationid: data}} - + csv = "stoplocations_test.csv" + stoplocations = pd.read_csv(csv, sep='\t', header=0, na_values=["(null)"]) + print('%s read shape: %s' % (csv, stoplocations.shape)) + stoplocations.dropna(subset=["latitude", "longitude"], inplace=True) # ignoring locations without latitude or longitude + stoplocations['position'] = stoplocations.apply(lambda x: (x['latitude'], x['longitude']), axis=1) + stoplocations['deliverystatustimestamp'] = pd.to_datetime(stoplocations['deliverystatustimestamp'].apply(lambda x: remove_microseconds(remove_tzinfo(x)))) + print(stoplocations.ftypes) return stoplocations -def driverpositions(): - driverpositions = {} - - with open("driverpositions_test.csv", "r") as f: - reader = csv.reader(f, delimiter="\t") # use "," when using Terje's file - for i, line in enumerate(reader): - if i == 0: # skip headers - continue - routeid = int(line[0]) - driverid = int(line[1]) - data = { - "position": (float(line[2]), float(line[3])), - "logtime": datetime.fromisoformat(remove_microseconds(remove_tzinfo(line[4]))), - "speed": float(line[6]) if line[6] else 0.0 - } - - if routeid in driverpositions: - drivers = driverpositions[routeid] - - if driverid in drivers: - drivers[driverid].append(data) - else: - drivers[driverid] = [data] - else: - driverpositions[routeid] = {driverid: [data]} - - # sort - for routeid, driver in driverpositions.items(): - for driverid, position_data in driver.items(): - driverpositions[routeid][driverid] = sorted(position_data, key=lambda position_datum: position_datum["logtime"]) +def get_drivers(driverpositions): + return driverpositions.groupby(['routeid', 'driverid']) + + +def driverpositions(): + csv = "driverpositions_test.csv" + driverpositions = pd.read_csv(csv, sep='\t', header=0, na_values=["(null)"]) # use "," when using Terje's file + print('%s read shape: %s' % (csv, driverpositions.shape)) + driverpositions.dropna(subset=["latitude", "longitude"], inplace=True) # ignoring positions without latitude or longitude + driverpositions['position'] = driverpositions.apply(lambda x: (x['latitude'], x['longitude']), axis=1) + driverpositions['logtime'] = pd.to_datetime(driverpositions['logtime'].apply(lambda x: remove_microseconds(remove_tzinfo(x)))) + driverpositions['speed'] = driverpositions['speed'].fillna(value=0) + driverpositions = driverpositions.sort_values(by=['routeid', 'driverid', 'logtime']) + print(driverpositions.ftypes) return driverpositions