use dataframes to and manipulate the csvs

Zeniten · Sep 25, 2018 · e124f7c · e124f7c
1 parent c51822a
commit e124f7c
Show file tree

Hide file tree

Showing 5 changed files with 80 additions and 210 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,4 +3,5 @@ __pycache__/
 
 database_data.py
 
-*.csv
+*.csv
+.vscode
diff --git a/deliverytimes.py b/deliverytimes.py
@@ -1,4 +1,6 @@
 from geopy.distance import geodesic
+import pandas as pd
+
 from datetime import datetime, timedelta
 import time
 
@@ -10,21 +12,25 @@
 driverpositions = driverpositions()
 stoplocations = stoplocations()
 
-AVERAGE_WALKING_SPEED = 5 # km/h
-R = 50 # radius of 30 meters
+AVERAGE_WALKING_SPEED = 5   # km/h
+R = 50  # radius of 30 meters
+
 
 def driver_is_inside_location_area(driver_position, location_position):
     return geodesic(driver_position, location_position).meters <= R
 
+
 def estimated_duration(timestamps_list):
     if not timestamps_list:
         return 0
     else:
         durations = [int(timedelta.total_seconds(timestamps[1] - timestamps[0])) for timestamps in timestamps_list]
         return max(durations)
 
+
 def driver_is_moving_at_walking_speed(speed):
-    return speed < AVERAGE_WALKING_SPEED + 2 # buffer
+    return speed < AVERAGE_WALKING_SPEED + 2    # buffer
+
 
 def enter_and_leave_timestamps(driver_positions, stoplocation_position, deliverystatustimestamp):
     enter_timestamp = 0
@@ -35,7 +41,7 @@ def enter_and_leave_timestamps(driver_positions, stoplocation_position, delivery
     # -- What does "best one" mean?
     timestamps_list = []
 
-    for e in driver_positions:
+    for idx, e in driver_positions.iterrows():
         if enter_timestamp == 0:
             # Optimizing enter_timestamp
             # Idea: Use speed data to shrink R, decreasing the area.
@@ -47,7 +53,7 @@ def enter_and_leave_timestamps(driver_positions, stoplocation_position, delivery
                 timestamps_list.append((enter_timestamp, leave_timestamp))
                 enter_timestamp = 0
                 leave_timestamp = 0
-    
+
     # Optimizing leave_timestamp
     # Use deliverystatustimestamp if driver registered delivery before leaving the area
     # Comment: Some drivers might register delivery before actually delivering. E.g., a driver might drive to
@@ -57,73 +63,32 @@ def enter_and_leave_timestamps(driver_positions, stoplocation_position, delivery
 
     return timestamps_list
 
-# def fill_in_delivery_time_estimates():
-#     for routeplanid, routes in routeplans.items():
-#         for routeid, driver in routes.items():
-#             for driverid, stoplocs in driver.items():
-#                 for stoplocationid, duration in stoplocs.items():
-#                     stoplocation = {}
-
-#                     try:
-#                         # some stoplocationids are not in stoplocations
-#                         stoplocation = stoplocations[routeplanid][routeid][stoplocationid]
-#                         driver_positions = driverpositions[routeid][driverid]
-#                     except:
-#                         continue
-
-#                     deliverystatustimestamp = stoplocations[routeplanid][routeid][stoplocationid]["deliverystatustimestamp"]
 
-#                     timestamps = enter_and_leave_timestamps(
-#                         driver_positions,
-#                         stoplocation["position"],
-#                         deliverystatustimestamp
-#                     )
+estimated_durations_data = []
 
-#                     routeplans[routeplanid][routeid][driverid][stoplocationid]["estimated_duration"] = estimated_duration(timestamps)
-
-estimated_durations = []
 
 def fill_in_delivery_time_estimates():
-    for item in routeplans:
-        try:
-            # item[0]: routeplanid
-            # item[1]: routeid
-            # item[2]: driverid
-            # item[3]: stoplocationid
-            # item[4]: duration
-
-            # some stoplocationids are not in stoplocations
-            routeplanid = int(item[0])
-            routeid = int(item[1])
-            driverid = int(item[2])
-            stoplocationid = int(item[3])
-
-            stoplocation = stoplocations[routeplanid][routeid][stoplocationid]
-            driver_positions = driverpositions[routeid][driverid]
-            stoplocation_position = stoplocation["position"]
-            deliverystatustimestamp = stoplocation["deliverystatustimestamp"]
-
-            timestamps = enter_and_leave_timestamps(
-                driver_positions,
-                stoplocation_position,
-                deliverystatustimestamp
-            )
-
-            estimated_durations.append(item + [estimated_duration(timestamps)])
-        except:
+    for idx, routeplan in routeplans.iterrows():
+        # some stoplocationids are not in stoplocations
+        stoplocation = stoplocations[ stoplocations['stoplocationid'] == routeplan['stoplocationid'] ]
+        driver_positions = driverpositions[ driverpositions['routeid'] == routeplan['routeid'] ][ driverpositions['driverid'] == routeplan['driverid'] ]
+        if stoplocation.empty or driverpositions.empty:
             continue
+        stoplocation_position = stoplocation["position"].iloc[0]
+        deliverystatustimestamp = stoplocation["deliverystatustimestamp"].iloc[0]
 
-fill_in_delivery_time_estimates()
-
-#create_csv(routeplans, stoplocations)
+        timestamps = enter_and_leave_timestamps(
+            driver_positions,
+            stoplocation_position,
+            deliverystatustimestamp
+        )
+        estimated_durations_data.append([routeplan['routeplanid'], estimated_duration(timestamps)])
 
-# for routeplanid, routes in routeplans.items():
-#     for routeid, driver in routes.items():
-#         for driverid, stoplocs in driver.items():
-#             for stoplocationid, durations in stoplocs.items():
-#                 if len(durations) > 1:
-#                     print(durations)
 
-print(estimated_durations)
+if __name__ == '__main__':
+    print('starting...')
+    fill_in_delivery_time_estimates()
+    estimated_durations = pd.DataFrame(estimated_durations_data, columns=['routeplanid', 'estimated_duration'])
+    create_csv(routeplans, stoplocations, driverpositions, estimated_durations)
 
-print("--- %s seconds ---" % (time.time() - start_time))
+    print("--- %s seconds ---" % (time.time() - start_time))
diff --git a/requirements.txt b/requirements.txt
@@ -8,3 +8,5 @@ psycopg2==2.7.5
 pylint==2.1.1
 six==1.11.0
 wrapt==1.10.11
+pandas==0.23.4
+numpy==1.15.1
diff --git a/run.sh b/run.sh
@@ -0,0 +1,6 @@
+if [ ! -d "venv" ]; then
+    virtualenv venv -p python3
+fi
+source venv/bin/activate
+pip install -r requirements.txt
+python deliverytimes.py
diff --git a/utilities.py b/utilities.py
@@ -1,161 +1,57 @@
-import csv
-from datetime import datetime
-
-def create_csv(routeplans_dict, stoplocations_dict):
-    with open('estimated_delivery_times.csv', 'w', newline='') as csvfile:
-        fieldnames = ['routeplanid', 'routeid', 'driverid', 'stoplocationid', 'deliverystatus', 'duration', 'estimated_duration']
-        writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
-
-        writer.writeheader()
-
-        for routeplanid, routes in routeplans_dict.items():
-            for routeid, drivers in routes.items():
-                for driverid, stoplocs in drivers.items():
-                    for stoplocationid, durations in stoplocs.items():
-                        deliverystatus = '(null)'
-                        if stoplocationid in stoplocations_dict[routeplanid][routeid]:
-                            deliverystatus = stoplocations_dict[routeplanid][routeid][stoplocationid]["deliverystatus"]
-                        writer.writerow({
-                            'routeplanid': routeplanid,
-                            'routeid': routeid,
-                            'driverid': driverid,
-                            'stoplocationid': stoplocationid,
-                            'deliverystatus': deliverystatus,
-                            'duration': durations["duration"],
-                            'estimated_duration': durations["estimated_duration"] if "estimated_duration" in durations else 0
-                        })
+import pandas as pd
+
+
+def create_csv(routeplans, stoplocations, driverpositions, estimated_durations):
+    df = routeplans.merge(stoplocations[['stoplocationid', 'deliverystatus']], on='stoplocationid', how='left')
+    df = df.merge(estimated_durations[['routeplanid', 'estimated_duration']], on='routeplanid', how='left')
+    print(df)
+    df.to_csv('estimated_delivery_times.csv', index = False)
+
 
 def remove_tzinfo(date_string):
     return date_string.split("+")[0]
 
+
 def remove_microseconds(date_string):
     return date_string.split(".")[0]
 
-# def routeplans():
-#     routeplans = {}
-
-#     with open("routeplans.csv", "r") as f:
-#         reader = csv.reader(f, delimiter="\t")
-#         for i, line in enumerate(reader):
-#             if i == 0:  # skip headers
-#                 continue
-#             if line[2] == '(null)':     # no driver
-#                 continue
-#             routeplanid = int(line[0])
-#             routeid = int(line[1])
-#             driverid = int(line[2])
-#             stoplocationid = int(line[3])
-#             duration = int(line[4]) if line[4] != '(null)' else 0
-
-#             if routeplanid in routeplans:
-#                 routes = routeplans[routeplanid]
-
-#                 if routeid in routes:
-#                     drivers = routes[routeid]
-
-#                     if driverid in drivers:
-#                         stoplocations = drivers[driverid]
-
-#                         stoplocations[stoplocationid] = {"duration": duration}
-#                     else:
-#                         drivers[driverid] = {stoplocationid: {"duration": duration}}
-#                 else:
-#                     routes[routeid] = {
-#                         driverid: {
-#                             stoplocationid: {
-#                                 "duration": duration
-#                             }
-#                         }
-#                     }
-#             else:
-#                 routeplans[routeplanid] = {
-#                     routeid: {
-#                         driverid: {
-#                             stoplocationid: {
-#                                 "duration": duration
-#                             }
-#                         }
-#                     }
-#                 }
-
-#     return routeplans
 
 def routeplans():
-    routeplans = []
+    csv = "routeplans_test.csv"
+    routeplans = pd.read_csv(csv, sep='\t', header=0, na_values=["(null)"])
+    print('%s read shape: %s' % (csv, routeplans.shape))
+    routeplans.dropna(subset=["driverid"], inplace=True)      # ignoring routes without driver
+    print(routeplans.ftypes)
+    return routeplans
 
-    with open("routeplans_test.csv", "r") as f:
-        reader = csv.reader(f, delimiter="\t")
-        for i, line in enumerate(reader):
-            if i == 0:  # skip headers
-                continue
-            if line[2] == '(null)':     # no driver
-                continue
 
-            routeplans.append(line)
+def get_routeids(stoplocations):
+    return stoplocations.groupby(['routeid', 'stoplocationid'])
 
-    return routeplans
 
 def stoplocations():
-    stoplocations = {}
-
-    with open("stoplocations_test.csv", "r") as f:
-        reader = csv.reader(f, delimiter="\t")
-        for i, line in enumerate(reader):
-            if i == 0:
-                continue
-            if line[3] == '(null)' or line[4] == '(null)':
-                continue
-            routeplanid = int(line[0])
-            routeid = int(line[1])
-            stoplocationid = int(line[2])
-            data = {
-                "position": (float(line[3]), float(line[4])),
-                "deliverystatus": int(line[5]),
-                "deliverystatustimestamp": datetime.fromisoformat(line[6] if line[6] != '(null)' else 0)
-            }
-
-            if routeplanid in stoplocations:
-                routes = stoplocations[routeplanid]
-
-                if routeid in routes:
-                    stoplocs = routes[routeid]
-                    stoplocs[stoplocationid] = data # Assumption: only unique stoplocationids
-                else:
-                    routes[routeid] = {stoplocationid: data}
-            else:
-                stoplocations[routeplanid] = {routeid: {stoplocationid: data}}
-
+    csv = "stoplocations_test.csv"
+    stoplocations = pd.read_csv(csv, sep='\t', header=0, na_values=["(null)"])
+    print('%s read shape: %s' % (csv, stoplocations.shape))
+    stoplocations.dropna(subset=["latitude", "longitude"], inplace=True)      # ignoring locations without latitude or longitude
+    stoplocations['position'] = stoplocations.apply(lambda x: (x['latitude'], x['longitude']), axis=1)
+    stoplocations['deliverystatustimestamp'] = pd.to_datetime(stoplocations['deliverystatustimestamp'].apply(lambda x: remove_microseconds(remove_tzinfo(x))))
+    print(stoplocations.ftypes)
     return stoplocations
 
-def driverpositions():
-    driverpositions = {}
-
-    with open("driverpositions_test.csv", "r") as f:
-        reader = csv.reader(f, delimiter="\t") # use "," when using Terje's file
-        for i, line in enumerate(reader):
-            if i == 0:  # skip headers
-                continue
-            routeid = int(line[0])
-            driverid = int(line[1])
-            data = {
-                "position": (float(line[2]), float(line[3])),
-                "logtime": datetime.fromisoformat(remove_microseconds(remove_tzinfo(line[4]))),
-                "speed": float(line[6]) if line[6] else 0.0
-            }
-
-            if routeid in driverpositions:
-                drivers = driverpositions[routeid]
-
-                if driverid in drivers:
-                    drivers[driverid].append(data)
-                else:
-                    drivers[driverid] = [data]
-            else:
-                driverpositions[routeid] = {driverid: [data]}
-
-    # sort
-    for routeid, driver in driverpositions.items():
-        for driverid, position_data in driver.items():
-            driverpositions[routeid][driverid] = sorted(position_data, key=lambda position_datum: position_datum["logtime"])
 
+def get_drivers(driverpositions):
+    return driverpositions.groupby(['routeid', 'driverid'])
+
+
+def driverpositions():
+    csv = "driverpositions_test.csv"
+    driverpositions = pd.read_csv(csv, sep='\t', header=0, na_values=["(null)"])  # use "," when using Terje's file
+    print('%s read shape: %s' % (csv, driverpositions.shape))
+    driverpositions.dropna(subset=["latitude", "longitude"], inplace=True)      # ignoring positions without latitude or longitude
+    driverpositions['position'] = driverpositions.apply(lambda x: (x['latitude'], x['longitude']), axis=1)
+    driverpositions['logtime'] = pd.to_datetime(driverpositions['logtime'].apply(lambda x: remove_microseconds(remove_tzinfo(x))))
+    driverpositions['speed'] = driverpositions['speed'].fillna(value=0)
+    driverpositions = driverpositions.sort_values(by=['routeid', 'driverid', 'logtime'])
+    print(driverpositions.ftypes)
     return driverpositions
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,4 +3,5 @@ __pycache__/ @@
     database_data.py
-    *.csv
+    *.csv
+    .vscode