From 0dea3508d300320fa0d4ff958cffcbdb5f275b7f Mon Sep 17 00:00:00 2001 From: Kim Montgomery Date: Sun, 9 Aug 2020 20:19:04 -0600 Subject: [PATCH 1/3] Adding similar event data recipe --- .DS_Store | Bin 0 -> 6148 bytes data/similar_event_feature_creator.py | 149 ++++++++++++++++++++++++++ scorers/.DS_Store | Bin 0 -> 6148 bytes 3 files changed, 149 insertions(+) create mode 100644 .DS_Store create mode 100755 data/similar_event_feature_creator.py create mode 100644 scorers/.DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..d1737db992955355365bb6e169cac437cec3ce17 GIT binary patch literal 6148 zcmeHK&2G~`5S~p!;sh#k0JYqFLE_M&$eSGNF$2T8sO)^@+o59 zK@fF2LDWGkZya|~ZvB!13mXYmKcgKD_vJiXXEGY7eBU~Qz#qkx$`6sr<}T$6mQ}J! zE2UShel%&tPCOnpoZ$<8dgR3Yj7v- z$OEk|Vmw3+=tPVBphaG%K89;TZ45<(dV&l-Y!9}B!uh)JV7t~b{nRoQ=sq3L9yRG9 zwWzs>&Nv#{q6f(T8rgfu9_ZOK-!*(5J&MpT4_Iw#Wp^}ek)Gfyc4;5)Xr9Gi_Nx|A zNcU6EpSz))_|dqLll^0AvKO#Ky7+#^5B2=SQ$CHp0{(|!hZj~)M)7Da z#mQNajzgM@7d3UQ%NL*8WHbyI2L5pdcz>`_Sl5-FCt7YDsN@v@ItRBD*!-7(V{E1C zO3xFG7Klhfp_5Rdt{5T-hu>DXy3+GRC*dU2 d8j2LO*=#`9m7XUW9*FrPAZaj_Vc?H4@Ef-b1M2_) literal 0 HcmV?d00001 diff --git a/data/similar_event_feature_creator.py b/data/similar_event_feature_creator.py new file mode 100755 index 00000000..d14d279c --- /dev/null +++ b/data/similar_event_feature_creator.py @@ -0,0 +1,149 @@ +"""Manually add features based on the average target value for similar events to a dataset""" + +""" + +This recipe adds the average value of the target for recent similar events (where similar events have the same +values for the categorical variables on the event list). + +Settings for Driverless AI: +1. Update folder_path to the data file and the filename. +2. Edit the seconds ahead list so that it lists the number of seconds ahead of time, +that predictions must be made. A separate file, with separate predict ahead intervals will be +created for each value on the list. For instance [24*3600, 7*24*3600] would create separate files +with day ahead and week ahead features. +3. Specify the target column. +4. Specify the datetime column. +5. Specify the columns used to define similar events as events. +6. Specify the time intervals over which events will be averaged in seconds as the event_intervals. +eg [1*24*3600, 3*24*3600, 7*24*3600] creates event features averaged over 1, 3, and 7 days. +7. Minimum number of event categories to consider in creating the lagged features. If n=2, all combinations of 2, 3, ... N events from the +events list are used to define similar events when creating features. +8. Upload under 'ADD DATASET' -> 'UPLOAD DATA RECIPE' +""" + +import datatable as dt +import numpy as np +import os + +from h2oaicore.data import CustomData +from h2oaicore.systemutils import config + + +class MyData(CustomData): + + @staticmethod + def create_data(): + + _modules_needed_by_name = ['datetime'] + + import datetime + import pandas as pd + from collections import defaultdict + from itertools import combinations + + """ + Update the below as needed + """ + # Path to the data + folder_path = 'tmp/' + # Data file + data_file = 'OTG_data_with_datetime.csv' # Data file + + # Number of seconds ahead that predictions should be made + seconds_ahead_list = [2*24*3600] + # Target column + target = "Meals Served" + # Datetime column + datetime_column = "datetime" + # Event group columns + events = ['Meal Period', 'Concept/Truck', 'Service Location', 'Menu Item Name'] + # time period over which to average events + event_intervals = [1*24*3600, 3*24*3600, 7*24*3600] + + # minimum number of events to include in combinations + min_event_combo_number = max(len(events) - 1, 1) + + # Try to calculate a datetime + def create_datetime(x): + + try: + answer = pd.to_datetime(str(x)) + except: + answer = x + + return answer + + + # Create datasets with minimum features calculated the given number of days ahead + dataset_dict = {} + for seconds_ahead in seconds_ahead_list: + + train = pd.read_csv(os.path.join(folder_path, data_file)) + + # Change the beginning and end of service times to datetimes + train['datetime'] = train[datetime_column].apply(create_datetime) + + # Calculate all combinations of the even columns that will be used to define a similar event + event_combinations = [] + for num_in_set in range(min_event_combo_number, len(events) + 1): + event_combinations += list(combinations(events, num_in_set)) + + for event_categories in event_combinations: + + event_categories = list(event_categories) + + event_prefix = "previous_" + for item in event_categories: + event_prefix += str(item.replace(' ', '')) + '_' + + temp_shift = train.copy() + + # Save separate dataframes for each unique event type + unique_categories = temp_shift[event_categories].drop_duplicates() + + split_set = {} + + # Split the training set by category + for ii in range(len(unique_categories)): + AA = temp_shift.copy() + for jj in range(len(event_categories)): + AA = AA[AA[event_categories[jj]] == unique_categories.iloc[ii, jj]] + split_set[tuple(unique_categories.iloc[ii,:])] = AA + + def mean(x): + x = list(x) + try: + answer = sum(x) / float(len(x)) + except: + answer = np.nan + return answer + + def most_recent(row, seconds_ahead, event_interval, event_categories): + # Find the average target value over the given event interval + try: + train_category = split_set[tuple(row[event_categories])].copy() + + train_category = train_category[((row['datetime'] - train_category['datetime']).apply(lambda x: x.total_seconds()) >= seconds_ahead) & + ((row['datetime'] - train_category['datetime']).apply(lambda x: x.total_seconds()) <= seconds_ahead + event_interval)] + answer = mean(train_category[target]) + + except: + answer = np.nan + + return answer + + # Average recent events over each interval length + for average_interval in event_intervals: + + temp_shift[event_prefix + 'event_ave_' + str(average_interval)] = temp_shift.apply(lambda row: most_recent(row, seconds_ahead, average_interval, event_categories), axis=1) + + + train = temp_shift.copy() + + + # Save the dataset corresponding to the number of seconds ahead the predictions are being made + new_name = data_file.split('.')[0] + '_' + str(min_event_combo_number) + '_event_lags_' + dataset_dict[new_name + str(seconds_ahead)] = train + + + return dataset_dict \ No newline at end of file diff --git a/scorers/.DS_Store b/scorers/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..ce4e1540dcae59e0442808d6b3bf6721a5066889 GIT binary patch literal 6148 zcmeHK-D(p-6h6~NvstNzBD9x#BY06HDN-yVENOEQ5D`}NLM7cz)@EV56SA9z8Y6qt zhp2Dhqxb|qh!3FOnHhibBX}VqVNN*notZgjcIP{pnH?e$t#Q9eR3{<_%2;txtPt*J zU6KvcQUwY%MotO6f%Yg|^0p0@0n5Pu#sII~n^>i1lFH*xtVG0`L+n)!T+hG~))jmBrOv01rxz3Mow<8HaHgMpj`MOcjURycaewWnb* zh>ZPA%JhdFM$eOY(yiY7_6X*`dsX4W7eRn!- z?jP=X_YY=!-gLil0Q>Mkb2h6vJ9nCoIwxlrZ>R5O@2@a1h`=VRGk(6oQlkzh zVJ;uSJXx3 Date: Sun, 9 Aug 2020 23:10:51 -0600 Subject: [PATCH 2/3] Adding similar event data recipe --- data/similar_event_feature_creator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/data/similar_event_feature_creator.py b/data/similar_event_feature_creator.py index d14d279c..bd18bec4 100755 --- a/data/similar_event_feature_creator.py +++ b/data/similar_event_feature_creator.py @@ -92,6 +92,7 @@ def create_datetime(x): event_categories = list(event_categories) + # Create a string indicating the categories included in the definition of a similar event event_prefix = "previous_" for item in event_categories: event_prefix += str(item.replace(' ', '')) + '_' @@ -110,6 +111,7 @@ def create_datetime(x): AA = AA[AA[event_categories[jj]] == unique_categories.iloc[ii, jj]] split_set[tuple(unique_categories.iloc[ii,:])] = AA + # Try to calculate the mean def mean(x): x = list(x) try: From 7ebef7ddaba9a2d8401390c810a0d8f92b4ac031 Mon Sep 17 00:00:00 2001 From: Kim Montgomery Date: Sun, 9 Aug 2020 23:16:18 -0600 Subject: [PATCH 3/3] Adding similar event data recipe --- data/similar_event_feature_creator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/data/similar_event_feature_creator.py b/data/similar_event_feature_creator.py index bd18bec4..fcfc39f4 100755 --- a/data/similar_event_feature_creator.py +++ b/data/similar_event_feature_creator.py @@ -1,13 +1,13 @@ -"""Manually add features based on the average target value for similar events to a dataset""" +"""Manually add features based on the average target value for similar events""" """ This recipe adds the average value of the target for recent similar events (where similar events have the same -values for the categorical variables on the event list). +values for the categorical variables in the event list). Settings for Driverless AI: 1. Update folder_path to the data file and the filename. -2. Edit the seconds ahead list so that it lists the number of seconds ahead of time, +2. Edit the seconds ahead list so that it lists the number of seconds ahead of time that predictions must be made. A separate file, with separate predict ahead intervals will be created for each value on the list. For instance [24*3600, 7*24*3600] would create separate files with day ahead and week ahead features. @@ -74,7 +74,7 @@ def create_datetime(x): return answer - # Create datasets with minimum features calculated the given number of days ahead + # Create datasets with features calculated the given number of days ahead dataset_dict = {} for seconds_ahead in seconds_ahead_list: