From 0dea3508d300320fa0d4ff958cffcbdb5f275b7f Mon Sep 17 00:00:00 2001
From: Kim Montgomery <kim.montgomery@h2o.ai>
Date: Sun, 9 Aug 2020 20:19:04 -0600
Subject: [PATCH 1/3] Adding similar event data recipe

---
 .DS_Store                             | Bin 0 -> 6148 bytes
 data/similar_event_feature_creator.py | 149 ++++++++++++++++++++++++++
 scorers/.DS_Store                     | Bin 0 -> 6148 bytes
 3 files changed, 149 insertions(+)
 create mode 100644 .DS_Store
 create mode 100755 data/similar_event_feature_creator.py
 create mode 100644 scorers/.DS_Store
diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..d1737db992955355365bb6e169cac437cec3ce17
GIT binary patch
literal 6148
zcmeHK&2G~`5S~p!;sh#k0JYqFLE_M&<dC)qRZ7zasS+R{2o8W!Cw3YO*Nzg0sDz+g
zcn9DO5KqF1BM-s@z&Gnv<!stpg#hhDyEC5sX6*g;FH1zCF?Op&Wg@arSm#Qp&Jb?r
zvLqShxe6_0jw5ntfS*V1652Qn1BQXWjR9V}Ygn~D4QYeY>$eSGNF$2T8sO)^@+o59
zK@fF2LDWGkZya|~ZvB!13mXYmKcgKD_vJiXXEGY7eBU~Qz#qkx$`6sr<}T$6mQ}J!
zE2UShel%&tPCOnpoZ$<8dgR3Yj<TOdQE+NIooBu`X_v3<MPcmtp*NJlzUu+<^0^<n
z(WDWLLw6+ew2}tPDq6*Md2Kq~+*-G*wXNB@J+0O@@2uO~waRQ(v{r9ccMcAZPu@=7
z&E9{KT_u5yPYM&6md|)NCE`tM7zR;z1Rvw`bdPrFG2NzPau-Pwt7uP?xq?CQK>7v-
z$OEk|Vmw3+=tPVBphaG%K89;TZ45<(dV&l-Y!9}B!uh)JV7t~b{nRoQ=sq3L9yRG9
zwWzs>&Nv#{q6f(T8rgfu9_ZOK-!*(5J&MpT4_Iw#Wp^}ek)Gfyc4;5)Xr9Gi_Nx|A
zNcU6EpSz)<ixS<#w=g>)_|dqLll^0AvKO#Ky7+#^5B2=SQ$CHp0{(|!hZj~)M)7Da
z#mQNajzgM@7d3UQ%NL*8WHbyI2L5pdcz>`_Sl5-FCt7YDsN@v@ItRBD*!-7(V{E1C
zO3xFG7Klhfp_5Rdt{5T-hu>DXy3+GRC*dU2<wK}93w1*g^6kiPOE`&oqD^fWFbpg*
zkWC)TeEv7jzyB`=nJ2@5Vc@@FK;-sY`wetSovllq<FnRAIYeP$+&s}T1eH3DrNKw>
d8j2LO*=#`9m7XUW9*FrPAZaj_Vc?H4@Ef-b1M2_)

literal 0
HcmV?d00001

diff --git a/data/similar_event_feature_creator.py b/data/similar_event_feature_creator.py
new file mode 100755
index 00000000..d14d279c
--- /dev/null
+++ b/data/similar_event_feature_creator.py
@@ -0,0 +1,149 @@
+"""Manually add features based on the average target value for similar events to a dataset"""
+
+"""
+
+This recipe adds the average value of the target for recent similar events (where similar events have the same 
+values for the categorical variables on the event list).
+
+Settings for Driverless AI:
+1. Update folder_path to the data file and the filename.
+2. Edit the seconds ahead list so that it lists the number of seconds ahead of time,
+that predictions must be made.  A separate file, with separate predict ahead intervals will be 
+created for each value on the list. For instance [24*3600, 7*24*3600] would create separate files
+with day ahead and week ahead features.
+3. Specify the target column.
+4. Specify the datetime column.
+5. Specify the columns used to define similar events as events.
+6. Specify the time intervals over which events will be averaged in seconds as the event_intervals.
+eg [1*24*3600, 3*24*3600, 7*24*3600] creates event features averaged over 1, 3, and 7 days.
+7. Minimum number of event categories to consider in creating the lagged features. If n=2, all combinations of 2, 3, ... N events from the 
+events list are used to define similar events when creating features.
+8. Upload under 'ADD DATASET' -> 'UPLOAD DATA RECIPE'
+"""
+
+import datatable as dt
+import numpy as np
+import os
+
+from h2oaicore.data import CustomData
+from h2oaicore.systemutils import config
+
+
+class MyData(CustomData):
+    
+    @staticmethod
+    def create_data():
+        
+        _modules_needed_by_name = ['datetime']        
+        
+        import datetime
+        import pandas as pd
+        from collections import defaultdict
+        from itertools import combinations
+        
+        """
+        Update the below as needed
+        """
+        # Path to the data
+        folder_path = 'tmp/'  
+        # Data file
+        data_file = 'OTG_data_with_datetime.csv' # Data file
+
+        # Number of seconds ahead that predictions should be made
+        seconds_ahead_list = [2*24*3600]
+        # Target column
+        target = "Meals Served"
+        # Datetime column
+        datetime_column = "datetime"
+        # Event group columns
+        events = ['Meal Period', 'Concept/Truck', 'Service Location', 'Menu Item Name']
+        # time period over which to average events
+        event_intervals = [1*24*3600, 3*24*3600, 7*24*3600]
+
+        # minimum number of events to include in combinations
+        min_event_combo_number = max(len(events) - 1, 1)
+        
+        # Try to calculate a datetime
+        def create_datetime(x):
+            
+            try:
+                answer = pd.to_datetime(str(x))
+            except:
+                answer = x
+        
+            return answer
+
+        
+        # Create datasets with minimum features calculated the given number of days ahead
+        dataset_dict = {}
+        for seconds_ahead in seconds_ahead_list:
+            
+            train = pd.read_csv(os.path.join(folder_path, data_file))
+
+            # Change the beginning and end of service times to datetimes
+            train['datetime'] = train[datetime_column].apply(create_datetime)
+                        
+            # Calculate all combinations of the even columns that will be used to define a similar event
+            event_combinations = []
+            for num_in_set in range(min_event_combo_number, len(events) + 1):
+                    event_combinations += list(combinations(events, num_in_set))
+                    
+            for event_categories in event_combinations:
+                
+                event_categories = list(event_categories)
+                
+                event_prefix = "previous_"
+                for item in event_categories:
+                    event_prefix += str(item.replace(' ', '')) + '_'
+                    
+                temp_shift = train.copy()
+                
+                # Save separate dataframes for each unique event type
+                unique_categories = temp_shift[event_categories].drop_duplicates()   
+                
+                split_set = {}
+    
+                # Split the training set by category
+                for ii in range(len(unique_categories)):
+                    AA = temp_shift.copy()
+                    for jj in range(len(event_categories)):
+                        AA = AA[AA[event_categories[jj]] == unique_categories.iloc[ii, jj]]
+                    split_set[tuple(unique_categories.iloc[ii,:])] = AA
+    
+                def mean(x):
+                    x = list(x)
+                    try:
+                        answer = sum(x) / float(len(x))
+                    except:
+                        answer = np.nan
+                    return answer
+                
+                def most_recent(row, seconds_ahead, event_interval, event_categories):
+                    # Find the average target value over the given event interval
+                    try:
+                        train_category = split_set[tuple(row[event_categories])].copy()
+                    
+                        train_category = train_category[((row['datetime'] - train_category['datetime']).apply(lambda x: x.total_seconds()) >= seconds_ahead) & 
+                                                        ((row['datetime'] - train_category['datetime']).apply(lambda x: x.total_seconds()) <= seconds_ahead + event_interval)]
+                        answer = mean(train_category[target])
+        
+                    except:
+                        answer = np.nan
+                        
+                    return answer
+    
+                # Average recent events over each interval length
+                for average_interval in event_intervals:
+                    
+                    temp_shift[event_prefix + 'event_ave_' + str(average_interval)] = temp_shift.apply(lambda row: most_recent(row, seconds_ahead, average_interval, event_categories), axis=1)
+    
+
+                train = temp_shift.copy()
+
+              
+            # Save the dataset corresponding to the number of seconds ahead the predictions are being made
+            new_name = data_file.split('.')[0] + '_' + str(min_event_combo_number)  + '_event_lags_'
+            dataset_dict[new_name + str(seconds_ahead)] = train
+
+            
+        return dataset_dict
\ No newline at end of file
diff --git a/scorers/.DS_Store b/scorers/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..ce4e1540dcae59e0442808d6b3bf6721a5066889
GIT binary patch
literal 6148
zcmeHK-D(p-6h6~NvstNzBD9x#BY06HDN-yVENOEQ5D`}NLM7cz)@EV56SA9z8Y6qt
zhp2Dhqxb|qh!3FOnHhibBX}VqVNN*notZgjcIP{pnH?e$t#Q9eR3{<_%2;txtPt*J
zU6KvcQUwY%MotO6f%Yg|^0p0@0n5Pu#sII~n^><YR&0*n&-GitmXvi;*^5#c!H+kI
zhtRuUmB1p}q5+L5qL@b1#VAsZa*u{|u@L!jA+k!3sY6H9rY96od)`~@EO@&ZKc-Vv
z>i1lFH*xtVG0`L+n)!T+hG~))jmBrOv01rxz3Mow<8HaHgMpj`MOcjURycaewWnb*
zh>ZPA%JhdFM$eOY(yiY<l35WZSv*p~Nk4|ms~1Vumy?zpXZ>7_6X*`dsX4W7eRn!-
z?jP=X_YY=!-gLil0Q>Mkb2h6vJ9nCoIwxlrZ>R5O@2@a1h`=VR<d(tj@Bzk3g3g0c
zmdflD{T#cE<n)jtrjpZD0MC#rPCnZ!(k8wj?$?B)YGpkbFn=fK#(GT0&cZTa8Tg9~
z@c!UI8GVDLMzwXIP*(t81<gue^JfCbcm{ogrAG8XgboGjP+_hZLWiT>Gk(6oQlkzh
zVJ;uSJXx3<icn9-_?~hn;cK*|Wxz5p&%max*7^Kz|Nj0zA7pEm0n5NzF(4|(!Ep;y
zGH2`3<oK-hpeIlk_A5225ESM(Rt6u%yHF+Q^Vk6T21|`-f!GfLMT0FY1OJqPZ;54|
AZ2$lO

literal 0
HcmV?d00001


From 01abaf09cba7dcad547e6d8e233d0fe35d6476ac Mon Sep 17 00:00:00 2001
From: Kim Montgomery <kim.montgomery@h2o.ai>
Date: Sun, 9 Aug 2020 23:10:51 -0600
Subject: [PATCH 2/3] Adding similar event data recipe

---
 data/similar_event_feature_creator.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/data/similar_event_feature_creator.py b/data/similar_event_feature_creator.py
index d14d279c..bd18bec4 100755
--- a/data/similar_event_feature_creator.py
+++ b/data/similar_event_feature_creator.py
@@ -92,6 +92,7 @@ def create_datetime(x):
                 
                 event_categories = list(event_categories)
                 
+                # Create a string indicating the categories included in the definition of a similar event
                 event_prefix = "previous_"
                 for item in event_categories:
                     event_prefix += str(item.replace(' ', '')) + '_'
@@ -110,6 +111,7 @@ def create_datetime(x):
                         AA = AA[AA[event_categories[jj]] == unique_categories.iloc[ii, jj]]
                     split_set[tuple(unique_categories.iloc[ii,:])] = AA
     
+                # Try to calculate the mean
                 def mean(x):
                     x = list(x)
                     try:

From 7ebef7ddaba9a2d8401390c810a0d8f92b4ac031 Mon Sep 17 00:00:00 2001
From: Kim Montgomery <kim.montgomery@h2o.ai>
Date: Sun, 9 Aug 2020 23:16:18 -0600
Subject: [PATCH 3/3] Adding similar event data recipe

---
 data/similar_event_feature_creator.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/data/similar_event_feature_creator.py b/data/similar_event_feature_creator.py
index bd18bec4..fcfc39f4 100755
--- a/data/similar_event_feature_creator.py
+++ b/data/similar_event_feature_creator.py
@@ -1,13 +1,13 @@
-"""Manually add features based on the average target value for similar events to a dataset"""
+"""Manually add features based on the average target value for similar events"""
 
 """
 
 This recipe adds the average value of the target for recent similar events (where similar events have the same 
-values for the categorical variables on the event list).
+values for the categorical variables in the event list).
 
 Settings for Driverless AI:
 1. Update folder_path to the data file and the filename.
-2. Edit the seconds ahead list so that it lists the number of seconds ahead of time,
+2. Edit the seconds ahead list so that it lists the number of seconds ahead of time
 that predictions must be made.  A separate file, with separate predict ahead intervals will be 
 created for each value on the list. For instance [24*3600, 7*24*3600] would create separate files
 with day ahead and week ahead features.
@@ -74,7 +74,7 @@ def create_datetime(x):
             return answer
 
         
-        # Create datasets with minimum features calculated the given number of days ahead
+        # Create datasets with features calculated the given number of days ahead
         dataset_dict = {}
         for seconds_ahead in seconds_ahead_list: