@@ -100,6 +100,13 @@ def load_data(file_type):
100
100
#Print Schema and count number of columns from data test
101
101
len (test_data .columns ), test_data .printSchema ()
102
102
103
+ #rename Target to 'label in data train
104
+ df = df .withColumnRenamed ('QuoteConversion_Flag' ,'label' )
105
+ #rename Id number ('QuoteNumber') to 'Id' in data train
106
+ df = df .withColumnRenamed ('QuoteNumber' ,'Id' )
107
+
108
+ #rename Id number ('QuoteNumber') to 'Id' in data test
109
+ test_data = test_data .withColumnRenamed ('QuoteNumber' ,'Id' )
103
110
104
111
#drop column Original_Quote_Date from data train
105
112
df_final = df .drop ('Original_Quote_Date' )
@@ -605,6 +612,15 @@ def Main_feature_engineering(df,df2):
605
612
b = feature_engineering (df2 )
606
613
return a ,b
607
614
615
+ #call function feature engineering
616
+ % time data2 , test2 = Main_feature_engineering (df_final , test_data )
617
+
618
+ #view result of feature engineering in data train
619
+ data2 .select ('Id' , 'features' ).show (5 )
620
+
621
+ #view result of feature engineering in data test
622
+ test2 .select ('Id' , 'features' ).show (5 )
623
+
608
624
#Split Data train to train and test
609
625
#Split df_final to train and test, train 70% and test 30%. Define seed 24 so the random data that we split will not change.
610
626
#we can define seed with any value
0 commit comments