Skip to content

Commit 3a7abd8

Browse files
authored
Update Classification_Using _Pyspark.py
1 parent e250126 commit 3a7abd8

File tree

1 file changed

+16
-0
lines changed

1 file changed

+16
-0
lines changed

Classification_Using _Pyspark.py

+16
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,13 @@ def load_data(file_type):
100100
#Print Schema and count number of columns from data test
101101
len(test_data.columns), test_data.printSchema()
102102

103+
#rename Target to 'label in data train
104+
df = df.withColumnRenamed('QuoteConversion_Flag','label')
105+
#rename Id number ('QuoteNumber') to 'Id' in data train
106+
df = df.withColumnRenamed('QuoteNumber','Id')
107+
108+
#rename Id number ('QuoteNumber') to 'Id' in data test
109+
test_data = test_data.withColumnRenamed('QuoteNumber','Id')
103110

104111
#drop column Original_Quote_Date from data train
105112
df_final=df.drop('Original_Quote_Date')
@@ -605,6 +612,15 @@ def Main_feature_engineering(df,df2):
605612
b=feature_engineering(df2)
606613
return a,b
607614

615+
#call function feature engineering
616+
%time data2, test2=Main_feature_engineering(df_final, test_data)
617+
618+
#view result of feature engineering in data train
619+
data2.select('Id', 'features').show(5)
620+
621+
#view result of feature engineering in data test
622+
test2.select('Id', 'features').show(5)
623+
608624
#Split Data train to train and test
609625
#Split df_final to train and test, train 70% and test 30%. Define seed 24 so the random data that we split will not change.
610626
#we can define seed with any value

0 commit comments

Comments
 (0)