-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
231 lines (152 loc) · 7.16 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mt
import streamlit as st
import streamlit.components.v1 as components
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as IMPPIPE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from PIL import Image
bankdata_clean=pd.read_csv('run-1608626939481-part-r-00000',sep=',')
bankdata=bankdata_clean[(bankdata_clean['balance']<2000)&(bankdata_clean['balance']>-1000)]
bankdata1=pd.read_csv('bank2.csv',sep=',')
VAL=0
VAL1=0
#--------------CREATE MODEL---SCALE-BALANCE CLASSES------------#
def make_model(bankdata):
final_data_features=bankdata.iloc[:,[0,1,3,4]]
final_data_target=bankdata.iloc[:,[2]]
X_train, X_test, y_train, y_test = train_test_split(final_data_features,final_data_target,random_state=1,shuffle=True,test_size=0.1)
over = SMOTE()
under = RandomUnderSampler()
steps = [('o', over), ('u', under)]
SAMPLER = imblearn.pipeline.Pipeline(steps=steps)
X_SMOTE, y_SMOTE = SAMPLER.fit_resample(X_train, y_train)
pre_process = ColumnTransformer([('scale',StandardScaler(),['housing_loan','personal_loan','age','balance']) ])
RF=Pipeline([('PR',pre_process),('DECISON_TREE',RandomForestClassifier(random_state=0))])
RF_MODEL=RF.fit(X_SMOTE,y_SMOTE)
return RF_MODEL
#---------------------LAYOUT----------------------------#
st.title('CUSTOMER CLASSIFICATION')
st.text('''The data is related with direct marketing campaigns of a
Portuguese banking institution. The marketing campaigns were based
on phone calls. Often, more than one contact to the same client
was required, in order to access if the product (bank term deposit)
would be ('yes') or not ('no') subscribed.''')
st.subheader('*To view the entire dataset with the original features,Exploratory data analysis,feature selecion,Adjustments forClass imbalance etc. view the')
components.html("""<a href="https://nbviewer.jupyter.org/github/savio0694/BANK-CUSTOMER-CLASSIFICATION-WEBAPP/blob/main/customer-classification.ipynb" target="_blank">JUPYTER NOTEBOOK</a> """)
st.text('')
st.subheader('A sample of raw data')
st.dataframe(bankdata1.head(10))
st.subheader('The raw data is initially kept in the in folder of an amazon S3 bucket,Amazon S3 is a simple web services interface that you can use to store and retrieve any amount of data, at any time, from anywhere on the web.')
st.image('https://github.com/savio0694/BANK-CUSTOMER-CLASSIFICATION-WEBAPP/blob/main/images/Capture2.PNG?raw=true')
st.subheader('The raw data is read and transformed using AWS GLUE,AWS Glue is a serverless data integration service that makes it easy to discover, prepare, and combine data for analytics, machine learning, and application development.')
st.image('https://github.com/savio0694/BANK-CUSTOMER-CLASSIFICATION-WEBAPP/blob/main/images/Capture6.png?raw=true')
st.image('https://github.com/savio0694/BANK-CUSTOMER-CLASSIFICATION-WEBAPP/blob/main/images/Capture3.PNG?raw=true')
st.image('https://github.com/savio0694/BANK-CUSTOMER-CLASSIFICATION-WEBAPP/blob/main/images/Capture1.PNG?raw=true')
st.subheader('We create a crawler to crawl the s3 bucket and define an ETL job to run our extrac,transform,load SPARK script given below.Apache Spark is a unified analytics engine for big data processing, with built-in modules for streaming, SQL, machine learning and graph processing.')
st.text('''#########################################
### IMPORT LIBRARIES AND SET VARIABLES
#########################################
#Import pyspark modules
from pyspark.context import SparkContext
import pyspark.sql.functions as f
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
#Import glue modules
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
#Initialize contexts and session
spark_context = SparkContext.getOrCreate()
glue_context = GlueContext(spark_context)
session = glue_context.spark_session
#Parameters
glue_db = "db1"
glue_tbl = "in"
s3_write_path = "s3://customers123/out/"
#########################################
### EXTRACT (READ DATA)
#########################################
dynamic_frame_read = glue_context.create_dynamic_frame.from_catalog(database = glue_db, table_name = glue_tbl)
#Convert dynamic frame to data frame to use standard pyspark functions
data = dynamic_frame_read.toDF()
#########################################
### TRANSFORM (MODIFY DATA)
#########################################
df=data.select('age','balance','housing','loan','y')
indexers=[StringIndexer(inputCol=columns,outputCol=columns+'-out') for columns in ['housing','loan','y']]
pipeline=Pipeline(stages=indexers)
df=pipeline.fit(df).transform(df)
df=df.select('age','balance','housing-out','loan-out','y-out')
df=df.withColumnRenamed('housing-out','housing_loan') \
.withColumnRenamed('loan-out','personal_loan') \
.withColumnRenamed('y-out','Term_deposit')
#df=df.filter((df.balance>1-1000)&(df.balance<1800)).collect()
#########################################
### LOAD (WRITE DATA)
#########################################
#Convert back to dynamic frame
dynamic_frame_write = DynamicFrame.fromDF(df, glue_context, "dynamic_frame_write")
#Write data back to S3
glue_context.write_dynamic_frame.from_options(
frame = dynamic_frame_write,
connection_type = "s3",
connection_options = {
"path": s3_write_path,
},
format = "csv"
)'''
)
st.subheader('The spark program creates he output file in the out folder in S3')
st.image('https://github.com/savio0694/BANK-CUSTOMER-CLASSIFICATION-WEBAPP/blob/main/images/Capture5.PNG?raw=true')
st.subheader('Final_data_features')
st.dataframe(bankdata.head(8))
st.subheader('FINAL RESULT AFTER RUNNING THROUGH RandomForestClassifier')
housing = st.sidebar.radio(
"Does the customer have a housing loan?",
('YES', 'NO'))
if housing == 'YES':
VAL=1
else:
VAL=0
personal=st.sidebar.radio(
"Does the customer have a personal loan?",
('YES', 'NO'))
if personal == 'YES':
VAL1=1
else:
VAL1=0
age = st.sidebar.slider('How old is the customer', 18, 100, 25)
balance= st.sidebar.slider('What is the customer balance?', -500, 2000, 250)
#---------------LAYOUT-------------------------------#
MODEL=make_model(bankdata)
#--------------MAKE-TEST_DATAFRAME-------------#
def make_frame(VAL,VAL1,age,balance):
data={
"housing_loan" : VAL,
"personal_loan" : VAL1,
"age" : age,
"balance":balance
}
df = pd.DataFrame(data,index=[1])
return df
df = make_frame(VAL,VAL1,age,balance)
PREDICTION=MODEL.predict(df)
#---FINAL DECISION-------#
def final_decision(PREDICTION):
if PREDICTION[0]==0:
return " The customer is UNLIKELY to have a term deposit"
else:
return " The customer is LIKELY to have a term deposit"
st.title(final_decision(PREDICTION))