-
Notifications
You must be signed in to change notification settings - Fork 0
/
code.py
329 lines (263 loc) · 11.8 KB
/
code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
# Imported packages
import numpy as np
import pandas as pd
from math import log
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
"""Stage 1: Sigmoid function
Description
In this project, we will work on a classification algorithm that makes
predictions when a dependent variable assumes discrete values. Logistic
regression is arguably the simplest solution. In the case of binary
classification (class 0 or class 1), it uses a sigmoid function to estimate how
likely an observation belongs to class 1.
we will work with the Wisconsin Breast Cancer Dataset from the sklearn library.
We also want to standardize the features as they are measured in different
units using Z-standardization
Objectives
1 - Create the CustomLogisticRegression class
2 - Create the __init__ method
3 - Create the sigmoid method
4 - Create the predict_proba method
"""
"""Stage 2: Gradient descent with MSE
Description
In this stage, we need to estimate the coef_ (weight) values by gradient descent
on the Mean squared error cost function. Gradient descent is an optimization
technique for finding the local minimum of a cost function by first-order
differentiating. To be precise, we're going to implement the Stochastic
gradient descent (SGD).
Objectives
1 - Implement the fit_mse method
2 - Implement the predict method
"""
"""Stage 3: Log-Loss
Description
The Mean squared error cost function produces a non-convex graph with the local
and global minimums when applied to a sigmoid function. If a weight value is
close to a local minimum, gradient descent minimizes the cost function by the
local (not global) minimum. This presents grave limitations to the Mean squared
error cost function if we apply it to binary classification tasks. The Log-loss
cost function may help to overcome this issue.
Objectives
Implement the fit_log_loss method in class CustomLogisticRegression
"""
# Load the dataset
data = load_breast_cancer(as_frame=True)
X = data.data[['worst concave points', 'worst perimeter', 'worst radius']]
y = data.target
# Standardize X
for feature in X.columns.tolist():
feature_mean = X[feature].mean()
feature_std = X[feature].std()
X[feature] = (X[feature] - feature_mean) / feature_std
# Split the datasets to training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
random_state=43)
class CustomLogisticRegression:
"""A simple logistic regression model."""
def __init__(self, fit_intercept=True, l_rate=0.01, n_epoch=100):
self.fit_intercept = fit_intercept
self.l_rate = l_rate
self.n_epoch = n_epoch
self.coef_ = None
self.epoch = []
def sigmoid(self, t):
"""The logistic function used to transform the linear regression to
logistic regression is the sigmoid function."""
return 1 / (1 + np.exp(- t))
def predict_proba(self, row, coef_):
"""Predict the probability that <row> belongs to Class 1, given the
weights <coef_> for each feature."""
if self.fit_intercept:
t = np.dot(row, coef_[1:]) + np.array([coef_[0]])
else:
t = np.dot(row, coef_)
return self.sigmoid(t)
def fit_mse(self, X_train, y_train):
"""Update the <self.coef_> attribute by estimating the optimal weight
values using the Gradient Descent method with the Mean Squared Error
cost function.
We will start with all weight values being equal to zero.
"""
if self.fit_intercept:
count = len(X_train.columns.tolist()) + 1
else:
count = len(X_train.columns.tolist())
# Initialize the weights
coef_ = np.zeros(count)
# Determining the number of rows
N = len(X_train)
# Training loop
for _ in range(self.n_epoch):
errors = []
i = 0
for _, row in X_train.iterrows():
y_hat = self.predict_proba(row, coef_)
# Update all weights
if self.fit_intercept:
ind = 1
for value in row:
coef_[ind] = coef_[ind] - self.l_rate * (
y_hat - y_train.iloc[i]) * y_hat * (
1 - y_hat) * value
ind = ind + 1
coef_[0] = coef_[0] - self.l_rate * (
y_hat - y_train.iloc[i]) * y_hat * (
1 - y_hat)
else:
ind = 0
for value in row:
coef_[ind] = coef_[ind] - self.l_rate * (
y_hat - y_train.iloc[i]) * y_hat * (
1 - y_hat) * value
ind = ind + 1
error = ((y_hat - y_train.iloc[i]) ** 2) * (1 / N)
errors.append(error.item())
i = i + 1
self.epoch.append(errors)
self.coef_ = coef_
def fit_log_loss(self, X_train, y_train):
if self.fit_intercept:
count = len(X_train.columns.tolist()) + 1
else:
count = len(X_train.columns.tolist())
# Initialize the weights
coef_ = np.zeros(count)
# Determine number of rows
N = len(X_train)
# Training loop
for _ in range(self.n_epoch):
errors = []
i = 0
for _, row in X_train.iterrows():
y_hat = self.predict_proba(row, coef_)
# Update all weights
if self.fit_intercept:
ind = 1
for value in row:
coef_[ind] = coef_[ind] - (self.l_rate * (
y_hat - y_train.iloc[i]) * value) / N
ind = ind + 1
coef_[0] = coef_[0] - (self.l_rate * (
y_hat - y_train.iloc[i])) / N
else:
ind = 0
for value in row:
coef_[ind] = coef_[ind] - (self.l_rate * (
y_hat - y_train.iloc[i]) * value) / N
ind = ind + 1
error = (y_train.iloc[i] * log(y_hat) + (
(1 - y_train.iloc[i]) * log(1 - y_hat))) * (
- 1 / N)
errors.append(error.item())
i = i + 1
self.epoch.append(errors)
self.coef_ = coef_
def predict(self, X_test, cut_off=0.5):
"""After the optimal weight values have been determined using the
self.fit_mse() method and the <self.coef_> has been updates, output the
prediction of whether the dataset belongs to class 1 or class 0.
Predictions can only take two values: 0 or 1.
"""
predictions = self.predict_proba(X_test.to_numpy(), self.coef_)
predictions[predictions >= cut_off] = 1
predictions[predictions < cut_off] = 0
return predictions
"""Stage 4: Compare between models
Description
In previous stages, we have successfully carried out the Stochastic gradient
descent on the Mean squared error and Log-loss cost functions.
At this stage, we need to train three models:
1 - Our implementation of logistic regression with the fit_mse cost function
2 - The same logistic regression with the fit_log_loss cost function
3 - The sklearn logistic regression algorithm
Objectives
1 - Load the Breast Cancer Wisconsin dataset. Select worst concave points,
worst perimeter, and worst radius as features and target as the target variable
2 - Standardize X
3 - Split the dataset including the target variable into training and test sets.
Set train_size=0.8 and random_state=43.
4 - Fit a model with the training set using the fit_log_loss method
5 - Fit a model with the training set using the fit_mse method
6 - Import LogisticRegression from sklearn.linear_model and fit it with the
training set
7 - Determine the error values during the first and the last epoch of training
custom logistic regression for fit_mse method
8 - Repeat the same operation for fit_log_loss method
9 - Predict y_hat values for the test set with all three models
10 - Calculate the accuracy scores for the test set for all models
11 - Print the accuracy scores of all models and the errors from the first and
the last epochs of the training custom models as a Python dictionary.
Use the following parameters for all three models:
n_epoch = 1000 (only for the two custom models)
fit_intercept = True
l_rate = 0.01
Additionally, answer the following questions:
1 - What is the minimum MSE value for the first epoch?
2 - What is the minimum MSE value for the last epoch?
3 - What is the maximum Log-loss value for the first epoch?
4 - What is the maximum Log-loss value for the last epoch?
5 - Has the range of the MSE values expanded or narrowed?
6 - Has the range of the Log-loss values expanded or narrowed?
"""
# Load the dataset
data = load_breast_cancer(as_frame=True)
X = data.data[['worst concave points', 'worst perimeter', 'worst radius']]
y = data.target
# Standardize X
for feature in X.columns.tolist():
feature_mean = X[feature].mean()
feature_std = X[feature].std()
X.loc[:, feature] = (X.loc[:, feature] - feature_mean) / feature_std
# Split the datasets to training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
random_state=43)
# Fit a model for each cost function method
log_loss_model = CustomLogisticRegression(fit_intercept=True, l_rate=0.01,
n_epoch=1000)
log_loss_model.fit_log_loss(X_train, y_train)
mse_model = CustomLogisticRegression(fit_intercept=True, l_rate=0.01,
n_epoch=1000)
mse_model.fit_mse(X_train, y_train)
# Fit a model using sklearn
sklearn_model = LogisticRegression(fit_intercept=True)
sklearn_model.fit(X_train, y_train)
# Determine the error values for the first and last epoch of training for
# fit_mse and fit_log_loss methods
mse_first_epoch_error = mse_model.epoch[0]
mse_last_epoch_error = mse_model.epoch[-1]
log_loss_first_epoch_error = log_loss_model.epoch[0]
log_loss_last_epoch_error = log_loss_model.epoch[-1]
# Predict y_hat values for the test set with all three models
mse_y_hat = mse_model.predict (X_test)
log_loss_y_hat = log_loss_model.predict(X_test)
sklearn_y_hat = sklearn_model.predict(X_test)
# Calculate the accuracy scores for the test set for all three models
mse_accuracy = accuracy_score(y_test.to_numpy(), mse_y_hat)
log_loss_accuracy = accuracy_score(y_test.to_numpy(), log_loss_y_hat)
sklearn_accuracy = accuracy_score(y_test.to_numpy(), sklearn_y_hat)
# Printing the required dictionary
output_dict = {'mse_accuracy': mse_accuracy,
'logloss_accuracy': log_loss_accuracy,
'sklearn_accuracy': sklearn_accuracy,
'mse_error_first': mse_first_epoch_error,
'mse_error_last': mse_last_epoch_error,
'logloss_error_first': log_loss_first_epoch_error,
'logloss_error_last': log_loss_last_epoch_error}
print(output_dict, end='\n\n')
# Printing the answers for the questions
min_mse_first = format(min(mse_first_epoch_error), '.5f')
min_mse_last = format(min(mse_last_epoch_error), '.5f')
max_logloss_first = format(max(log_loss_first_epoch_error), '.5f')
max_logloss_last = format(max(log_loss_last_epoch_error), '.5f')
print(f"""Answers to the questions:
1) {min_mse_first}
2) {min_mse_last}
3) {max_logloss_first}
4) {max_logloss_last}
5) expanded
6) expanded
""")