forked from vgutierrez9/breast-cancer-classification
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBreast_Cancer_Tumor_Prediction.py
323 lines (252 loc) · 13.8 KB
/
Breast_Cancer_Tumor_Prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
'''
Building Binary Classification Models to Identify Malignant and Benign
Breast Cancer Tumors
Vanessa Gutierrez
'''
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def prepare_dataset(dataset_path):
'''
Read a comma separated text file where
- the first field is the sample code ID number
- the 11th (last) field is a class label 2 for benign or 4 for malignant
- the remaining fields are values for clump thickness, uniformity of cell size and
shape, marginal adhesion, single epithelial cell size, bare nuclei, bland chromatin,
normal nucleoli, and mitoses.
Return two numpy arrays X and y where
- X is two dimensional. X[i,:] is the ith tumor example and i's row contains the data points listed above for i
- y is one dimensional. y[i] is the class label of tumor described at X[i,:]
y[i] is set to 2 for benign or 4 for malignant
@param dataset_path: full path of the dataset text file
@return X,y
'''
# Read data from file, separated by commas
X_All = np.genfromtxt(dataset_path, delimiter=",", dtype=int)
# Convert data to numpyArray format
X_All = np.array(X_All)
X = list()
Y = list()
# Set Y to 2 (benign) or 4 (malignant) for each 2 or 4 in x[:][10]
for elem in X_All:
elem = list(elem)
Y.append((elem[10],))
# Remove ID and class from X
X.append(elem[1:10])
X = np.array(X)
Y = np.array(Y)
return(X, Y)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def partition_dataset(X_All, Y_All):
''' Takes in the data set's X and Y, randomizes order of rows,
assigns 80% of data to Training and 20% to Testing,
and appropriate values to the respective training and testing
X and Y arrays.
@param X_All an array with one tumor's data points per row,
with rows in the same order as the data was provided.
@param Y_All an array with one tumor's classification, 2 for benign or 4 for malignant,
per row, with each row's classification corresponding to the data in
the same row in X_All
@return
X_Train : randomized tumor records selected for the X_training set
X_Test : randomized tumor records selected for the X_testing set
Y_Train : the respective label (2 or 4) corresponding to the records in X_Train
Y_Test : the respective label (2 or 4) corresponding to the records in X_Test
'''
# Get the total number of records
n = len(X_All)
# Calculate the amount equal 80% of records, note the use of int.
n80 = int(n*.8)
# Create a list length n of randomized order of numbers 1-n
randomOrder = np.random.permutation(n)
# Randomize order of data, keeping X and Y in the same order as each other
randomX = X_All[randomOrder]
randomY = Y_All[randomOrder]
# Separate data into training and testing sets 80:20
X_Train = randomX[:n80]
X_Test = randomX[n80:]
Y_Train = randomY[:n80]
Y_Test = randomY[n80:]
return(X_Train, X_Test, Y_Train, Y_Test)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def build_NB_classifier(X_training, y_training):
'''
Build a Naive Bayes classifier based on the training set X_training, y_training.
@param
X_training: X_training[i,:] where the ith row contains the data for one tumor
y_training: y_training[i] is the class label of the tumor who's data is at X_training[i,:]
@return
best_clf : the classifier with the highest accuracy built in this function
best_validation_accuracy : the classifier's validation accuracy
CV_results : the results of cross validation
'''
CV_results = []
kf = KFold(n_splits = 10)
model = GaussianNB()
# Use k-fold splits to train and validate 10 different Naive Bayes Classifiers
for train, valid in kf.split(X_training):
# Create a classifier using the training data from the current split
clf = model.fit(X_training[train], y_training[train])
# Adds the accuracy_score of validation data, and the classifier to the results list
CV_results += [(accuracy_score(y_training[valid], clf.predict(X_training[valid])), clf),]
best_validation_accuracy = 0
for result in CV_results:
# Saves the classifier with the best validation accuracy score from the k-fold validation
if (result[0] > best_validation_accuracy):
best_validation_accuracy = result[0]
best_clf = result[1]
return (best_clf, best_validation_accuracy, CV_results)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def build_DT_classifier(X_training, y_training):
'''
Build a Decision Tree classifier based on the training set X_training, y_training.
@param
X_training: X_training[i,:] where the ith row contains the data for one tumor
y_training: y_training[i] is the class label of the tumor who's data is at X_training[i,:]
@return
best_clf : the classifier with the highest accuracy built in this function
best_validation_accuracy : the classifier's validation accuracy
CV_results : the results of cross validation
'''
CV_results = []
kf = KFold(n_splits = 10)
model = DecisionTreeClassifier()
# Use k-fold splits to train and validate 10 different Decision Tree Classifiers
for train, valid in kf.split(X_training):
clf = model.fit(X_training[train], y_training[train])
# Adds the accuracy_score of validation data, and the classifier to the results list
CV_results += [(accuracy_score(y_training[valid], clf.predict(X_training[valid])), clf),]
best_validation_accuracy = 0
for result in CV_results:
# Saves the classifier with the best validation accuracy score from the k-fold validation
if (result[0] > best_validation_accuracy):
best_validation_accuracy = result[0]
best_clf = result[1]
return (best_clf, best_validation_accuracy, CV_results)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def build_NN_classifier(X_training, y_training):
'''
Build a Nearest Neighbors classifier based on the training set X_training, y_training.
@param
X_training: X_training[i,:] where the ith row contains the data for one tumor
y_training: y_training[i] is the class label of the tumor who's data is at X_training[i,:]
@return
best_clf : the classifier with the highest accuracy built in this function
best_validation_accuracy : the classifier's validation accuracy
CV_results : the results of cross validation
'''
kf = KFold(n_splits = 10)
NN_results = []
# Use cross validation to find best model for each odd value K in range 1-16
for i in range(1,16):
CV_results = []
# Skip even k's for k nearest neighbor
if (i % 2 != 0):
model = KNeighborsClassifier(n_neighbors = i)
# Use k-fold splits to train and validate 10 different kNN Classifiers for k = i value
for train, valid in kf.split(X_training):
clf = model.fit(X_training[train], y_training[train])
# Adds the accuracy_score of validation data, and the classifier to the results list
CV_results += [(accuracy_score(y_training[valid], clf.predict(X_training[valid])), clf),]
best_accuracy_cv = 0
# Saves the most accurate classifier for i neighbors (clf with highest validation accuracy)
for result in CV_results:
if(result[0] > best_accuracy_cv):
best_accuracy_cv = result[0]
best_clf_cv = result[1]
# Save number of neighbors and the validation accuracy of the most accurate kNN classifier with i neighbors
NN_results += [(i, best_accuracy_cv, best_clf_cv)]
# Find the "best of the best": given the most accurate classifier for each k value, find and save the classifier with the best accuracy
best_accuracy_K = 0
for K in NN_results:
if( K[1] > best_accuracy_K):
best_accuracy_K = K[1]
best_K = K[0]
best_clf_K = K[2]
return (best_clf_K, best_accuracy_K, best_K)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def build_SVM_classifier(X_training, y_training):
'''
Build a Support Vector Machine classifier based on the training set X_training, y_training.
@param
X_training: X_training[i,:] where the ith row contains the data for one tumor
y_training: y_training[i] is the class label of the tumor who's data is at X_training[i,:]
@return
best_clf_K : the classifier with the highest accuracy built in this function
best_validation_accuracy_K : the classifier's validation accuracy
best_K : the kernel used
'''
kf = KFold(n_splits = 10)
SVM_results = []
# Use cross validation to find best model for each kernel type
for kernel in ('linear', 'rbf', 'sigmoid'):
model = SVC(kernel=kernel)
CV_results = []
# Use k-fold splits to train and validate 10 different SVM Classifiers for each kernel type
for train, valid in kf.split(X_training):
clf = model.fit(X_training[train], y_training[train])
# Adds the accuracy_score of validation data, and the classifier to the results list
CV_results += [(accuracy_score(y_training[valid], clf.predict(X_training[valid])), clf),]
best_validation_accuracy_cv = 0
# Saves the most accurate classifier for the kernel type (clf with highest validation accuracy)
for result in CV_results:
if(result[0] > best_validation_accuracy_cv):
best_validation_accuracy_cv = result[0]
best_clf_cv = result[1]
# Save kernel type and validation accuracy of the most accurate SVM classifier with that kernel type
SVM_results += [(kernel, best_validation_accuracy_cv, best_clf_cv)]
best_validation_K = 0
# Find the "best of the best": given the most accurate classifier for each kernel type, find and save the classifier with the best accuracy
for K in SVM_results:
if(K[1] > best_validation_K):
best_K = K[0]
best_validation_K = K[1]
best_clf_K = K[2]
return (best_clf_K, best_validation_K, best_K)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
if __name__ == "__main__":
print("Breast Cancer Tumor Prediction")
print("Author: Vanessa Gutierrez")
print("This program builds various binary classifiers to predict the diagnoses of breast cancer tumors as malignant or benign, using the following tumor measurments:")
print("Clump Thickness, Uniformity of Cell Size, Uniformity of Cell Shape, Marginal Adhesion, Single Epithelial Cell Size, Bare Nuclei, Bland Chromatin, Normal Nucleoli and Mitoses.")
print("Using the Breast Cancer Wisconsin Dataset found at: \nhttp://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29\n")
X, Y = prepare_dataset("Breast_Cancer_Wisconsin_Data.csv")
X_Train, X_Test, Y_Train, Y_Test = partition_dataset(X, Y)
# print(X_Train,X_Test, Y_Train, Y_Test)
print("--------------------------------------------")
print("NAIVE BAYES CLASSIFIER")
# Create Naive Bayes classifier, and get its validation accuracy
NB_clf, NB_validation_accuracy, NB_CV_results = build_NB_classifier(X_Train, Y_Train.ravel())
print("Naive Bayes Classifier best accuracy on validation data in k-fold cross validation:", NB_validation_accuracy)
# Run naive bayes classifier on testing data
NB_testing_accuracy = accuracy_score(Y_Test, NB_clf.predict(X_Test))
print("Naive Bayes Classifier Testing Data Accuracy: ", NB_testing_accuracy)
print("--------------------------------------------")
print("DECISION TREE CLASSIFIER")
# Create Decision Tree classifier, and get its validation accuracy
DT_clf, DT_validation_accuracy, DT_CV_results = build_DT_classifier(X_Train, Y_Train.ravel())
print("Decision Tree Classifier best accuracy on validation data in k-fold cross validation:", DT_validation_accuracy)
# Run Decision Tree classifier on testing data
DT_testing_accuracy = accuracy_score(Y_Test, DT_clf.predict(X_Test))
print("Decision Tree Classifier Testing Data Accuracy: ", DT_testing_accuracy)
print("--------------------------------------------")
print("K NEAREST NEIGHBORS (kNN) CLASSIFIER")
# Create K-Nearest Neighbors classifier, and get its validation accuracy
NN_clf, NN_validation_accuracy, NN_CV_results = build_NN_classifier(X_Train, Y_Train.ravel())
print("K-Nearest Neighbor Classifier best accuracy on validation data in k-fold cross validation:", NN_validation_accuracy)
# Run K-Nearest Neighbor classifier on testing data
NN_testing_accuracy = accuracy_score(Y_Test, NN_clf.predict(X_Test))
print("K-Nearest Neighbor Classifier Testing Data Accuracy: ", NN_testing_accuracy)
print("--------------------------------------------")
print("SUPPORT VECTOR MACHINE (SVM) CLASSIFIER")
# Create SVM classifier, and get its validation accuracy
SVM_clf, SVM_validation_accuracy, SVM_CV_results = build_SVM_classifier(X_Train, Y_Train.ravel())
print("Support Vector Machine Classifier best accuracy on validation data in k-fold cross validation:", SVM_validation_accuracy)
# Run SVM classifier on testing data
SVM_testing_accuracy = accuracy_score(Y_Test, SVM_clf.predict(X_Test))
print("Support Vector Machine Testing Data Accuracy: ", SVM_testing_accuracy)