From da20b085b33531a7b4e81c75f8011e438ee9b596 Mon Sep 17 00:00:00 2001 From: Bikash Karmokar Date: Thu, 18 Nov 2021 22:41:07 +0600 Subject: [PATCH] Added liblinear solver in the LogisticRegression initialization. Added training accuracy and testing accuracy print in the function __VotingClassifier__(). In the main function called ensemble.__VotingClassifier__(). And finaly, added requirements.txt file as I tested it with the lates scikit-learn version. Signed-off-by: Bikash Karmokar --- .gitignore | 141 +++++++++++++++++++++++++++++++++++++++++++++++ blending.py | 41 +++++++------- requirements.txt | Bin 0 -> 204 bytes stacking.py | 44 ++++++++------- voting.py | 41 ++++++++------ 5 files changed, 209 insertions(+), 58 deletions(-) create mode 100644 .gitignore create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7c03df3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,141 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# pycharm +.idea diff --git a/blending.py b/blending.py index fa9fd3d..24f73c7 100644 --- a/blending.py +++ b/blending.py @@ -9,6 +9,7 @@ from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression + class Ensemble: def __init__(self): self.x_train = None @@ -19,18 +20,19 @@ def __init__(self): def load_data(self): x, y = load_breast_cancer(return_X_y=True) self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y, test_size=0.15, random_state=23) - self.x_train, self.x_val, self.y_train, self.y_val = train_test_split(self.x_train, self.y_train, test_size=0.3, random_state=23) - + self.x_train, self.x_val, self.y_train, self.y_val = train_test_split(self.x_train, self.y_train, test_size=0.3, + random_state=23) + def BlendingClassifier(self): # Define weak learners weak_learners = [('dt', DecisionTreeClassifier()), - ('knn', KNeighborsClassifier()), - ('rf', RandomForestClassifier()), - ('gb', GradientBoostingClassifier()), - ('gn', GaussianNB())] - - # Finaler learner or meta model + ('knn', KNeighborsClassifier()), + ('rf', RandomForestClassifier()), + ('gb', GradientBoostingClassifier()), + ('gn', GaussianNB())] + + # Final learner or meta model final_learner = LogisticRegression() train_meta_model = None @@ -38,12 +40,12 @@ def BlendingClassifier(self): # Start stacking for clf_id, clf in weak_learners: - + # Predictions for each classifier based on k-fold val_predictions, test_predictions = self.train_level_0(clf) - + # Stack predictions which will form - # the inputa data for the data model + # the input data for the data model if isinstance(train_meta_model, np.ndarray): train_meta_model = np.vstack((train_meta_model, val_predictions)) else: @@ -55,25 +57,24 @@ def BlendingClassifier(self): test_meta_model = np.vstack((test_meta_model, test_predictions)) else: test_meta_model = test_predictions - + # Transpose train_meta_model train_meta_model = train_meta_model.T # Transpose test_meta_model test_meta_model = test_meta_model.T - + # Training level 1 self.train_level_1(final_learner, train_meta_model, test_meta_model) - def train_level_0(self, clf): # Train with base x_train clf.fit(self.x_train, self.y_train) - + # Generate predictions for the holdout set (validation) # These predictions will build the input for the meta model val_predictions = clf.predict(self.x_val) - + # Generate predictions for original test set # These predictions will be used to test the meta model test_predictions = clf.predict(self.x_test) @@ -83,13 +84,13 @@ def train_level_0(self, clf): def train_level_1(self, final_learner, train_meta_model, test_meta_model): # Train is carried out with final learner or meta model final_learner.fit(train_meta_model, self.y_val) - + # Getting train and test accuracies from meta_model - print(f"Train accuracy: {final_learner.score(train_meta_model, self.y_val)}") + print(f"Train accuracy: {final_learner.score(train_meta_model, self.y_val)}") print(f"Test accuracy: {final_learner.score(test_meta_model, self.y_test)}") - + if __name__ == "__main__": ensemble = Ensemble() ensemble.load_data() - ensemble.BlendingClassifier() \ No newline at end of file + ensemble.BlendingClassifier() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..6daaea4630f13b1de276785ddac01a6755d453ab GIT binary patch literal 204 zcmZ9GOAdlS5JcbFgri_!FmdH!R807cVq{!!d9bGW#E?$+rm9}`G&XeH@YK2&P0rCa zBjLb{FHhbKo90YCnc8z#uGI!oF&;kzaG^9 literal 0 HcmV?d00001 diff --git a/stacking.py b/stacking.py index 0eb8efb..ee3a927 100644 --- a/stacking.py +++ b/stacking.py @@ -9,6 +9,7 @@ from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression + class Ensemble: def __init__(self): self.x_train = None @@ -20,17 +21,17 @@ def __init__(self): def load_data(self): x, y = load_breast_cancer(return_X_y=True) self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y, test_size=0.3, random_state=23) - + def StackingClassifier(self): # Define weak learners weak_learners = [('dt', DecisionTreeClassifier()), - ('knn', KNeighborsClassifier()), - ('rf', RandomForestClassifier()), - ('gb', GradientBoostingClassifier()), - ('gn', GaussianNB())] - - # Finaler learner or meta model + ('knn', KNeighborsClassifier()), + ('rf', RandomForestClassifier()), + ('gb', GradientBoostingClassifier()), + ('gn', GaussianNB())] + + # Final learner or meta model final_learner = LogisticRegression() train_meta_model = None @@ -40,12 +41,12 @@ def StackingClassifier(self): for clf_id, clf in weak_learners: # Predictions for each classifier based on k-fold predictions_clf = self.k_fold_cross_validation(clf) - + # Predictions for test set for each classifier based on train of level 0 test_predictions_clf = self.train_level_0(clf) - + # Stack predictions which will form - # the inputa data for the data model + # the input data for the data model if isinstance(train_meta_model, np.ndarray): train_meta_model = np.vstack((train_meta_model, predictions_clf)) else: @@ -57,19 +58,18 @@ def StackingClassifier(self): test_meta_model = np.vstack((test_meta_model, test_predictions_clf)) else: test_meta_model = test_predictions_clf - + # Transpose train_meta_model train_meta_model = train_meta_model.T # Transpose test_meta_model test_meta_model = test_meta_model.T - + # Training level 1 self.train_level_1(final_learner, train_meta_model, test_meta_model) - def k_fold_cross_validation(self, clf): - + predictions_clf = None # Number of samples per fold @@ -87,14 +87,16 @@ def k_fold_cross_validation(self, clf): test = self.x_train[(batch_size * fold): (batch_size * (fold + 1)), :] batch_start = batch_size * fold batch_finish = batch_size * (fold + 1) - + # test & training samples for each fold iteration fold_x_test = self.x_train[batch_start:batch_finish, :] - fold_x_train = self.x_train[[index for index in range(self.x_train.shape[0]) if index not in range(batch_start, batch_finish)], :] + fold_x_train = self.x_train[[index for index in range(self.x_train.shape[0]) if + index not in range(batch_start, batch_finish)], :] # test & training targets for each fold iteration fold_y_test = self.y_train[batch_start:batch_finish] - fold_y_train = self.y_train[[index for index in range(self.x_train.shape[0]) if index not in range(batch_start, batch_finish)]] + fold_y_train = self.y_train[ + [index for index in range(self.x_train.shape[0]) if index not in range(batch_start, batch_finish)]] # Fit current classifier clf.fit(fold_x_train, fold_y_train) @@ -113,18 +115,18 @@ def train_level_0(self, clf): clf.fit(self.x_train, self.y_train) # Get predictions from full real test set y_pred = clf.predict(self.x_test) - + return y_pred def train_level_1(self, final_learner, train_meta_model, test_meta_model): # Train is carried out with final learner or meta model final_learner.fit(train_meta_model, self.y_train) # Getting train and test accuracies from meta_model - print(f"Train accuracy: {final_learner.score(train_meta_model, self.y_train)}") + print(f"Train accuracy: {final_learner.score(train_meta_model, self.y_train)}") print(f"Test accuracy: {final_learner.score(test_meta_model, self.y_test)}") - + if __name__ == "__main__": ensemble = Ensemble() ensemble.load_data() - ensemble.StackingClassifier() \ No newline at end of file + ensemble.StackingClassifier() diff --git a/voting.py b/voting.py index ab62aa1..46167c5 100644 --- a/voting.py +++ b/voting.py @@ -5,8 +5,8 @@ from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import GridSearchCV from sklearn.ensemble import VotingClassifier +from sklearn.metrics import accuracy_score class Ensemble: @@ -24,38 +24,38 @@ def load_data(self): def __Classifiers__(name=None): # See for reproducibility random_state = 23 - + if name == 'decision_tree': return DecisionTreeClassifier(random_state=random_state) if name == 'kneighbors': return KNeighborsClassifier() if name == 'logistic_regression': - return LogisticRegression(random_state=random_state) + return LogisticRegression(random_state=random_state, solver='liblinear') def __DecisionTreeClassifier__(self): - + # Decision Tree Classifier decision_tree = Ensemble.__Classifiers__(name='decision_tree') - + # Train Decision Tree decision_tree.fit(self.x_train, self.y_train) def __KNearestNeighborsClassifier__(self): - + # K-Nearest Neighbors Classifier knn = Ensemble.__Classifiers__(name='kneighbors') - + # Train K-Nearest Neighbos knn.fit(self.x_train, self.y_train) def __LogisticRegression__(self): - + # Decision Tree Classifier logistic_regression = Ensemble.__Classifiers__(name='logistic_regression') - + # Init Grid Search logistic_regression.fit(self.x_train, self.y_train) - + def __VotingClassifier__(self): # Instantiate classifiers @@ -64,15 +64,22 @@ def __VotingClassifier__(self): logistic_regression = Ensemble.__Classifiers__(name='logistic_regression') # Voting Classifier initialization - vc = VotingClassifier(estimators=[('decision_tree', decision_tree), - ('knn', knn), ('logistic_regression', - logistic_regression)], voting='soft') - - # Init Grid Search + vc = VotingClassifier(estimators=[('decision_tree', decision_tree), + ('knn', knn), ('logistic_regression', + logistic_regression)], voting='soft') + + # Fitting the vc model vc.fit(self.x_train, self.y_train) -if __name__ == "__main__": + # Getting train and test accuracies from meta_model + y_pred_train = vc.predict(self.x_train) + y_pred = vc.predict(self.x_test) + print(f"Train accuracy: {accuracy_score(self.y_train, y_pred_train)}") + print(f"Test accuracy: {accuracy_score(self.y_test, y_pred)}") + + +if __name__ == "__main__": ensemble = Ensemble() ensemble.load_data() - ensemble.__StackingClassifier__() \ No newline at end of file + ensemble.__VotingClassifier__()