From da20b085b33531a7b4e81c75f8011e438ee9b596 Mon Sep 17 00:00:00 2001
From: Bikash Karmokar <bksbd007@gmail.com>
Date: Thu, 18 Nov 2021 22:41:07 +0600
Subject: [PATCH] Added liblinear solver in the LogisticRegression
 initialization. Added training accuracy and testing accuracy print in the
 function __VotingClassifier__(). In the main function called
 ensemble.__VotingClassifier__(). And finaly, added requirements.txt file as I
 tested it with the lates scikit-learn version.

Signed-off-by: Bikash Karmokar <bksbd007@gmail.com>
---
 .gitignore       | 141 +++++++++++++++++++++++++++++++++++++++++++++++
 blending.py      |  41 +++++++-------
 requirements.txt | Bin 0 -> 204 bytes
 stacking.py      |  44 ++++++++-------
 voting.py        |  41 ++++++++------
 5 files changed, 209 insertions(+), 58 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 requirements.txt

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7c03df3
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,141 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# pycharm
+.idea
diff --git a/blending.py b/blending.py
index fa9fd3d..24f73c7 100644
--- a/blending.py
+++ b/blending.py
@@ -9,6 +9,7 @@
 from sklearn.naive_bayes import GaussianNB
 from sklearn.linear_model import LogisticRegression
 
+
 class Ensemble:
     def __init__(self):
         self.x_train = None
@@ -19,18 +20,19 @@ def __init__(self):
     def load_data(self):
         x, y = load_breast_cancer(return_X_y=True)
         self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y, test_size=0.15, random_state=23)
-        self.x_train, self.x_val, self.y_train, self.y_val = train_test_split(self.x_train, self.y_train, test_size=0.3, random_state=23)
-    
+        self.x_train, self.x_val, self.y_train, self.y_val = train_test_split(self.x_train, self.y_train, test_size=0.3,
+                                                                              random_state=23)
+
     def BlendingClassifier(self):
 
         # Define weak learners
         weak_learners = [('dt', DecisionTreeClassifier()),
-                        ('knn', KNeighborsClassifier()),
-                        ('rf', RandomForestClassifier()),
-                        ('gb', GradientBoostingClassifier()),
-                        ('gn', GaussianNB())]
-        
-        # Finaler learner or meta model
+                         ('knn', KNeighborsClassifier()),
+                         ('rf', RandomForestClassifier()),
+                         ('gb', GradientBoostingClassifier()),
+                         ('gn', GaussianNB())]
+
+        # Final learner or meta model
         final_learner = LogisticRegression()
 
         train_meta_model = None
@@ -38,12 +40,12 @@ def BlendingClassifier(self):
 
         # Start stacking
         for clf_id, clf in weak_learners:
-            
+
             # Predictions for each classifier based on k-fold
             val_predictions, test_predictions = self.train_level_0(clf)
-            
+
             # Stack predictions which will form 
-            # the inputa data for the data model
+            # the input data for the data model
             if isinstance(train_meta_model, np.ndarray):
                 train_meta_model = np.vstack((train_meta_model, val_predictions))
             else:
@@ -55,25 +57,24 @@ def BlendingClassifier(self):
                 test_meta_model = np.vstack((test_meta_model, test_predictions))
             else:
                 test_meta_model = test_predictions
-        
+
         # Transpose train_meta_model
         train_meta_model = train_meta_model.T
 
         # Transpose test_meta_model
         test_meta_model = test_meta_model.T
-        
+
         # Training level 1
         self.train_level_1(final_learner, train_meta_model, test_meta_model)
 
-
     def train_level_0(self, clf):
         # Train with base x_train
         clf.fit(self.x_train, self.y_train)
-        
+
         # Generate predictions for the holdout set (validation)
         # These predictions will build the input for the meta model
         val_predictions = clf.predict(self.x_val)
-        
+
         # Generate predictions for original test set
         # These predictions will be used to test the meta model
         test_predictions = clf.predict(self.x_test)
@@ -83,13 +84,13 @@ def train_level_0(self, clf):
     def train_level_1(self, final_learner, train_meta_model, test_meta_model):
         # Train is carried out with final learner or meta model
         final_learner.fit(train_meta_model, self.y_val)
-       
+
         # Getting train and test accuracies from meta_model
-        print(f"Train accuracy: {final_learner.score(train_meta_model,  self.y_val)}")
+        print(f"Train accuracy: {final_learner.score(train_meta_model, self.y_val)}")
         print(f"Test accuracy: {final_learner.score(test_meta_model, self.y_test)}")
-        
+
 
 if __name__ == "__main__":
     ensemble = Ensemble()
     ensemble.load_data()
-    ensemble.BlendingClassifier()
\ No newline at end of file
+    ensemble.BlendingClassifier()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6daaea4630f13b1de276785ddac01a6755d453ab
GIT binary patch
literal 204
zcmZ9GOAdlS5JcbFgri_!FmdH!R807cVq{!!d9bGW#E?$+rm9}`G&XeH@YK2&P0rCa
zBjLb{FHhbKo90YCnc8z#uGI!oF&<p`D6?`hV@1yF{O6pdxrNg1vN1RJ2PzRGx6~J%
O#;Y^v#*V^#tM>;kzaG^9

literal 0
HcmV?d00001

diff --git a/stacking.py b/stacking.py
index 0eb8efb..ee3a927 100644
--- a/stacking.py
+++ b/stacking.py
@@ -9,6 +9,7 @@
 from sklearn.naive_bayes import GaussianNB
 from sklearn.linear_model import LogisticRegression
 
+
 class Ensemble:
     def __init__(self):
         self.x_train = None
@@ -20,17 +21,17 @@ def __init__(self):
     def load_data(self):
         x, y = load_breast_cancer(return_X_y=True)
         self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y, test_size=0.3, random_state=23)
-    
+
     def StackingClassifier(self):
 
         # Define weak learners
         weak_learners = [('dt', DecisionTreeClassifier()),
-                        ('knn', KNeighborsClassifier()),
-                        ('rf', RandomForestClassifier()),
-                        ('gb', GradientBoostingClassifier()),
-                        ('gn', GaussianNB())]
-        
-        # Finaler learner or meta model
+                         ('knn', KNeighborsClassifier()),
+                         ('rf', RandomForestClassifier()),
+                         ('gb', GradientBoostingClassifier()),
+                         ('gn', GaussianNB())]
+
+        # Final learner or meta model
         final_learner = LogisticRegression()
 
         train_meta_model = None
@@ -40,12 +41,12 @@ def StackingClassifier(self):
         for clf_id, clf in weak_learners:
             # Predictions for each classifier based on k-fold
             predictions_clf = self.k_fold_cross_validation(clf)
-            
+
             # Predictions for test set for each classifier based on train of level 0
             test_predictions_clf = self.train_level_0(clf)
-            
+
             # Stack predictions which will form 
-            # the inputa data for the data model
+            # the input data for the data model
             if isinstance(train_meta_model, np.ndarray):
                 train_meta_model = np.vstack((train_meta_model, predictions_clf))
             else:
@@ -57,19 +58,18 @@ def StackingClassifier(self):
                 test_meta_model = np.vstack((test_meta_model, test_predictions_clf))
             else:
                 test_meta_model = test_predictions_clf
-        
+
         # Transpose train_meta_model
         train_meta_model = train_meta_model.T
 
         # Transpose test_meta_model
         test_meta_model = test_meta_model.T
-        
+
         # Training level 1
         self.train_level_1(final_learner, train_meta_model, test_meta_model)
 
-
     def k_fold_cross_validation(self, clf):
-        
+
         predictions_clf = None
 
         # Number of samples per fold
@@ -87,14 +87,16 @@ def k_fold_cross_validation(self, clf):
                 test = self.x_train[(batch_size * fold): (batch_size * (fold + 1)), :]
                 batch_start = batch_size * fold
                 batch_finish = batch_size * (fold + 1)
-            
+
             # test & training samples for each fold iteration
             fold_x_test = self.x_train[batch_start:batch_finish, :]
-            fold_x_train = self.x_train[[index for index in range(self.x_train.shape[0]) if index not in range(batch_start, batch_finish)], :]
+            fold_x_train = self.x_train[[index for index in range(self.x_train.shape[0]) if
+                                         index not in range(batch_start, batch_finish)], :]
 
             # test & training targets for each fold iteration
             fold_y_test = self.y_train[batch_start:batch_finish]
-            fold_y_train = self.y_train[[index for index in range(self.x_train.shape[0]) if index not in range(batch_start, batch_finish)]]
+            fold_y_train = self.y_train[
+                [index for index in range(self.x_train.shape[0]) if index not in range(batch_start, batch_finish)]]
 
             # Fit current classifier
             clf.fit(fold_x_train, fold_y_train)
@@ -113,18 +115,18 @@ def train_level_0(self, clf):
         clf.fit(self.x_train, self.y_train)
         # Get predictions from full real test set
         y_pred = clf.predict(self.x_test)
-        
+
         return y_pred
 
     def train_level_1(self, final_learner, train_meta_model, test_meta_model):
         # Train is carried out with final learner or meta model
         final_learner.fit(train_meta_model, self.y_train)
         # Getting train and test accuracies from meta_model
-        print(f"Train accuracy: {final_learner.score(train_meta_model,  self.y_train)}")
+        print(f"Train accuracy: {final_learner.score(train_meta_model, self.y_train)}")
         print(f"Test accuracy: {final_learner.score(test_meta_model, self.y_test)}")
-        
+
 
 if __name__ == "__main__":
     ensemble = Ensemble()
     ensemble.load_data()
-    ensemble.StackingClassifier()
\ No newline at end of file
+    ensemble.StackingClassifier()
diff --git a/voting.py b/voting.py
index ab62aa1..46167c5 100644
--- a/voting.py
+++ b/voting.py
@@ -5,8 +5,8 @@
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.linear_model import LogisticRegression
 
-from sklearn.model_selection import GridSearchCV
 from sklearn.ensemble import VotingClassifier
+from sklearn.metrics import accuracy_score
 
 
 class Ensemble:
@@ -24,38 +24,38 @@ def load_data(self):
     def __Classifiers__(name=None):
         # See for reproducibility
         random_state = 23
-        
+
         if name == 'decision_tree':
             return DecisionTreeClassifier(random_state=random_state)
         if name == 'kneighbors':
             return KNeighborsClassifier()
         if name == 'logistic_regression':
-            return LogisticRegression(random_state=random_state)
+            return LogisticRegression(random_state=random_state, solver='liblinear')
 
     def __DecisionTreeClassifier__(self):
-        
+
         # Decision Tree Classifier
         decision_tree = Ensemble.__Classifiers__(name='decision_tree')
-        
+
         # Train Decision Tree
         decision_tree.fit(self.x_train, self.y_train)
 
     def __KNearestNeighborsClassifier__(self):
-        
+
         # K-Nearest Neighbors Classifier
         knn = Ensemble.__Classifiers__(name='kneighbors')
-        
+
         # Train K-Nearest Neighbos
         knn.fit(self.x_train, self.y_train)
 
     def __LogisticRegression__(self):
-        
+
         # Decision Tree Classifier
         logistic_regression = Ensemble.__Classifiers__(name='logistic_regression')
-        
+
         # Init Grid Search
         logistic_regression.fit(self.x_train, self.y_train)
-    
+
     def __VotingClassifier__(self):
 
         # Instantiate classifiers
@@ -64,15 +64,22 @@ def __VotingClassifier__(self):
         logistic_regression = Ensemble.__Classifiers__(name='logistic_regression')
 
         # Voting Classifier initialization
-        vc = VotingClassifier(estimators=[('decision_tree', decision_tree), 
-                                        ('knn', knn), ('logistic_regression', 
-                                        logistic_regression)], voting='soft')
-        
-        # Init Grid Search
+        vc = VotingClassifier(estimators=[('decision_tree', decision_tree),
+                                          ('knn', knn), ('logistic_regression',
+                                                         logistic_regression)], voting='soft')
+
+        # Fitting the vc model
         vc.fit(self.x_train, self.y_train)
 
-if __name__ == "__main__":
+        # Getting train and test accuracies from meta_model
+        y_pred_train = vc.predict(self.x_train)
+        y_pred = vc.predict(self.x_test)
 
+        print(f"Train accuracy: {accuracy_score(self.y_train, y_pred_train)}")
+        print(f"Test accuracy: {accuracy_score(self.y_test, y_pred)}")
+
+
+if __name__ == "__main__":
     ensemble = Ensemble()
     ensemble.load_data()
-    ensemble.__StackingClassifier__()
\ No newline at end of file
+    ensemble.__VotingClassifier__()