Merge pull request #75 from wwu-mmll/feature/connectome_based_predictive_modeling

NilsWinter · web-flow · commit b3e29cd7f007 · 2024-11-04T14:22:24.000+01:00
Add CPM feature selection as model wrapper
diff --git a/examples/advanced/connectome_based_predictive_modeling_example.py b/examples/advanced/connectome_based_predictive_modeling_example.py
@@ -0,0 +1,34 @@
+"""
+Connectome-based predictive modeling
+
+CPM is a method described in the following Nature Protocols article: https://www.nature.com/articles/nprot.2016.178
+It has been used in a number of publications to predict behavior from connectivity data.
+CPM works similar to a feature selection method. First, relevant edges (connectivity values) are identified through
+correlation analysis. Every edge is correlated with the predictive target. Only significant edges will be used in the
+subsequent steps. Next, the edge values for all significant positive and for all significant negative correlations are
+summed to create two new features. Lastly, these two features are used as input to another classifier.
+
+In this example, no connectivity data is used, but the method will still work.
+This example is just supposed to show how to use CPM as feature selection and integration tool in PHOTONAI.
+"""
+
+from sklearn.datasets import load_breast_cancer
+from sklearn.model_selection import KFold
+
+from photonai import Hyperpipe, PipelineElement
+
+
+X, y = load_breast_cancer(return_X_y=True)
+
+pipe = Hyperpipe("cpm_feature_selection_pipe",
+                  outer_cv=KFold(n_splits=5, shuffle=True, random_state=15),
+                  inner_cv=KFold(n_splits=5, shuffle=True, random_state=15),
+                  metrics=["balanced_accuracy"], best_config_metric="balanced_accuracy",
+                  project_folder='./tmp')
+
+pipe += PipelineElement('CPMFeatureSelection', hyperparameters={'corr_method': ['pearson', 'spearman'],
+                                                                'p_threshold': [0.01, 0.05]})
+
+pipe += PipelineElement('LogisticRegression')
+
+pipe.fit(X, y)
diff --git a/photonai/base/registry/PhotonCore.json b/photonai/base/registry/PhotonCore.json
@@ -490,5 +490,9 @@
   "LocallyLinearEmbedding":[
     "sklearn.manifold.LocallyLinearEmbedding",
     "Transformer"
+  ],
+  "CPMFeatureSelection":[
+    "photonai.modelwrapper.cpm_feature_selection.CPMFeatureSelection",
+    "Estimator"
   ]
 }
diff --git a/photonai/modelwrapper/cpm_feature_selection.py b/photonai/modelwrapper/cpm_feature_selection.py
@@ -0,0 +1,139 @@
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from scipy.stats import beta, spearmanr
+
+from photonai.photonlogger.logger import logger
+
+
+class CPMFeatureSelection(BaseEstimator, TransformerMixin):
+    """Feature Selection using Connectome-Based Predictive Modeling.
+    loosely based on this paper https://www.nature.com/articles/nprot.2016.178#Sec10
+
+    Correlate all features with target and select significant features only.
+    Sum significant edges for positive correlations and negative correlations separately.
+    """
+    _estimator_type = "transformer"
+
+    def __init__(self, p_threshold: float = .05, corr_method: str = 'pearson'):
+        """
+        Initialize the object.
+
+        Parameters:
+            p_threshold:
+                Upper bound for p_values.
+            corr_method:
+                Correlation coefficient method. Can be 'pearson' or 'spearman'.
+
+        """
+        self.p_threshold = p_threshold
+        self.corr_method = corr_method
+        if corr_method not in ['pearson', 'spearman']:
+            raise NotImplementedError("corr_method has to be either 'pearson' or 'spearman'.")
+
+        self.significant_edges = None
+        self.positive_edges = None
+        self.negative_edges = None
+        self.n_original_features = None
+
+    def fit(self, X: np.ndarray, y: np.ndarray):
+        """Calculate correlation coefficients between features of X and y.
+
+        Parameters:
+            X:
+                The input samples of shape [n_samples, n_original_features]
+
+            y:
+                The input targets of shape [n_samples, 1]
+
+        """
+        n_samples, self.n_original_features = X.shape
+
+        if self.corr_method == 'pearson':
+            corr = self._columnwise_pearson
+        elif self.corr_method == 'spearman':
+            corr = self._columnwise_spearman
+        else:
+            corr = None
+
+        r, p = corr(X, y)
+        self.significant_edges = p < self.p_threshold
+        self.positive_edges = r > 0
+        self.negative_edges = r < 0
+        return self
+
+    @staticmethod
+    def _columnwise_pearson(X, y):
+        """
+        Compute Pearson's correlation coefficient between y and every column of X efficiently
+
+        :param X: ndarray
+        :param y: ndarray
+        :return: r_values: array of correlation coefficients
+                 p_values: array of corresponding p-values
+        """
+        n_samples = X.shape[0]
+        X = (X - X.mean(axis=0)) / X.std(axis=0)
+        y = (y - y.mean(axis=0)) / y.std(axis=0)
+        r_values = np.dot(X.T, y) / n_samples
+
+        # I used the p-value calculation described here
+        # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html
+        dist = beta(n_samples / 2 - 1, n_samples / 2 - 1, loc=-1, scale=2)
+        p_values = 2 * dist.cdf(-np.abs(r_values))
+        return r_values, p_values
+
+    @staticmethod
+    def _columnwise_spearman(X, y):
+        # ToDo: make more efficient by not relying on for loop
+        n_features = X.shape[1]
+        r_values, p_values = np.zeros(n_features), np.zeros(n_features)
+        for i in range(n_features):
+            corr = spearmanr(X[:, i], y)
+            r_values[i], p_values[i] = corr.statistic, corr.pvalue
+        return r_values, p_values
+
+    def transform(self, X: np.ndarray) -> np.ndarray:
+        """Sum over significant positive and significant negative edges.
+
+        Parameters:
+            X
+                The input samples of shape [n_samples, n_original_features]
+
+        Returns:
+            array of shape [n_samples, 2].
+
+        """
+        return np.stack([np.sum(X[:, (self.significant_edges == self.positive_edges)], axis=1),
+                         np.sum(X[:, (self.significant_edges == self.negative_edges)], axis=1)], axis=1)
+
+    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
+        """Reverse to original dimension.
+
+        Parameters:
+            X:
+                The input samples of shape [n_samples, 2].
+
+        Returns:
+            Array of shape [1, n_original_features]
+            with columns of zeros inserted where features haven't been included in the sum of positive or
+            negative edges. First value of input is inserted where a significant positive edge had been identified.
+            Second value of the input is inserted where a significant negative edge had been identified.
+
+        """
+        if len(X.shape) == 1:
+            X = X.reshape(1, -1)
+
+        if X.shape[1] != 2:
+            msg = "X needs to have 2 features (which correspond to the sum of positive and negative edges)."
+            logger.error(msg)
+            raise ValueError(msg)
+
+        if X.shape[0] > 1:
+            msg = "X can only contain one array with shape [1, 2]."
+            logger.error(msg)
+            raise ValueError(msg)
+
+        Xt = np.zeros((X.shape[0], self.n_original_features))
+        Xt[:, (self.significant_edges == self.positive_edges)] = X[:, 0]
+        Xt[:, (self.significant_edges == self.negative_edges)] = X[:, 1]
+        return Xt
diff --git a/test/modelwrapper_tests/test_cpm_feature_selection.py b/test/modelwrapper_tests/test_cpm_feature_selection.py
@@ -0,0 +1,85 @@
+import numpy as np
+
+from scipy.stats import pearsonr, spearmanr
+
+from sklearn.model_selection import KFold, ShuffleSplit
+from sklearn.datasets import load_breast_cancer, load_diabetes
+
+from photonai import Hyperpipe, PipelineElement
+from photonai.helper.photon_base_test import PhotonBaseTest
+
+from photonai.modelwrapper.cpm_feature_selection import CPMFeatureSelection
+
+
+class CPMFeatureSelectionTest(PhotonBaseTest):
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        cls.file = __file__
+        super(CPMFeatureSelectionTest, cls).setUpClass()
+
+    def setUp(self):
+        super(CPMFeatureSelectionTest, self).setUp()
+        self.X_classif, self.y_classif = load_breast_cancer(return_X_y=True)
+        self.X_regr, self.y_regr = load_diabetes(return_X_y=True)
+        self.pipe_classif = Hyperpipe("cpm_feature_selection_pipe_classif",
+                              outer_cv=ShuffleSplit(test_size=0.2, n_splits=1, random_state=15),
+                              inner_cv= KFold(n_splits=3, shuffle=True, random_state=15),
+                              metrics=["accuracy"], best_config_metric="accuracy",
+                              project_folder=self.tmp_folder_path)
+        self.pipe_regr = Hyperpipe("cpm_feature_selection_pipe_regr",
+                              outer_cv=ShuffleSplit(test_size=0.2, n_splits=1, random_state=15),
+                              inner_cv= KFold(n_splits=3, shuffle=True, random_state=15),
+                              metrics=["mean_absolute_error"], best_config_metric="mean_absolute_error",
+                              project_folder=self.tmp_folder_path)
+
+    def test_cpm_regression(self):
+        self.pipe_regr += PipelineElement('CPMFeatureSelection', hyperparameters={})
+        self.pipe_regr += PipelineElement('LinearRegression')
+        self.pipe_regr.fit(self.X_regr, self.y_regr)
+
+    def test_cpm_classification(self):
+        self.pipe_classif += PipelineElement('CPMFeatureSelection',
+                                             hyperparameters={'corr_method': ['pearson', 'spearman']})
+        self.pipe_classif += PipelineElement('LogisticRegression')
+        self.pipe_classif.fit(self.X_classif, self.y_classif)
+
+    def test_columnwise_correlation(self):
+        for cpm_corr_method, scipy_corr_method in [(CPMFeatureSelection._columnwise_pearson, pearsonr),
+                                                   (CPMFeatureSelection._columnwise_spearman, spearmanr)]:
+            r_values, p_values = cpm_corr_method(self.X_classif, self.y_classif)
+            r_scipy_first = scipy_corr_method(self.X_classif[:, 0], self.y_classif)
+            r_scipy_last = scipy_corr_method(self.X_classif[:, -1], self.y_classif)
+            self.assertAlmostEqual(r_values[0], r_scipy_first.statistic)
+            self.assertAlmostEqual(p_values[0], r_scipy_first.pvalue)
+            self.assertAlmostEqual(r_values[-1], r_scipy_last.statistic)
+            self.assertAlmostEqual(p_values[-1], r_scipy_last.pvalue)
+
+    def test_cpm_inverse(self):
+        cpm = PipelineElement('CPMFeatureSelection',
+                              hyperparameters={'corr_method': ['pearson']})
+
+        cpm.fit(self.X_classif, self.y_classif)
+        X_transformed, _, _ = cpm.transform(self.X_classif)
+        X_back, _, _ = cpm.inverse_transform(np.asarray([3, -3]))
+        self.assertEqual(X_transformed.shape[1], 2)
+        self.assertEqual(self.X_classif.shape[1], X_back.shape[1])
+        self.assertEqual(np.min(X_back), -3)
+        self.assertEqual(np.max(X_back), 3)
+
+        with self.assertRaises(ValueError):
+            cpm.inverse_transform(X_transformed)
+
+        with self.assertRaises(ValueError):
+            cpm.inverse_transform(X_transformed.T)
+
+    def test_wrong_corr_method(self):
+        with self.assertRaises(NotImplementedError):
+            PipelineElement('CPMFeatureSelection', corr_method='Pearsons')
+
+    def test_cpm_transform(self):
+        element = PipelineElement('CPMFeatureSelection', hyperparameters={})
+        element.fit(self.X_classif, self.y_classif)
+        X_transformed, _, _ = element.transform(self.X_classif)
+        self.assertEqual(X_transformed.shape[0], self.X_classif.shape[0])
+        self.assertEqual(X_transformed.shape[1], 2)

Original file line number	Diff line number	Diff line change
`@@ -490,5 +490,9 @@`
`490`	`490`	`"LocallyLinearEmbedding":[`
`491`	`491`	`"sklearn.manifold.LocallyLinearEmbedding",`
`492`	`492`	`"Transformer"`
	`493`	`+ ],`
	`494`	`+ "CPMFeatureSelection":[`
	`495`	`+ "photonai.modelwrapper.cpm_feature_selection.CPMFeatureSelection",`
	`496`	`+ "Estimator"`
`493`	`497`	`]`
`494`	`498`	`}`