Skip to content

Commit b3e29cd

Browse files
authored
Merge pull request #75 from wwu-mmll/feature/connectome_based_predictive_modeling
Add CPM feature selection as model wrapper
2 parents 7baa097 + 5a69f9b commit b3e29cd

File tree

4 files changed

+262
-0
lines changed

4 files changed

+262
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
"""
2+
Connectome-based predictive modeling
3+
4+
CPM is a method described in the following Nature Protocols article: https://www.nature.com/articles/nprot.2016.178
5+
It has been used in a number of publications to predict behavior from connectivity data.
6+
CPM works similar to a feature selection method. First, relevant edges (connectivity values) are identified through
7+
correlation analysis. Every edge is correlated with the predictive target. Only significant edges will be used in the
8+
subsequent steps. Next, the edge values for all significant positive and for all significant negative correlations are
9+
summed to create two new features. Lastly, these two features are used as input to another classifier.
10+
11+
In this example, no connectivity data is used, but the method will still work.
12+
This example is just supposed to show how to use CPM as feature selection and integration tool in PHOTONAI.
13+
"""
14+
15+
from sklearn.datasets import load_breast_cancer
16+
from sklearn.model_selection import KFold
17+
18+
from photonai import Hyperpipe, PipelineElement
19+
20+
21+
X, y = load_breast_cancer(return_X_y=True)
22+
23+
pipe = Hyperpipe("cpm_feature_selection_pipe",
24+
outer_cv=KFold(n_splits=5, shuffle=True, random_state=15),
25+
inner_cv=KFold(n_splits=5, shuffle=True, random_state=15),
26+
metrics=["balanced_accuracy"], best_config_metric="balanced_accuracy",
27+
project_folder='./tmp')
28+
29+
pipe += PipelineElement('CPMFeatureSelection', hyperparameters={'corr_method': ['pearson', 'spearman'],
30+
'p_threshold': [0.01, 0.05]})
31+
32+
pipe += PipelineElement('LogisticRegression')
33+
34+
pipe.fit(X, y)

photonai/base/registry/PhotonCore.json

+4
Original file line numberDiff line numberDiff line change
@@ -490,5 +490,9 @@
490490
"LocallyLinearEmbedding":[
491491
"sklearn.manifold.LocallyLinearEmbedding",
492492
"Transformer"
493+
],
494+
"CPMFeatureSelection":[
495+
"photonai.modelwrapper.cpm_feature_selection.CPMFeatureSelection",
496+
"Estimator"
493497
]
494498
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
import numpy as np
2+
from sklearn.base import BaseEstimator, TransformerMixin
3+
from scipy.stats import beta, spearmanr
4+
5+
from photonai.photonlogger.logger import logger
6+
7+
8+
class CPMFeatureSelection(BaseEstimator, TransformerMixin):
9+
"""Feature Selection using Connectome-Based Predictive Modeling.
10+
loosely based on this paper https://www.nature.com/articles/nprot.2016.178#Sec10
11+
12+
Correlate all features with target and select significant features only.
13+
Sum significant edges for positive correlations and negative correlations separately.
14+
"""
15+
_estimator_type = "transformer"
16+
17+
def __init__(self, p_threshold: float = .05, corr_method: str = 'pearson'):
18+
"""
19+
Initialize the object.
20+
21+
Parameters:
22+
p_threshold:
23+
Upper bound for p_values.
24+
corr_method:
25+
Correlation coefficient method. Can be 'pearson' or 'spearman'.
26+
27+
"""
28+
self.p_threshold = p_threshold
29+
self.corr_method = corr_method
30+
if corr_method not in ['pearson', 'spearman']:
31+
raise NotImplementedError("corr_method has to be either 'pearson' or 'spearman'.")
32+
33+
self.significant_edges = None
34+
self.positive_edges = None
35+
self.negative_edges = None
36+
self.n_original_features = None
37+
38+
def fit(self, X: np.ndarray, y: np.ndarray):
39+
"""Calculate correlation coefficients between features of X and y.
40+
41+
Parameters:
42+
X:
43+
The input samples of shape [n_samples, n_original_features]
44+
45+
y:
46+
The input targets of shape [n_samples, 1]
47+
48+
"""
49+
n_samples, self.n_original_features = X.shape
50+
51+
if self.corr_method == 'pearson':
52+
corr = self._columnwise_pearson
53+
elif self.corr_method == 'spearman':
54+
corr = self._columnwise_spearman
55+
else:
56+
corr = None
57+
58+
r, p = corr(X, y)
59+
self.significant_edges = p < self.p_threshold
60+
self.positive_edges = r > 0
61+
self.negative_edges = r < 0
62+
return self
63+
64+
@staticmethod
65+
def _columnwise_pearson(X, y):
66+
"""
67+
Compute Pearson's correlation coefficient between y and every column of X efficiently
68+
69+
:param X: ndarray
70+
:param y: ndarray
71+
:return: r_values: array of correlation coefficients
72+
p_values: array of corresponding p-values
73+
"""
74+
n_samples = X.shape[0]
75+
X = (X - X.mean(axis=0)) / X.std(axis=0)
76+
y = (y - y.mean(axis=0)) / y.std(axis=0)
77+
r_values = np.dot(X.T, y) / n_samples
78+
79+
# I used the p-value calculation described here
80+
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html
81+
dist = beta(n_samples / 2 - 1, n_samples / 2 - 1, loc=-1, scale=2)
82+
p_values = 2 * dist.cdf(-np.abs(r_values))
83+
return r_values, p_values
84+
85+
@staticmethod
86+
def _columnwise_spearman(X, y):
87+
# ToDo: make more efficient by not relying on for loop
88+
n_features = X.shape[1]
89+
r_values, p_values = np.zeros(n_features), np.zeros(n_features)
90+
for i in range(n_features):
91+
corr = spearmanr(X[:, i], y)
92+
r_values[i], p_values[i] = corr.statistic, corr.pvalue
93+
return r_values, p_values
94+
95+
def transform(self, X: np.ndarray) -> np.ndarray:
96+
"""Sum over significant positive and significant negative edges.
97+
98+
Parameters:
99+
X
100+
The input samples of shape [n_samples, n_original_features]
101+
102+
Returns:
103+
array of shape [n_samples, 2].
104+
105+
"""
106+
return np.stack([np.sum(X[:, (self.significant_edges == self.positive_edges)], axis=1),
107+
np.sum(X[:, (self.significant_edges == self.negative_edges)], axis=1)], axis=1)
108+
109+
def inverse_transform(self, X: np.ndarray) -> np.ndarray:
110+
"""Reverse to original dimension.
111+
112+
Parameters:
113+
X:
114+
The input samples of shape [n_samples, 2].
115+
116+
Returns:
117+
Array of shape [1, n_original_features]
118+
with columns of zeros inserted where features haven't been included in the sum of positive or
119+
negative edges. First value of input is inserted where a significant positive edge had been identified.
120+
Second value of the input is inserted where a significant negative edge had been identified.
121+
122+
"""
123+
if len(X.shape) == 1:
124+
X = X.reshape(1, -1)
125+
126+
if X.shape[1] != 2:
127+
msg = "X needs to have 2 features (which correspond to the sum of positive and negative edges)."
128+
logger.error(msg)
129+
raise ValueError(msg)
130+
131+
if X.shape[0] > 1:
132+
msg = "X can only contain one array with shape [1, 2]."
133+
logger.error(msg)
134+
raise ValueError(msg)
135+
136+
Xt = np.zeros((X.shape[0], self.n_original_features))
137+
Xt[:, (self.significant_edges == self.positive_edges)] = X[:, 0]
138+
Xt[:, (self.significant_edges == self.negative_edges)] = X[:, 1]
139+
return Xt
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import numpy as np
2+
3+
from scipy.stats import pearsonr, spearmanr
4+
5+
from sklearn.model_selection import KFold, ShuffleSplit
6+
from sklearn.datasets import load_breast_cancer, load_diabetes
7+
8+
from photonai import Hyperpipe, PipelineElement
9+
from photonai.helper.photon_base_test import PhotonBaseTest
10+
11+
from photonai.modelwrapper.cpm_feature_selection import CPMFeatureSelection
12+
13+
14+
class CPMFeatureSelectionTest(PhotonBaseTest):
15+
16+
@classmethod
17+
def setUpClass(cls) -> None:
18+
cls.file = __file__
19+
super(CPMFeatureSelectionTest, cls).setUpClass()
20+
21+
def setUp(self):
22+
super(CPMFeatureSelectionTest, self).setUp()
23+
self.X_classif, self.y_classif = load_breast_cancer(return_X_y=True)
24+
self.X_regr, self.y_regr = load_diabetes(return_X_y=True)
25+
self.pipe_classif = Hyperpipe("cpm_feature_selection_pipe_classif",
26+
outer_cv=ShuffleSplit(test_size=0.2, n_splits=1, random_state=15),
27+
inner_cv= KFold(n_splits=3, shuffle=True, random_state=15),
28+
metrics=["accuracy"], best_config_metric="accuracy",
29+
project_folder=self.tmp_folder_path)
30+
self.pipe_regr = Hyperpipe("cpm_feature_selection_pipe_regr",
31+
outer_cv=ShuffleSplit(test_size=0.2, n_splits=1, random_state=15),
32+
inner_cv= KFold(n_splits=3, shuffle=True, random_state=15),
33+
metrics=["mean_absolute_error"], best_config_metric="mean_absolute_error",
34+
project_folder=self.tmp_folder_path)
35+
36+
def test_cpm_regression(self):
37+
self.pipe_regr += PipelineElement('CPMFeatureSelection', hyperparameters={})
38+
self.pipe_regr += PipelineElement('LinearRegression')
39+
self.pipe_regr.fit(self.X_regr, self.y_regr)
40+
41+
def test_cpm_classification(self):
42+
self.pipe_classif += PipelineElement('CPMFeatureSelection',
43+
hyperparameters={'corr_method': ['pearson', 'spearman']})
44+
self.pipe_classif += PipelineElement('LogisticRegression')
45+
self.pipe_classif.fit(self.X_classif, self.y_classif)
46+
47+
def test_columnwise_correlation(self):
48+
for cpm_corr_method, scipy_corr_method in [(CPMFeatureSelection._columnwise_pearson, pearsonr),
49+
(CPMFeatureSelection._columnwise_spearman, spearmanr)]:
50+
r_values, p_values = cpm_corr_method(self.X_classif, self.y_classif)
51+
r_scipy_first = scipy_corr_method(self.X_classif[:, 0], self.y_classif)
52+
r_scipy_last = scipy_corr_method(self.X_classif[:, -1], self.y_classif)
53+
self.assertAlmostEqual(r_values[0], r_scipy_first.statistic)
54+
self.assertAlmostEqual(p_values[0], r_scipy_first.pvalue)
55+
self.assertAlmostEqual(r_values[-1], r_scipy_last.statistic)
56+
self.assertAlmostEqual(p_values[-1], r_scipy_last.pvalue)
57+
58+
def test_cpm_inverse(self):
59+
cpm = PipelineElement('CPMFeatureSelection',
60+
hyperparameters={'corr_method': ['pearson']})
61+
62+
cpm.fit(self.X_classif, self.y_classif)
63+
X_transformed, _, _ = cpm.transform(self.X_classif)
64+
X_back, _, _ = cpm.inverse_transform(np.asarray([3, -3]))
65+
self.assertEqual(X_transformed.shape[1], 2)
66+
self.assertEqual(self.X_classif.shape[1], X_back.shape[1])
67+
self.assertEqual(np.min(X_back), -3)
68+
self.assertEqual(np.max(X_back), 3)
69+
70+
with self.assertRaises(ValueError):
71+
cpm.inverse_transform(X_transformed)
72+
73+
with self.assertRaises(ValueError):
74+
cpm.inverse_transform(X_transformed.T)
75+
76+
def test_wrong_corr_method(self):
77+
with self.assertRaises(NotImplementedError):
78+
PipelineElement('CPMFeatureSelection', corr_method='Pearsons')
79+
80+
def test_cpm_transform(self):
81+
element = PipelineElement('CPMFeatureSelection', hyperparameters={})
82+
element.fit(self.X_classif, self.y_classif)
83+
X_transformed, _, _ = element.transform(self.X_classif)
84+
self.assertEqual(X_transformed.shape[0], self.X_classif.shape[0])
85+
self.assertEqual(X_transformed.shape[1], 2)

0 commit comments

Comments
 (0)