diff --git a/esda/__init__.py b/esda/__init__.py index cd15d5b0..8ada54a8 100644 --- a/esda/__init__.py +++ b/esda/__init__.py @@ -19,7 +19,7 @@ from .gamma import Gamma from .util import fdr from .smaup import Smaup -from .lee import Spatial_Pearson, Local_Spatial_Pearson +from .lee import Spatial_Pearson, Spatial_Pearson_Local from .silhouettes import (path_silhouette, boundary_silhouette, silhouette_alist, nearest_label) from . import adbscan diff --git a/esda/lee.py b/esda/lee.py index 51d0363d..4ea9b3b8 100644 --- a/esda/lee.py +++ b/esda/lee.py @@ -3,6 +3,103 @@ from sklearn.base import BaseEstimator from sklearn import preprocessing from sklearn import utils +from itertools import chain + +try: + from joblib import Parallel, delayed + _HAS_JOBLIB = True +except ModuleNotFoundError: + _HAS_JOBLIB = False + +class Pearson_Local(BaseEstimator): + """This splits the pearson's R into its individual site components""" + + def __init__(self, conditional_inference=False, permutations=999i, n_jobs=-1): + self.permutations = permutations + self.conditional_inference = conditional_inference + self.n_jobs = -1 + + def fit(self, x, y=None): + if y is not None: + x,y = utils.check_X_y(x, y, ensure_2d=False, estimator=self) + X = numpy.column_stack((x, y)) + else: + X = utils.check_array(x, ensure_2d=True, ensure_min_features=2, estimator=self) + n,p = X.shape + Z = preprocessing.StandardScaler().fit_transform(X) + self.associations_ = self._statistic(Z) + + if y is not None: + self.associations_ = self.associations[0,1] + + if (self.permutations is None) or (self.permutations < 1): + self.reference_distribution_ = None + self.significance_ = numpy.nan + return self + + if self.conditinal_inference is not False: + if self.conditional_inference == 'x': + permuter = ((numpy.arange(n), numpy.random.permutation(n)) + for _ in range(self.permutations)) + elif self.conditional_inference == 'y': + permuter = ((numpy.random.permutation(n), numpy.arange(n)) + for _ in range(self.permutations)) + elif self.conditional_inference == True: + permute_x = ((numpy.random.permutation(n), numpy.arange(n)) + for _ in range(self.permutations // 2)) + permute_y = ((numpy.arange(n), numpy.random.permutation(n)) + for _ in range(self.permutations // 2)) + permuter = chain(permute_x, permute_y) + else: + permuter = ((numpy.random.permutation(n), numpy.random.permutation(n)) + for _ in range(self.permutations)) + n_jobs = self.n_jobs + if not HAS_JOBLIB and (n_jobs != 1): + warn('The joblib package is required to run parallel' + ' simulations for this model. Please install ' + ' joblib to enable parallel processing for simulations.') + n_jobs = 1 + simulations = [self._statistic(numpy.column_stack((Z[0,rx], + Z[1,ry])) + ) + for rx,ry in permuter] + else: + simulations = Parallel(n_jobs=n_jobs)( + delayed(self._statistic)(numpy.column_stack((Z[0,rx], + Z[1,ry])) + ) + for rx,ry in permuter + ) + if self.conditional_inference == True: + rz = numpy.arctanh(simulations) + firsthalf, secondhalf = numpy.array_split(rz, 2) + from scipy.stats import ttest_rel + post_hoc_test = ttest_rel(firsthalf, secondhalf) + if post_hoc_test.pvalue < .01: + warn('The null hypothesis that permutations of X yield equivalent' + ' correlations to permutations of y is very unlikely given' + ' the permutations conducted (p = {p}). It is very likely' + ' that any inference based on the conditional permutation' + ' will be incorrect. Use conditional_inference=False in' + ' this case.') + self.reference_distribution_ = numpy.row_stack(simulations).T + above = self.reference_distribution_ >= self.associations_.reshape(-1,1) + larger = above.sum(axis=1) + extreme = numpy.minimum(larger, self.permutations - larger) + self.significance_ = (extreme + 1.) / (self.permutations + 1.) + self.reference_distribution_ = self.reference_distribution_.T + return self + + + + + + @staticmethod + def _statistic(Z): + N,P = X.shape + # is in shape n, p, p + return (Z.T * Z.T[:,None]) / N + class Spatial_Pearson(BaseEstimator): """Global Spatial Pearson Statistic""" @@ -72,8 +169,12 @@ def fit(self, x, y): self.connectivity.sum(axis=1)) if (self.permutations is None): + self.reference_distribution_ = None + self.significance_ = numpy.nan return self elif self.permutations < 1: + self.reference_distribution_ = None + self.significance_ = numpy.nan return self if self.permutations: @@ -93,7 +194,7 @@ def _statistic(Z,W): return (Z.T @ ctc @ Z) / (ones.T @ ctc @ ones) -class Local_Spatial_Pearson(BaseEstimator): +class Spatial_Pearson_Local(BaseEstimator): """Local Spatial Pearson Statistic""" def __init__(self, connectivity=None, permutations=999): @@ -218,6 +319,7 @@ def fit(self, x, y): self.reference_distribution_ = self.reference_distribution_.T else: self.reference_distribution_ = None + self.significance_ = numpy.nan return self @staticmethod diff --git a/tests/test_lee.py b/tests/test_lee.py new file mode 100644 index 00000000..18f4f798 --- /dev/null +++ b/tests/test_lee.py @@ -0,0 +1,84 @@ +import unittest +import libpysal +import geopandas +from libpysal.common import pandas, RTOL, ATOL +from .. import lee +import numpy + + +PANDAS_EXTINCT = pandas is None + +class Lee_Tester(unittest.TestCase): + def setUp(self): + self.data = geopandas.read_file(libpysal.examples.get_path("columbus.shp")) + self.w = libpysal.weights.Queen.from_dataframe(self.data) + self.w.transform = 'r' + self.x = self.data[['HOVAL']].values + self.y = self.data[['CRIME']].values + + def test_global(self): + numpy.random.seed(2478879) + result = lee.Spatial_Pearson(connectivity=self.w.sparse).fit(self.x, self.y) + known = numpy.array([[ 0.30136527, -0.23625603], + [-0.23625603, 0.53512008]]) + numpy.testing.assert_allclose(known,result.association_, rtol=RTOL, atol=ATOL) + numpy.testing.assert_array_equal(result.reference_distribution_.shape, (999,2,2)) + first_rep = numpy.array([[ 0.22803705, -0.08053692], + [-0.08053692, 0.18897318]]) + + second_rep = numpy.array([[ 0.14179274, -0.06962692], + [-0.06962692, 0.13688337]]) + numpy.testing.assert_allclose(first_rep, result.reference_distribution_[0], + rtol=RTOL, atol=ATOL) + numpy.testing.assert_allclose(second_rep, result.reference_distribution_[1], + rtol=RTOL, atol=ATOL) + + known_significance = numpy.array([[0.125, 0.026], + [0.026, 0.001]]) + numpy.testing.assert_allclose(known_significance, result.significance_, + rtol=RTOL, atol=ATOL) + + def test_local(self): + numpy.random.seed(2478879) + result = lee.Spatial_Pearson_Local(connectivity=self.w.sparse).fit(self.x, self.y) + known_locals = numpy.array([ 0.10246023, -0.24169198, -0.1308714 , + 0.00895543, -0.16080899, -0.00950808, + -0.14615398, -0.0627634 , 0.00661232, + -0.42354628, -0.73121006, 0.02060548, + 0.05187356, 0.06515283, -0.64400723, + -0.37489818, -2.06573667, -0.10931854, + 0.50823848, -0.06338637, -0.10559429, + 0.03282849, -0.86618915, -0.62333825, + -0.40910044,-0.41866868, -0.00702983, + -0.4246288 , -0.52142507, -0.22481772, + 0.1931263 , -1.39355214, 0.02036755, + 0.22896308, -0.00240854, -0.30405211, + -0.66950406, -0.21481868, -0.60320158, + -0.38117303, -0.45584563, 0.32019362, + -0.02818729, -0.02214172, 0.05587915, + 0.0295999 , -0.78818135, 0.16854472, + 0.2378127 ]) + numpy.testing.assert_allclose(known_locals, result.associations_, + rtol=RTOL, atol=ATOL) + significances = numpy.array([0.154, 0.291, 0.358, 0.231, 0.146, + 0.335, 0.325, 0.388, 0.244, 0.111, + 0.019, 0.165, 0.136, 0.073, 0.014, + 0.029, 0.002, 0.376, 0.003, 0.265, + 0.449, 0.121, 0.072, 0.006, 0.036, + 0.06 , 0.355, 0.01 , 0.017, 0.168, + 0.022, 0.003, 0.217, 0.016, 0.337, + 0.137, 0.015, 0.128, 0.11 , 0.09 , + 0.168, 0.031, 0.457, 0.44 , 0.141, + 0.249, 0.158, 0.018, 0.031]) + numpy.testing.assert_allclose(significances, result.significance_, + rtol=RTOL, atol=ATOL) + +suite = unittest.TestSuite() +test_classes = [Lee_Tester] +for i in test_classes: + a = unittest.TestLoader().loadTestsFromTestCase(i) + suite.addTest(a) + +if __name__ == '__main__': + runner = unittest.TextTestRunner() + runner.run(suite)