From b35e0abe3f546e115172eb3d5cc2c94159cefd94 Mon Sep 17 00:00:00 2001 From: Daniel Lusk Date: Thu, 25 May 2023 12:05:53 +0200 Subject: [PATCH 01/14] fix deprecated shapely syntax --- spacv/grid_builder.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/spacv/grid_builder.py b/spacv/grid_builder.py index bb75782..984663a 100644 --- a/spacv/grid_builder.py +++ b/spacv/grid_builder.py @@ -1,9 +1,10 @@ -import numpy as np import geopandas as gpd -from shapely.geometry import Polygon, asPolygon +import numpy as np from matplotlib.collections import PolyCollection +from shapely.geometry import Polygon from sklearn.neighbors import BallTree -from .utils import convert_geodataframe, geometry_to_2d, convert_numpy + +from .utils import convert_geodataframe, convert_numpy, geometry_to_2d __all__ = [ "construct_blocks", @@ -109,7 +110,7 @@ def construct_square_grid(XYs, tiles_x, tiles_y): top_left = np.add(minx, np.multiply(rows, dx)), np.add(miny, np.multiply(columns+1, dy)) polys = np.vstack([bottom_left, bottom_right, top_right, top_left]).reshape(4,2,-1) - polys = [asPolygon(polys[:,:,i]) for i in range(tiles_x*tiles_y)] + polys = [Polygon(polys[:,:,i]) for i in range(tiles_x*tiles_y)] grid = gpd.GeoDataFrame({'geometry':polys}) From a8aeb9f066c1bd76761e83d8b9f6864c6b1b9620 Mon Sep 17 00:00:00 2001 From: Daniel Lusk Date: Thu, 25 May 2023 12:58:24 +0200 Subject: [PATCH 02/14] fix deprecated np type --- spacv/base_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacv/base_classes.py b/spacv/base_classes.py index 45609f0..6354cdd 100644 --- a/spacv/base_classes.py +++ b/spacv/base_classes.py @@ -71,7 +71,7 @@ def _remove_buffered_indices(self, XYs, test_indices, buffer_radius, geometry_bu return test_indices, train_exclude else: # Yield empty array because no training data removed in dead zone when buffer is zero - _ = np.empty([], dtype=np.int) + _ = np.empty([], dtype=int) return test_indices, _ @abstractmethod From b29761224ef795e9861428829956033c07031e28 Mon Sep 17 00:00:00 2001 From: Daniel Lusk Date: Thu, 25 May 2023 12:59:02 +0200 Subject: [PATCH 03/14] fix error when no deadzones (zero buffer) --- spacv/base_classes.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/spacv/base_classes.py b/spacv/base_classes.py index 6354cdd..eb76c2a 100644 --- a/spacv/base_classes.py +++ b/spacv/base_classes.py @@ -44,8 +44,11 @@ def split(self, XYs): indices = XYs.index.values for test_indices, train_excluded in self._iter_test_indices(XYs): - # Exclude the training indices within buffer - train_excluded = np.concatenate([test_indices, train_excluded]) + if train_excluded.ndim and train_excluded.size: + # Exclude the training indices within buffer + train_excluded = np.concatenate([test_indices, train_excluded]) + else: + train_excluded = test_indices train_index = np.setdiff1d( np.union1d( indices, From 5571e8b485857aa17fc65484795537287a983b2c Mon Sep 17 00:00:00 2001 From: Daniel Lusk Date: Thu, 25 May 2023 13:00:00 +0200 Subject: [PATCH 04/14] formatting --- spacv/base_classes.py | 9 ++++++--- spacv/spacv.py | 10 ++++++---- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/spacv/base_classes.py b/spacv/base_classes.py index eb76c2a..e0b23e1 100644 --- a/spacv/base_classes.py +++ b/spacv/base_classes.py @@ -1,8 +1,11 @@ -from abc import ABC, abstractmethod, ABCMeta -import numpy as np +from abc import ABC, ABCMeta, abstractmethod + import geopandas as gpd +import numpy as np from sklearn.model_selection import BaseCrossValidator -from .utils import convert_geoseries, convert_geodataframe + +from .utils import convert_geodataframe, convert_geoseries + class BaseSpatialCV(BaseCrossValidator, metaclass=ABCMeta): """ diff --git a/spacv/spacv.py b/spacv/spacv.py index 6f27581..6bc5df4 100644 --- a/spacv/spacv.py +++ b/spacv/spacv.py @@ -1,11 +1,13 @@ -import warnings import numbers -import numpy as np +import warnings + import geopandas as gpd +import numpy as np from sklearn.cluster import MiniBatchKMeans + from .base_classes import BaseSpatialCV -from .grid_builder import construct_blocks, assign_pt_to_grid -from .utils import geometry_to_2d, convert_geodataframe, load_custom_polygon +from .grid_builder import assign_pt_to_grid, construct_blocks +from .utils import convert_geodataframe, geometry_to_2d, load_custom_polygon __all__ = [ "HBLOCK", From 04bf503bc98942792586fa170041af126f47f354 Mon Sep 17 00:00:00 2001 From: Daniel Lusk Date: Sat, 27 May 2023 12:37:14 +0200 Subject: [PATCH 05/14] formatting updates for consistencty (black) --- spacv/base_classes.py | 65 +++++---- spacv/grid_builder.py | 231 +++++++++++++++++-------------- spacv/spacv.py | 284 +++++++++++++++++++------------------- spacv/tests/test_cv.py | 151 +++++++++++--------- spacv/tests/test_grids.py | 156 ++++++++++----------- spacv/tests/test_viz.py | 2 +- spacv/utils.py | 26 ++-- spacv/visualisation.py | 222 ++++++++++++++++------------- 8 files changed, 605 insertions(+), 532 deletions(-) diff --git a/spacv/base_classes.py b/spacv/base_classes.py index e0b23e1..cc296f4 100644 --- a/spacv/base_classes.py +++ b/spacv/base_classes.py @@ -1,4 +1,4 @@ -from abc import ABC, ABCMeta, abstractmethod +from abc import ABCMeta, abstractmethod import geopandas as gpd import numpy as np @@ -11,21 +11,20 @@ class BaseSpatialCV(BaseCrossValidator, metaclass=ABCMeta): """ Base class for partitioning-based spatial cross-validation approaches. """ - def __init__( - self - ): + + def __init__(self): self.buffer_radius = buffer_radius - + def split(self, XYs): """ Generate indices to split data into training and test set. - + Parameters ---------- XYs : GeoSeries GeoSeries containing shapely Points that identify Easting and Northing coordinates of data points. - + Yields ------ train : ndarray @@ -35,80 +34,80 @@ def split(self, XYs): """ XYs = convert_geoseries(XYs).reset_index(drop=True) minx, miny, maxx, maxy = XYs.total_bounds - + buffer_radius = self.buffer_radius - if buffer_radius > maxx-minx or buffer_radius > maxy-miny: + if buffer_radius > maxx - minx or buffer_radius > maxy - miny: raise ValueError( "buffer_radius too large and excludes all points. Given {}.".format( self.buffer_radius ) ) - num_samples = XYs.shape[0] + # num_samples = XYs.shape[0] indices = XYs.index.values - - for test_indices, train_excluded in self._iter_test_indices(XYs): + + for test_indices, train_excluded in self._iter_test_indices(XYs): if train_excluded.ndim and train_excluded.size: # Exclude the training indices within buffer train_excluded = np.concatenate([test_indices, train_excluded]) else: train_excluded = test_indices train_index = np.setdiff1d( - np.union1d( - indices, - train_excluded - ), np.intersect1d(indices, train_excluded) - ) + np.union1d(indices, train_excluded), + np.intersect1d(indices, train_excluded), + ) if len(train_index) < 1: raise ValueError( "Training set is empty. Try lowering buffer_radius to include more training instances." ) - test_index = indices[test_indices] + test_index = indices[test_indices] yield train_index, test_index - - def _remove_buffered_indices(self, XYs, test_indices, buffer_radius, geometry_buffer): + + def _remove_buffered_indices( + self, XYs, test_indices, buffer_radius, geometry_buffer + ): # Remove training points from dead zone buffer - if buffer_radius > 0: - # Buffer grid and clip training instances - candidate_deadzone = XYs.loc[~XYs.index.isin( test_indices )] + if buffer_radius > 0: + # Buffer grid and clip training instances + candidate_deadzone = XYs.loc[~XYs.index.isin(test_indices)] candidate_deadzone = convert_geodataframe(candidate_deadzone) geometry_buffer = convert_geodataframe(geometry_buffer) deadzone_points = gpd.sjoin(candidate_deadzone, geometry_buffer) - train_exclude = deadzone_points.loc[~deadzone_points.index.isin(test_indices)].index.values + train_exclude = deadzone_points.loc[ + ~deadzone_points.index.isin(test_indices) + ].index.values return test_indices, train_exclude else: # Yield empty array because no training data removed in dead zone when buffer is zero _ = np.empty([], dtype=int) return test_indices, _ - + @abstractmethod def _iter_test_indices(self, XYs): """ - Generates integer indices corresponding to test sets and + Generates integer indices corresponding to test sets and training indices to be excluded from model training. - + Parameters ---------- X : GeoSeries GeoSeries containing shapely Points that identify Easting and Northing coordinates of data points. - + Yields ------ test_indices : array The testing set indices for that fold. train_exclude : array The training set indices to exclude for that fold. - """ - + """ + def get_n_splits(self): """ Returns the number of folds used in the cross-validation. - + Returns ------- n_splits : int Returns the number of folds in the cross-validator. """ return self.n_splits - - \ No newline at end of file diff --git a/spacv/grid_builder.py b/spacv/grid_builder.py index 984663a..81e4bb5 100644 --- a/spacv/grid_builder.py +++ b/spacv/grid_builder.py @@ -6,20 +6,27 @@ from .utils import convert_geodataframe, convert_numpy, geometry_to_2d -__all__ = [ - "construct_blocks", - "construct_square_grid", - "construct_hex_grid" -] - -def construct_blocks(XYs, tiles_x, tiles_y, method='unique', shape='square', - direction='diagonal', data=None, n_groups=5, n_sims=10, - distance_metric='euclidean', random_state=None): +__all__ = ["construct_blocks", "construct_square_grid", "construct_hex_grid"] + + +def construct_blocks( + XYs, + tiles_x, + tiles_y, + method="unique", + shape="square", + direction="diagonal", + data=None, + n_groups=5, + n_sims=10, + distance_metric="euclidean", + random_state=None, +): """ Build a grid over study area with user-defined number of tiles. This function exposes a number of configurable parameters to design the shape - of grid, but also the fold assignment technique. - + of grid, but also the fold assignment technique. + Parameters ---------- XYs : Geoseries series @@ -39,7 +46,7 @@ def construct_blocks(XYs, tiles_x, tiles_y, method='unique', shape='square', If int, random_state is the seed used by the random number generator. If None, the random number generator is the RandomState instance used by `np.random`. - + Returns ------- grid : GeoDataFrame Dataframe @@ -48,39 +55,43 @@ def construct_blocks(XYs, tiles_x, tiles_y, method='unique', shape='square', """ # Construct grid of polygons of defined size and shape grid = construct_grid(XYs, tiles_x, tiles_y, shape) - + # Set grid assignment method - if method == 'unique': - grid['grid_id'] = grid.index - elif method == 'systematic': - if shape != 'square': - raise Exception("systematic grid assignment method does not work for irregular grids.") - grid['grid_id'] = assign_systematic(grid, tiles_x, tiles_y, direction) - elif method == 'random': - grid['grid_id'] = assign_randomized(grid, n_groups, random_state) - elif method == 'optimized_random': - grid['grid_id'] = assign_optimized_random(grid, XYs, data, - n_groups, - n_sims, - distance_metric) + if method == "unique": + grid["grid_id"] = grid.index + elif method == "systematic": + if shape != "square": + raise Exception( + "systematic grid assignment method does not work for irregular grids." + ) + grid["grid_id"] = assign_systematic(grid, tiles_x, tiles_y, direction) + elif method == "random": + grid["grid_id"] = assign_randomized(grid, n_groups, random_state) + elif method == "optimized_random": + grid["grid_id"] = assign_optimized_random( + grid, XYs, data, n_groups, n_sims, distance_metric + ) else: - raise ValueError("Method not recognised. Choose between: unique, systematic, random or optimized_random.") + raise ValueError( + "Method not recognised. Choose between: unique, systematic, random or optimized_random." + ) return grid - + + def construct_grid(XYs, tiles_x, tiles_y, shape): """ Choose grid shape to build across bounds of study area. """ - if shape == 'square': + if shape == "square": return construct_square_grid(XYs, tiles_x, tiles_y) - if shape == 'hex': + if shape == "hex": return construct_hex_grid(XYs, tiles_x, tiles_y) - + def construct_square_grid(XYs, tiles_x, tiles_y): """ Build square grid over study area with user-defined number of tiles. - + Parameters ---------- XYs : Geoseries series @@ -89,37 +100,48 @@ def construct_square_grid(XYs, tiles_x, tiles_y): Integer declaring number of tiles along X axis. tiles_y : integer Integer declaring number of tiles along Y axis. - + Returns ------- grid : GeoDataFrame Dataframe GeoDataFrame with square grids as shapely polygons. - + """ minx, miny, maxx, maxy = XYs.total_bounds - rows = np.array(list(np.arange(0, tiles_x))*tiles_y) + rows = np.array(list(np.arange(0, tiles_x)) * tiles_y) columns = np.repeat(np.arange(0, tiles_y), tiles_x) - + dx = (maxx - minx) / tiles_x dy = (maxy - miny) / tiles_y - bottom_left = np.add(minx, np.multiply(rows, dx)), np.add(miny, np.multiply(columns, dy)) - bottom_right = np.add(minx, np.multiply(rows+1, dx)), np.add(miny, np.multiply(columns, dy)) - top_right = np.add(minx, np.multiply(rows+1, dx)), np.add(miny, np.multiply(columns+1, dy)) - top_left = np.add(minx, np.multiply(rows, dx)), np.add(miny, np.multiply(columns+1, dy)) + bottom_left = np.add(minx, np.multiply(rows, dx)), np.add( + miny, np.multiply(columns, dy) + ) + bottom_right = np.add(minx, np.multiply(rows + 1, dx)), np.add( + miny, np.multiply(columns, dy) + ) + top_right = np.add(minx, np.multiply(rows + 1, dx)), np.add( + miny, np.multiply(columns + 1, dy) + ) + top_left = np.add(minx, np.multiply(rows, dx)), np.add( + miny, np.multiply(columns + 1, dy) + ) - polys = np.vstack([bottom_left, bottom_right, top_right, top_left]).reshape(4,2,-1) - polys = [Polygon(polys[:,:,i]) for i in range(tiles_x*tiles_y)] + polys = np.vstack([bottom_left, bottom_right, top_right, top_left]).reshape( + 4, 2, -1 + ) + polys = [Polygon(polys[:, :, i]) for i in range(tiles_x * tiles_y)] - grid = gpd.GeoDataFrame({'geometry':polys}) + grid = gpd.GeoDataFrame({"geometry": polys}) return grid + def construct_hex_grid(XYs, tiles_x, tiles_y): """ Build hexagon grid over study area with user-defined number of tiles. - + Parameters ---------- XYs : Geoseries series @@ -128,15 +150,15 @@ def construct_hex_grid(XYs, tiles_x, tiles_y): Integer declaring number of tiles along X axis. tiles_y : integer Integer declaring number of tiles along Y axis. - + Returns ------- grid : GeoDataFrame Dataframe GeoDataFrame with hexagon grids as shapely polygons. - + """ minx, miny, maxx, maxy = XYs.total_bounds - padding = 1.e-9 * (maxx - minx) + padding = 1.0e-9 * (maxx - minx) minx -= padding maxx += padding dx = (maxx - minx) / tiles_x @@ -147,56 +169,55 @@ def construct_hex_grid(XYs, tiles_x, tiles_y): n = tiles_x1 * tiles_y1 + tiles_x * tiles_y offsets = np.zeros((n, 2), float) - offsets[:tiles_x1 * tiles_y1, 0] = np.repeat(np.arange(tiles_x1), tiles_y1) - offsets[:tiles_x1 * tiles_y1, 1] = np.tile(np.arange(tiles_y1), tiles_x1) - offsets[tiles_x1 * tiles_y1:, 0] = np.repeat(np.arange(tiles_x) + 0.5, tiles_y) - offsets[tiles_x1 * tiles_y1:, 1] = np.tile(np.arange(tiles_y), tiles_x) + 0.5 - + offsets[: tiles_x1 * tiles_y1, 0] = np.repeat(np.arange(tiles_x1), tiles_y1) + offsets[: tiles_x1 * tiles_y1, 1] = np.tile(np.arange(tiles_y1), tiles_x1) + offsets[tiles_x1 * tiles_y1 :, 0] = np.repeat(np.arange(tiles_x) + 0.5, tiles_y) + offsets[tiles_x1 * tiles_y1 :, 1] = np.tile(np.arange(tiles_y), tiles_x) + 0.5 + offsets[:, 0] *= dx offsets[:, 1] *= dy offsets[:, 0] += minx offsets[:, 1] += miny polygon = [dx, dy / 3] * np.array( - [[.5, -.5], [.5, .5], [0., 1.], [-.5, .5], [-.5, -.5], [0., -1.]]) + [[0.5, -0.5], [0.5, 0.5], [0.0, 1.0], [-0.5, 0.5], [-0.5, -0.5], [0.0, -1.0]] + ) - collection = PolyCollection([polygon],offsets=offsets) + collection = PolyCollection([polygon], offsets=offsets) hex_polys = collection.get_paths()[0].vertices hex_array = [] - for xs,ys in collection.get_offsets(): - hex_x = np.add(hex_polys[:,0], xs) - hex_y = np.add(hex_polys[:,1], ys) + for xs, ys in collection.get_offsets(): + hex_x = np.add(hex_polys[:, 0], xs) + hex_y = np.add(hex_polys[:, 1], ys) hex_array.append(Polygon(np.vstack([hex_x, hex_y]).T)) - hex_grid = gpd.GeoDataFrame({'geometry':hex_array}) - + hex_grid = gpd.GeoDataFrame({"geometry": hex_array}) + return hex_grid -def assign_systematic(grid, tiles_x, tiles_y, direction='diagonal'): + +def assign_systematic(grid, tiles_x, tiles_y, direction="diagonal"): """ Set grid pattern as systematic by assigning grid IDs along diagonals, normal and anti-diagonal. """ # Reshape length of grid to matrix - sys_matrix = np.arange(0, tiles_x * tiles_y) \ - .reshape(tiles_y, tiles_x) + sys_matrix = np.arange(0, tiles_x * tiles_y).reshape(tiles_y, tiles_x) length, width = sys_matrix.shape # Set systematic pattern, diagonal or anti-diagonal - if direction == 'diagonal': - diags = [sys_matrix.diagonal(i) - for i in range(width-1, -length,-1)] - elif direction == 'anti': - diags = [sys_matrix[::-1,:].diagonal(i) - for i in range(-length+1, width)] + if direction == "diagonal": + diags = [sys_matrix.diagonal(i) for i in range(width - 1, -length, -1)] + elif direction == "anti": + diags = [sys_matrix[::-1, :].diagonal(i) for i in range(-length + 1, width)] else: - raise ValueError("Direction of systematic pattern not recognised. Choose between: diagonal or anti.") + raise ValueError( + "Direction of systematic pattern not recognised. Choose between: diagonal or anti." + ) # Construct lookup between diagonal element indices and grid dataframe - systematic_lookup = dict([ - tuple([element, key]) - for key, diag in enumerate(diags) - for element in diag - ]) + systematic_lookup = dict( + [tuple([element, key]) for key, diag in enumerate(diags) for element in diag] + ) grid_id = grid.index.map(systematic_lookup) return grid_id @@ -207,17 +228,20 @@ def assign_randomized(grid, n_groups=5, random_state=None): Set grid pattern as randomized by randomly assigning grid IDs. """ np.random.seed(random_state) - + # Determine number of randomized groups n_random_grps = np.arange(0, n_groups) n_grids = grid.shape[0] - + # Allocate random group id to each grid row grid_id = np.random.choice(n_random_grps, size=n_grids, replace=True) - + return grid_id -def assign_optimized_random(grid, XYs, data, n_groups=5, n_sims=10, distance_metric='euclidean'): + +def assign_optimized_random( + grid, XYs, data, n_groups=5, n_sims=10, distance_metric="euclidean" +): """ Set grid pattern as optimized random by taking grid IDs that minimize dissimilarity between folds. """ @@ -227,53 +251,52 @@ def assign_optimized_random(grid, XYs, data, n_groups=5, n_sims=10, distance_met " dissimilarity when using optimized_random method." ) data = convert_numpy(data) - # Build dictionary of grid IDs with paired SSR for dissimilarity + # Build dictionary of grid IDs with paired SSR for dissimilarity optimized_grid = {} for sim in range(n_sims): grid_id = assign_randomized(grid, n_groups) - grid['grid_id'] = grid_id + grid["grid_id"] = grid_id folds = assign_pt_to_grid(XYs, grid, distance_metric) # Scale for SSR calculation X = (data - data.mean(axis=0)) / data.std(axis=0) Xbar = X.mean(axis=0) - X_grid_means = np.array([ X[v].mean(axis=0) - for _, v in folds.groupby('grid_id').groups.items()]) - # Calculate dissimilarity between folds and mean values across all data - sse = sum( - sum((X_grid_means - Xbar)**2) + X_grid_means = np.array( + [X[v].mean(axis=0) for _, v in folds.groupby("grid_id").groups.items()] ) - optimized_grid.update( {sim : {'sse': sse, 'grid_id': grid_id}} ) + # Calculate dissimilarity between folds and mean values across all data + sse = sum(sum((X_grid_means - Xbar) ** 2)) + optimized_grid.update({sim: {"sse": sse, "grid_id": grid_id}}) # Take the optimized grid as one that minimises dissimilarity between folds - minimised_obj = min(optimized_grid, key = lambda x : optimized_grid[x]['sse']) - grid_id = optimized_grid[minimised_obj]['grid_id'] + minimised_obj = min(optimized_grid, key=lambda x: optimized_grid[x]["sse"]) + grid_id = optimized_grid[minimised_obj]["grid_id"] return grid_id -def assign_pt_to_grid(XYs, grid, distance_metric='euclidean', random_state=None): +def assign_pt_to_grid(XYs, grid, distance_metric="euclidean", random_state=None): """ - Spatial join pts to grids. Reassign border points to nearest grid based on centroid distance. + Spatial join pts to grids. Reassign border points to nearest grid based on centroid distance. """ np.random.seed(random_state) - XYs = convert_geodataframe(XYs) - # Equate spatial reference systems if defined + XYs = convert_geodataframe(XYs) + # Equate spatial reference systems if defined if not grid.crs == XYs.crs: - grid.crs = XYs.crs - XYs = gpd.sjoin(XYs, grid, how='left' , op='within')[['geometry', 'grid_id']] - + grid.crs = XYs.crs + XYs = gpd.sjoin(XYs, grid, how="left", op="within")[["geometry", "grid_id"]] + # In rare cases, points will sit at the border separating two grids - if XYs['grid_id'].isna().any(): + if XYs["grid_id"].isna().any(): # Find border pts and assign to nearest grid centroid grid_centroid = grid.geometry.centroid grid_centroid = geometry_to_2d(grid_centroid) - border_pt_index = XYs['grid_id'].isna() + border_pt_index = XYs["grid_id"].isna() border_pts = XYs[border_pt_index].geometry - border_pts = geometry_to_2d(border_pts) - + border_pts = geometry_to_2d(border_pts) + # Update border pt grid IDs - tree = BallTree(grid_centroid, metric=distance_metric) - grid_id = tree.query(border_pts, k=1, return_distance=False).flatten() - grid_id = grid.loc[grid_id, 'grid_id'].values - XYs.loc[border_pt_index, 'grid_id'] = grid_id - return XYs \ No newline at end of file + tree = BallTree(grid_centroid, metric=distance_metric) + grid_id = tree.query(border_pts, k=1, return_distance=False).flatten() + grid_id = grid.loc[grid_id, "grid_id"].values + XYs.loc[border_pt_index, "grid_id"] = grid_id + return XYs diff --git a/spacv/spacv.py b/spacv/spacv.py index 6bc5df4..faeabd9 100644 --- a/spacv/spacv.py +++ b/spacv/spacv.py @@ -7,25 +7,21 @@ from .base_classes import BaseSpatialCV from .grid_builder import assign_pt_to_grid, construct_blocks -from .utils import convert_geodataframe, geometry_to_2d, load_custom_polygon +from .utils import geometry_to_2d, load_custom_polygon + +__all__ = ["HBLOCK", "SKCV", "RepeatedSKCV", "UserDefinedSCV"] -__all__ = [ - "HBLOCK", - "SKCV", - "RepeatedSKCV", - "UserDefinedSCV" -] class HBLOCK(BaseSpatialCV): """ H-Blocking spatial cross-validator. Partitions study area into a series of grid polygons that are assigned into different folds based on several user-defined options. HBLOCK - exposes several parameters for choosing block sizes, types and + exposes several parameters for choosing block sizes, types and fold assignment. - + Yields indices to split data into training and test sets. - + Parameters ---------- tiles_x : integer, default=5 @@ -36,14 +32,14 @@ class HBLOCK(BaseSpatialCV): Specify shape of grid polygons, square or hex. method : string, default='unique' Choose grid ID assignment method to build folds. Options - are: unique, where every polygon in the grid is a fold; - systematic, where folds reflect diagonal or anti-diagonal - patterns across the study area; random, where folds are + are: unique, where every polygon in the grid is a fold; + systematic, where folds reflect diagonal or anti-diagonal + patterns across the study area; random, where folds are randomly assigned into groups determined by n_groups parameter; and optimized_random, where random assignment of grids into groups are optimized by reducing disimilarity between folds. buffer_radius : integer, default=0 - Buffer radius (dead zone) to exclude training points that are + Buffer radius (dead zone) to exclude training points that are within a defined distance of test data within a fold. direction : string, default='diagonal' Choose direction of pattern for systematic grid assignment, @@ -52,7 +48,7 @@ class HBLOCK(BaseSpatialCV): Number of folds to randomly assign grid polygons into. data : array Array containing covariates used in predictive task. Used to - calculate disimilarity of feature space between folds to + calculate disimilarity of feature space between folds to find the optimized random grid assignment. n_sims : integer, default=10 Number of iterations in which to find optimized grid assignment @@ -64,26 +60,27 @@ class HBLOCK(BaseSpatialCV): unprojected spaces. random_state : int, default=None random_state is the seed used by the random number generator. - + Examples - -------- - - + -------- + + """ + def __init__( self, tiles_x=5, tiles_y=5, - shape='square', - method='unique', + shape="square", + method="unique", buffer_radius=0, - direction='diagonal', + direction="diagonal", n_groups=5, data=None, n_sims=10, - distance_metric='euclidean', - random_state=None - ): + distance_metric="euclidean", + random_state=None, + ): self.tiles_x = tiles_x self.tiles_y = tiles_y self.shape = shape @@ -94,105 +91,104 @@ def __init__( self.data = data self.n_sims = n_sims self.distance_metric = distance_metric - self.n_splits = tiles_x*tiles_y + self.n_splits = tiles_x * tiles_y self.random_state = random_state - + def _iter_test_indices(self, XYs): """ - Generates integer indices corresponding to test sets and + Generates integer indices corresponding to test sets and training indices to be excluded from model training. - + Parameters ---------- XYs : GeoSeries GeoSeries containing shapely Points that identify Easting and Northing coordinates of data points. - + Yields ------ test_indices : array The testing set indices for that fold. train_exclude : array The training set indices to exclude for that fold. - """ + """ # Define grid type used in CV procedure - grid = construct_blocks(XYs, - tiles_x = self.tiles_x, - tiles_y = self.tiles_y, - shape = self.shape, - method = self.method, - direction = self.direction, - n_groups = self.n_groups, - data = self.data, - n_sims = self.n_sims, - random_state = self.random_state) - + grid = construct_blocks( + XYs, + tiles_x=self.tiles_x, + tiles_y=self.tiles_y, + shape=self.shape, + method=self.method, + direction=self.direction, + n_groups=self.n_groups, + data=self.data, + n_sims=self.n_sims, + random_state=self.random_state, + ) + # Convert to GDF to use Geopandas functions - XYs = gpd.GeoDataFrame(({'geometry':XYs})) - + XYs = gpd.GeoDataFrame(({"geometry": XYs})) + # Assign pts to grids XYs = assign_pt_to_grid(XYs, grid, self.distance_metric) grid_ids = np.unique(grid.grid_id) # Yield test indices and optionally training indices within buffer for grid_id in grid_ids: - test_indices = XYs.loc[XYs['grid_id'] == grid_id ].index.values + test_indices = XYs.loc[XYs["grid_id"] == grid_id].index.values # Remove empty grids if len(test_indices) < 1: continue grid_poly_buffer = grid.loc[[grid_id]].buffer(self.buffer_radius) - test_indices, train_exclude = \ - super()._remove_buffered_indices(XYs, test_indices, - self.buffer_radius, grid_poly_buffer) - yield test_indices, train_exclude - + test_indices, train_exclude = super()._remove_buffered_indices( + XYs, test_indices, self.buffer_radius, grid_poly_buffer + ) + yield test_indices, train_exclude + + class SKCV(BaseSpatialCV): """ - Spatial K-fold cross-validator. Modification of standard - CV to overcome biased prediction performance of estimates - due to autocorrelation in spatial data. Overoptimistic bias - in performance is prevented by ensuring spatial proximity of + Spatial K-fold cross-validator. Modification of standard + CV to overcome biased prediction performance of estimates + due to autocorrelation in spatial data. Overoptimistic bias + in performance is prevented by ensuring spatial proximity of test data, and maintaing a training set that is within a certain spatial distance from the test dataset. - + When K=N, SKCV becomes a spatial leave-one-out (SLOO) cross-validator. - + Yields indices to split data into training and test sets. - + Parameters ---------- n_splits : int, default=10 Number of folds. Must be at least 2. buffer_radius : integer, default=0 - Buffer radius (dead zone) to exclude training points that are + Buffer radius (dead zone) to exclude training points that are within a defined distance of test data within a fold. random_state : int, RandomState instance or None, optional, default=None random_state is the seed used by the random number generator. Examples - -------- - + -------- + """ - def __init__( - self, - n_splits=10, - buffer_radius = 0, - random_state = None - ): + + def __init__(self, n_splits=10, buffer_radius=0, random_state=None): self.n_splits = n_splits self.buffer_radius = buffer_radius self.random_state = random_state - + def _iter_test_indices(self, XYs): """ Generates integer indices corresponding to test sets. - + Parameters ---------- X : GeoSeries GeoSeries containing shapely Points that identify Easting and Northing coordinates of data points. - + Yields ------ test_indices : array @@ -200,16 +196,18 @@ def _iter_test_indices(self, XYs): train_exclude : array The training set indices to exclude for that fold. """ - - if self.n_splits > len(XYs) : + + if self.n_splits > len(XYs): raise ValueError( "Number of specified n_splits (folds) is larger than number of data points. Given {} observations and {} folds.".format( len(XYs), self.n_splits ) ) sloo = len(XYs) == self.n_splits - lattice = any(XYs.geom_type == 'Polygon') or any(XYs.geom_type == 'MultiPolygon') - + lattice = any(XYs.geom_type == "Polygon") or any( + XYs.geom_type == "MultiPolygon" + ) + # If K = N, SLOO if sloo: num_samples = XYs.shape[0] @@ -217,40 +215,48 @@ def _iter_test_indices(self, XYs): else: # Partion XYs space into folds XYs_to_2d = geometry_to_2d(XYs) - km_skcv = MiniBatchKMeans(n_clusters = self.n_splits, random_state=self.random_state) + km_skcv = MiniBatchKMeans( + n_clusters=self.n_splits, random_state=self.random_state + ) labels = km_skcv.fit(XYs_to_2d).labels_ uniques, counts = np.unique(labels, return_counts=True) - check_fold_n = (counts < 2) + check_fold_n = counts < 2 if check_fold_n.any(): - warn = "{} folds contain less than three points and do not form polygons.".format( check_fold_n.sum() ) + warn = "{} folds contain less than three points and do not form polygons.".format( + check_fold_n.sum() + ) warnings.warn(warn) - indices_from_folds = [np.argwhere(labels == i).reshape(-1) - for i in uniques] - for fold_indices in indices_from_folds: + indices_from_folds = [np.argwhere(labels == i).reshape(-1) for i in uniques] + for fold_indices in indices_from_folds: if sloo: test_indices = np.array([fold_indices]) fold_polygon = XYs.loc[test_indices].buffer(self.buffer_radius) elif lattice: test_indices = np.array(fold_indices) - fold_polygon = XYs.loc[test_indices].unary_union.buffer(self.buffer_radius) - else: # skcv + fold_polygon = XYs.loc[test_indices].unary_union.buffer( + self.buffer_radius + ) + else: # skcv test_indices = np.array(fold_indices) - fold_polygon = XYs.loc[test_indices].unary_union.convex_hull.buffer(self.buffer_radius) - - test_indices, train_exclude = \ - super()._remove_buffered_indices(XYs, test_indices, - self.buffer_radius, fold_polygon) + fold_polygon = XYs.loc[test_indices].unary_union.convex_hull.buffer( + self.buffer_radius + ) + + test_indices, train_exclude = super()._remove_buffered_indices( + XYs, test_indices, self.buffer_radius, fold_polygon + ) yield test_indices, train_exclude - + + class RepeatedSKCV(SKCV): """ - Repeats Spatial K-Fold cross-validation (SKCV) n times with different + Repeats Spatial K-Fold cross-validation (SKCV) n times with different K-means randomization in each repetition. Given the sensitivity of K-means to the initial, randomly initialised starting values of the centroid seeds, - RepeatedSKCV repeats SKCV a number of times, yielding a generator of + RepeatedSKCV repeats SKCV a number of times, yielding a generator of n_repeats * K test and training splits. - + Parameters ---------- n_splits : int, default=10 @@ -264,14 +270,8 @@ class RepeatedSKCV(SKCV): **cvargs : additional params Constructor parameters for cv. """ - - def __init__( - self, - n_repeats=10, - n_splits=10, - random_state=None, - **cvargs - ): + + def __init__(self, n_repeats=10, n_splits=10, random_state=None, **cvargs): if not isinstance(n_repeats, numbers.Integral): raise ValueError("Number of repetitions must be of Integral type.") if n_repeats <= 0: @@ -280,102 +280,100 @@ def __init__( self.n_repeats = n_repeats self.n_splits = n_splits self.cvargs = cvargs - + def split(self, XYs): n_repeats = self.n_repeats for idx in range(n_repeats): cv = self.cv(self.n_splits, **self.cvargs) for train_index, test_index in cv.split(XYs): yield train_index, test_index - + + class UserDefinedSCV(BaseSpatialCV): """ Spatial cross-validation using user-defined polygons. - + Yields indices to split data into training and test sets. - + Parameters ---------- custom_polygons : string, GeoSeries File path to user defined grid polygons used to assign data points into folds. buffer_radius : integer, default=0 - Buffer radius (dead zone) to exclude training points that are + Buffer radius (dead zone) to exclude training points that are within a defined distance of test data within a fold. distance_metric : string, default='euclidean' Distance metric used to reconcile points that sit at exact border between two grids. Defaults to euclidean assuming projected coordinate system, otherwise use haversine for unprojected spaces. - + Yields ------ test_indices : array The testing set indices for that fold. train_exclude : array The training set indices to exclude for that fold. - + """ - def __init__( - self, - custom_polygons, - buffer_radius = 0, - distance_metric = 'euclidean' - ): + + def __init__(self, custom_polygons, buffer_radius=0, distance_metric="euclidean"): self.buffer_radius = buffer_radius self.custom_polygons = load_custom_polygon(custom_polygons) self.distance_metric = distance_metric - + def _iter_test_indices(self, XYs): """ - Generates integer indices corresponding to test sets and + Generates integer indices corresponding to test sets and training indices to be excluded from model training. - + Parameters ---------- XYs : GeoSeries GeoSeries containing shapely Points that identify Easting and Northing coordinates of data points. - + Yields ------ test_indices : array The testing set indices for that fold. train_exclude : array The training set indices to exclude for that fold. - """ + """ grid = self.custom_polygons - grid['grid_id'] = grid.index + grid["grid_id"] = grid.index grid_ids = np.unique(grid.grid_id) - + XYs = assign_pt_to_grid(XYs, grid, self.distance_metric) - + # Yield test indices and optionally training indices within buffer for grid_id in grid_ids: - test_indices = XYs.loc[XYs['grid_id'] == grid_id ].index.values + test_indices = XYs.loc[XYs["grid_id"] == grid_id].index.values # Remove empty grids if len(test_indices) < 1: - continue + continue grid_poly_buffer = grid.loc[[grid_id]].buffer(self.buffer_radius) - test_indices, train_exclude = \ - super()._remove_buffered_indices(XYs, test_indices, - self.buffer_radius, grid_poly_buffer) - yield test_indices, train_exclude - -def compute_gcv(y, X): - """ - Compute generalized cross-validation (GCV) for i.i.d data. GSV - is an approximation of leave-one-out (LOO) CV. - - - ADD: accomodate space. - """ - y = y.reshape(-1, 1) - ols = spreg.ols.OLS(y, X) - hat = X.dot(np.linalg.inv(X.T.dot(X)).dot(X.T)) - mse = np.mean( (y - ols.predy)**2) - n = len(y) - h_value = np.trace( np.identity(n) - hat ) / n - gcv = mse / h_value**2 - - return gcv \ No newline at end of file + test_indices, train_exclude = super()._remove_buffered_indices( + XYs, test_indices, self.buffer_radius, grid_poly_buffer + ) + yield test_indices, train_exclude + + +# def compute_gcv(y, X): +# """ +# Compute generalized cross-validation (GCV) for i.i.d data. GSV +# is an approximation of leave-one-out (LOO) CV. + + +# ADD: accomodate space. +# """ +# y = y.reshape(-1, 1) +# ols = spreg.ols.OLS(y, X) +# hat = X.dot(np.linalg.inv(X.T.dot(X)).dot(X.T)) +# mse = np.mean((y - ols.predy) ** 2) +# n = len(y) +# h_value = np.trace(np.identity(n) - hat) / n +# gcv = mse / h_value**2 + +# return gcv diff --git a/spacv/tests/test_cv.py b/spacv/tests/test_cv.py index f32bbf7..be42e28 100644 --- a/spacv/tests/test_cv.py +++ b/spacv/tests/test_cv.py @@ -1,33 +1,61 @@ import unittest -import numpy as np + import geopandas as gpd +import numpy as np import spacv + class SKCV_Tester(unittest.TestCase): def setUp(self): np.random.seed(10) x = np.random.randint(0, 3000, 30) y = np.random.randint(0, 3000, 30) - - self.gdf = gpd.GeoDataFrame( - {'geometry' : gpd.points_from_xy(x,y)} - ) - + + self.gdf = gpd.GeoDataFrame({"geometry": gpd.points_from_xy(x, y)}) + self.fold_test_one = np.array([1, 5, 13, 16, 17, 18, 21, 22, 23, 25, 26, 28]) self.fold_test_two = np.array([2, 4, 6, 7, 11, 12, 14, 27, 29]) self.fold_test_three = np.array([0, 3, 8, 9, 10, 15, 19, 20, 24]) - - self.fold_train_one = np.array([0, 2, 3, 4, 6, 8, 9, 10, 11, 12, 14, 15, 19, 20, 24, 27, 29]) - self.fold_train_two = np.array([0, 1, 3, 5, 8, 9, 10, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28]) - self.fold_train_three = np.array([1, 2, 4, 5, 6, 7, 11, 12, 13, 14, 16, 17, 18, 21, 22, 23, 25, 26, 27, 28, 29]) - + + self.fold_train_one = np.array( + [0, 2, 3, 4, 6, 8, 9, 10, 11, 12, 14, 15, 19, 20, 24, 27, 29] + ) + self.fold_train_two = np.array( + [0, 1, 3, 5, 8, 9, 10, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28] + ) + self.fold_train_three = np.array( + [ + 1, + 2, + 4, + 5, + 6, + 7, + 11, + 12, + 13, + 14, + 16, + 17, + 18, + 21, + 22, + 23, + 25, + 26, + 27, + 28, + 29, + ] + ) + def test_skcv(self): np.random.seed(10) - scv = spacv.SKCV(n_splits = 3, buffer_radius = 450, random_state=123) + scv = spacv.SKCV(n_splits=3, buffer_radius=450, random_state=123) self.assertEqual(scv.n_splits, 3) self.assertEqual(scv.buffer_radius, 450) self.assertEqual(scv.random_state, 123) - + fold_train, fold_test = [], [] for train, test in scv.split(self.gdf): fold_train.append(train) @@ -36,120 +64,115 @@ def test_skcv(self): scv_fold_one = fold_test[0] scv_fold_two = fold_test[1] scv_fold_three = fold_test[2] - + scv_train_one = fold_train[0] scv_train_two = fold_train[1] scv_train_three = fold_train[2] - + np.testing.assert_equal(self.fold_test_one, scv_fold_one) np.testing.assert_equal(self.fold_test_two, scv_fold_two) np.testing.assert_equal(self.fold_test_three, scv_fold_three) - + # Training pts removed in deadzone buffer check np.testing.assert_equal(self.fold_train_one, scv_train_one) np.testing.assert_equal(self.fold_train_two, scv_train_two) np.testing.assert_equal(self.fold_train_three, scv_train_three) - - + + class HBLOCK_Tester(unittest.TestCase): def setUp(self): np.random.seed(10) x = np.random.randint(0, 3000, 10) y = np.random.randint(0, 3000, 10) - - self.gdf = gpd.GeoDataFrame( - {'geometry' : gpd.points_from_xy(x,y)} - ) - self.fold_test_one = np.array([0,4,6]) - self.fold_test_two = np.array([2,3,7,8]) - self.fold_test_three = np.array([1,5,9]) - - self.fold_train_one = np.array([1,2,3,5,7,8, 9]) - self.fold_train_two = np.array([0,1,4,5,6,9]) - self.fold_train_three = np.array([0,2,3,4,6,7,8]) - + self.gdf = gpd.GeoDataFrame({"geometry": gpd.points_from_xy(x, y)}) + + self.fold_test_one = np.array([0, 4, 6]) + self.fold_test_two = np.array([2, 3, 7, 8]) + self.fold_test_three = np.array([1, 5, 9]) + + self.fold_train_one = np.array([1, 2, 3, 5, 7, 8, 9]) + self.fold_train_two = np.array([0, 1, 4, 5, 6, 9]) + self.fold_train_three = np.array([0, 2, 3, 4, 6, 7, 8]) + def test_hblock(self): - - scv = spacv.HBLOCK(buffer_radius=0, tiles_x=3, tiles_y=3, - method='random', n_groups=3, random_state=12) + scv = spacv.HBLOCK( + buffer_radius=0, + tiles_x=3, + tiles_y=3, + method="random", + n_groups=3, + random_state=12, + ) self.assertEqual(scv.tiles_x, 3) self.assertEqual(scv.tiles_y, 3) - self.assertEqual(scv.method, 'random') - self.assertEqual(scv.n_groups, 3) - self.assertEqual(scv.random_state, 12) - + self.assertEqual(scv.method, "random") + self.assertEqual(scv.n_groups, 3) + self.assertEqual(scv.random_state, 12) + fold_train, fold_test = [], [] for train, test in scv.split(self.gdf): fold_train.append(train) fold_test.append(test) - + scv_fold_one = fold_test[0] scv_fold_two = fold_test[1] scv_fold_three = fold_test[2] - + scv_train_one = fold_train[0] scv_train_two = fold_train[1] scv_train_three = fold_train[2] - + np.testing.assert_equal(self.fold_test_one, scv_fold_one) np.testing.assert_equal(self.fold_test_two, scv_fold_two) np.testing.assert_equal(self.fold_test_three, scv_fold_three) - + # Training pts removed in deadzone buffer check np.testing.assert_equal(self.fold_train_one, scv_train_one) np.testing.assert_equal(self.fold_train_two, scv_train_two) np.testing.assert_equal(self.fold_train_three, scv_train_three) - + class SKCVUnprojected_Tester(unittest.TestCase): def setUp(self): - pass - + def test_unprojected_coords_skcv(self): - pass - + + class SLOO_Tester(unittest.TestCase): def setUp(self): - pass - + def test_sloo(self): - pass - + + class LatticeSKCV_Tester(unittest.TestCase): def setUp(self): - pass - + def test_lattice_skcv(self): - pass - + class RepeatSKCV_Tester(unittest.TestCase): def setUp(self): - pass - + def test_repeated_skcv(self): - pass - + + class UserDefinedSCV_Tester(unittest.TestCase): def setUp(self): - pass - + def test_userdefined_skcv(self): - pass - - + suite = unittest.TestSuite() test_classes = [SKCV_Tester, HBLOCK_Tester] for i in test_classes: @@ -158,4 +181,4 @@ def test_userdefined_skcv(self): if __name__ == "__main__": runner = unittest.TextTestRunner() - runner.run(suite) \ No newline at end of file + runner.run(suite) diff --git a/spacv/tests/test_grids.py b/spacv/tests/test_grids.py index de0bf79..0d87bdc 100644 --- a/spacv/tests/test_grids.py +++ b/spacv/tests/test_grids.py @@ -1,124 +1,129 @@ import unittest -import numpy as np + import geopandas as gpd -from spacv.grid_builder import * +import numpy as np +from spacv.grid_builder import construct_blocks + class SquareUniqueGrid_Tester(unittest.TestCase): def setUp(self): np.random.seed(10) self.tiles_x = 5 self.tiles_y = 10 - self.method = 'unique' + self.method = "unique" x = np.random.randint(0, 3000, 100) y = np.random.randint(0, 3000, 100) - self.XYs = gpd.GeoDataFrame( - {'geometry' : gpd.points_from_xy(x,y)} - ) - + self.XYs = gpd.GeoDataFrame({"geometry": gpd.points_from_xy(x, y)}) + def test_unique_grid(self): np.random.seed(10) grid = construct_blocks(self.XYs, self.tiles_x, self.tiles_y, self.method) # Test count of unique folds - np.testing.assert_equal(grid.shape[0], 5 * 10 ) - np.testing.assert_equal( len(grid.grid_id.unique()), 50 ) - - + np.testing.assert_equal(grid.shape[0], 5 * 10) + np.testing.assert_equal(len(grid.grid_id.unique()), 50) + + class SquareRandomGrid_Tester(unittest.TestCase): - def setUp(self): + def setUp(self): np.random.seed(10) self.tiles_x = 4 self.tiles_y = 4 - self.method = 'random' + self.method = "random" self.n_groups = 5 - self.shape = 'square' + self.shape = "square" self.random_state = 123 - - self.grid_assignment = np.array([ - [2, 4, 2, 1], - [3, 2, 3, 1], - [1, 0, 1, 1], - [0, 0, 1, 3] - ]) + + self.grid_assignment = np.array( + [[2, 4, 2, 1], [3, 2, 3, 1], [1, 0, 1, 1], [0, 0, 1, 3]] + ) x = np.random.randint(0, 1000, 100) y = np.random.randint(0, 1000, 100) - self.XYs = gpd.GeoDataFrame( - {'geometry' : gpd.points_from_xy(x,y)} - ) - + self.XYs = gpd.GeoDataFrame({"geometry": gpd.points_from_xy(x, y)}) + def test_random_grid(self): np.random.seed(10) - - grid = construct_blocks(XYs = self.XYs, - tiles_x = self.tiles_x, - tiles_y = self.tiles_y, - method = self.method, - n_groups = self.n_groups, - shape = self.shape, - random_state = self.random_state) - - np.testing.assert_equal(grid.shape[0], 4 * 4 ) - np.testing.assert_equal( len(grid.grid_id.unique()), 5 ) - - np.testing.assert_equal(grid.grid_id.values.reshape(4,4), self.grid_assignment) - - + + grid = construct_blocks( + XYs=self.XYs, + tiles_x=self.tiles_x, + tiles_y=self.tiles_y, + method=self.method, + n_groups=self.n_groups, + shape=self.shape, + random_state=self.random_state, + ) + + np.testing.assert_equal(grid.shape[0], 4 * 4) + np.testing.assert_equal(len(grid.grid_id.unique()), 5) + + np.testing.assert_equal(grid.grid_id.values.reshape(4, 4), self.grid_assignment) + + class SquareSystematicGrid_Tester(unittest.TestCase): - def setUp(self): + def setUp(self): np.random.seed(10) self.tiles_x = 5 self.tiles_y = 5 - self.method = 'systematic' - self.direction = 'diagonal' - self.shape = 'square' - - self.grid_assignment = np.array([[4, 3, 2, 1, 0], - [5, 4, 3, 2, 1], - [6, 5, 4, 3, 2], - [7, 6, 5, 4, 3], - [8, 7, 6, 5, 4]]) + self.method = "systematic" + self.direction = "diagonal" + self.shape = "square" + self.grid_assignment = np.array( + [ + [4, 3, 2, 1, 0], + [5, 4, 3, 2, 1], + [6, 5, 4, 3, 2], + [7, 6, 5, 4, 3], + [8, 7, 6, 5, 4], + ] + ) x = np.random.randint(0, 1000, 100) y = np.random.randint(0, 1000, 100) - self.XYs = gpd.GeoDataFrame( - {'geometry' : gpd.points_from_xy(x,y)} - ) - + self.XYs = gpd.GeoDataFrame({"geometry": gpd.points_from_xy(x, y)}) + def test_systematic_grid(self): np.random.seed(10) - - grid = construct_blocks(XYs = self.XYs, - tiles_x = self.tiles_x, - tiles_y = self.tiles_y, - method = self.method, - direction = self.direction, - shape = self.shape) - - np.testing.assert_equal(grid.shape[0], 5 * 5 ) - np.testing.assert_equal(grid.grid_id.values.reshape(5,5), self.grid_assignment) - - + + grid = construct_blocks( + XYs=self.XYs, + tiles_x=self.tiles_x, + tiles_y=self.tiles_y, + method=self.method, + direction=self.direction, + shape=self.shape, + ) + + np.testing.assert_equal(grid.shape[0], 5 * 5) + np.testing.assert_equal(grid.grid_id.values.reshape(5, 5), self.grid_assignment) + + class SquareOptimizedRandomGrid_Tester(unittest.TestCase): - def setUp(self): + def setUp(self): pass - + def test_random_optimized(self): pass - + + class HexUniqueGrid_Tester(unittest.TestCase): def setUp(self): np.random.seed(10) pass - + def test_hex_unique_grid(self): np.random.seed(10) - - + + suite = unittest.TestSuite() -test_classes = [SquareUniqueGrid_Tester, SquareRandomGrid_Tester, SquareSystematicGrid_Tester] +test_classes = [ + SquareUniqueGrid_Tester, + SquareRandomGrid_Tester, + SquareSystematicGrid_Tester, +] for i in test_classes: a = unittest.TestLoader().loadTestsFromTestCase(i) suite.addTest(a) @@ -126,8 +131,3 @@ def test_hex_unique_grid(self): if __name__ == "__main__": runner = unittest.TextTestRunner() runner.run(suite) - - - - - diff --git a/spacv/tests/test_viz.py b/spacv/tests/test_viz.py index f8436be..24002b7 100644 --- a/spacv/tests/test_viz.py +++ b/spacv/tests/test_viz.py @@ -1,3 +1,3 @@ import matplotlib -matplotlib.use("agg") # To prevent plots from using display \ No newline at end of file +matplotlib.use("agg") # To prevent plots from using display diff --git a/spacv/utils.py b/spacv/utils.py index 3c5a89c..aa2bca4 100644 --- a/spacv/utils.py +++ b/spacv/utils.py @@ -1,39 +1,43 @@ -import numpy as np import geopandas as gpd +import numpy as np import pandas as pd -from shapely.geometry import Point, Polygon, LineString, MultiPolygon +from shapely.geometry import LineString, MultiPolygon, Point, Polygon + def geometry_to_2d(geometry): - return np.array(list(map(lambda x : (x.x, x.y), geometry))) + return np.array(list(map(lambda x: (x.x, x.y), geometry))) + def convert_geoseries(XYs): if isinstance(XYs, gpd.GeoDataFrame): XYs = XYs.geometry if isinstance(XYs, np.ndarray): - XYs = gpd.GeoSeries( gpd.points_from_xy(XYs) ) + XYs = gpd.GeoSeries(gpd.points_from_xy(XYs)) if isinstance(XYs, gpd.GeoDataFrame): - if any(XYs.geom_type == 'Polygon') or any(XYs.geom_type == 'MultiPolygon'): - XYs = XYs.geometry.centroid + if any(XYs.geom_type == "Polygon") or any(XYs.geom_type == "MultiPolygon"): + XYs = XYs.geometry.centroid return XYs + def convert_geodataframe(XYs): if isinstance(XYs, gpd.GeoSeries): - XYs = gpd.GeoDataFrame({'geometry':XYs}) + XYs = gpd.GeoDataFrame({"geometry": XYs}) if isinstance(XYs, np.ndarray): - XYs = gpd.GeoDataFrame({'geometry': gpd.points_from_xy(XYs) }) + XYs = gpd.GeoDataFrame({"geometry": gpd.points_from_xy(XYs)}) if isinstance(XYs, (Point, Polygon, LineString, MultiPolygon)): - XYs = gpd.GeoDataFrame({'geometry': [XYs] }) + XYs = gpd.GeoDataFrame({"geometry": [XYs]}) return XYs + def convert_numpy(X): if isinstance(X, (pd.DataFrame, pd.Series, gpd.GeoDataFrame)): return X.values else: return X - + + def load_custom_polygon(custom_poly): if isinstance(custom_poly, (gpd.GeoDataFrame, gpd.GeoSeries)): return custom_poly.reset_index(drop=True) if isinstance(custom_poly, str): return gpd.read_file(custom_poly).reset_index(drop=True) - \ No newline at end of file diff --git a/spacv/visualisation.py b/spacv/visualisation.py index 17f3e05..b688ee4 100644 --- a/spacv/visualisation.py +++ b/spacv/visualisation.py @@ -1,30 +1,33 @@ -import numpy as np +import geopandas as gpd import matplotlib.pyplot as plt +import numpy as np from matplotlib.colors import ListedColormap -import geopandas as gpd -from sklearn.neighbors import BallTree -from scipy.spatial.distance import pdist, squareform from scipy.optimize import curve_fit +from scipy.spatial.distance import pdist, squareform +from sklearn.neighbors import BallTree + from .utils import geometry_to_2d try: import seaborn as sns - sns.set_style("whitegrid") # pretty plots + + sns.set_style("whitegrid") # pretty plots except ModuleNotFoundError: pass - + __all__ = [ "variogram_at_lag", "compute_semivariance", "plot_autocorrelation_ranges", "aoa", - "plot_aoa" + "plot_aoa", ] - + + def variogram_at_lag(XYs, x, lags, bw): """ Return semivariance values for defined lag of distance. - + Parameters ---------- XYs : Geoseries series @@ -34,8 +37,8 @@ def variogram_at_lag(XYs, x, lags, bw): lags : array Array of distance lags in metres to obtain semivariances. bw : integer or float - Bandwidth, plus and minus lags to calculate semivariance. - + Bandwidth, plus and minus lags to calculate semivariance. + Returns ------- semivariances : Array of floats @@ -45,46 +48,50 @@ def variogram_at_lag(XYs, x, lags, bw): x = np.asarray(x) paired_distances = pdist(XYs) pd_m = squareform(paired_distances) - semivariances = np.empty(( len(lags) ), dtype=np.float64) + semivariances = np.empty((len(lags)), dtype=np.float64) for i, lag in enumerate(lags): # Mask pts outside bandwidth - lower = pd_m >= lag-bw - upper = pd_m <= lag+bw + lower = pd_m >= lag - bw + upper = pd_m <= lag + bw mask = np.logical_and(lower, upper) semivariances[i] = compute_semivariance(x, mask, bw) return np.c_[semivariances, lags].T + def compute_semivariance(x, mask, bw): """ Calculate semivariance for masked elements. """ semis, counts = [], [] - for i in range( len(x) ): + for i in range(len(x)): xi = np.array([x[i]]) - mask_i = mask[i,:] + mask_i = mask[i, :] mask_i[:i] = False xj = x[mask_i] - ss = (xi - xj)**2 - filter_empty = ss > 0. + ss = (xi - xj) ** 2 + filter_empty = ss > 0.0 if len(ss[filter_empty]) > 0: - counts.append( len(ss[filter_empty])) + counts.append(len(ss[filter_empty])) semis.append(ss[filter_empty]) - semivariance = np.sum( np.concatenate(semis)) / (2.0 * sum(counts) ) + semivariance = np.sum(np.concatenate(semis)) / (2.0 * sum(counts)) return semivariance + def variogram(func): def send_params(*params): new_args = params[1:] mapping = map(lambda h: func(h, *new_args), params[0]) return np.fromiter(mapping, dtype=float) + return send_params + @variogram def spherical(h, r, sill, nugget=0): """ Spherical variogram model function. Calculates the dependent variable for a given lag (h). The nugget (b) defaults to be 0. - + Parameters ---------- h : float @@ -92,28 +99,29 @@ def spherical(h, r, sill, nugget=0): r : float Effective range of autocorrelation. sill : float - The sill of the variogram, where the semivariance begins to saturate. + The sill of the variogram, where the semivariance begins to saturate. nugget : float, default=0 The nugget of the variogram. This is the value of independent - variable at the distance of zero. + variable at the distance of zero. Returns ------- gamma : numpy float Coefficients that describe effective range of spatial autocorrelation """ - a = r / 1. + a = r / 1.0 if h <= r: return nugget + sill * ((1.5 * (h / a)) - (0.5 * ((h / a) ** 3.0))) else: return nugget + sill - + + def plot_autocorrelation_ranges(XYs, X, lags, bw, **kwargs): """ - Plot spatial autocorrelation ranges for input covariates. Suggested - block size is proposed by taking the median autocorrelation range + Plot spatial autocorrelation ranges for input covariates. Suggested + block size is proposed by taking the median autocorrelation range across the data and is reported by the horiztonal line. This function works best for projected coordinate systems. - + Parameters ---------- XYs : Geoseries series @@ -123,55 +131,63 @@ def plot_autocorrelation_ranges(XYs, X, lags, bw, **kwargs): lags : array Array of distance lags in metres to obtain semivariances. bw : integer or float - Bandwidth, plus and minus lags to calculate semivariance. - + Bandwidth, plus and minus lags to calculate semivariance. + Returns ------- fig : matplotlip Figure instance Figure of spatial weight network. ax : matplotlib Axes instance - Axes in which the figure is plotted. + Axes in which the figure is plotted. """ - alpha = kwargs.pop('alpha', .7) - font_size = kwargs.pop('font_size', 14) - block_suggestion_color = kwargs.pop('color', 'red') - figsize = kwargs.pop('figsize', (8,6)) - + alpha = kwargs.pop("alpha", 0.7) + font_size = kwargs.pop("font_size", 14) + block_suggestion_color = kwargs.pop("color", "red") + figsize = kwargs.pop("figsize", (8, 6)) + ranges = [] for col in X.values.T: # Fit spherical model and extract effective range parameter semis = variogram_at_lag(XYs, col, lags, bw) - sv,h = semis[:,0], semis[:,1] + sv, h = semis[:, 0], semis[:, 1] start_params = [np.nanmax(h), np.nanmax(sv)] bounds = (0, start_params) - cof, _ = curve_fit(spherical, h, sv, sigma=None, - p0 = start_params, bounds=bounds, - method='trf') + cof, _ = curve_fit( + spherical, h, sv, sigma=None, p0=start_params, bounds=bounds, method="trf" + ) effective_range = cof[0] ranges.append(effective_range) x_labs = X.columns f, ax = plt.subplots(1, figsize=figsize) - ax.bar(x_labs, ranges, color='skyblue', alpha=alpha) + ax.bar(x_labs, ranges, color="skyblue", alpha=alpha) ax.set_ylabel("Ranges (m)") ax.set_xlabel("Variables") median_eff_range = np.median(ranges) - ax.text(0, median_eff_range + (np.max(ranges) / 100 * 3), '{:.3f}m'.format(median_eff_range), color=block_suggestion_color, size=font_size ) - ax.axhline(median_eff_range, color=block_suggestion_color, linestyle='--') + ax.text( + 0, + median_eff_range + (np.max(ranges) / 100 * 3), + "{:.3f}m".format(median_eff_range), + color=block_suggestion_color, + size=font_size, + ) + ax.axhline(median_eff_range, color=block_suggestion_color, linestyle="--") return f, ax - -def aoa(new_data, - training_data, - model=None, - thres=0.95, - fold_indices=None, - distance_metric='euclidean' - ): + + +def aoa( + new_data, + training_data, + model=None, + thres=0.95, + fold_indices=None, + distance_metric="euclidean", +): """ Area of Applicability (AOA) measure for spatial prediction models from - Meyer and Pebesma (2020). The AOA defines the area for which, on average, - the cross-validation error of the model applies, which is crucial for + Meyer and Pebesma (2020). The AOA defines the area for which, on average, + the cross-validation error of the model applies, which is crucial for cases where spatial predictions are used to inform decision-making. - + Parameters ---------- new_data : GeoDataFrame @@ -194,26 +210,28 @@ def aoa(new_data, Binary mask that occludes points outside predictive area of applicability. """ if len(training_data) <= 1: - raise Exception('At least two training instances need to be specified.') - - # Scale data + raise Exception("At least two training instances need to be specified.") + + # Scale data training_data = (training_data - np.mean(training_data)) / np.std(training_data) new_data = (new_data - np.mean(new_data)) / np.std(new_data) # Calculate nearest training instance to test data, return Euclidean distances - tree = BallTree(training_data, metric=distance_metric) + tree = BallTree(training_data, metric=distance_metric) mindist, _ = tree.query(new_data, k=1, return_distance=True) - # Build matrix of pairwise distances + # Build matrix of pairwise distances paired_distances = pdist(training_data) train_dist = squareform(paired_distances) np.fill_diagonal(train_dist, np.nan) - + # Remove data points that are within the same fold - if fold_indices: + if fold_indices: # Get number of training instances in each fold instances_in_folds = [len(fold) for fold in fold_indices] - instance_fold_id = np.repeat(np.arange(0, len(fold_indices)), instances_in_folds) + instance_fold_id = np.repeat( + np.arange(0, len(fold_indices)), instances_in_folds + ) # Create mapping between training instance and fold ID fold_indices = np.concatenate(fold_indices) @@ -221,20 +239,22 @@ def aoa(new_data, # Mask training points in same fold for DI measure calculation for i, row in enumerate(train_dist): - mask = folds[:,0] == folds[:,0][i] + mask = folds[:, 0] == folds[:, 0][i] train_dist[i, mask] = np.nan # Scale distance to nearest training point by average distance across training data train_dist_mean = np.nanmean(train_dist, axis=1) train_dist_avgmean = np.mean(train_dist_mean) - mindist /= train_dist_avgmean + mindist /= train_dist_avgmean # Define threshold for AOA train_dist_min = np.nanmin(train_dist, axis=1) - aoa_train_stats = np.quantile(train_dist_min / train_dist_avgmean, - q = np.array([0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 1])) - thres = np.quantile(train_dist_min / train_dist_avgmean, q = thres) - + # aoa_train_stats = np.quantile( + # train_dist_min / train_dist_avgmean, + # q=np.array([0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 1]), + # ) + thres = np.quantile(train_dist_min / train_dist_avgmean, q=thres) + # We choose the AOA as the area where the DI does not exceed the threshold DIs = mindist.reshape(-1) masked_result = np.repeat(1, len(mindist)) @@ -242,7 +262,10 @@ def aoa(new_data, return DIs, masked_result -def plot_aoa(new_data, training_data, columns, figsize=(14,4), fold_indices=None, **kwargs): + +def plot_aoa( + new_data, training_data, columns, figsize=(14, 4), fold_indices=None, **kwargs +): """ Parameters ---------- @@ -257,43 +280,46 @@ def plot_aoa(new_data, training_data, columns, figsize=(14,4), fold_indices=None Width, height of figure in inches. fold_indices : iterable, default=None Iterable consisting of training indices that identify instances in the - folds. - + folds. + Returns ------- fig : matplotlip Figure instance Figure of spatial weight network. ax : matplotlib Axes instance - Axes in which the figure is plotted. + Axes in which the figure is plotted. """ # Pop geometry for use later in plotting new_data = new_data.copy() - new_data_geometry = new_data.pop('geometry') - + new_data_geometry = new_data.pop("geometry") + # Subset to variables new_data_aoa = new_data[columns] training_data_aoa = training_data[columns] - - DIs, masked_result = aoa(new_data_aoa, - training_data_aoa, - fold_indices=fold_indices) - - new_data.loc[:, 'DI'] = DIs - new_data.loc[:, 'AOA'] = masked_result - new_data.loc[:, 'geometry'] = new_data_geometry - new_data = gpd.GeoDataFrame(new_data, geometry=new_data['geometry']) - - f,ax = plt.subplots(1, 2, figsize=figsize) - - new_data.plot(ax=ax[0], column='DI', legend=True, cmap='viridis') - new_data.plot(ax=ax[1], column='AOA', categorical=True, - legend=True, cmap=ListedColormap(['red', 'blue'])) - training_data.plot(ax=ax[0], alpha=.3) - training_data.plot(ax=ax[1], alpha=.3) - - ax[0].set_aspect('auto') - ax[1].set_aspect('auto') - ax[0].set_title('Dissimilarity index (DI)') - ax[1].set_title('AOA'); - - return f, ax \ No newline at end of file + + DIs, masked_result = aoa(new_data_aoa, training_data_aoa, fold_indices=fold_indices) + + new_data.loc[:, "DI"] = DIs + new_data.loc[:, "AOA"] = masked_result + new_data.loc[:, "geometry"] = new_data_geometry + new_data = gpd.GeoDataFrame(new_data, geometry=new_data["geometry"]) + + f, ax = plt.subplots(1, 2, figsize=figsize) + + new_data.plot(ax=ax[0], column="DI", legend=True, cmap="viridis") + new_data.plot( + ax=ax[1], + column="AOA", + categorical=True, + legend=True, + cmap=ListedColormap(["red", "blue"]), + ) + training_data.plot(ax=ax[0], alpha=0.3) + training_data.plot(ax=ax[1], alpha=0.3) + + ax[0].set_aspect("auto") + ax[1].set_aspect("auto") + ax[0].set_title("Dissimilarity index (DI)") + ax[1].set_title("AOA") + + return f, ax From 930d25c75611f13e00c3248eab6a90619f9f575e Mon Sep 17 00:00:00 2001 From: Daniel Lusk Date: Wed, 31 May 2023 18:01:49 +0200 Subject: [PATCH 06/14] fix semivariance indexing bug that prevented curve_fit from fitting --- spacv/visualisation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacv/visualisation.py b/spacv/visualisation.py index b688ee4..20c0c6c 100644 --- a/spacv/visualisation.py +++ b/spacv/visualisation.py @@ -149,7 +149,7 @@ def plot_autocorrelation_ranges(XYs, X, lags, bw, **kwargs): for col in X.values.T: # Fit spherical model and extract effective range parameter semis = variogram_at_lag(XYs, col, lags, bw) - sv, h = semis[:, 0], semis[:, 1] + sv, h = semis[0], semis[1] start_params = [np.nanmax(h), np.nanmax(sv)] bounds = (0, start_params) cof, _ = curve_fit( From f41312486afa9ced3398078e898c8977e688413b Mon Sep 17 00:00:00 2001 From: Daniel Lusk Date: Wed, 31 May 2023 18:05:40 +0200 Subject: [PATCH 07/14] allow autocorrelation to be calculated using geographic coordinates; add rudimentary variogram plotting; add multiprocessing --- spacv/visualisation.py | 140 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 124 insertions(+), 16 deletions(-) diff --git a/spacv/visualisation.py b/spacv/visualisation.py index 20c0c6c..15d8954 100644 --- a/spacv/visualisation.py +++ b/spacv/visualisation.py @@ -1,9 +1,15 @@ +import multiprocessing +from typing import Tuple, Union + import geopandas as gpd import matplotlib.pyplot as plt import numpy as np +from matplotlib.axis import Axis from matplotlib.colors import ListedColormap +from matplotlib.figure import Figure from scipy.optimize import curve_fit from scipy.spatial.distance import pdist, squareform +from sklearn.metrics.pairwise import haversine_distances from sklearn.neighbors import BallTree from .utils import geometry_to_2d @@ -24,7 +30,13 @@ ] -def variogram_at_lag(XYs, x, lags, bw): +def variogram_at_lag( + XYs: gpd.GeoSeries, + x: Union[list, np.ndarray], + lags: np.ndarray, + bw: Union[int, float], + distance_metric: str = "euclidean", +) -> np.ndarray: """ Return semivariance values for defined lag of distance. @@ -38,6 +50,10 @@ def variogram_at_lag(XYs, x, lags, bw): Array of distance lags in metres to obtain semivariances. bw : integer or float Bandwidth, plus and minus lags to calculate semivariance. + distance_metric : string + Distance function to calculate pairwise distances. Must be "euclidean" for + points in euclidean space, or "haversine" for points in a geographic CRS. + Defaults to "euclidean". Returns ------- @@ -46,15 +62,23 @@ def variogram_at_lag(XYs, x, lags, bw): """ XYs = geometry_to_2d(XYs) x = np.asarray(x) - paired_distances = pdist(XYs) - pd_m = squareform(paired_distances) + + if distance_metric == "euclidean": + paired_distances = pdist(XYs) + pd_m = squareform(paired_distances) + elif distance_metric == "haversine": + pd_m = haversine_distances(np.radians([*XYs])) + pd_m = pd_m * 6371000 # multiply by Earth radius to get meters + semivariances = np.empty((len(lags)), dtype=np.float64) + for i, lag in enumerate(lags): # Mask pts outside bandwidth lower = pd_m >= lag - bw upper = pd_m <= lag + bw mask = np.logical_and(lower, upper) semivariances[i] = compute_semivariance(x, mask, bw) + return np.c_[semivariances, lags].T @@ -115,7 +139,29 @@ def spherical(h, r, sill, nugget=0): return nugget + sill -def plot_autocorrelation_ranges(XYs, X, lags, bw, **kwargs): +def calculate_range(args): + XYs, col, lags, bw, distance_metric = args + semis = variogram_at_lag(XYs, col, lags, bw, distance_metric) + sv, h = semis[0], semis[1] + start_params = [np.nanmax(h), np.nanmax(sv)] + bounds = (0, start_params) + cof, _ = curve_fit( + spherical, h, sv, sigma=None, p0=start_params, bounds=bounds, method="trf" + ) + effective_range = cof[0] + return effective_range + + +def plot_autocorrelation_ranges( + XYs: gpd.GeoSeries, + X: Union[np.ndarray, gpd.GeoDataFrame], + lags: np.ndarray, + bw: Union[int, float], + distance_metric: str = "euclidean", + workers: int = 1, + verbose: bool = False, + **kwargs, +) -> Tuple[Figure, Axis, list]: """ Plot spatial autocorrelation ranges for input covariates. Suggested block size is proposed by taking the median autocorrelation range @@ -132,6 +178,14 @@ def plot_autocorrelation_ranges(XYs, X, lags, bw, **kwargs): Array of distance lags in metres to obtain semivariances. bw : integer or float Bandwidth, plus and minus lags to calculate semivariance. + distance_metric : string + Distance function to calculate pairwise distances. Must be "euclidean" for + points in euclidean space, or "haversine" for points in a geographic CRS. + Defaults to "euclidean". + workers : int + Use multiprocessing for >1 worker. -1 uses all available cores. Defaults to 1. + verbose : bool + Print name of current column being processed. Returns ------- @@ -139,6 +193,7 @@ def plot_autocorrelation_ranges(XYs, X, lags, bw, **kwargs): Figure of spatial weight network. ax : matplotlib Axes instance Axes in which the figure is plotted. + ranges : list """ alpha = kwargs.pop("alpha", 0.7) font_size = kwargs.pop("font_size", 14) @@ -146,17 +201,32 @@ def plot_autocorrelation_ranges(XYs, X, lags, bw, **kwargs): figsize = kwargs.pop("figsize", (8, 6)) ranges = [] - for col in X.values.T: - # Fit spherical model and extract effective range parameter - semis = variogram_at_lag(XYs, col, lags, bw) - sv, h = semis[0], semis[1] - start_params = [np.nanmax(h), np.nanmax(sv)] - bounds = (0, start_params) - cof, _ = curve_fit( - spherical, h, sv, sigma=None, p0=start_params, bounds=bounds, method="trf" - ) - effective_range = cof[0] - ranges.append(effective_range) + + if workers == -1 or workers > 1: + pool = multiprocessing.Pool(workers) + results = [] + + for i, col in enumerate(X.values.T): + if verbose: + print(f"{i}: {X.columns[i]}") + # Fit spherical model and extract effective range parameter + args = (XYs, col, lags, bw, distance_metric) + results.append(pool.apply_async(calculate_range, (args,))) + + for result in results: + ranges.append(result.get()) + + pool.close() + pool.join() + else: + for i, col in enumerate(X.values.T): + if verbose: + print(f"{i}: {X.columns[i]}") + # Fit spherical model and extract effective range parameter + args = (XYs, col, lags, bw, distance_metric) + eff_range = calculate_range(args) + ranges.append(eff_range) + x_labs = X.columns f, ax = plt.subplots(1, figsize=figsize) ax.bar(x_labs, ranges, color="skyblue", alpha=alpha) @@ -171,7 +241,7 @@ def plot_autocorrelation_ranges(XYs, X, lags, bw, **kwargs): size=font_size, ) ax.axhline(median_eff_range, color=block_suggestion_color, linestyle="--") - return f, ax + return f, ax, ranges def aoa( @@ -323,3 +393,41 @@ def plot_aoa( ax[1].set_title("AOA") return f, ax + + +def plot_variogram( + XYs: gpd.GeoSeries, + col: Union[gpd.GeoSeries, np.ndarray], + lags: np.ndarray, + bw: Union[int, float], + distance_metric: str = "euclidean", +) -> np.ndarray: + """ + Return semivariance values for defined lag of distance. + + Parameters + ---------- + XYs : Geoseries series + Series containing X and Y coordinates. + col : array or list + Array (N,) containing variable. + lags : array + Array of distance lags in metres to obtain semivariances. + bw : integer or float + Bandwidth, plus and minus lags to calculate semivariance. + distance_metric : string + Distance function to calculate pairwise distances. Must be "euclidean" for + points in euclidean space, or "haversine" for points in a geographic CRS. + Defaults to "euclidean". + + Returns + ------- + semis : Array of floats + Array of semivariances at defined lag points for given variable. + """ + semis = variogram_at_lag(XYs, col, lags, bw, distance_metric) + sv, h = semis[0], semis[1] + plt.scatter(h, sv) + plt.show() + + return semis From 5d9e82012db79962e566c370989ad82143665681 Mon Sep 17 00:00:00 2001 From: Daniel Lusk Date: Wed, 31 May 2023 19:59:47 +0200 Subject: [PATCH 08/14] update deprecated gpd.sjoin param --- spacv/grid_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacv/grid_builder.py b/spacv/grid_builder.py index 81e4bb5..d323ece 100644 --- a/spacv/grid_builder.py +++ b/spacv/grid_builder.py @@ -283,7 +283,7 @@ def assign_pt_to_grid(XYs, grid, distance_metric="euclidean", random_state=None) # Equate spatial reference systems if defined if not grid.crs == XYs.crs: grid.crs = XYs.crs - XYs = gpd.sjoin(XYs, grid, how="left", op="within")[["geometry", "grid_id"]] + XYs = gpd.sjoin(XYs, grid, how="left", predicate="within")[["geometry", "grid_id"]] # In rare cases, points will sit at the border separating two grids if XYs["grid_id"].isna().any(): From cbfb36c8b6c4042c4ccd13168e2099a4061d082a Mon Sep 17 00:00:00 2001 From: Daniel Lusk Date: Wed, 31 May 2023 20:00:36 +0200 Subject: [PATCH 09/14] project epsg 4326 into meters when performing geometric operations --- spacv/grid_builder.py | 10 +++++++++- spacv/spacv.py | 11 ++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/spacv/grid_builder.py b/spacv/grid_builder.py index d323ece..95fd347 100644 --- a/spacv/grid_builder.py +++ b/spacv/grid_builder.py @@ -288,7 +288,15 @@ def assign_pt_to_grid(XYs, grid, distance_metric="euclidean", random_state=None) # In rare cases, points will sit at the border separating two grids if XYs["grid_id"].isna().any(): # Find border pts and assign to nearest grid centroid - grid_centroid = grid.geometry.centroid + # First project to pseudo-mercator if using 4326 + epsg = grid.crs.to_epsg() + if epsg == 4326: + grid = grid.to_crs(3857) + grid_centroid = grid.geometry.centroid + grid = grid.to_crs(epsg) # return to original EPSG + else: + grid_centroid = grid.geometry.centroid + grid_centroid = geometry_to_2d(grid_centroid) border_pt_index = XYs["grid_id"].isna() border_pts = XYs[border_pt_index].geometry diff --git a/spacv/spacv.py b/spacv/spacv.py index faeabd9..206116e 100644 --- a/spacv/spacv.py +++ b/spacv/spacv.py @@ -139,7 +139,16 @@ def _iter_test_indices(self, XYs): # Remove empty grids if len(test_indices) < 1: continue - grid_poly_buffer = grid.loc[[grid_id]].buffer(self.buffer_radius) + + epsg = grid.crs.to_epsg() + if epsg == 4326: + grid = grid.to_crs(3857) + grid_poly_buffer = grid.loc[[grid_id]].buffer(self.buffer_radius) + grid_poly_buffer = grid_poly_buffer.to_crs(epsg) + grid = grid.to_crs(epsg) # return to original EPSG + else: + grid_poly_buffer = grid.loc[[grid_id]].buffer(self.buffer_radius) + test_indices, train_exclude = super()._remove_buffered_indices( XYs, test_indices, self.buffer_radius, grid_poly_buffer ) From 4b9efe175df211fe6f8b5f126ea917e6b7863fe5 Mon Sep 17 00:00:00 2001 From: Daniel Lusk Date: Sat, 10 Jun 2023 09:08:35 +0200 Subject: [PATCH 10/14] Catch failed semivariance calculation and log error --- spacv/visualisation.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/spacv/visualisation.py b/spacv/visualisation.py index 15d8954..8661f46 100644 --- a/spacv/visualisation.py +++ b/spacv/visualisation.py @@ -1,3 +1,4 @@ +import logging import multiprocessing from typing import Tuple, Union @@ -32,10 +33,11 @@ def variogram_at_lag( XYs: gpd.GeoSeries, - x: Union[list, np.ndarray], + x: Union[list, np.ndarray, gpd.GeoSeries], lags: np.ndarray, bw: Union[int, float], distance_metric: str = "euclidean", + col_name: str = None, ) -> np.ndarray: """ Return semivariance values for defined lag of distance. @@ -44,7 +46,7 @@ def variogram_at_lag( ---------- XYs : Geoseries series Series containing X and Y coordinates. - X : array or list + X : array, list, or Geoseries Array (N,) containing variable. lags : array Array of distance lags in metres to obtain semivariances. @@ -77,12 +79,12 @@ def variogram_at_lag( lower = pd_m >= lag - bw upper = pd_m <= lag + bw mask = np.logical_and(lower, upper) - semivariances[i] = compute_semivariance(x, mask, bw) + semivariances[i] = compute_semivariance(x, mask, bw, col_name) return np.c_[semivariances, lags].T -def compute_semivariance(x, mask, bw): +def compute_semivariance(x, mask, bw, col_name): """ Calculate semivariance for masked elements. """ @@ -97,8 +99,14 @@ def compute_semivariance(x, mask, bw): if len(ss[filter_empty]) > 0: counts.append(len(ss[filter_empty])) semis.append(ss[filter_empty]) - semivariance = np.sum(np.concatenate(semis)) / (2.0 * sum(counts)) - return semivariance + try: + semivariance = np.sum(np.concatenate(semis)) / (2.0 * sum(counts)) + return semivariance + except ValueError: + logging.error( + f"Could not calculate semivariances for {col_name}. Using 0 instead." + ) + return 0 def variogram(func): @@ -140,8 +148,8 @@ def spherical(h, r, sill, nugget=0): def calculate_range(args): - XYs, col, lags, bw, distance_metric = args - semis = variogram_at_lag(XYs, col, lags, bw, distance_metric) + XYs, col, lags, bw, distance_metric, col_name = args + semis = variogram_at_lag(XYs, col, lags, bw, distance_metric, col_name) sv, h = semis[0], semis[1] start_params = [np.nanmax(h), np.nanmax(sv)] bounds = (0, start_params) @@ -207,10 +215,11 @@ def plot_autocorrelation_ranges( results = [] for i, col in enumerate(X.values.T): + col_name = X.columns[i] if verbose: - print(f"{i}: {X.columns[i]}") + print(f"{i}: {col_name}") # Fit spherical model and extract effective range parameter - args = (XYs, col, lags, bw, distance_metric) + args = (XYs, col, lags, bw, distance_metric, col_name) results.append(pool.apply_async(calculate_range, (args,))) for result in results: @@ -220,10 +229,11 @@ def plot_autocorrelation_ranges( pool.join() else: for i, col in enumerate(X.values.T): + col_name = X.columns[i] if verbose: - print(f"{i}: {X.columns[i]}") + print(f"{i}: {col_name}") # Fit spherical model and extract effective range parameter - args = (XYs, col, lags, bw, distance_metric) + args = (XYs, col, lags, bw, distance_metric, col_name) eff_range = calculate_range(args) ranges.append(eff_range) From 6a894fbf0be2f2f07e4e1010ef0f5046763d31fc Mon Sep 17 00:00:00 2001 From: Daniel Lusk Date: Sat, 17 Jun 2023 16:03:29 +0200 Subject: [PATCH 11/14] add type hinting to HBLOCK --- spacv/spacv.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/spacv/spacv.py b/spacv/spacv.py index 206116e..1d39b05 100644 --- a/spacv/spacv.py +++ b/spacv/spacv.py @@ -1,5 +1,6 @@ import numbers import warnings +from typing import Optional, Union import geopandas as gpd import numpy as np @@ -69,17 +70,17 @@ class HBLOCK(BaseSpatialCV): def __init__( self, - tiles_x=5, - tiles_y=5, - shape="square", - method="unique", - buffer_radius=0, - direction="diagonal", - n_groups=5, - data=None, - n_sims=10, - distance_metric="euclidean", - random_state=None, + tiles_x: int = 5, + tiles_y: int = 5, + shape: str = "square", + method: str = "unique", + buffer_radius: Union[int, float] = 0, + direction: str = "diagonal", + n_groups: int = 5, + data: Optional[np.ndarray] = None, + n_sims: int = 10, + distance_metric: str = "euclidean", + random_state: Optional[int] = None, ): self.tiles_x = tiles_x self.tiles_y = tiles_y From e280142c49106b7ccb1ebb746365d48a60c94fdf Mon Sep 17 00:00:00 2001 From: Daniel Lusk Date: Sun, 10 Sep 2023 08:43:39 +0200 Subject: [PATCH 12/14] drop nans and only use a subset of 30000 points for AC ranges calc --- spacv/visualisation.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/spacv/visualisation.py b/spacv/visualisation.py index 8661f46..a8d1306 100644 --- a/spacv/visualisation.py +++ b/spacv/visualisation.py @@ -65,6 +65,17 @@ def variogram_at_lag( XYs = geometry_to_2d(XYs) x = np.asarray(x) + # Remove nans from x and XYs for variogram calculation + nan_idx = np.isnan(x) + x = x[~nan_idx] + XYs = XYs[~nan_idx] + + # Select only 30000 random points from x and XYs to calculate pairwise distances + if len(x) > 30000: + idx = np.random.choice(len(x), 30000, replace=False) + x = x[idx] + XYs = XYs[idx] + if distance_metric == "euclidean": paired_distances = pdist(XYs) pd_m = squareform(paired_distances) @@ -318,7 +329,7 @@ def aoa( folds = np.vstack((fold_indices, instance_fold_id)).T # Mask training points in same fold for DI measure calculation - for i, row in enumerate(train_dist): + for i, _ in enumerate(train_dist): mask = folds[:, 0] == folds[:, 0][i] train_dist[i, mask] = np.nan From 472743b19917a3e5b891220002d32286b50efb64 Mon Sep 17 00:00:00 2001 From: Daniel Lusk Date: Sun, 19 May 2024 12:18:29 +0200 Subject: [PATCH 13/14] update version to reflect changes --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d0ea701..1f77f43 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="spacv", - version="0.0.21", + version="0.0.22", description="Spatial cross-validation in Python", long_description=long_description, long_description_content_type="text/markdown", From 954b1332d60ac753ceb9b90fb73b2a447957cd22 Mon Sep 17 00:00:00 2001 From: Daniel Lusk Date: Sun, 19 May 2024 13:17:21 +0200 Subject: [PATCH 14/14] remove pygeos from requirements now that it has been integrated with Shapely --- requirements.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 248d848..289910f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,4 @@ pandas geopandas shapely scikit-learn -scipy -pygeos \ No newline at end of file +scipy \ No newline at end of file