Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix deprecations; support geographic coordinates; add some type hinting/formatting #6

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,4 @@ pandas
geopandas
shapely
scikit-learn
scipy
pygeos
scipy
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name="spacv",
version="0.0.21",
version="0.0.22",
description="Spatial cross-validation in Python",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
81 changes: 43 additions & 38 deletions spacv/base_classes.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,30 @@
from abc import ABC, abstractmethod, ABCMeta
import numpy as np
from abc import ABCMeta, abstractmethod

import geopandas as gpd
import numpy as np
from sklearn.model_selection import BaseCrossValidator
from .utils import convert_geoseries, convert_geodataframe

from .utils import convert_geodataframe, convert_geoseries


class BaseSpatialCV(BaseCrossValidator, metaclass=ABCMeta):
"""
Base class for partitioning-based spatial cross-validation approaches.
"""
def __init__(
self
):

def __init__(self):
self.buffer_radius = buffer_radius

def split(self, XYs):
"""
Generate indices to split data into training and test set.

Parameters
----------
XYs : GeoSeries
GeoSeries containing shapely Points that identify Easting
and Northing coordinates of data points.

Yields
------
train : ndarray
Expand All @@ -32,77 +34,80 @@ def split(self, XYs):
"""
XYs = convert_geoseries(XYs).reset_index(drop=True)
minx, miny, maxx, maxy = XYs.total_bounds

buffer_radius = self.buffer_radius
if buffer_radius > maxx-minx or buffer_radius > maxy-miny:
if buffer_radius > maxx - minx or buffer_radius > maxy - miny:
raise ValueError(
"buffer_radius too large and excludes all points. Given {}.".format(
self.buffer_radius
)
)
num_samples = XYs.shape[0]
# num_samples = XYs.shape[0]
indices = XYs.index.values

for test_indices, train_excluded in self._iter_test_indices(XYs):
# Exclude the training indices within buffer
train_excluded = np.concatenate([test_indices, train_excluded])

for test_indices, train_excluded in self._iter_test_indices(XYs):
if train_excluded.ndim and train_excluded.size:
# Exclude the training indices within buffer
train_excluded = np.concatenate([test_indices, train_excluded])
else:
train_excluded = test_indices
train_index = np.setdiff1d(
np.union1d(
indices,
train_excluded
), np.intersect1d(indices, train_excluded)
)
np.union1d(indices, train_excluded),
np.intersect1d(indices, train_excluded),
)
if len(train_index) < 1:
raise ValueError(
"Training set is empty. Try lowering buffer_radius to include more training instances."
)
test_index = indices[test_indices]
test_index = indices[test_indices]
yield train_index, test_index

def _remove_buffered_indices(self, XYs, test_indices, buffer_radius, geometry_buffer):

def _remove_buffered_indices(
self, XYs, test_indices, buffer_radius, geometry_buffer
):
# Remove training points from dead zone buffer
if buffer_radius > 0:
# Buffer grid and clip training instances
candidate_deadzone = XYs.loc[~XYs.index.isin( test_indices )]
if buffer_radius > 0:
# Buffer grid and clip training instances
candidate_deadzone = XYs.loc[~XYs.index.isin(test_indices)]
candidate_deadzone = convert_geodataframe(candidate_deadzone)
geometry_buffer = convert_geodataframe(geometry_buffer)
deadzone_points = gpd.sjoin(candidate_deadzone, geometry_buffer)
train_exclude = deadzone_points.loc[~deadzone_points.index.isin(test_indices)].index.values
train_exclude = deadzone_points.loc[
~deadzone_points.index.isin(test_indices)
].index.values
return test_indices, train_exclude
else:
# Yield empty array because no training data removed in dead zone when buffer is zero
_ = np.empty([], dtype=np.int)
_ = np.empty([], dtype=int)
return test_indices, _

@abstractmethod
def _iter_test_indices(self, XYs):
"""
Generates integer indices corresponding to test sets and
Generates integer indices corresponding to test sets and
training indices to be excluded from model training.

Parameters
----------
X : GeoSeries
GeoSeries containing shapely Points that identify Easting
and Northing coordinates of data points.

Yields
------
test_indices : array
The testing set indices for that fold.
train_exclude : array
The training set indices to exclude for that fold.
"""
"""

def get_n_splits(self):
"""
Returns the number of folds used in the cross-validation.

Returns
-------
n_splits : int
Returns the number of folds in the cross-validator.
"""
return self.n_splits


Loading