Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix is_time to avoid memory overload #397

Open
wants to merge 19 commits into
base: master
Choose a base branch
from
Open
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 40 additions & 9 deletions clisops/utils/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import cf_xarray as cfxr # noqa
import cftime
import dask.array as da
import fsspec
import numpy as np
import xarray as xr
Expand Down Expand Up @@ -61,6 +62,13 @@ def get_coord_by_type(
elif isinstance(ds, xr.Dataset):
# Not all coordinate variables are always classified as such
coord_vars = list(ds.coords) + list(ds.data_vars)
# make sure we skip the main variable!
try:
var = get_main_variable(ds)
except ValueError:
warnings.warn(f"No variable found for dataset '{ds}'.")
else:
coord_vars.remove(var)
else:
raise TypeError("Not an xarray.Dataset or xarray.DataArray.")
for coord_id in coord_vars:
Expand Down Expand Up @@ -95,11 +103,12 @@ def get_coord_by_type(

# Select coordinate with most dims (matching with main variable dims)
for coord_id in coords:
if all([dim in main_var_dims for dim in ds.coords[coord_id].dims]):
if return_further_matches:
return coord_id, [x for x in coords if x != coord_id]
else:
return coord_id
if coord_id in ds.coords:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make sure coord_id is in ds.coords (lat_bnds is not)

if all([dim in main_var_dims for dim in ds.coords[coord_id].dims]):
if return_further_matches:
return coord_id, [x for x in coords if x != coord_id]
else:
return coord_id
# If the decision making fails, pass the first match
if return_further_matches:
return coords[0], coords[1:]
Expand Down Expand Up @@ -207,13 +216,38 @@ def is_level(coord):
return False


def _is_time(coord):
"""
Check if a coordinate uses cftime datetime objects.
Handles Dask-backed arrays for lazy evaluation.
Comment on lines +225 to +226
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Check if a coordinate uses cftime datetime objects.
Handles Dask-backed arrays for lazy evaluation.
Check if a coordinate uses cftime datetime objects.
Handles Dask-backed arrays for lazy evaluation.

"""
if coord.size == 0:
return False # Empty array

if isinstance(coord.dtype.type(), cftime.datetime):
return True

# Safely get first element without loading entire array
first_value = coord.isel({dim: 0 for dim in coord.dims}).values

# Compute only if it's a Dask array
if isinstance(first_value, da.Array):
first_value = first_value.compute()

return isinstance(first_value.item(0), cftime.datetime)


def is_time(coord):
"""
Determines if a coordinate is time.

:param coord: coordinate of xarray dataset e.g. coord = ds.coords[coord_id]
:return: (bool) True if the coordinate is time.
"""
if False and coord.ndim >= 2:
# skip variables with more than two dimensions: lat_bnds, lon_bnds, time_bnds, t, ...
return False

if "time" in coord.cf.coordinates and coord.name in coord.cf.coordinates["time"]:
return True

Expand All @@ -226,14 +260,11 @@ def is_time(coord):
if np.issubdtype(coord.dtype, np.datetime64):
return True

if isinstance(np.atleast_1d(coord.values)[0], cftime.datetime):
return True

if hasattr(coord, "axis"):
if coord.axis == "T":
return True

return False
return _is_time(coord)


def is_realization(coord):
Expand Down
Loading