diff --git a/.vscode/launch.json b/.vscode/launch.json index 1edaef0..c05cb0a 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -198,6 +198,7 @@ "convert-s2-optimized", // "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202509-s02msil2a/08/products/cpm_v256/S2A_MSIL2A_20250908T100041_N0511_R122_T32TQM_20250908T115116.zarr", "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202511-s02msil2a-eu/15/products/cpm_v262/S2B_MSIL2A_20251115T091139_N0511_R050_T35SLU_20251115T111807.zarr", + // "https://objects.eodc.eu:443/e05ab01a9d56408d82ac32d69a5aae2a:202512-s02msil2a-eu/16/products/cpm_v262/S2B_MSIL2A_20251216T102339_N0511_R065_T32TNS_20251216T123617.zarr", // "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202511-s02msil2a-eu/16/products/cpm_v262/S2A_MSIL2A_20251116T085431_N0511_R107_T35SQD_20251116T103813.zarr", // "s3://esa-zarr-sentinel-explorer-fra/tests-output/sentinel-2-l2a-opt/S2A_MSIL2A_20250908T100041_N0511_R122_T32TQM_20250908T115116.zarr", // "s3://esa-zarr-sentinel-explorer-fra/tests-output/sentinel-2-l2a-staging/S2B_MSIL2A_20251115T091139_N0511_R050_T35SLU_20251115T111807.zarr", @@ -211,7 +212,7 @@ // "--omit-nodes", // "quality/l2a_quicklook", "--dask-cluster", - "--verbose" + // "--verbose" ], "cwd": "${workspaceFolder}", "justMyCode": false, diff --git a/pyproject.toml b/pyproject.toml index fe932fb..03c46df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,9 +27,9 @@ classifiers = [ ] requires-python = ">=3.11" dependencies = [ - "pydantic-zarr>=0.8.0", + "pydantic-zarr>=0.9.1", "pydantic>=2.12", - "zarr>=3.1.1", + "zarr>=3.1.4", "xarray>=2025.7.1", "dask[array,distributed]>=2025.5.1", "numpy>=2.3.1", @@ -172,6 +172,8 @@ module = ["zarr.*", "xarray.*", "rioxarray.*", "cf_xarray.*", "dask.*"] ignore_missing_imports = true [tool.pytest.ini_options] +filterwarnings = "error" +log_level = "WARNING" minversion = "7.0" addopts = "-ra -q --strict-markers --strict-config" testpaths = ["tests"] diff --git a/src/eopf_geozarr/cli.py b/src/eopf_geozarr/cli.py index e3db9e3..32ad608 100755 --- a/src/eopf_geozarr/cli.py +++ b/src/eopf_geozarr/cli.py @@ -1155,8 +1155,16 @@ def add_s2_optimization_commands(subparsers: argparse._SubParsersAction) -> None choices=range(1, 10), help="Compression level 1-9 (default: 3)", ) + s2_parser.add_argument( + "--omit-nodes", help="The names of groups or arrays to skip.", default="", type=str + ) s2_parser.add_argument("--skip-validation", action="store_true", help="Skip output validation") s2_parser.add_argument("--verbose", action="store_true", help="Enable verbose output") + s2_parser.add_argument( + "--allow-json-nan", + action="store_true", + help="Allow invalid float values (nan, inf) in output JSON", + ) s2_parser.add_argument( "--keep-scale-offset", action="store_true", @@ -1184,18 +1192,26 @@ def convert_s2_optimized_command(args: argparse.Namespace) -> None: # Load input dataset log.info("Loading Sentinel-2 dataset from", input_path=args.input_path) storage_options = get_storage_options(str(args.input_path)) + store = args.input_path dt_input = xr.open_datatree( - str(args.input_path), engine="zarr", chunks="auto", storage_options=storage_options + store, + engine="zarr", + chunks="auto", + storage_options=storage_options, ) + omit_nodes = set(args.omit_nodes.split()) + # Convert convert_s2_optimized( dt_input=dt_input, output_path=args.output_path, enable_sharding=args.enable_sharding, spatial_chunk=args.spatial_chunk, + omit_nodes=omit_nodes, compression_level=args.compression_level, validate_output=not args.skip_validation, + allow_json_nan=args.allow_json_nan, keep_scale_offset=args.keep_scale_offset, ) diff --git a/src/eopf_geozarr/conversion/geozarr.py b/src/eopf_geozarr/conversion/geozarr.py index b6d5279..e09d700 100644 --- a/src/eopf_geozarr/conversion/geozarr.py +++ b/src/eopf_geozarr/conversion/geozarr.py @@ -1189,6 +1189,7 @@ def cleanup_prefix(prefix: str) -> None: engine="zarr", decode_coords="all", chunks="auto", + consolidated=False, storage_options=store_storage_options, ) break diff --git a/src/eopf_geozarr/data_api/geozarr/common.py b/src/eopf_geozarr/data_api/geozarr/common.py index f8c1e99..c6a9b16 100644 --- a/src/eopf_geozarr/data_api/geozarr/common.py +++ b/src/eopf_geozarr/data_api/geozarr/common.py @@ -5,6 +5,7 @@ import io import urllib import urllib.request +from collections.abc import Mapping from dataclasses import dataclass from typing import ( TYPE_CHECKING, @@ -24,12 +25,18 @@ from pydantic.experimental.missing_sentinel import MISSING from typing_extensions import Protocol, TypedDict, runtime_checkable -from eopf_geozarr.data_api.geozarr.projjson import ProjJSON # noqa: TC001 +from eopf_geozarr.data_api.geozarr.projjson import ProjJSON +from eopf_geozarr.data_api.geozarr.types import ( + CF_SCALE_OFFSET_KEYS, + CFScaleOffset, + EmptyDict, +) if TYPE_CHECKING: from collections.abc import Mapping + @dataclass(frozen=True) class UNSET_TYPE: """ @@ -289,3 +296,38 @@ def check_grid_mapping(model: TDataSetLike) -> TDataSetLike: def is_none(data: object) -> TypeGuard[None]: return data is None + + +def extract_scale_offset( + data: Mapping[str, object], +) -> tuple[dict[str, object], CFScaleOffset | EmptyDict]: + """ + Extract scale/offset information from a mapping, returning the remaining data and the scale/offset info. + + Parameters + ---------- + data : Mapping[[str, object]] + The input mapping from which to extract scale/offset information. + + Returns + ------- + tuple[Mapping[str, object], CFScaleOffset] + A tuple containing the remaining data (with scale/offset keys removed) and the extracted scale/offset info. + """ + scale_offset: CFScaleOffset = {} # type: ignore[typeddict-item] + remaining_data: dict[str, object] = {} + + if set(data.keys()).isdisjoint(CF_SCALE_OFFSET_KEYS): + return dict(data), {} + + if set(data.keys()).issuperset(CF_SCALE_OFFSET_KEYS): + for key, value in data.items(): + if key in CF_SCALE_OFFSET_KEYS: + scale_offset[key] = value # type: ignore[literal-required] + else: + remaining_data[key] = value + return remaining_data, scale_offset + + raise ValueError( + "Incomplete scale/offset information: all of 'scale_factor', 'add_offset', must be present." + ) diff --git a/src/eopf_geozarr/data_api/geozarr/multiscales/geozarr.py b/src/eopf_geozarr/data_api/geozarr/multiscales/geozarr.py index b8ea6d1..e2f564d 100644 --- a/src/eopf_geozarr/data_api/geozarr/multiscales/geozarr.py +++ b/src/eopf_geozarr/data_api/geozarr/multiscales/geozarr.py @@ -28,7 +28,7 @@ def valid_zcm(self) -> Self: Ensure that the ZCM metadata, if present, is valid """ if self.layout is not MISSING: - zcm.Multiscales(**self.model_dump()) + zcm.Multiscales(layout=self.layout, resampling_method=self.resampling_method) return self @@ -38,7 +38,11 @@ def valid_tms(self) -> Self: Ensure that the TMS metadata, if present, is valid """ if self.tile_matrix_set is not MISSING: - tms.Multiscales(**self.model_dump()) + tms.Multiscales( + tile_matrix_set=self.tile_matrix_set, + tile_matrix_limits=self.tile_matrix_limits, + resampling_method=self.resampling_method, # type: ignore[arg-type] + ) return self diff --git a/src/eopf_geozarr/data_api/geozarr/multiscales/zcm.py b/src/eopf_geozarr/data_api/geozarr/multiscales/zcm.py index cc34b0a..175ee4b 100644 --- a/src/eopf_geozarr/data_api/geozarr/multiscales/zcm.py +++ b/src/eopf_geozarr/data_api/geozarr/multiscales/zcm.py @@ -2,7 +2,7 @@ from typing import Final, Literal, NotRequired -from pydantic import BaseModel, field_validator +from pydantic import BaseModel, field_validator, model_serializer from pydantic.experimental.missing_sentinel import MISSING from typing_extensions import TypedDict @@ -73,6 +73,15 @@ class Transform(BaseModel): scale: tuple[float, ...] | MISSING = MISSING translation: tuple[float, ...] | MISSING = MISSING + @model_serializer + def serialize_model(self) -> dict[str, tuple[float, ...]]: + result: dict[str, tuple[float, ...]] = {} + if self.scale is not MISSING: + result["scale"] = self.scale + if self.translation is not MISSING: + result["translation"] = self.translation + return result + class TransformJSON(TypedDict): scale: NotRequired[tuple[float, ...]] diff --git a/src/eopf_geozarr/data_api/geozarr/types.py b/src/eopf_geozarr/data_api/geozarr/types.py index 0d9adaf..613dbbd 100644 --- a/src/eopf_geozarr/data_api/geozarr/types.py +++ b/src/eopf_geozarr/data_api/geozarr/types.py @@ -2,7 +2,9 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Final, Literal, NotRequired, TypedDict +from typing import TYPE_CHECKING, Final, Literal, NotRequired + +from typing_extensions import TypedDict if TYPE_CHECKING: from collections.abc import Mapping @@ -16,7 +18,7 @@ class TileMatrixLimitJSON(TypedDict): maxTileRow: int -CF_SCALE_OFFSET_KEYS: Final[set[str]] = {"scale_factor", "add_offset", "dtype"} +CF_SCALE_OFFSET_KEYS: Final[set[str]] = {"scale_factor", "add_offset"} XARRAY_ENCODING_KEYS: Final[set[str]] = { "chunks", @@ -25,16 +27,32 @@ class TileMatrixLimitJSON(TypedDict): "filters", "shards", "_FillValue", + "dtype", } | CF_SCALE_OFFSET_KEYS +class CFScaleOffset(TypedDict): + """ + Metadata defining scale/offset encoding for array values. Defined by the CF + conventions and found in EOPF Sentinel products in Zarr array attributes. + """ + + scale_factor: float + add_offset: float + dtype: str + + +class EmptyDict(TypedDict, closed=True): # type: ignore[call-arg] + """A dict with no keys.""" + + class XarrayDataArrayEncoding(TypedDict): """ The dict form of the encoding for xarray.DataArray """ chunks: NotRequired[tuple[int, ...]] - preferred_chunks: NotRequired[tuple[int, ...]] + preferred_chunks: NotRequired[dict[str, int]] compressors: NotRequired[tuple[object, ...] | None] filters: NotRequired[tuple[object, ...]] shards: NotRequired[tuple[int, ...] | None] diff --git a/src/eopf_geozarr/data_api/s2.py b/src/eopf_geozarr/data_api/s2.py index 92a033a..1f3ed8e 100644 --- a/src/eopf_geozarr/data_api/s2.py +++ b/src/eopf_geozarr/data_api/s2.py @@ -9,6 +9,7 @@ from typing import Annotated, Any, Literal from pydantic import BaseModel, Field +from pyproj import CRS from typing_extensions import TypedDict from eopf_geozarr.data_api.geozarr.common import ( @@ -24,182 +25,21 @@ # ============================================================================ -class BandMetadata(BaseModel): - """Metadata for a single Sentinel-2 spectral band. +class OtherMetadataDict(TypedDict, total=False): + """Sentinel-2 other_metadata attributes container. - Contains physical and spectral characterization of a band. - """ - - bandwidth: Annotated[float | str, Field(description="Bandwidth in nm")] - central_wavelength: Annotated[float, Field(description="Central wavelength in nm")] - onboard_compression_rate: Annotated[str | float, Field(description="Compression rate")] - onboard_integration_time: Annotated[str | float, Field(description="Integration time")] - physical_gain: Annotated[str | float, Field(description="Physical gain factor")] - spectral_response_step: Annotated[str, Field(description="Spectral response step")] - spectral_response_values: Annotated[str, Field(description="Spectral response curve values")] - units: Annotated[str, Field(description="Unit of measurement")] - wavelength_max: Annotated[float, Field(description="Maximum wavelength in nm")] - wavelength_min: Annotated[float, Field(description="Minimum wavelength in nm")] - - -class BandDescription(BaseModel): - """Collection of band metadata for all Sentinel-2 bands. + This is intentionally flexible to accommodate variations across different + Sentinel-2 product types (L1C, L2A) and processing versions. - Maps Sentinel-2 band identifiers to their spectral and technical metadata. - All standard bands should be present in Sentinel-2 L1C and L2A products. - - Preserves both numeric (01, 02, ..., 8A) and named (b01, b02, ..., b8a) band keys - from the original JSON structure for round-trip compatibility. + Only horizontal_CRS_code is required as it's the load-bearing element + needed by the API to extract CRS information. """ - # Numeric band keys (JSON format) - band_01: Annotated[BandMetadata, Field(alias="01")] - band_02: Annotated[BandMetadata, Field(alias="02")] - band_03: Annotated[BandMetadata, Field(alias="03")] - band_04: Annotated[BandMetadata, Field(alias="04")] - band_05: Annotated[BandMetadata, Field(alias="05")] - band_06: Annotated[BandMetadata, Field(alias="06")] - band_07: Annotated[BandMetadata, Field(alias="07")] - band_08: Annotated[BandMetadata, Field(alias="08")] - band_09: Annotated[BandMetadata, Field(alias="09")] - band_10: Annotated[BandMetadata, Field(alias="10")] - band_11: Annotated[BandMetadata, Field(alias="11")] - band_12: Annotated[BandMetadata, Field(alias="12")] - band_8a: Annotated[BandMetadata, Field(alias="8A")] - - # Named band keys (human-readable format) - band_b01: Annotated[BandMetadata, Field(alias="b01")] - band_b02: Annotated[BandMetadata, Field(alias="b02")] - band_b03: Annotated[BandMetadata, Field(alias="b03")] - band_b04: Annotated[BandMetadata, Field(alias="b04")] - band_b05: Annotated[BandMetadata, Field(alias="b05")] - band_b06: Annotated[BandMetadata, Field(alias="b06")] - band_b07: Annotated[BandMetadata, Field(alias="b07")] - band_b08: Annotated[BandMetadata, Field(alias="b08")] - band_b09: Annotated[BandMetadata, Field(alias="b09")] - band_b10: Annotated[BandMetadata, Field(alias="b10")] - band_b11: Annotated[BandMetadata, Field(alias="b11")] - band_b12: Annotated[BandMetadata, Field(alias="b12")] - band_b8a: Annotated[BandMetadata, Field(alias="b8a")] - - model_config = {"populate_by_name": True} - - -class OtherMetadata(BaseModel): - """Sentinel-2 product metadata container. - - Stores various metadata about the product including: - - Quality information (L0/L2A, processing details) - - Band descriptions (spectral characteristics) - - Atmospheric corrections (AOT, water vapor) - - Geolocation and timing information - """ + # Required field - needed for CRS extraction + horizontal_CRS_code: str - # Core metadata fields - AOT_retrieval_model: Annotated[ - str, Field(description="Aerosol Optical Thickness retrieval model") - ] - L0_ancillary_data_quality: Annotated[ - str, Field(description="L0 ancillary data quality indicator") - ] - L0_ephemeris_data_quality: Annotated[str, Field(description="L0 ephemeris data quality")] - NUC_table_ID: Annotated[int | str, Field(description="Non-Uniformity Correction table ID")] - SWIR_rearrangement_flag: Annotated[ - str | None, Field(description="SWIR band rearrangement flag") - ] - UTM_zone_identification: Annotated[str, Field(description="UTM zone identifier")] - absolute_location_assessment_from_AOCS: Annotated[str, Field(description="Location assessment")] - - # Band information - band_description: Annotated[ - BandDescription, Field(description="Spectral band metadata for all bands") - ] - - # Accuracy declarations - declared_accuracy_of_AOT_model: Annotated[float | None, Field(description="AOT model accuracy")] - declared_accuracy_of_radiative_transfer_model: Annotated[ - float | None, Field(description="Radiative transfer accuracy") - ] - declared_accuracy_of_water_vapour_model: Annotated[ - float | None, Field(description="Water vapor model accuracy") - ] - - # Correction flags - electronic_crosstalk_correction_flag: Annotated[ - str | bool, Field(description="Electronic crosstalk correction") - ] - optical_crosstalk_correction_flag: Annotated[ - str | bool, Field(description="Optical crosstalk correction") - ] - onboard_compression_flag: Annotated[ - str | bool, Field(description="Onboard compression applied") - ] - onboard_equalization_flag: Annotated[ - str | bool, Field(description="Onboard equalization applied") - ] - - # Product and geometry information - eopf_category: Annotated[str, Field(description="EOPF product category")] - geometric_refinement: Annotated[ - dict[str, Any] | str | None, - Field(description="Geometric refinement information"), - ] - history: Annotated[list[dict[str, Any]] | str | None, Field(description="Processing history")] - horizontal_CRS_code: Annotated[str, Field(description="Coordinate Reference System code")] - horizontal_CRS_name: Annotated[str, Field(description="CRS name")] - mean_sensing_time: Annotated[str | None, Field(description="Mean acquisition time")] - - # Sun/sensor geometry - mean_sun_azimuth_angle_in_deg_for_all_bands_all_detectors: Annotated[ - float, Field(description="Mean sun azimuth in degrees") - ] - mean_sun_zenith_angle_in_deg_for_all_bands_all_detectors: Annotated[ - float, Field(description="Mean sun zenith in degrees") - ] - - # Atmospheric parameters - mean_value_of_aerosol_optical_thickness: Annotated[ - float | None, Field(description="Mean AOT value") - ] - mean_value_of_total_water_vapour_content: Annotated[ - float | None, Field(description="Mean water vapor content") - ] - - # Meteo information (flexible structure) - meteo: Annotated[dict[str, Any] | None, Field(description="Meteorological parameters")] - - # Quality assessments - multispectral_registration_assessment: Annotated[ - str | None, Field(description="Registration quality") - ] - product_quality_status: Annotated[str, Field(description="Product quality status")] - planimetric_stability_assessment_from_AOCS: Annotated[ - str | None, Field(description="Planimetric stability") - ] - - # Data degradation - percentage_of_degraded_MSI_data: Annotated[ - float | None, Field(description="Percentage of degraded data") - ] - - # Ozone information - ozone_source: Annotated[str | None, Field(description="Ozone data source")] - ozone_value: Annotated[float | str | None, Field(description="Ozone value")] - - # Reference band - spectral_band_of_reference: Annotated[str, Field(description="Reference spectral band")] - - # Radiometric correction - reflectance_correction_factor_from_the_Sun_Earth_distance_variation_computed_using_the_acquisition_date: Annotated[ - float | None, - Field( - default=None, - alias="reflectance_correction_factor_from_the_Sun-Earth_distance_variation_computed_using_the_acquisition_date", - description="Reflectance correction factor for Sun-Earth distance variation", - ), - ] - - model_config = {"populate_by_name": True} + # All other fields are optional and unvalidated + # Different S2 products have different metadata structures class Sentinel2ArrayAttributes(BaseModel): @@ -442,7 +282,7 @@ class Sentinel2DataArrayAttrs(BaseDataArrayAttrs): class Sentinel2RootAttrs(BaseModel): """Root-level attributes for Sentinel-2 DataTree.""" - other_metadata: dict[str, object] # no validation + other_metadata: OtherMetadataDict stac_discovery: dict[str, object] # no validation @@ -690,3 +530,11 @@ def quality(self) -> Sentinel2QualityGroup: def conditions(self) -> Sentinel2ConditionsGroup: """Get conditions group.""" return self.members["conditions"] + + @property + def crs(self) -> CRS: + """Get the coordinate reference system (CRS) for this product""" + crs_code = self.attributes.other_metadata["horizontal_CRS_code"] + # Handle both "EPSG:32635" and "32635" formats + crs_code = crs_code.removeprefix("EPSG:") # Remove "EPSG:" prefix + return CRS.from_epsg(int(crs_code)) diff --git a/src/eopf_geozarr/s2_optimization/s2_converter.py b/src/eopf_geozarr/s2_optimization/s2_converter.py index 774720d..ae6895c 100644 --- a/src/eopf_geozarr/s2_optimization/s2_converter.py +++ b/src/eopf_geozarr/s2_optimization/s2_converter.py @@ -5,25 +5,77 @@ from __future__ import annotations import time -from typing import Any, TypedDict +from functools import partial +from pathlib import Path +from typing import Any, Final, Literal, TypedDict import structlog import xarray as xr import zarr from pydantic import TypeAdapter from pyproj import CRS +from zarr.core.dtype import Float32, TimeDelta64, ZDType +from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata +from zarr.core.sync import sync +from zarr.storage._common import make_store from eopf_geozarr.conversion.fs_utils import get_storage_options from eopf_geozarr.conversion.geozarr import get_zarr_group +from eopf_geozarr.data_api.geozarr.common import extract_scale_offset from eopf_geozarr.data_api.s1 import Sentinel1Root from eopf_geozarr.data_api.s2 import Sentinel2Root +from eopf_geozarr.zarrio import ArrayReencoderConfig, convert_compression, reencode_group -from .s2_multiscale import create_multiscale_from_datatree +from .s2_multiscale import ( + auto_chunks, + calculate_simple_shard_dimensions, + create_multiscale_levels, + create_multiscales_metadata, + write_geo_metadata, +) log = structlog.get_logger() - -def initialize_crs_from_dataset(dt_input: xr.DataTree) -> CRS | None: +TimeUnit = Literal[ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", +] + +TimeAbbreviation = Literal["D", "h", "m", "s", "ms", "us", "ns"] + +TIME_UNIT: Final[tuple[str, ...]] = ( + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", +) + +TIME_ABBREVIATION: Final[tuple[str, ...]] = ("D", "h", "m", "s", "ms", "us", "ns") + +_NETCDF_TO_NUMPY_TIME_UNITS: dict[TimeUnit, TimeAbbreviation] = { + "days": "D", + "hours": "h", + "minutes": "m", + "seconds": "s", + "milliseconds": "ms", + "microseconds": "us", + "nanoseconds": "ns", +} + + +def _netcdf_unit_to_numpy_time_unit(unit: TimeUnit) -> TimeAbbreviation: + return _NETCDF_TO_NUMPY_TIME_UNITS[unit] + + +def initialize_crs_from_dataset(dt_input: xr.DataTree) -> CRS: """ Initialize CRS from dataset by checking data variables. @@ -92,86 +144,160 @@ def initialize_crs_from_dataset(dt_input: xr.DataTree) -> CRS | None: log.info("Initialized CRS from EPSG code", epsg=epsg) except Exception: pass - else: - return crs + raise ValueError("No CRS found.") - log.warning("Could not initialize CRS from dataset") - return None +class ConvertS2Params(TypedDict): + enable_sharding: bool + spatial_chunk: int + compression_level: int + max_retries: int -def convert_s2( - dt_input: xr.DataTree, - output_path: str, - validate_output: bool, - enable_sharding: bool, - spatial_chunk: int, -) -> xr.DataTree: + +def add_crs_and_grid_mapping(group: zarr.Group, crs: CRS) -> None: + """ + Add crs and grid mapping elements to a dataset. """ - Convert S2 dataset to optimized structure. + ds = xr.open_dataset(group.store, group=group.path, engine="zarr", consolidated=False) + write_geo_metadata(ds, crs=crs) + + for var in ds.data_vars: + new_attrs = ds[var].attrs.copy() + new_attrs["coordinates"] = "spatial_ref" + group[var].attrs.update(new_attrs | {"grid_mapping": "spatial_ref"}) + + group.create_array( + "spatial_ref", + shape=ds["spatial_ref"].shape, + dtype=ds["spatial_ref"].dtype, + attributes=ds["spatial_ref"].attrs, + compressors=None, + filters=None, + ) - Args: - dt_input: Input Sentinel-2 DataTree - output_path: Output path for optimized dataset - validate_output: Whether to validate the output - verbose: Enable verbose logging + # Set grid_mapping attribute on the group itself + group.attrs.update({"grid_mapping": "spatial_ref"}) - Returns: - Optimized DataTree - """ - start_time = time.time() - log.info( - "Starting S2 optimized conversion", - num_groups=len(dt_input.groups), - output_path=output_path, - ) +def array_reencoder( + key: str, + metadata: ArrayV2Metadata, + *, + config: ArrayReencoderConfig | None = None, +) -> ArrayV3Metadata: + """ + Generate Zarr V3 Metadata from a key and a Zarr V2 metadata document. + """ + if config is None: + config = {} + keep_scale_offset = config.get("keep_scale_offset", False) + spatial_chunk = config.get("spatial_chunk", 256) + enable_sharding = config.get("enable_sharding", False) + compression_level = config.get("compression_level") + + attrs, maybe_scale_offset = extract_scale_offset(metadata.attributes.copy()) + out_dtype: ZDType[Any, Any] + # If the array used CF scale/offset encoding, we change the output dtype to Float32 + out_dtype = metadata.dtype + if not keep_scale_offset and len(maybe_scale_offset) > 0: + out_dtype = Float32() + + # handle xarray datetime/timedelta encoding + # If the array has time-related units, ensure the dtype attribute matches the actual dtype + if attrs.get("units") in TIME_UNIT: + numpy_time_unit = _netcdf_unit_to_numpy_time_unit(attrs["units"]) # type: ignore[arg-type] + # Check if this is a timedelta or datetime based on: + # 1. The zarr dtype (if it's a native time type like TimeDelta64) + # 2. The existing dtype attribute (for int64-encoded times) + # 3. The standard_name attribute (e.g., "forecast_period" indicates timedelta) + existing_dtype_attr = str(attrs.get("dtype", "")) + standard_name = str(attrs.get("standard_name", "")) + is_timedelta = ( + isinstance(metadata.dtype, TimeDelta64) + or "timedelta" in existing_dtype_attr + or standard_name == "forecast_period" # CF convention for time since forecast + ) - # Validate input is S2 - if not is_sentinel2_dataset(get_zarr_group(dt_input)): - raise ValueError("Input dataset is not a Sentinel-2 product") + if is_timedelta: + # This is a timedelta array - set or correct the dtype attribute + # Note: xarray/pandas only support timedelta64 with s/ms/us/ns, not m/h/D + # Use 's' as the safest option + attrs["dtype"] = "timedelta64[ns]" + else: + # This is a datetime array - set or correct the dtype attribute + attrs["dtype"] = f"datetime64[{numpy_time_unit}]" - # Step 1: Process data while preserving original structure - log.info("Step 1: Processing data with original structure preserved") - - # Step 2: Create multiscale pyramids for each group in the original structure - log.info("Step 2: Creating multiscale pyramids (preserving original hierarchy)") - datasets = create_multiscale_from_datatree( - dt_input, - output_group=zarr.open_group(output_path), - spatial_chunk=spatial_chunk, - enable_sharding=enable_sharding, - keep_scale_offset=False, + dimension_names: None | tuple[str, ...] = attrs.pop("_ARRAY_DIMENSIONS", None) # type: ignore[assignment] + compressor_converted = convert_compression( + metadata.compressor, compression_level=compression_level + ) + chunk_key_encoding: dict[str, str | dict[str, object]] = { + "name": "default", + "configuration": {"separator": "/"}, + } + + # Zarr v2 allows `None` as a fill value, but for Zarr v3 a fill value consistent with the + # array's data type must be provided. We use the zarr-python model of the data type to get + # a fill value here. + if metadata.fill_value is None: + fill_value = metadata.dtype.default_scalar() + else: + fill_value = metadata.fill_value + + group_name = str(Path(key).parent) + # sentinel-specific logic: if this array is a variable stored in a measurements group + # then we will apply particular chunking + # Check if the group name contains measurements and the last component is a resolution (r10m, r20m, etc.) + in_measurements_group = ( + "measurements" in group_name + and Path(group_name).name.startswith("r") + and Path(group_name).name.endswith("m") ) - log.info("Created multiscale pyramids", num_groups=len(datasets)) + chunk_shape: tuple[int, ...] = metadata.chunks - # Step 3: Root-level consolidation - log.info("Step 3: Final root-level metadata consolidation") - simple_root_consolidation(output_path, datasets) + # check if this is a coordinate variable + is_coord_var = [key] == metadata.attributes.get('_ARRAY_DIMENSIONS') - # Step 4: Validation - if validate_output: - log.info("Step 4: Validating optimized dataset") - validation_results = validate_optimized_dataset(output_path) - if not validation_results["is_valid"]: - log.warning("Validation issues found", issues=validation_results["issues"]) + if in_measurements_group: + chunk_shape = auto_chunks(metadata.shape, spatial_chunk) - # Create result DataTree - result_dt = create_result_datatree(output_path) + if is_coord_var: + chunk_shape = metadata.shape - total_time = time.time() - start_time - log.info("Optimization complete", duration_seconds=round(total_time, 2)) + subchunk_shape: tuple[int, ...] | None = None - optimization_summary(dt_input, result_dt, output_path) + if in_measurements_group and metadata.ndim >= 2 and enable_sharding and not is_coord_var: + subchunk_shape = chunk_shape + chunk_shape = calculate_simple_shard_dimensions(metadata.shape, chunk_shape) - return result_dt + chunk_grid: dict[str, str | dict[str, object]] - -class ConvertS2Params(TypedDict): - enable_sharding: bool - spatial_chunk: int - compression_level: int - max_retries: int + chunk_grid = {"name": "regular", "configuration": {"chunk_shape": chunk_shape}} + if enable_sharding and subchunk_shape is not None: + codecs = ( + { + "name": "sharding_indexed", + "configuration": { + "chunk_shape": subchunk_shape, + "index_codecs": ({"name": "bytes"}, {"name": "crc32c"}), + "index_location": "end", + "codecs": ({"name": "bytes"}, *compressor_converted), + }, + }, + ) + else: + codecs = ({"name": "bytes"}, *compressor_converted) # type: ignore[assignment] + return ArrayV3Metadata( + shape=metadata.shape, + data_type=out_dtype, + chunk_key_encoding=chunk_key_encoding, + chunk_grid=chunk_grid, + fill_value=fill_value, + dimension_names=dimension_names, + codecs=codecs, + attributes=attrs, + ) def convert_s2_optimized( @@ -182,8 +308,10 @@ def convert_s2_optimized( spatial_chunk: int, compression_level: int, validate_output: bool, + omit_nodes: set[str] | None = None, keep_scale_offset: bool, max_retries: int = 3, + allow_json_nan: bool = False, ) -> xr.DataTree: """ Convenience function for S2 optimization. @@ -198,11 +326,19 @@ def convert_s2_optimized( keep_scale_offset: Whether to preserve scale-offset encoding of the source data. max_retries: Maximum number of retries for network operations - Returns: + Returns + ------- + xr.DataTree Optimized DataTree """ + if omit_nodes is None: + omit_nodes = set() + start_time = time.time() + zg = get_zarr_group(dt_input) + s2root_model = Sentinel2Root.from_zarr(zg) + crs = s2root_model.crs log.info( "Starting S2 optimized conversion", @@ -210,34 +346,55 @@ def convert_s2_optimized( output_path=output_path, ) # Validate input is S2 - if not is_sentinel2_dataset(get_zarr_group(dt_input)): + if not is_sentinel2_dataset(zg): raise ValueError("Input dataset is not a Sentinel-2 product") - # Initialize CRS from dataset - crs = initialize_crs_from_dataset(dt_input) - - # Step 1: Process data while preserving original structure - log.info("Step 1: Processing data with original structure preserved") + out_store = sync(make_store(output_path)) - # Step 2: Create multiscale pyramids for each group in the original structure - log.info("Step 2: Creating multiscale pyramids (preserving original hierarchy)") + log.info("Re-encoding source data to Zarr V3") - output_group = zarr.open_group(output_path) + # Create a partial function by specifying parameters for our array encoder + _array_reencoder = partial( + array_reencoder, + config={ + "spatial_chunk": spatial_chunk, + "enable_sharding": enable_sharding, + "compression_level": compression_level, + "keep_scale_offset": keep_scale_offset, + }, + ) - datasets = create_multiscale_from_datatree( - dt_input, - output_group=output_group, - spatial_chunk=spatial_chunk, - enable_sharding=enable_sharding, - crs=crs, - keep_scale_offset=keep_scale_offset, + out_group = reencode_group( + zg, + out_store, + path="", + overwrite=True, + array_reencoder=_array_reencoder, # type: ignore[arg-type] + omit_nodes=omit_nodes, + allow_json_nan=allow_json_nan, ) + if "measurements" in out_group: + log.info("Adding CRS elements to datasets in measurements") + for _, subgroup in out_group["measurements"].groups(): + for _, dataset in subgroup.groups(): + add_crs_and_grid_mapping(dataset, crs=crs) + + if "quality" in out_group: + log.info("Adding CRS elements to quality datasets") + for _, subgroup in out_group["quality"].groups(): + for _, dataset in subgroup.groups(): + add_crs_and_grid_mapping(dataset, crs=crs) + + # Create multiscale pyramids for each group in the original structure + log.info("Adding multiscale levels") - log.info("Created multiscale pyramids", num_groups=len(datasets)) + # Create multiscale metadata + create_multiscale_levels(out_group, "measurements/reflectance") + create_multiscales_metadata(out_group["measurements/reflectance"]) - # Step 3: Root-level consolidation log.info("Step 3: Final root-level metadata consolidation") - simple_root_consolidation(output_path, datasets) + # Pass empty dict since all groups are already created by reencode_group + simple_root_consolidation(output_path, {}) # Step 4: Validation if validate_output: diff --git a/src/eopf_geozarr/s2_optimization/s2_multiscale.py b/src/eopf_geozarr/s2_optimization/s2_multiscale.py index b4b465b..4d8899a 100644 --- a/src/eopf_geozarr/s2_optimization/s2_multiscale.py +++ b/src/eopf_geozarr/s2_optimization/s2_multiscale.py @@ -14,7 +14,7 @@ from dask import delayed from dask.array import from_delayed from pydantic.experimental.missing_sentinel import MISSING -from pyproj import CRS +from zarr.codecs import BloscCodec from eopf_geozarr.conversion.geozarr import ( _create_tile_matrix_limits, @@ -40,6 +40,7 @@ from collections.abc import Hashable, Mapping import zarr + from pyproj import CRS from eopf_geozarr.types import OverviewLevelJSON @@ -64,153 +65,130 @@ def get_grid_spacing(ds: xr.DataArray, coords: tuple[Hashable, ...]) -> tuple[fl return tuple(np.abs(ds.coords[coord][0].data - ds.coords[coord][1].data) for coord in coords) -def create_multiscale_from_datatree( - dt_input: xr.DataTree, - *, - output_group: zarr.Group, - enable_sharding: bool, - spatial_chunk: int, - crs: CRS | None = None, - keep_scale_offset: bool, -) -> dict[str, dict]: +def create_multiscale_levels(group: zarr.Group, path: str) -> None: """ - Create multiscale versions preserving original structure. - Keeps all original groups, adds r120m, r360m, r720m downsampled versions. - - Args: - dt_input: Input DataTree with original structure - output_path: Base output path - enable_sharding: Enable Zarr v3 sharding - spatial_chunk: Spatial chunk size - crs: Coordinate Reference System for datasets - - Returns: - Dictionary of processed groups + Add additional multiscale levels to an existing Zarr group """ - processed_groups = {} - # The scale levels in the output data. 10, 20, 60 already exist in the source data. - - # Step 1: Copy all original groups as-is - for group_path in dt_input.groups: - if group_path == ".": - continue + ds_levels = (20, 60, 120, 360, 720) + # Construct the full path to the group containing resolution levels + full_path = f"{group.path}/{path}" if group.path else path + + for cur_factor, next_factor in pairwise((10, *ds_levels)): + cur_group_name = f"r{cur_factor}m" + next_group_name = f"r{next_factor}m" + # Open the current resolution level as a dataset + cur_group_path = f"{full_path}/{cur_group_name}" + cur_ds = xr.open_dataset(group.store, group=cur_group_path, engine="zarr") + + scale = next_factor // cur_factor + to_downsample: dict[str, xr.DataArray] = {} + to_copy: dict[str, xr.DataArray] = {} + + # Iterate over all variables (data_vars and coords) + for var_name in list(cur_ds.data_vars) + list(cur_ds.coords): + # Convert to string for type safety + var_name_str = str(var_name) + var = cur_ds[var_name] + next_level_path = f"{path}/{next_group_name}" + + # Skip if already exists in next level + if f"{next_level_path}/{var_name_str}" in group: + continue - group_node = dt_input[group_path] + # Decide whether to downsample or copy + # Spatial data variables (2D+) should be downsampled + if var_name in cur_ds.data_vars and var.ndim >= 2: + to_downsample[var_name_str] = var + # Everything else (coordinates, 0D/1D variables like spatial_ref) should be copied + else: + to_copy[var_name_str] = var - # Skip parent groups that have children (only process leaf groups) - if hasattr(group_node, "children") and len(group_node.children) > 0: - continue + log.info("downsampling %s into %s", tuple(sorted(to_downsample.keys())), next_group_name) + if to_copy: + log.info("copying %s into %s", tuple(sorted(to_copy.keys())), next_group_name) - dataset = group_node.to_dataset() + # Only process if there's something to downsample or copy + if to_downsample or to_copy: + # Create downsampled dataset with data variables to downsample + downsampled_ds = create_downsampled_resolution_group( + xr.Dataset(data_vars=to_downsample), factor=scale + ) - # Skip empty groups - if not dataset.data_vars: - log.info("Skipping empty group: {}", group_path=group_path) - continue + # Add variables to copy (like spatial_ref) + for var_name, var in to_copy.items(): + if var_name not in downsampled_ds.coords: + downsampled_ds = downsampled_ds.assign_coords({var_name: var}) + next_group_path = f"{full_path}/{next_group_name}" + downsampled_ds.to_zarr( + group.store, group=next_group_path, consolidated=False, mode="a", compute=True + ) - log.info("Copying original group: {}", group_path=group_path) - # Determine if this is a measurement-related resolution group - group_name = group_path.split("/")[-1] - is_measurement_group = ( - group_name.startswith("r") - and group_name.endswith("m") - and "/measurements/" in group_path +def update_encoding( + array: xr.DataArray, encoding: XarrayDataArrayEncoding +) -> XarrayDataArrayEncoding: + """ + Update an xarray encoding of a variable against a dataarray. Used when ensuring that a downsampled + dataarray has an encoding consistent with both the source array and also its newly reduced shape. + Shape-related quantities like chunks and shards need to be adjusted to match the shape of the array. + All other elements of the encoding are preserved + """ + new_encoding: XarrayDataArrayEncoding = {**encoding} + if "chunks" in new_encoding: + new_encoding["chunks"] = tuple( + min(s, c) for s, c in zip(array.shape, new_encoding["chunks"], strict=True) ) - - if is_measurement_group: - # Measurement groups: apply custom encoding - encoding = create_measurements_encoding( - dataset, - spatial_chunk=spatial_chunk, - enable_sharding=enable_sharding, - keep_scale_offset=keep_scale_offset, + if new_encoding.get("shards") is not None: + # new shards are the largest multiple of the new chunks that fits inside the new shape + new_shards = tuple( + (shape // chunk) * chunk + for chunk, shape in zip(new_encoding["chunks"], array.shape, strict=True) ) - # convert float64 arrays to float32 - for data_var in dataset.data_vars: - if dataset[data_var].dtype in (np.dtype("f8")): - dataset[data_var] = dataset[data_var].astype("float32") - else: - # Non-measurement groups: preserve original encoding - encoding = create_original_encoding(dataset) - - ds_out = stream_write_dataset( - dataset, - path=group_path, - group=output_group, - encoding=encoding, - enable_sharding=enable_sharding, - crs=crs, - ) - processed_groups[group_path] = ds_out - # Step 2: Create downsampled resolution groups ONLY for measurements - # Find all resolution-based groups under /measurements/ and organize by base path - resolution_groups: dict[str, xr.Dataset] = {} - base_path = "/measurements/reflectance" - for group_path in processed_groups: - # Only process groups under /measurements/reflectance - if not group_path.startswith(base_path): - continue - - group_name = group_path.split("/")[-1] - if group_name in ["r10m", "r20m", "r60m"]: - resolution_groups[group_name] = processed_groups[group_path] - - scale_levels = tuple(pyramid_levels.values()) - - # iterate over source, dest pairs: (60, 120), (120, 360), ... - for source_level, dest_level in pairwise(scale_levels[2:]): - dest_level_name = f"r{dest_level}m" - dest_level_path = f"{base_path}/{dest_level_name}" - - source_ds = resolution_groups[f"r{source_level}m"] - - downsample_factor = dest_level // source_level - log.info("Creating level with resolution", level=dest_level_name, resolution=dest_level) - - # Create downsampled dataset - downsampled_dataset = create_downsampled_resolution_group( - source_ds, factor=downsample_factor - ) - - log.info("Writing level to path", level=dest_level_name, output_path=dest_level_path) + # calculate the number of inner chunks within the shard + num_subchunks = np.prod( + tuple( + shard // chunk + for shard, chunk in zip(new_shards, new_encoding["chunks"], strict=True) + ) + ) + # If we would generate shards with a single chunk, there is no longer value in sharding, and we should + # use regular chunking + if num_subchunks == 1: + new_encoding["shards"] = None + else: + new_encoding["shards"] = new_shards + if "preferred_chunks" in new_encoding: + new_encoding["preferred_chunks"] = { + k: min(array.shape[array.dims.index(k)], v) + for k, v in new_encoding["preferred_chunks"].items() + } + return new_encoding - # Create encoding - encoding = create_measurements_encoding( - downsampled_dataset, - spatial_chunk=spatial_chunk, - enable_sharding=enable_sharding, - keep_scale_offset=keep_scale_offset, - ) - # Write dataset - ds_out = stream_write_dataset( - downsampled_dataset, - path=dest_level_path, - group=output_group, - encoding=encoding, - enable_sharding=enable_sharding, - crs=crs, - ) +def auto_chunks(shape: tuple[int, ...], target_chunk_size: int) -> tuple[int, ...]: + """ + Compute a chunk size from a shape and a target chunk size. This logic is application-specific. + For 0D data, the empty tuple is returned. + For 1D data, the minimum of the length of the data and the target chunk size is returned. + For 2D, 3D, etc, a chunk edge length is computed based on each of the last two dimensions, and + all other dimensions have chunks set to 1. + """ + ndim = len(shape) - # Store results - processed_groups[dest_level_path] = ds_out - resolution_groups[dest_level_name] = ds_out + if ndim == 0: + return () - # Step 3: Add multiscales metadata to parent groups - log.info("Adding multiscales metadata to parent groups") + if ndim == 1: + return (min(target_chunk_size, shape[0]),) - # Get the parent group (it was created when writing the resolution groups) - parent_group = output_group[base_path] + height, width = shape[-2:] - dt_multiscale = add_multiscales_metadata_to_parent( - parent_group, - resolution_groups, - multiscales_flavor={"ogc_tms", "experimental_multiscales_convention"}, + spatial_chunk_aligned = min( + target_chunk_size, + calculate_aligned_chunk_size(width, target_chunk_size), + calculate_aligned_chunk_size(height, target_chunk_size), ) - processed_groups[base_path] = dt_multiscale - - return processed_groups + return ((1,) * (ndim - 2)) + (spatial_chunk_aligned, spatial_chunk_aligned) def create_measurements_encoding( @@ -267,7 +245,8 @@ def create_measurements_encoding( keep_keys = XARRAY_ENCODING_KEYS - {"compressors", "shards", "chunks"} if not keep_scale_offset: - keep_keys = keep_keys - CF_SCALE_OFFSET_KEYS + # Remove scale/offset keys AND dtype to prevent transformation + keep_keys = keep_keys - CF_SCALE_OFFSET_KEYS - {"dtype"} for key in keep_keys: if key in var_data.encoding: @@ -341,11 +320,10 @@ def calculate_simple_shard_dimensions( return tuple(shard_dims) -def add_multiscales_metadata_to_parent( - group: zarr.Group, - res_groups: Mapping[str, xr.Dataset], +def create_multiscales_metadata( + out_group: zarr.Group, multiscales_flavor: set[MultiscalesFlavor] | None = None, -) -> xr.DataTree: +) -> None: """Add GeoZarr-compliant multiscales metadata to parent group.""" # Sort by resolution (finest to coarsest) if multiscales_flavor is None: @@ -358,44 +336,30 @@ def add_multiscales_metadata_to_parent( "r360m": 360, "r720m": 720, } - - all_resolutions = sorted(set(res_groups.keys()), key=lambda x: res_order.get(x, 999)) - - if len(all_resolutions) < 2: - log.info( - "Skipping {} - only one resolution available", - base_path=group.path, - ) - return None - - # Get CRS and bounds from first available dataset (load from output path) - first_res = all_resolutions[0] - first_dataset = res_groups[first_res] + res_groups = tuple(out_group[k] for k in res_order) + first_dataset = xr.open_zarr(out_group.store, group=next(iter(res_groups)).path) # Get CRS and bounds native_crs = first_dataset.rio.crs if hasattr(first_dataset, "rio") else None if native_crs is None: - log.info("No CRS found, skipping multiscales metadata", base_path=group.path) - return None + raise ValueError("Cannot determine native CRS for multiscales metadata") - native_bounds = None try: native_bounds = first_dataset.rio.bounds() except (AttributeError, TypeError): + # Try alternative method or construct from coordinates x_coords = first_dataset.x.values y_coords = first_dataset.y.values native_bounds = (x_coords.min(), y_coords.min(), x_coords.max(), y_coords.max()) # Create overview_levels structure following the multiscales v1.0 specification overview_levels: list[OverviewLevelJSON] = [] - for res_name in all_resolutions: + for res_group in res_groups: + res_name = res_group.basename # Use resolution order for consistent scale calculations res_meters = res_order[res_name] - dataset = res_groups[res_name] - - if dataset is None: - continue + dataset = xr.open_zarr(res_group.store, group=res_group.path) # Get first data variable to extract dimensions first_var = next(iter(dataset.data_vars.values())) @@ -457,7 +421,7 @@ def add_multiscales_metadata_to_parent( zoom = max(zoom_for_width, zoom_for_height) # Calculate relative scale and translation vs first resolution - finest_res_meters = res_order[all_resolutions[0]] + finest_res_meters = res_order[res_groups[0].basename] relative_scale = res_meters / finest_res_meters relative_translation = (res_meters - finest_res_meters) / 2 @@ -492,10 +456,6 @@ def add_multiscales_metadata_to_parent( overview_levels.append(layout_entry) - if len(overview_levels) < 2: - log.info(" Could not create overview levels for {}", base_path=group.path) - return None - multiscales: dict[str, Any] = {"multiscales": {}} layout: list[zcm.ScaleLevel] | MISSING = MISSING tile_matrix_set: tms.TileMatrixSet | MISSING = MISSING @@ -543,7 +503,7 @@ def add_multiscales_metadata_to_parent( scale_level_data: dict[str, Any] = {"asset": asset} if i > 0: # Not the first (base) resolution - derived_from = derivation_chain.get(asset, str(all_resolutions[0])) + derived_from = derivation_chain.get(asset, str(res_groups[0].basename)) multiscale_transform = zcm.Transform( scale=(overview_level["scale_relative"],) * 2, translation=(overview_level["translation_relative"],) * 2, @@ -561,6 +521,7 @@ def add_multiscales_metadata_to_parent( scale_level = zcm.ScaleLevel(**scale_level_data) layout.append(scale_level) + # Create convention metadata for all three conventions multiscale_attrs = MultiscaleGroupAttrs( zarr_conventions=( @@ -576,33 +537,28 @@ def add_multiscales_metadata_to_parent( ), ) - # Write multiscale attributes directly to the parent group - attrs_to_write = multiscale_attrs.model_dump() - + # Add multiscale attributes + out_group.attrs.update(multiscale_attrs.model_dump()) # Add spatial and proj attributes at group level following specifications if native_crs and native_bounds: # Add spatial convention attributes - attrs_to_write["spatial:dimensions"] = ["y", "x"] # Required field - attrs_to_write["spatial:bbox"] = list(native_bounds) # [xmin, ymin, xmax, ymax] - attrs_to_write["spatial:registration"] = "pixel" # Default registration type + spatial_attrs: dict[str, object] = { + "spatial:dimensions": ("y", "x"), + "spatial:bbox": tuple(native_bounds), + "spatial:registration": "pixel", + } # Add proj convention attributes if hasattr(native_crs, "to_epsg") and native_crs.to_epsg(): - attrs_to_write["proj:code"] = f"EPSG:{native_crs.to_epsg()}" + spatial_attrs["proj:code"] = f"EPSG:{native_crs.to_epsg()}" elif hasattr(native_crs, "to_wkt"): - attrs_to_write["proj:wkt2"] = native_crs.to_wkt() + spatial_attrs["proj:wkt2"] = native_crs.to_wkt() - # Write attributes directly to the zarr group - group.attrs.update(attrs_to_write) - - log.info("Added %s multiscale levels to %s", len(overview_levels), group.path) - - return None # No DataTree to return since we wrote directly to the group + out_group.attrs.update(spatial_attrs) def create_original_encoding(dataset: xr.Dataset) -> dict[str, XarrayDataArrayEncoding]: """Write a group preserving its original chunking and encoding.""" - from zarr.codecs import BloscCodec # Simple encoding that preserves original structure compressor = BloscCodec(cname="zstd", clevel=3, shuffle="shuffle", blocksize=0) @@ -632,47 +588,38 @@ def create_original_encoding(dataset: xr.Dataset) -> dict[str, XarrayDataArrayEn def create_downsampled_resolution_group(source_dataset: xr.Dataset, factor: int) -> xr.Dataset: """Create a downsampled version of a dataset by given factor.""" - if not source_dataset or len(source_dataset.data_vars) == 0: - return xr.Dataset() - - # Get reference dimensions - ref_var = next(iter(source_dataset.data_vars.values())) - if ref_var.ndim < 2: - return xr.Dataset() - - current_height, current_width = ref_var.shape[-2:] - target_height = current_height // factor - target_width = current_width // factor - - if target_height < 1 or target_width < 1: - return xr.Dataset() - - # Downsample all variables using existing lazy operations - lazy_vars = {} - for var_name, var_data in source_dataset.data_vars.items(): - if var_data.ndim < 2: - continue - var_typ = determine_variable_type(var_name, var_data) - if var_typ == "quality_mask": - lazy_downsampled = var_data.coarsen({"x": factor, "y": factor}, boundary="trim").max() - elif var_typ == "reflectance": - lazy_downsampled = var_data.coarsen({"x": factor, "y": factor}, boundary="trim").mean() - elif var_typ == "classification": - lazy_downsampled = var_data.coarsen({"x": factor, "y": factor}, boundary="trim").reduce( - subsample_2 - ) - elif var_typ == "probability": - lazy_downsampled = var_data.coarsen({"x": factor, "y": factor}, boundary="trim").mean() - else: - raise ValueError(f"Unknown variable type {var_typ}") - # preserve encoding - lazy_downsampled.encoding = var_data.encoding - # Ensure that dtype is preserved - lazy_vars[var_name] = lazy_downsampled.astype(var_data.dtype) + # Downsample all variables + lazy_vars: dict[str, xr.DataArray] = {} + for data_var_name, data_var in source_dataset.data_vars.items(): + if data_var.ndim >= 2: + var_typ = determine_variable_type(data_var_name, data_var) + if var_typ == "quality_mask": + lazy_downsampled = data_var.coarsen( + {"x": factor, "y": factor}, boundary="trim" + ).max() + elif var_typ == "reflectance": + lazy_downsampled = data_var.coarsen( + {"x": factor, "y": factor}, boundary="trim" + ).mean() + elif var_typ == "classification": + lazy_downsampled = data_var.coarsen( + {"x": factor, "y": factor}, boundary="trim" + ).reduce(subsample_2) + elif var_typ == "probability": + lazy_downsampled = data_var.coarsen( + {"x": factor, "y": factor}, boundary="trim" + ).mean() + else: + raise ValueError(f"Unknown variable type {var_typ}") - if not lazy_vars: - return xr.Dataset() + lazy_downsampled = lazy_downsampled.astype(data_var.dtype) + lazy_downsampled.encoding = update_encoding(lazy_downsampled, data_var.encoding) + for coord_name, coord in lazy_downsampled.coords.items(): + lazy_downsampled.coords[coord_name].encoding = update_encoding( + coord, data_var.coords[coord_name].encoding + ) + lazy_vars[data_var_name] = lazy_downsampled # Create dataset with lazy variables and coordinates return xr.Dataset(lazy_vars, attrs=source_dataset.attrs) @@ -798,7 +745,12 @@ def stream_write_dataset( dataset_path=path, ) return xr.open_dataset( - group.store, engine="zarr", chunks={}, decode_coords="all", group=path + group.store, + engine="zarr", + chunks={}, + decode_coords="all", + consolidated=False, + group=f"{group.path}/{path}", ) log.info("Streaming computation and write to {}", dataset_path=path) @@ -824,7 +776,6 @@ def stream_write_dataset( group=path, compute=False, # Create job first for progress tracking ) - write_job = write_job.persist() if DISTRIBUTED_AVAILABLE: try: @@ -863,8 +814,9 @@ def stream_write_dataset( def write_geo_metadata( dataset: xr.Dataset, + crs: CRS, + *, grid_mapping_var_name: str = "spatial_ref", - crs: CRS | None = None, ) -> None: """ Write geographic metadata to the dataset. @@ -874,81 +826,69 @@ def write_geo_metadata( grid_mapping_var_name: Name for grid mapping variable crs: Coordinate Reference System to use (if None, attempts to detect from dataset) """ - # Use provided CRS or try to detect from dataset - if crs is None: - for var in dataset.data_vars.values(): - if hasattr(var, "rio") and var.rio.crs: - crs = var.rio.crs - break - if "proj:epsg" in var.attrs: - epsg = var.attrs["proj:epsg"] - crs = CRS.from_epsg(epsg) - break - - if crs is not None: - # Write CRS using rioxarray - # NOTE: for now rioxarray only supports writing grid mapping using CF conventions - dataset.rio.write_crs(crs, grid_mapping_name=grid_mapping_var_name, inplace=True) - dataset.rio.write_grid_mapping(grid_mapping_var_name, inplace=True) - dataset.attrs["grid_mapping"] = grid_mapping_var_name - - for var in dataset.data_vars.values(): - var.rio.write_grid_mapping(grid_mapping_var_name, inplace=True) - var.attrs["grid_mapping"] = grid_mapping_var_name - - # Also add proj: and spatial: zarr conventions at dataset level - # TODO : Remove once rioxarray supports writing these conventions directly - # https://github.com/corteva/rioxarray/pull/883 - - # Add spatial convention attributes - dataset.attrs["spatial:dimensions"] = ["y", "x"] # Required field - dataset.attrs["spatial:registration"] = "pixel" # Default registration type - - # Calculate and add spatial bbox if coordinates are available - if "x" in dataset.coords and "y" in dataset.coords: - x_coords = dataset.coords["x"].values - y_coords = dataset.coords["y"].values - x_min, x_max = float(x_coords.min()), float(x_coords.max()) - y_min, y_max = float(y_coords.min()), float(y_coords.max()) - dataset.attrs["spatial:bbox"] = [x_min, y_min, x_max, y_max] - - # Calculate spatial transform (affine transformation) - spatial_transform = None - if hasattr(dataset, "rio") and hasattr(dataset.rio, "transform"): - try: - rio_transform = dataset.rio.transform - if callable(rio_transform): - rio_transform = rio_transform() - spatial_transform = list(rio_transform)[:6] - except (AttributeError, TypeError): - # Fallback: construct from coordinate spacing - pixel_size_x = float(get_grid_spacing(dataset, ("x",))[0]) - pixel_size_y = float(get_grid_spacing(dataset, ("y",))[0]) - spatial_transform = [ - pixel_size_x, - 0.0, - x_min, - 0.0, - -pixel_size_y, - y_max, - ] - - # Only add spatial:transform if we have valid transform data (not all zeros) - if spatial_transform is not None and not all(t == 0 for t in spatial_transform): - dataset.attrs["spatial:transform"] = spatial_transform - - # Add spatial shape if data variables exist - if dataset.data_vars: - first_var = next(iter(dataset.data_vars.values())) - if first_var.ndim >= 2: - height, width = first_var.shape[-2:] - dataset.attrs["spatial:shape"] = [height, width] + # Write CRS using rioxarray + # NOTE: for now rioxarray only supports writing grid mapping using CF conventions + dataset.rio.write_crs(crs, grid_mapping_name=grid_mapping_var_name, inplace=True) + dataset.rio.write_grid_mapping(grid_mapping_var_name, inplace=True) + dataset.attrs["grid_mapping"] = grid_mapping_var_name + + for var in dataset.data_vars.values(): + var.rio.write_grid_mapping(grid_mapping_var_name, inplace=True) + var.attrs["grid_mapping"] = grid_mapping_var_name + + # Also add proj: and spatial: zarr conventions at dataset level + # TODO : Remove once rioxarray supports writing these conventions directly + # https://github.com/corteva/rioxarray/pull/883 + + # Add spatial convention attributes + dataset.attrs["spatial:dimensions"] = ["y", "x"] # Required field + dataset.attrs["spatial:registration"] = "pixel" # Default registration type + + # Calculate and add spatial bbox if coordinates are available + if "x" in dataset.coords and "y" in dataset.coords: + x_coords = dataset.coords["x"].values + y_coords = dataset.coords["y"].values + x_min, x_max = float(x_coords.min()), float(x_coords.max()) + y_min, y_max = float(y_coords.min()), float(y_coords.max()) + dataset.attrs["spatial:bbox"] = [x_min, y_min, x_max, y_max] - # Add proj convention attributes - if hasattr(crs, "to_epsg") and crs.to_epsg(): - dataset.attrs["proj:code"] = f"EPSG:{crs.to_epsg()}" - elif hasattr(crs, "to_wkt"): - dataset.attrs["proj:wkt2"] = crs.to_wkt() + # Calculate spatial transform (affine transformation) + spatial_transform = None + if hasattr(dataset, "rio") and hasattr(dataset.rio, "transform"): + try: + rio_transform = dataset.rio.transform + if callable(rio_transform): + rio_transform = rio_transform() + spatial_transform = list(rio_transform)[:6] + except (AttributeError, TypeError): + # Fallback: construct from coordinate spacing + pixel_size_x = float(get_grid_spacing(dataset, ("x",))[0]) + pixel_size_y = float(get_grid_spacing(dataset, ("y",))[0]) + spatial_transform = [ + pixel_size_x, + 0.0, + x_min, + 0.0, + -pixel_size_y, + y_max, + ] + + # Only add spatial:transform if we have valid transform data (not all zeros) + if spatial_transform is not None and not all(t == 0 for t in spatial_transform): + dataset.attrs["spatial:transform"] = spatial_transform + + # Add spatial shape if data variables exist + if dataset.data_vars: + first_var = next(iter(dataset.data_vars.values())) + if first_var.ndim >= 2: + height, width = first_var.shape[-2:] + dataset.attrs["spatial:shape"] = [height, width] + + # Add proj convention attributes + if hasattr(crs, "to_epsg") and crs.to_epsg(): + dataset.attrs["proj:code"] = f"EPSG:{crs.to_epsg()}" + elif hasattr(crs, "to_wkt"): + dataset.attrs["proj:wkt2"] = crs.to_wkt() def rechunk_dataset_for_encoding( diff --git a/src/eopf_geozarr/zarrio.py b/src/eopf_geozarr/zarrio.py new file mode 100644 index 0000000..365f049 --- /dev/null +++ b/src/eopf_geozarr/zarrio.py @@ -0,0 +1,256 @@ +""" +This module contains zarr-specific IO routines +""" + +from __future__ import annotations + +from dataclasses import replace +from typing import TYPE_CHECKING, Any, NotRequired, Protocol, TypedDict + +import numcodecs +import structlog +import zarr +from zarr.core.group import GroupMetadata +from zarr.core.metadata.v3 import ArrayV3Metadata +from zarr.core.sync import sync +from zarr.dtype import Float32, ZDType +from zarr.storage._common import make_store_path + +from eopf_geozarr.data_api.geozarr.common import extract_scale_offset + +if TYPE_CHECKING: + from collections.abc import Mapping + + from zarr.core.metadata.v2 import ArrayV2Metadata + + +class ChunkEncodingSpec(TypedDict): + write_chunks: tuple[int, ...] + read_chunks: NotRequired[tuple[int, ...]] + + +def convert_compression( + compressor: numcodecs.abc.Codec | dict[str, object], *, compression_level: int | None = None +) -> tuple[dict[str, Any], ...]: + """ + Convert the compression parameters for a Zarr V2 array into a tuple of Zarr V3 codecs. + + Parameters + ---------- + compressor : numcodecs.abc.Codec | dict[str, object] + The compressor of the zarr v3 array + compression_level: int | None = None + The new compression level to use. + Only applies to the specific case of blosc -> blosc compression. + """ + + if compressor is None: + return () + + SHUFFLE = ("noshuffle", "shuffle", "bitshuffle") + if isinstance(compressor, numcodecs.abc.Codec): + old_config = compressor.get_config() + else: + old_config = compressor + if old_config["id"] == "blosc": + new_level = old_config["clevel"] if compression_level is None else compression_level + new_codec = { + "name": "blosc", + "configuration": { + "cname": old_config["cname"], + "clevel": new_level, + "blocksize": old_config["blocksize"], + "shuffle": SHUFFLE[old_config["shuffle"]], + }, + } + return (new_codec,) + + raise ValueError(f"Only blosc -> blosc or None -> () is supported. Got {compressor=}") + + +class ArrayReencoderConfig(TypedDict): + keep_scale_offset: NotRequired[bool] + spatial_chunk: NotRequired[int] + enable_sharding: NotRequired[bool] + compression_level: NotRequired[int] + + +def default_array_reencoder( + key: str, + metadata: ArrayV2Metadata, +) -> ArrayV3Metadata: + """ + Re-encode a zarr array into a new zarr v3 array. + """ + keep_scale_offset = False + new_codecs = convert_compression(metadata.compressor) + if metadata.fill_value is None: + fill_value = metadata.dtype.default_scalar() + else: + fill_value = metadata.fill_value + attrs, maybe_scale_offset = extract_scale_offset(metadata.attributes.copy()) + out_dtype: ZDType[Any, Any] + # If the array used CF scale/offset encoding, we change the output dtype to Float32 + out_dtype = metadata.dtype + if not keep_scale_offset and len(maybe_scale_offset) > 0: + out_dtype = Float32() + + dimension_names = attrs.pop("_ARRAY_DIMENSIONS", None) + chunk_grid_shape = metadata.chunks + codecs = ({"name": "bytes"}, *new_codecs) + + return ArrayV3Metadata( + shape=metadata.shape, + data_type=out_dtype, + chunk_key_encoding={"name": "default", "configuration": {"separator": "/"}}, + chunk_grid={ + "name": "regular", + "configuration": {"chunk_shape": chunk_grid_shape}, + }, + fill_value=fill_value, + dimension_names=dimension_names, + codecs=codecs, + attributes=attrs, + ) + + +def replace_json_invalid_floats(obj: object) -> object: + """ + Recursively replace NaN and Infinity float values in a JSON-like object + with their string representations, to make them JSON-compliant. + + Parameters + ---------- + obj : object + The JSON-like object to process + + Returns + ------- + object + The processed object with NaN and Infinity replaced + """ + if isinstance(obj, float): + if obj != obj: # NaN check + return "NaN" + if obj == float("inf"): + return "Infinity" + if obj == float("-inf"): + return "-Infinity" + return obj + if isinstance(obj, dict): + return {k: replace_json_invalid_floats(v) for k, v in obj.items()} + if isinstance(obj, list | tuple): + return [replace_json_invalid_floats(item) for item in obj] + return obj + + +class ArrayReencoder(Protocol): + @staticmethod + def __call__( + key: str, + metadata: ArrayV2Metadata, + ) -> ArrayV3Metadata: ... + + +def reencode_group( + group: zarr.Group, + store: zarr.storage.StoreLike, + path: str, + *, + overwrite: bool = False, + use_consolidated_for_children: bool = False, + omit_nodes: set[str] | None = None, + array_reencoder: ArrayReencoder = default_array_reencoder, + allow_json_nan: bool = False, +) -> zarr.Group: + """ + Re-encode a Zarr group, applying a re-encoding to all sub-groups and sub-arrays. + + Parameters + ---------- + group : zarr.Group + The Zarr group to re-encode + store : zarr.storage.StoreLike + The store to write into + path : str + The path in the new store to use + overwrite : bool, default = False + Whether to overwrite contents of the new store + omit_nodes : set[str], default = {} + The names of groups or arrays to omit from re-encoding. + chunk_reencoder : Callable[[zarr.Array[Any], ChunkEncodingSpec]] | None, default = None + A function that takes a Zarr array object and returns a ChunkEncodingSpec, which is a dict + that defines a new chunk encoding. Use this parameter to define per-array chunk encoding + logic. + allow_json_nan : bool, default = False + Whether to allow NaN and Infinity values in JSON metadata. If False, float("NaN") will be + encoded as the string "Nan", float("Inf") as "Infinity", and float("-Inf") as "-Infinity". + """ + if omit_nodes is None: + omit_nodes = set() + + log = structlog.get_logger() + + # Convert store-like to a proper Store object + store_path = sync(make_store_path(store)) + store = store_path.store + + members = dict( + group.members(max_depth=None, use_consolidated_for_children=use_consolidated_for_children) + ) + + log.info("Begin re-encoding Zarr group %s", group) + new_members: dict[str, ArrayV3Metadata | GroupMetadata] = { + path: GroupMetadata(zarr_format=3, attributes=group.attrs.asdict()) + } + chunks_to_encode: list[str] = [] + for name in omit_nodes: + if not any(k.startswith(name) for k in members): + log.warning( + "The name %s was provided in omit_nodes but no such array or group exists.", name + ) + for name, member in members.items(): + if any(name.startswith(v) for v in omit_nodes): + log.info( + "Skipping node %s because it is contained in a subgroup declared in the omit_groups parameter", + name, + ) + continue + + log.info("Re-encoding member %s", name) + new_path = f"{path}/{name}" + source_meta = member.metadata + + if not allow_json_nan: + log.info("Replacing any nan and inf float values with string equivalents") + # replace any invalid floats with string encodings using the + # zarr v3 fill value encoding for floats + new_attrs = replace_json_invalid_floats(source_meta.attributes) + source_meta = replace(source_meta, attributes=new_attrs) + + if isinstance(member, zarr.Array): + new_meta = array_reencoder(member.path, source_meta) + new_members[new_path] = new_meta + chunks_to_encode.append(name) + else: + new_members[new_path] = GroupMetadata( + zarr_format=3, + attributes=member.attrs.asdict(), + ) + log.info("Creating new Zarr hierarchy structure at %s", f"{store}/{path}") + tree = dict(zarr.create_hierarchy(store=store, nodes=new_members, overwrite=overwrite)) + new_group: zarr.Group = tree[path] + for name in chunks_to_encode: + log.info("Re-encoding chunks for array %s", name) + old_array = group[name] + new_array = new_group[name] + + new_array[...] = old_array[...] + + # return the root group + return tree[path] + + +class ArrayEncoding(TypedDict): + dimension_names: NotRequired[None | tuple[str | None, ...]] + attributes: NotRequired[Mapping[str, object] | None] diff --git a/tests/__init__.py b/tests/__init__.py index 4057dff..19d152c 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -165,7 +165,7 @@ def _verify_multiscale_structure(output_path: pathlib.Path, group: str) -> None: assert "y" in ds.dims, f"Missing 'y' dimension in {level_path}" # Store shape for progression verification - level_shapes[level_num] = (ds.dims["y"], ds.dims["x"]) + level_shapes[level_num] = (ds.sizes["y"], ds.sizes["x"]) print(f" Level {level_num}: {level_shapes[level_num]} pixels") ds.close() diff --git a/tests/_test_data/optimized_geozarr_examples/S2A_MSIL2A_20251008T100041_N0511_R122_T32TQM_20251008T122613.json b/tests/_test_data/optimized_geozarr_examples/S2A_MSIL2A_20251008T100041_N0511_R122_T32TQM_20251008T122613.json index f85b343..49bef77 100644 --- a/tests/_test_data/optimized_geozarr_examples/S2A_MSIL2A_20251008T100041_N0511_R122_T32TQM_20251008T122613.json +++ b/tests/_test_data/optimized_geozarr_examples/S2A_MSIL2A_20251008T100041_N0511_R122_T32TQM_20251008T122613.json @@ -29,6 +29,16 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 28 + }, + "name": "blosc" } ], "data_type": { @@ -70,6 +80,16 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 12 + }, + "name": "blosc" } ], "data_type": { @@ -111,6 +131,16 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 8 + }, + "name": "blosc" } ], "data_type": "int64", @@ -127,7 +157,6 @@ }, "mean_sun_angles": { "attributes": { - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "angle" @@ -136,7 +165,7 @@ "angle" ], "eopf_is_masked": true, - "fill_value": NaN + "fill_value": "NaN" }, "unit": "deg" }, @@ -164,9 +193,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 8 }, "name": "blosc" @@ -176,7 +205,7 @@ "dimension_names": [ "angle" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 2 @@ -186,7 +215,6 @@ }, "mean_viewing_incidence_angles": { "attributes": { - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "angle", @@ -197,7 +225,7 @@ "angle" ], "eopf_is_masked": true, - "fill_value": NaN + "fill_value": "NaN" }, "unit": "deg" }, @@ -226,9 +254,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 8 }, "name": "blosc" @@ -239,7 +267,7 @@ "band", "angle" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 13, @@ -250,7 +278,6 @@ }, "sun_angles": { "attributes": { - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "angle", @@ -263,7 +290,7 @@ "x" ], "eopf_is_masked": true, - "fill_value": NaN + "fill_value": "NaN" } }, "chunk_grid": { @@ -292,9 +319,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 8 }, "name": "blosc" @@ -306,7 +333,7 @@ "y", "x" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 2, @@ -318,7 +345,6 @@ }, "viewing_incidence_angles": { "attributes": { - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "angle", @@ -335,7 +361,7 @@ "x" ], "eopf_is_masked": true, - "fill_value": NaN + "fill_value": "NaN" } }, "chunk_grid": { @@ -366,9 +392,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 8 }, "name": "blosc" @@ -382,7 +408,7 @@ "y", "x" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 13, @@ -416,6 +442,16 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 8 + }, + "name": "blosc" } ], "data_type": "int64", @@ -452,6 +488,16 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 8 + }, + "name": "blosc" } ], "data_type": "int64", @@ -541,9 +587,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 1 }, "name": "blosc" @@ -625,9 +671,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 1 }, "name": "blosc" @@ -709,9 +755,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 1 }, "name": "blosc" @@ -793,9 +839,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 1 }, "name": "blosc" @@ -837,6 +883,16 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 8 + }, + "name": "blosc" } ], "data_type": "int64", @@ -873,6 +929,16 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 8 + }, + "name": "blosc" } ], "data_type": "int64", @@ -956,9 +1022,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 1 }, "name": "blosc" @@ -1040,9 +1106,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 1 }, "name": "blosc" @@ -1124,9 +1190,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 1 }, "name": "blosc" @@ -1208,9 +1274,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 1 }, "name": "blosc" @@ -1292,9 +1358,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 1 }, "name": "blosc" @@ -1376,9 +1442,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 1 }, "name": "blosc" @@ -1420,6 +1486,16 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 8 + }, + "name": "blosc" } ], "data_type": "int64", @@ -1456,6 +1532,16 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 8 + }, + "name": "blosc" } ], "data_type": "int64", @@ -1539,9 +1625,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 1 }, "name": "blosc" @@ -1623,9 +1709,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 1 }, "name": "blosc" @@ -1707,9 +1793,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 1 }, "name": "blosc" @@ -1751,6 +1837,16 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 8 + }, + "name": "blosc" } ], "data_type": "int64", @@ -1787,6 +1883,16 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 8 + }, + "name": "blosc" } ], "data_type": "int64", @@ -1897,9 +2003,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 1 }, "name": "blosc" @@ -1941,6 +2047,16 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 8 + }, + "name": "blosc" } ], "data_type": "int64", @@ -1977,6 +2093,16 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 8 + }, + "name": "blosc" } ], "data_type": "int64", @@ -2063,9 +2189,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 1 }, "name": "blosc" @@ -2107,6 +2233,16 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 8 + }, + "name": "blosc" } ], "data_type": "int64", @@ -2143,6 +2279,16 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 8 + }, + "name": "blosc" } ], "data_type": "int64", @@ -2222,9 +2368,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 1 }, "name": "blosc" @@ -2266,6 +2412,16 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 8 + }, + "name": "blosc" } ], "data_type": "int64", @@ -2302,6 +2458,16 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 8 + }, + "name": "blosc" } ], "data_type": "int64", @@ -2371,7 +2537,6 @@ "GRIB_totalNumber": 0, "GRIB_typeOfLevel": "surface", "GRIB_units": "~", - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "latitude", @@ -2387,7 +2552,7 @@ "longitude" ], "eopf_is_masked": true, - "fill_value": NaN, + "fill_value": "NaN", "long_name": "Total Aerosol Optical Depth at 1240nm", "standard_name": "unknown", "units": "~" @@ -2422,9 +2587,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 4 }, "name": "blosc" @@ -2435,7 +2600,7 @@ "latitude", "longitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9, @@ -2473,7 +2638,6 @@ "GRIB_totalNumber": 0, "GRIB_typeOfLevel": "surface", "GRIB_units": "~", - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "latitude", @@ -2489,7 +2653,7 @@ "longitude" ], "eopf_is_masked": true, - "fill_value": NaN, + "fill_value": "NaN", "long_name": "Total Aerosol Optical Depth at 469nm", "standard_name": "unknown", "units": "~" @@ -2524,9 +2688,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 4 }, "name": "blosc" @@ -2537,7 +2701,7 @@ "latitude", "longitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9, @@ -2575,7 +2739,6 @@ "GRIB_totalNumber": 0, "GRIB_typeOfLevel": "surface", "GRIB_units": "~", - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "latitude", @@ -2591,7 +2754,7 @@ "longitude" ], "eopf_is_masked": true, - "fill_value": NaN, + "fill_value": "NaN", "long_name": "Total Aerosol Optical Depth at 550nm", "standard_name": "unknown", "units": "~" @@ -2626,9 +2789,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 4 }, "name": "blosc" @@ -2639,7 +2802,7 @@ "latitude", "longitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9, @@ -2677,7 +2840,6 @@ "GRIB_totalNumber": 0, "GRIB_typeOfLevel": "surface", "GRIB_units": "~", - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "latitude", @@ -2693,7 +2855,7 @@ "longitude" ], "eopf_is_masked": true, - "fill_value": NaN, + "fill_value": "NaN", "long_name": "Total Aerosol Optical Depth at 670nm", "standard_name": "unknown", "units": "~" @@ -2728,9 +2890,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 4 }, "name": "blosc" @@ -2741,7 +2903,7 @@ "latitude", "longitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9, @@ -2779,7 +2941,6 @@ "GRIB_totalNumber": 0, "GRIB_typeOfLevel": "surface", "GRIB_units": "~", - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "latitude", @@ -2795,7 +2956,7 @@ "longitude" ], "eopf_is_masked": true, - "fill_value": NaN, + "fill_value": "NaN", "long_name": "Total Aerosol Optical Depth at 865nm", "standard_name": "unknown", "units": "~" @@ -2830,9 +2991,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 4 }, "name": "blosc" @@ -2843,7 +3004,7 @@ "latitude", "longitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9, @@ -2881,7 +3042,6 @@ "GRIB_totalNumber": 0, "GRIB_typeOfLevel": "surface", "GRIB_units": "~", - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "latitude", @@ -2897,7 +3057,7 @@ "longitude" ], "eopf_is_masked": true, - "fill_value": NaN, + "fill_value": "NaN", "long_name": "Black Carbon Aerosol Optical Depth at 550nm", "standard_name": "unknown", "units": "~" @@ -2932,9 +3092,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 4 }, "name": "blosc" @@ -2945,7 +3105,7 @@ "latitude", "longitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9, @@ -2983,7 +3143,6 @@ "GRIB_totalNumber": 0, "GRIB_typeOfLevel": "isobaricInhPa", "GRIB_units": "~", - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "latitude", @@ -2999,7 +3158,7 @@ "longitude" ], "eopf_is_masked": true, - "fill_value": NaN, + "fill_value": "NaN", "long_name": "Dust Aerosol Optical Depth at 550nm", "standard_name": "unknown", "units": "~" @@ -3034,9 +3193,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 4 }, "name": "blosc" @@ -3047,7 +3206,7 @@ "latitude", "longitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9, @@ -3058,7 +3217,6 @@ }, "isobaricInhPa": { "attributes": { - "_FillValue": "AAAAAAAA+H8=", "long_name": "pressure", "positive": "down", "standard_name": "air_pressure", @@ -3086,8 +3244,8 @@ } ], "data_type": "float64", - "dimension_names": null, - "fill_value": 0.0, + "dimension_names": [], + "fill_value": "NaN", "node_type": "array", "shape": [], "storage_transformers": [], @@ -3095,7 +3253,6 @@ }, "latitude": { "attributes": { - "_FillValue": "AAAAAAAA+H8=", "long_name": "latitude", "standard_name": "latitude", "stored_direction": "decreasing", @@ -3121,13 +3278,23 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 8 + }, + "name": "blosc" } ], "data_type": "float64", "dimension_names": [ "latitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9 @@ -3137,7 +3304,6 @@ }, "longitude": { "attributes": { - "_FillValue": "AAAAAAAA+H8=", "long_name": "longitude", "standard_name": "longitude", "units": "degrees_east" @@ -3162,13 +3328,23 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 8 + }, + "name": "blosc" } ], "data_type": "float64", "dimension_names": [ "longitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9 @@ -3203,7 +3379,7 @@ } ], "data_type": "int64", - "dimension_names": null, + "dimension_names": [], "fill_value": 0, "node_type": "array", "shape": [], @@ -3239,7 +3415,6 @@ "GRIB_totalNumber": 0, "GRIB_typeOfLevel": "surface", "GRIB_units": "~", - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "latitude", @@ -3255,7 +3430,7 @@ "longitude" ], "eopf_is_masked": true, - "fill_value": NaN, + "fill_value": "NaN", "long_name": "Organic Matter Aerosol Optical Depth at 550nm", "standard_name": "unknown", "units": "~" @@ -3290,9 +3465,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 4 }, "name": "blosc" @@ -3303,7 +3478,7 @@ "latitude", "longitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9, @@ -3341,7 +3516,6 @@ "GRIB_totalNumber": 0, "GRIB_typeOfLevel": "surface", "GRIB_units": "~", - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "latitude", @@ -3357,7 +3531,7 @@ "longitude" ], "eopf_is_masked": true, - "fill_value": NaN, + "fill_value": "NaN", "long_name": "Sea Salt Aerosol Optical Depth at 550nm", "standard_name": "unknown", "units": "~" @@ -3392,9 +3566,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 4 }, "name": "blosc" @@ -3405,7 +3579,7 @@ "latitude", "longitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9, @@ -3419,7 +3593,7 @@ "dtype": "timedelta64[ns]", "long_name": "time since forecast_reference_time", "standard_name": "forecast_period", - "units": "days" + "units": "minutes" }, "chunk_grid": { "configuration": { @@ -3442,7 +3616,7 @@ } ], "data_type": "int64", - "dimension_names": null, + "dimension_names": [], "fill_value": 0, "node_type": "array", "shape": [], @@ -3478,7 +3652,6 @@ "GRIB_totalNumber": 0, "GRIB_typeOfLevel": "surface", "GRIB_units": "~", - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "latitude", @@ -3494,7 +3667,7 @@ "longitude" ], "eopf_is_masked": true, - "fill_value": NaN, + "fill_value": "NaN", "long_name": "Sulphate Aerosol Optical Depth at 550nm", "standard_name": "unknown", "units": "~" @@ -3529,9 +3702,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 4 }, "name": "blosc" @@ -3542,7 +3715,7 @@ "latitude", "longitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9, @@ -3553,7 +3726,6 @@ }, "surface": { "attributes": { - "_FillValue": "AAAAAAAA+H8=", "long_name": "original GRIB coordinate for key: level(surface)", "units": "1" }, @@ -3578,8 +3750,8 @@ } ], "data_type": "float64", - "dimension_names": null, - "fill_value": 0.0, + "dimension_names": [], + "fill_value": "NaN", "node_type": "array", "shape": [], "storage_transformers": [], @@ -3613,7 +3785,7 @@ } ], "data_type": "int64", - "dimension_names": null, + "dimension_names": [], "fill_value": 0, "node_type": "array", "shape": [], @@ -3648,7 +3820,7 @@ } ], "data_type": "int64", - "dimension_names": null, + "dimension_names": [], "fill_value": 0, "node_type": "array", "shape": [], @@ -3684,7 +3856,6 @@ "GRIB_totalNumber": 0, "GRIB_typeOfLevel": "surface", "GRIB_units": "m**2 s**-2", - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "latitude", @@ -3700,7 +3871,7 @@ "longitude" ], "eopf_is_masked": true, - "fill_value": NaN, + "fill_value": "NaN", "long_name": "Geopotential", "standard_name": "geopotential", "units": "m**2 s**-2" @@ -3735,9 +3906,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 4 }, "name": "blosc" @@ -3748,7 +3919,7 @@ "latitude", "longitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9, @@ -3774,7 +3945,6 @@ "members": { "isobaricInhPa": { "attributes": { - "_FillValue": "AAAAAAAA+H8=", "long_name": "pressure", "positive": "down", "standard_name": "air_pressure", @@ -3802,8 +3972,8 @@ } ], "data_type": "float64", - "dimension_names": null, - "fill_value": 0.0, + "dimension_names": [], + "fill_value": "NaN", "node_type": "array", "shape": [], "storage_transformers": [], @@ -3811,7 +3981,6 @@ }, "latitude": { "attributes": { - "_FillValue": "AAAAAAAA+H8=", "long_name": "latitude", "standard_name": "latitude", "stored_direction": "decreasing", @@ -3837,13 +4006,23 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 8 + }, + "name": "blosc" } ], "data_type": "float64", "dimension_names": [ "latitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9 @@ -3853,7 +4032,6 @@ }, "longitude": { "attributes": { - "_FillValue": "AAAAAAAA+H8=", "long_name": "longitude", "standard_name": "longitude", "units": "degrees_east" @@ -3878,13 +4056,23 @@ "endian": "little" }, "name": "bytes" + }, + { + "configuration": { + "blocksize": 0, + "clevel": 1, + "cname": "zstd", + "shuffle": "bitshuffle", + "typesize": 8 + }, + "name": "blosc" } ], "data_type": "float64", "dimension_names": [ "longitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9 @@ -3921,7 +4109,6 @@ "GRIB_totalNumber": 0, "GRIB_typeOfLevel": "surface", "GRIB_units": "Pa", - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "latitude", @@ -3937,7 +4124,7 @@ "longitude" ], "eopf_is_masked": true, - "fill_value": NaN, + "fill_value": "NaN", "long_name": "Mean sea level pressure", "standard_name": "air_pressure_at_mean_sea_level", "units": "Pa" @@ -3972,9 +4159,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 4 }, "name": "blosc" @@ -3985,7 +4172,7 @@ "latitude", "longitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9, @@ -4021,7 +4208,7 @@ } ], "data_type": "int64", - "dimension_names": null, + "dimension_names": [], "fill_value": 0, "node_type": "array", "shape": [], @@ -4057,7 +4244,6 @@ "GRIB_totalNumber": 0, "GRIB_typeOfLevel": "isobaricInhPa", "GRIB_units": "%", - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "latitude", @@ -4073,7 +4259,7 @@ "longitude" ], "eopf_is_masked": true, - "fill_value": NaN, + "fill_value": "NaN", "long_name": "Relative humidity", "standard_name": "relative_humidity", "units": "%" @@ -4108,9 +4294,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 4 }, "name": "blosc" @@ -4121,7 +4307,7 @@ "latitude", "longitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9, @@ -4135,7 +4321,7 @@ "dtype": "timedelta64[ns]", "long_name": "time since forecast_reference_time", "standard_name": "forecast_period", - "units": "days" + "units": "minutes" }, "chunk_grid": { "configuration": { @@ -4158,7 +4344,7 @@ } ], "data_type": "int64", - "dimension_names": null, + "dimension_names": [], "fill_value": 0, "node_type": "array", "shape": [], @@ -4167,7 +4353,6 @@ }, "surface": { "attributes": { - "_FillValue": "AAAAAAAA+H8=", "long_name": "original GRIB coordinate for key: level(surface)", "units": "1" }, @@ -4192,8 +4377,8 @@ } ], "data_type": "float64", - "dimension_names": null, - "fill_value": 0.0, + "dimension_names": [], + "fill_value": "NaN", "node_type": "array", "shape": [], "storage_transformers": [], @@ -4228,7 +4413,6 @@ "GRIB_totalNumber": 0, "GRIB_typeOfLevel": "surface", "GRIB_units": "kg m**-2", - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "latitude", @@ -4244,7 +4428,7 @@ "longitude" ], "eopf_is_masked": true, - "fill_value": NaN, + "fill_value": "NaN", "long_name": "Total column ozone", "standard_name": "atmosphere_mass_content_of_ozone", "units": "kg m**-2" @@ -4279,9 +4463,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 4 }, "name": "blosc" @@ -4292,7 +4476,7 @@ "latitude", "longitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9, @@ -4330,7 +4514,6 @@ "GRIB_totalNumber": 0, "GRIB_typeOfLevel": "surface", "GRIB_units": "kg m**-2", - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "latitude", @@ -4346,7 +4529,7 @@ "longitude" ], "eopf_is_masked": true, - "fill_value": NaN, + "fill_value": "NaN", "long_name": "Total column vertically-integrated water vapour", "standard_name": "lwe_thickness_of_atmosphere_mass_content_of_water_vapor", "units": "kg m**-2" @@ -4381,9 +4564,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 4 }, "name": "blosc" @@ -4394,7 +4577,7 @@ "latitude", "longitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9, @@ -4431,7 +4614,7 @@ } ], "data_type": "int64", - "dimension_names": null, + "dimension_names": [], "fill_value": 0, "node_type": "array", "shape": [], @@ -4467,7 +4650,6 @@ "GRIB_totalNumber": 0, "GRIB_typeOfLevel": "surface", "GRIB_units": "m s**-1", - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "latitude", @@ -4483,7 +4665,7 @@ "longitude" ], "eopf_is_masked": true, - "fill_value": NaN, + "fill_value": "NaN", "long_name": "10 metre U wind component", "standard_name": "unknown", "units": "m s**-1" @@ -4518,9 +4700,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 4 }, "name": "blosc" @@ -4531,7 +4713,7 @@ "latitude", "longitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9, @@ -4569,7 +4751,6 @@ "GRIB_totalNumber": 0, "GRIB_typeOfLevel": "surface", "GRIB_units": "m s**-1", - "_FillValue": "AAAAAAAA+H8=", "_eopf_attrs": { "coordinates": [ "latitude", @@ -4585,7 +4766,7 @@ "longitude" ], "eopf_is_masked": true, - "fill_value": NaN, + "fill_value": "NaN", "long_name": "10 metre V wind component", "standard_name": "unknown", "units": "m s**-1" @@ -4620,9 +4801,9 @@ { "configuration": { "blocksize": 0, - "clevel": 3, + "clevel": 1, "cname": "zstd", - "shuffle": "shuffle", + "shuffle": "bitshuffle", "typesize": 4 }, "name": "blosc" @@ -4633,7 +4814,7 @@ "latitude", "longitude" ], - "fill_value": 0.0, + "fill_value": "NaN", "node_type": "array", "shape": [ 9, @@ -4670,7 +4851,7 @@ } ], "data_type": "int64", - "dimension_names": null, + "dimension_names": [], "fill_value": 0, "node_type": "array", "shape": [], @@ -4851,54 +5032,54 @@ { "cellSize": 0.0, "id": "r10m", - "matrixHeight": 1, - "matrixWidth": 1, + "matrixHeight": 45, + "matrixWidth": 45, "pointOfOrigin": [ 0.0, 0.0 ], "scaleDenominator": 0.0, - "tileHeight": 10980, - "tileWidth": 10980 + "tileHeight": 244, + "tileWidth": 244 }, { "cellSize": 0.0, "id": "r20m", - "matrixHeight": 1, - "matrixWidth": 1, + "matrixHeight": 30, + "matrixWidth": 30, "pointOfOrigin": [ 0.0, 0.0 ], "scaleDenominator": 0.0, - "tileHeight": 5490, - "tileWidth": 5490 + "tileHeight": 183, + "tileWidth": 183 }, { "cellSize": 0.0, "id": "r60m", - "matrixHeight": 1, - "matrixWidth": 1, + "matrixHeight": 10, + "matrixWidth": 10, "pointOfOrigin": [ 0.0, 0.0 ], "scaleDenominator": 0.0, - "tileHeight": 1830, - "tileWidth": 1830 + "tileHeight": 183, + "tileWidth": 183 }, { "cellSize": 0.0, "id": "r120m", - "matrixHeight": 1, - "matrixWidth": 1, + "matrixHeight": 5, + "matrixWidth": 5, "pointOfOrigin": [ 0.0, 0.0 ], "scaleDenominator": 0.0, - "tileHeight": 915, - "tileWidth": 915 + "tileHeight": 183, + "tileWidth": 183 }, { "cellSize": 0.0, @@ -4910,8 +5091,8 @@ 0.0 ], "scaleDenominator": 0.0, - "tileHeight": 256, - "tileWidth": 256 + "tileHeight": 183, + "tileWidth": 183 }, { "cellSize": 0.0, @@ -4969,28 +5150,11 @@ "members": { "r10m": { "attributes": { - "grid_mapping": "spatial_ref", - "proj:code": "EPSG:32632", - "spatial:bbox": [ - 0.0, - 0.0, - 0.0, - 0.0 - ], - "spatial:dimensions": [ - "y", - "x" - ], - "spatial:registration": "pixel", - "spatial:shape": [ - 10980, - 10980 - ] + "grid_mapping": "spatial_ref" }, "members": { "b02": { "attributes": { - "_FillValue": "AAAAAAAAAAA=", "_eopf_attrs": { "add_offset": -0.1, "coordinates": [ @@ -5009,6 +5173,7 @@ "scale_factor": 0.0001, "units": "digital_counts" }, + "coordinates": "spatial_ref", "dtype": " zarr.Group: """ source_path: pathlib.Path = request.param store = {} - return GroupSpecV3(**read_json(source_path)).to_zarr(store, path="") + # Need to ignore warnings about fixed-length string dtypes + with pytest.warns(zarr.errors.UnstableSpecificationWarning): + return GroupSpecV3(**read_json(source_path)).to_zarr(store, path="") @pytest.fixture(params=optimized_geozarr_example_paths, ids=get_stem) diff --git a/tests/test_cli_e2e.py b/tests/test_cli_e2e.py index 2fc277c..210c9d2 100644 --- a/tests/test_cli_e2e.py +++ b/tests/test_cli_e2e.py @@ -10,20 +10,20 @@ import subprocess from pathlib import Path +import jsondiff import pytest import xarray as xr import zarr from pydantic_zarr.core import tuplify_json -from pydantic_zarr.v3 import GroupSpec - -from tests.test_data_api.conftest import view_json_diff +from pydantic_zarr.experimental.v3 import GroupSpec +@pytest.mark.filterwarnings("ignore:.*:zarr.errors.UnstableSpecificationWarning") def test_convert_s2_optimized(s2_group_example: Path, tmp_path: Path) -> None: """ Test the convert-s2-optimized CLI command on a local copy of sentinel data """ - output_path = tmp_path + output_path = tmp_path / (s2_group_example.stem + '_geozarr.zarr') cmd = [ "python", @@ -38,6 +38,7 @@ def test_convert_s2_optimized(s2_group_example: Path, tmp_path: Path) -> None: assert res.returncode == 0, res.stderr +@pytest.mark.filterwarnings("ignore:.*:zarr.errors.UnstableSpecificationWarning") def test_cli_convert_real_sentinel2_data(s2_group_example: Path, tmp_path: Path) -> None: """ Test CLI conversion using a Sentinel-2 hierarchy saved locally. @@ -46,7 +47,12 @@ def test_cli_convert_real_sentinel2_data(s2_group_example: Path, tmp_path: Path) output_path = tmp_path / "s2b_geozarr_cli_test.zarr" # Detect product level (L1C vs L2A) by checking which quicklook group exists - dt_source = xr.open_datatree(s2_group_example, engine="zarr") + dt_source = xr.open_datatree( + s2_group_example, + engine="zarr", + consolidated=False, + decode_timedelta=True, + ) has_l2a_quicklook = "/quality/l2a_quicklook" in dt_source.groups has_l1c_quicklook = "/quality/l1c_quicklook" in dt_source.groups @@ -139,9 +145,29 @@ def test_cli_convert_real_sentinel2_data(s2_group_example: Path, tmp_path: Path) observed_structure_json = tuplify_json( GroupSpec.from_zarr(zarr.open_group(output_path)).model_dump() ) - assert expected_structure_json == observed_structure_json, view_json_diff( - expected_structure_json, observed_structure_json - ) + observed_structure = GroupSpec(**observed_structure_json) + observed_structure_flat = observed_structure.to_flat() + expected_structure = GroupSpec(**expected_structure_json) + expected_structure_flat = expected_structure.to_flat() + + o_keys = set(observed_structure_flat.keys()) + e_keys = set(expected_structure_flat.keys()) + + # Check that all of the keys are the same + assert o_keys == e_keys + + differences: dict[str, dict[str, object]] = {} + for k in o_keys: + expected_val = expected_structure_flat[k] + observed_val = observed_structure_flat[k] + + expected_val_json = tuplify_json(expected_val.model_dump()) + observed_val_json = tuplify_json(observed_val.model_dump()) + + if expected_val_json != observed_val_json: + differences[k] = jsondiff.diff(expected_val_json, observed_val_json) + + assert differences == {} def test_cli_help_commands() -> None: @@ -270,6 +296,7 @@ def test_cli_convert_with_crs_groups(s2_group_example, tmp_path: Path) -> None: assert (output_path / "zarr.json").exists(), "Main zarr.json not found" +@pytest.mark.filterwarnings("ignore:.*:zarr.errors.ZarrUserWarning") def test_cli_crs_groups_empty_list(tmp_path: str) -> None: """Test CLI with --crs-groups but no groups specified (empty list).""" # Create a minimal test dataset diff --git a/tests/test_data_api/test_geozarr/test_common.py b/tests/test_data_api/test_geozarr/test_common.py index 71505d8..33cf108 100644 --- a/tests/test_data_api/test_geozarr/test_common.py +++ b/tests/test_data_api/test_geozarr/test_common.py @@ -40,7 +40,10 @@ def test_datarraylike(obj: DataArray_V2 | DataArray_V3) -> None: assert isinstance(obj, DataArrayLike) -@pytest.mark.parametrize("obj", [GroupSpec_V2(), GroupSpec_V3()]) +@pytest.mark.parametrize( + "obj", + [GroupSpec_V2(attributes={}, members={}), GroupSpec_V3(attributes={}, members={})], +) def test_grouplike(obj: GroupSpec_V3[Any, Any] | GroupSpec_V2[Any, Any]) -> None: """ Test that the GroupLike protocol works correctly @@ -76,6 +79,7 @@ def test_check_standard_name_invalid() -> None: check_standard_name("invalid_standard_name") +@pytest.mark.filterwarnings("ignore:.*:zarr.errors.UnstableSpecificationWarning") def test_multiscales_round_trip(s2_optimized_geozarr_group_example: zarr.Group) -> None: """ Ensure that we can round-trip multiscale metadata through the `Multiscales` model. diff --git a/tests/test_data_api/test_geozarr/test_multiscales/test_geozarr.py b/tests/test_data_api/test_geozarr/test_multiscales/test_geozarr.py index 78bb3af..aaeacba 100644 --- a/tests/test_data_api/test_geozarr/test_multiscales/test_geozarr.py +++ b/tests/test_data_api/test_geozarr/test_multiscales/test_geozarr.py @@ -18,7 +18,6 @@ def test_multiscale_group_attrs(multiscale_flavor: set[Literal["zcm", "tms"]]) - zcm_meta: dict[str, object] = {} tms_meta: dict[str, object] = {} zarr_conventions_meta: MISSING | tuple[Any, ...] = MISSING - if "zcm" in multiscale_flavor: layout = ( zcm.ScaleLevel( diff --git a/tests/test_data_api/test_v2.py b/tests/test_data_api/test_v2.py index 5855fd2..8c20255 100644 --- a/tests/test_data_api/test_v2.py +++ b/tests/test_data_api/test_v2.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections.abc import Mapping # noqa: TC003 from typing import Any import numpy as np @@ -13,6 +14,11 @@ ) +class GroupWithDataArrays(GroupSpec): + attributes: Any = {} # noqa: RUF012 + members: Mapping[str, DataArray] + + def test_invalid_dimension_names() -> None: msg = r"The _ARRAY_DIMENSIONS attribute has length 3, which does not match the number of dimensions for this array \(got 2\)" with pytest.raises(ValidationError, match=msg): @@ -35,7 +41,7 @@ def test_valid(data_shape: tuple[int, ...]) -> None: f"dim_{idx}": DataArray.from_array(np.arange(s), dimension_names=(f"dim_{idx}",)) for idx, s in enumerate(data_shape) } - group = GroupSpec[Any, DataArray](members={"base": base_array, **coords_arrays}) + group = GroupWithDataArrays(members={"base": base_array, **coords_arrays}) assert check_valid_coordinates(group) == group @staticmethod @@ -57,7 +63,8 @@ def test_invalid_coordinates( f"dim_{idx}": DataArray.from_array(np.arange(s + 1), dimension_names=(f"dim_{idx}",)) for idx, s in enumerate(data_shape) } - group = GroupSpec[Any, DataArray](members={"base": base_array, **coords_arrays}) + + group = GroupWithDataArrays(members={"base": base_array, **coords_arrays}) msg = "Dimension .* for array 'base' has a shape mismatch:" with pytest.raises(ValueError, match=msg): check_valid_coordinates(group) diff --git a/tests/test_data_api/test_v3.py b/tests/test_data_api/test_v3.py index 1054a41..10aa872 100644 --- a/tests/test_data_api/test_v3.py +++ b/tests/test_data_api/test_v3.py @@ -1,3 +1,4 @@ +from collections.abc import Mapping from typing import Any import numpy as np @@ -13,6 +14,11 @@ ) +class GroupWithDataArrays(GroupSpec): + attributes: Any = {} # noqa: RUF012 + members: Mapping[str, DataArray] + + class TestCheckValidCoordinates: @staticmethod @pytest.mark.parametrize("data_shape", [(10,), (10, 12)]) @@ -29,7 +35,7 @@ def test_valid(data_shape: tuple[int, ...]) -> None: f"dim_{idx}": DataArray.from_array(np.arange(s), dimension_names=(f"dim_{idx}",)) for idx, s in enumerate(data_shape) } - group = GroupSpec[Any, DataArray](members={"base": base_array, **coords_arrays}) + group = GroupWithDataArrays(members={"base": base_array, **coords_arrays}) assert check_valid_coordinates(group) == group @staticmethod @@ -51,12 +57,13 @@ def test_invalid_coordinates( f"dim_{idx}": DataArray.from_array(np.arange(s + 1), dimension_names=(f"dim_{idx}",)) for idx, s in enumerate(data_shape) } - group = GroupSpec[Any, DataArray](members={"base": base_array, **coords_arrays}) + group = GroupWithDataArrays(members={"base": base_array, **coords_arrays}) msg = "Dimension .* for array 'base' has a shape mismatch:" with pytest.raises(ValueError, match=msg): check_valid_coordinates(group) +@pytest.mark.filterwarnings("ignore:.*:zarr.errors.UnstableSpecificationWarning") def test_dataarray_round_trip(s2_geozarr_group_example: Any) -> None: """ Ensure that we can round-trip dataarray attributes through the `Multiscales` model. diff --git a/tests/test_integration_sentinel1.py b/tests/test_integration_sentinel1.py index bf34ffc..e6f3e13 100644 --- a/tests/test_integration_sentinel1.py +++ b/tests/test_integration_sentinel1.py @@ -208,6 +208,7 @@ def test_invalid_gcp_group_raises_error(temp_output_dir, sample_sentinel1_datatr ) +@pytest.mark.filterwarnings("ignore::rasterio.errors.NotGeoreferencedWarning") @pytest.mark.parametrize( "polarization_group", [ diff --git a/tests/test_reprojection_validation.py b/tests/test_reprojection_validation.py index 8159d73..4432057 100755 --- a/tests/test_reprojection_validation.py +++ b/tests/test_reprojection_validation.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd +import pytest import xarray as xr from eopf_geozarr.conversion import create_geozarr_dataset @@ -140,6 +141,7 @@ def build(self) -> xr.DataTree: return dt +@pytest.mark.filterwarnings("ignore::rasterio.errors.NotGeoreferencedWarning") def test_titiler_compatibility(tmp_path: Path) -> None: """Test that reprojected Sentinel-1 data is compatible with titiler-eopf operations.""" print("🧪 Testing titiler-eopf compatibility with reprojected Sentinel-1 data...") diff --git a/tests/test_s2_converter_simplified.py b/tests/test_s2_converter_simplified.py index 16dcc48..50b2772 100644 --- a/tests/test_s2_converter_simplified.py +++ b/tests/test_s2_converter_simplified.py @@ -6,17 +6,18 @@ import json from pathlib import Path -from unittest.mock import Mock, patch import numpy as np import pytest import xarray as xr +import zarr +from structlog.testing import capture_logs as cap_logs from eopf_geozarr.s2_optimization.s2_converter import ( convert_s2_optimized, - initialize_crs_from_dataset, simple_root_consolidation, ) +from eopf_geozarr.s2_optimization.s2_multiscale import auto_chunks @pytest.fixture @@ -48,159 +49,10 @@ def mock_s2_dataset() -> xr.DataTree: return dt -class TestS2FunctionalAPI: - """Test the S2 functional API.""" - - def test_is_sentinel2_dataset_placeholder(self) -> None: - """Placeholder test for is_sentinel2_dataset. - - The actual is_sentinel2_dataset function uses complex pydantic validation - that requires a fully structured zarr group matching Sentinel1Root or - Sentinel2Root models. Testing this would require creating a complete - mock sentinel dataset, which is better done in integration tests. - """ - # This test is kept as a placeholder to maintain test structure - assert True - - -class TestCRSInitialization: - """Test CRS initialization functionality.""" - - def test_initialize_crs_from_cpm_260_metadata(self) -> None: - """Test CRS initialization from CPM >= 2.6.0 metadata with integer EPSG.""" - # Create a DataTree with CPM 2.6.0+ style metadata (integer format) - dt = xr.DataTree() - dt.attrs = { - "other_metadata": { - "horizontal_CRS_code": 32632 # EPSG:32632 (WGS 84 / UTM zone 32N) - } - } - - crs = initialize_crs_from_dataset(dt) - - assert crs is not None - assert crs.to_epsg() == 32632 - - def test_initialize_crs_from_cpm_260_metadata_string(self) -> None: - """Test CRS initialization from CPM >= 2.6.0 metadata with string EPSG.""" - # Create a DataTree with CPM 2.6.0+ style metadata (string format) - dt = xr.DataTree() - dt.attrs = { - "other_metadata": { - "horizontal_CRS_code": "EPSG:32632" # String format - } - } - - crs = initialize_crs_from_dataset(dt) - - assert crs is not None - assert crs.to_epsg() == 32632 - - def test_initialize_crs_from_cpm_260_metadata_invalid_epsg(self) -> None: - """Test CRS initialization with invalid EPSG code in CPM 2.6.0 metadata.""" - # Create a DataTree with invalid EPSG code - dt = xr.DataTree() - dt.attrs = { - "other_metadata": { - "horizontal_CRS_code": 999999 # Invalid EPSG code - } - } - - # Should fall through to other methods or return None - crs = initialize_crs_from_dataset(dt) - - # CRS should be None since there's no other CRS information - assert crs is None - - def test_initialize_crs_from_rio_accessor(self) -> None: - """Test CRS initialization from rioxarray accessor.""" - # Create a dataset with rioxarray CRS - coords = { - "x": (["x"], np.linspace(0, 1000, 10)), - "y": (["y"], np.linspace(0, 1000, 10)), - } - data_vars = { - "b02": (["y", "x"], np.random.rand(10, 10)), - } - ds = xr.Dataset(data_vars, coords=coords) - ds = ds.rio.write_crs("EPSG:32633") - - # Create DataTree without CPM 2.6.0 metadata - dt = xr.DataTree(ds) - - crs = initialize_crs_from_dataset(dt) - - assert crs is not None - assert crs.to_epsg() == 32633 - - def test_initialize_crs_from_proj_epsg_attribute(self) -> None: - """Test CRS initialization from proj:epsg attribute.""" - # Create a dataset with proj:epsg attribute - coords = { - "x": (["x"], np.linspace(0, 1000, 10)), - "y": (["y"], np.linspace(0, 1000, 10)), - } - data_vars = { - "b02": (["y", "x"], np.random.rand(10, 10)), - } - ds = xr.Dataset(data_vars, coords=coords) - ds["b02"].attrs["proj:epsg"] = 32634 - - # Create DataTree - dt = xr.DataTree(ds) - - crs = initialize_crs_from_dataset(dt) - - assert crs is not None - assert crs.to_epsg() == 32634 - - def test_initialize_crs_no_crs_information(self) -> None: - """Test CRS initialization when no CRS information is available.""" - # Create a dataset without any CRS information - coords = { - "x": (["x"], np.linspace(0, 1000, 10)), - "y": (["y"], np.linspace(0, 1000, 10)), - } - data_vars = { - "b02": (["y", "x"], np.random.rand(10, 10)), - } - ds = xr.Dataset(data_vars, coords=coords) - - # Create DataTree without any CRS metadata - dt = xr.DataTree(ds) - - crs = initialize_crs_from_dataset(dt) - - assert crs is None - - def test_initialize_crs_priority_cpm_260_over_rio(self) -> None: - """Test that CPM 2.6.0 metadata takes priority over rio accessor.""" - # Create a dataset with both CPM 2.6.0 metadata and rio CRS - coords = { - "x": (["x"], np.linspace(0, 1000, 10)), - "y": (["y"], np.linspace(0, 1000, 10)), - } - data_vars = { - "b02": (["y", "x"], np.random.rand(10, 10)), - } - ds = xr.Dataset(data_vars, coords=coords) - ds = ds.rio.write_crs("EPSG:32633") # Different EPSG - - # Create DataTree with CPM 2.6.0 metadata - dt = xr.DataTree(ds) - dt.attrs = { - "other_metadata": { - "horizontal_CRS_code": 32632 # This should take priority - } - } - - crs = initialize_crs_from_dataset(dt) - - assert crs is not None - # CPM 2.6.0 metadata should take priority - assert crs.to_epsg() == 32632 - - +@pytest.mark.filterwarnings("ignore:Object at .*:zarr.errors.ZarrUserWarning") +@pytest.mark.filterwarnings( + "ignore:Consolidated metadata is currently .*:zarr.errors.ZarrUserWarning" +) def test_simple_root_consolidation_success(tmp_path: Path) -> None: """ Test that simple_root_consolidation produces consolidated metadata at the root, and for the @@ -238,65 +90,119 @@ def test_simple_root_consolidation_success(tmp_path: Path) -> None: assert atmos_zmeta["consolidated_metadata"] is None +@pytest.mark.filterwarnings("ignore:.*Expected 'MISSING' sentinel:UserWarning") +@pytest.mark.filterwarnings("ignore:.*:zarr.errors.UnstableSpecificationWarning") +@pytest.mark.filterwarnings("ignore:.*:xarray.coding.common.SerializationWarning") +@pytest.mark.filterwarnings( + "ignore:Failed to open Zarr store with consolidated metadata:RuntimeWarning" +) +@pytest.mark.filterwarnings("ignore:.*:zarr.errors.ZarrUserWarning") +@pytest.mark.filterwarnings("ignore:Pydantic serializer warnings:UserWarning") class TestConvenienceFunction: """Test the convenience function.""" - @patch("eopf_geozarr.s2_optimization.s2_converter.create_result_datatree") - @patch("eopf_geozarr.s2_optimization.s2_converter.zarr.open_group") - @patch("eopf_geozarr.s2_optimization.s2_converter.initialize_crs_from_dataset") - @patch("eopf_geozarr.s2_optimization.s2_converter.get_zarr_group") - @patch("eopf_geozarr.s2_optimization.s2_converter.is_sentinel2_dataset") - @patch("eopf_geozarr.s2_optimization.s2_converter.create_multiscale_from_datatree") - @patch("eopf_geozarr.s2_optimization.s2_converter.simple_root_consolidation") + @pytest.mark.parametrize("compression_level", [3]) + @pytest.mark.parametrize("spatial_chunk", [256]) + @pytest.mark.parametrize("enable_sharding", [False, True]) def test_convert_s2_optimized_convenience_function( self, - mock_consolidation: Mock, - mock_multiscale: Mock, - mock_is_s2: Mock, - mock_get_zarr_group: Mock, - mock_init_crs: Mock, - mock_zarr_open_group: Mock, - mock_create_result_datatree: Mock, + s2_group_example: Path, + tmp_path: Path, + spatial_chunk: int, + compression_level: int, + enable_sharding: bool, ) -> None: - """Test the convenience function parameter passing.""" - # Setup mocks - mock_multiscale.return_value = {} - mock_is_s2.return_value = True - mock_get_zarr_group.return_value = Mock() - mock_init_crs.return_value = None # Return None for CRS - - # Mock zarr.open_group to return a group with proper groups() and members() methods - mock_zarr_group = Mock() - mock_zarr_group.groups.return_value = iter([]) # Return empty iterator for groups - mock_zarr_group.members.return_value = {} # Return empty dict for members - mock_zarr_open_group.return_value = mock_zarr_group - - # Mock create_result_datatree to return a mock DataTree with groups attribute - mock_result_dt = Mock() - mock_result_dt.groups = [] # Empty list for groups - mock_create_result_datatree.return_value = mock_result_dt - - # Test parameter passing - Mock DataTree with groups attribute - dt_input = Mock() - dt_input.groups = ["/measurements/reflectance/r10m"] - output_path = "/test/path" - - convert_s2_optimized( - dt_input, - output_path=output_path, - enable_sharding=False, - spatial_chunk=512, - compression_level=2, - max_retries=5, - validate_output=False, - keep_scale_offset=False, + """Test the convenience function with real S2 data.""" + # Open the S2 example as a DataTree + dt_input = xr.open_datatree( + s2_group_example, + engine="zarr", + consolidated=False, + decode_timedelta=True, + ) + output_path = str(tmp_path / "test_output.zarr") + + # Run the conversion + with cap_logs(): + result = convert_s2_optimized( + dt_input, + output_path=output_path, + enable_sharding=enable_sharding, + spatial_chunk=spatial_chunk, + compression_level=compression_level, + max_retries=5, + validate_output=False, + keep_scale_offset=False, + ) + + # Verify the output was created + assert Path(output_path).exists() + + # Verify the result is a DataTree + assert isinstance(result, xr.DataTree) + + # Verify basic structure - output should have multiscale groups + output_dt = xr.open_datatree(output_path, engine="zarr") + assert len(output_dt.groups) > 0 + + # Open the Zarr store to verify metadata + # We know the S2 data has measurements/reflectance group structure + output_zarr = zarr.open_group(output_path, mode="r") + + reflectance_group = output_zarr["measurements/reflectance"] + reencoded_array = reflectance_group["r10m/b03"] + downsampled_array = reflectance_group["r720m/b03"] + + # For re-encoded arrays, chunks should match output of auto_chunks + assert reencoded_array.chunks == auto_chunks(reencoded_array.shape, spatial_chunk) + + # For downsampled arrays, chunks should be min(spatial_chunk, array_shape) + # because the array might be smaller than the chunk size + expected_downsampled_chunks = ( + min(spatial_chunk, downsampled_array.shape[0]), + min(spatial_chunk, downsampled_array.shape[1]), + ) + assert downsampled_array.chunks == expected_downsampled_chunks + + # Sharding should only be present on arrays large enough for the chunk size + # Reencoded arrays are typically large (10980x10980), so should have sharding + if enable_sharding: + assert reencoded_array.shards is not None, "Large reencoded array should have sharding" + else: + assert reencoded_array.shards is None, ( + "Reencoded array should not have sharding when disabled" + ) + + # Downsampled arrays (r720m) might be too small for sharding + # Only check for sharding if the array is large enough (shape >= spatial_chunk in both dims) + if ( + enable_sharding + and downsampled_array.shape[0] >= spatial_chunk + and downsampled_array.shape[1] >= spatial_chunk + ): + assert downsampled_array.shards is not None, ( + f"Downsampled array with shape {downsampled_array.shape} should have sharding" + ) + elif ( + not enable_sharding + or downsampled_array.shape[0] < spatial_chunk + or downsampled_array.shape[1] < spatial_chunk + ): + assert downsampled_array.shards is None, ( + f"Downsampled array with shape {downsampled_array.shape} should not have sharding " + f"(enable_sharding={enable_sharding}, spatial_chunk={spatial_chunk})" + ) + + assert reencoded_array.compressors[0].to_dict()["name"] == "blosc" + assert downsampled_array.compressors[0].to_dict()["name"] == "blosc" + + assert ( + reencoded_array.compressors[0].to_dict()["configuration"]["clevel"] == compression_level + ) + assert ( + downsampled_array.compressors[0].to_dict()["configuration"]["clevel"] + == compression_level ) - - # Verify multiscale function was called with correct args - mock_multiscale.assert_called_once() - call_kwargs = mock_multiscale.call_args.kwargs - assert call_kwargs["enable_sharding"] is False - assert call_kwargs["spatial_chunk"] == 512 if __name__ == "__main__": diff --git a/tests/test_s2_multiscale.py b/tests/test_s2_multiscale.py index 82adf64..0a8c262 100644 --- a/tests/test_s2_multiscale.py +++ b/tests/test_s2_multiscale.py @@ -15,14 +15,172 @@ from pydantic_zarr.v3 import GroupSpec from structlog.testing import capture_logs +from eopf_geozarr.s2_optimization.s2_converter import convert_s2_optimized from eopf_geozarr.s2_optimization.s2_multiscale import ( calculate_aligned_chunk_size, calculate_simple_shard_dimensions, create_downsampled_resolution_group, create_measurements_encoding, - create_multiscale_from_datatree, ) +try: + from pyproj import CRS as ProjCRS +except ImportError: + ProjCRS = None + + +def add_spatial_ref_to_group(group: zarr.Group, epsg_code: int | None = None) -> None: + """ + Add spatial_ref coordinate to all resolution levels in a group. + + This manually creates the spatial_ref coordinate that rioxarray would create, + avoiding the need for rasterio dependency in the core re-encoding logic. + + Parameters + ---------- + group : zarr.Group + The group containing resolution level subgroups (e.g., r10m, r20m) + epsg_code : int, optional + EPSG code to use. If None, will try to detect from data variable attributes. + """ + if ProjCRS is None: + pytest.skip("pyproj not available") + return + + # Iterate over all resolution levels + for _, member in group.members(): + if not isinstance(member, zarr.Group): + continue + + # Check if we can determine EPSG from array attributes in the zarr group + detected_epsg = epsg_code + if detected_epsg is None: + for _, array in member.arrays(): + array_attrs = dict(array.attrs) + if "proj:epsg" in array_attrs: + detected_epsg = int(array_attrs["proj:epsg"]) + break + + if detected_epsg is None: + continue + + # Create CRS from EPSG + crs = ProjCRS.from_epsg(detected_epsg) + cf_attrs = crs.to_cf() + + # Create spatial_ref array directly in zarr (scalar int64 array) + # This avoids issues with xarray's to_zarr mode="a" handling + spatial_ref_array = member.create_array( + "spatial_ref", + shape=(), + dtype="int64", + fill_value=0, + overwrite=True, + ) + spatial_ref_array.attrs.update(cf_attrs) + spatial_ref_array[()] = 0 # Set the scalar value + + +@pytest.mark.filterwarnings("ignore:.*:zarr.errors.ZarrUserWarning") +def test_add_spatial_ref_matches_rasterio(tmp_path: pathlib.Path) -> None: + """Test that our manual spatial_ref creation matches rasterio's output.""" + try: + import rioxarray + except ImportError: + pytest.skip("rioxarray not available") + + if ProjCRS is None: + pytest.skip("pyproj not available") + + # Create a test dataset with spatial dimensions + x = np.arange(0, 100, 10, dtype=np.float64) + y = np.arange(0, 100, 10, dtype=np.float64) + data = np.random.rand(len(y), len(x)).astype(np.float32) + + test_epsg = 32632 # UTM zone 32N + + ds = xr.Dataset( + { + "test_var": (["y", "x"], data, {"proj:epsg": test_epsg}), + }, + coords={"x": x, "y": y}, + ) + + # Create two stores: one for rasterio, one for manual + rasterio_store = tmp_path / "rasterio.zarr" + manual_store = tmp_path / "manual.zarr" + + # Create group structure with a resolution level for manual approach + manual_root = zarr.create_group(str(manual_store)) + _ = manual_root.create_group("r10m") # Create the subgroup structure + + # Write dataset using rioxarray (which creates spatial_ref automatically) + ds_with_crs = ds.rio.write_crs(test_epsg) + ds_with_crs.to_zarr(str(rasterio_store), group="r10m", mode="w") + + # Write dataset without spatial_ref, then add it manually + ds.to_zarr(str(manual_store), group="r10m", mode="a") + add_spatial_ref_to_group(manual_root, epsg_code=test_epsg) + + # Reopen groups to access arrays (without consolidated metadata since we added spatial_ref manually) + rasterio_res_group_reopened = zarr.open_group( + str(rasterio_store), mode="r", use_consolidated=False + )["r10m"] + manual_res_group_reopened = zarr.open_group( + str(manual_store), mode="r", use_consolidated=False + )["r10m"] + + # Get spatial_ref arrays + rasterio_spatial_ref_member = rasterio_res_group_reopened["spatial_ref"] + manual_spatial_ref_member = manual_res_group_reopened["spatial_ref"] + + # Ensure they are arrays + assert isinstance(rasterio_spatial_ref_member, zarr.Array) + assert isinstance(manual_spatial_ref_member, zarr.Array) + + # Check that both exist and have the same shape (scalar) + assert rasterio_spatial_ref_member.shape == () + assert manual_spatial_ref_member.shape == () + + # Check that both have the same dtype + assert rasterio_spatial_ref_member.dtype == manual_spatial_ref_member.dtype + + # Check that both have the same value + rasterio_value = rasterio_spatial_ref_member[()] + manual_value = manual_spatial_ref_member[()] + assert rasterio_value == manual_value + + # Check that CF attributes match + rasterio_attrs = dict(rasterio_spatial_ref_member.attrs) + manual_attrs = dict(manual_spatial_ref_member.attrs) + + # Both should have grid_mapping_name + assert "grid_mapping_name" in rasterio_attrs + assert "grid_mapping_name" in manual_attrs + assert rasterio_attrs["grid_mapping_name"] == manual_attrs["grid_mapping_name"] + + # Both should have crs_wkt, but the format may differ (WKT1 vs WKT2) + # Verify both are valid WKT and represent the same CRS + assert "crs_wkt" in rasterio_attrs + assert "crs_wkt" in manual_attrs + + # Parse both WKT strings and verify they represent the same EPSG code + rasterio_crs = ProjCRS.from_wkt(str(rasterio_attrs["crs_wkt"])) + manual_crs = ProjCRS.from_wkt(str(manual_attrs["crs_wkt"])) + + # Both should have the same EPSG code + assert rasterio_crs.to_epsg() == test_epsg + assert manual_crs.to_epsg() == test_epsg + + # Verify the manual implementation has the expected CF attribute keys + # (pyproj's to_cf() generates these) + expected_cf_keys = {"grid_mapping_name", "crs_wkt"} + assert expected_cf_keys.issubset(set(manual_attrs.keys())) + + # The manual implementation should have all standard CF-compliant attributes + # that pyproj generates (there may be additional ones) + assert len(manual_attrs) >= len(expected_cf_keys) + @pytest.fixture def sample_dataset(s2_group_example: pathlib.Path) -> xr.Dataset: @@ -77,6 +235,7 @@ def test_calculate_simple_shard_dimensions() -> None: assert shard_dims[1] == 768 # 3 * 256 = 768 +@pytest.mark.filterwarnings("ignore:.*:zarr.errors.ZarrUserWarning") @pytest.mark.parametrize("keep_scale_offset", [True, False]) def test_create_measurements_encoding(keep_scale_offset: bool, sample_dataset: xr.Dataset) -> None: """Test measurements encoding creation with xy-aligned sharding.""" @@ -147,48 +306,54 @@ def test_calculate_aligned_chunk_size() -> None: @pytest.mark.filterwarnings("ignore:.*:RuntimeWarning") @pytest.mark.filterwarnings("ignore:.*:FutureWarning") @pytest.mark.filterwarnings("ignore:.*:UserWarning") -def test_create_multiscale_from_datatree( - s2_group_example: zarr.Group, +def test_convert_s2_optimized( + s2_group_example: Path, tmp_path: pathlib.Path, ) -> None: """Test multiscale creation from DataTree.""" - output_path = str(tmp_path / "output.zarr") input_group = zarr.open_group(s2_group_example) - output_group = zarr.create_group(output_path) - dt_input = xr.open_datatree(input_group.store, engine="zarr", chunks="auto") - # Capture log output using structlog's testing context manager + output_path = tmp_path / "output.zarr" + + dt_input = xr.open_datatree(input_group.store, engine="zarr", chunks="auto") with capture_logs(): - create_multiscale_from_datatree( + convert_s2_optimized( dt_input, - output_group=output_group, + output_path=str(output_path), enable_sharding=True, spatial_chunk=256, + compression_level=1, + validate_output=False, keep_scale_offset=False, ) - - observed_group = zarr.open_group(output_path, use_consolidated=False) - - observed_structure_json = GroupSpec.from_zarr(observed_group).model_dump() + observed_group = zarr.open_group(output_path) + observed_structure = GroupSpec.from_zarr(observed_group) # Comparing JSON objects is sensitive to the difference between tuples and lists, but we # don't care about that here, so we convert all lists to tuples before creating the GroupSpec + observed_structure_json = observed_structure.model_dump() observed_structure = GroupSpec(**tuplify_json(observed_structure_json)) observed_structure_flat = observed_structure.to_flat() expected_structure_path = Path("tests/_test_data/optimized_geozarr_examples/") / ( s2_group_example.stem + ".json" ) - # Uncomment this section to write out the expected structure from the observed structure - # This is useful when the expected structure needs to be updated - # expected_structure_path.write_text( - # json.dumps(observed_structure_json, indent=2, sort_keys=True) - # ) + + + # Write out the observed structure for analysis + observed_output_path = Path("/tmp/observed_structure.json") + observed_output_path.write_text(json.dumps(observed_structure_json, indent=2, sort_keys=True)) expected_structure_json = tuplify_json(json.loads(expected_structure_path.read_text())) expected_structure = GroupSpec(**expected_structure_json) expected_structure_flat = expected_structure.to_flat() + # Uncomment this section to write out the expected structure from the observed structure + # This is useful when the expected structure needs to be updated + # expected_structure_path.write_text( + # json.dumps(observed_structure_json, indent=2, sort_keys=True) + #) + # check that all multiscale levels have the same data type # this check is redundant with the later check, but it's expedient to check this here. # eventually this check should be spun out into its own test @@ -212,7 +377,44 @@ def test_create_multiscale_from_datatree( o_keys = set(observed_structure_flat.keys()) e_keys = set(expected_structure_flat.keys()) + # Write out the expected structure to the temporary test directory + # This is useful when the expected structure needs to be updated + + observed_tree_path = tmp_path/"observed_tree.json" + observed_flat_path = tmp_path/"observed_flat.json" + expected_flat_path = tmp_path/"expected_flat.json" + + observed_tree_path.write_text( + json.dumps(observed_structure_json, indent=2, sort_keys=True) + ) + + observed_flat_path.write_text(json.dumps({k: v.model_dump() for k,v in observed_structure_flat.items()})) + expected_flat_path.write_text(json.dumps({k: v.model_dump() for k,v in expected_structure_flat.items()})) + # Check that all of the keys are the same - assert o_keys == e_keys - # Check that all values are the same - assert [k for k in o_keys if expected_structure_flat[k] != observed_structure_flat[k]] == [] + if o_keys != e_keys: + (tmp_path / "extra_observed_keys.json").write_text(json.dumps({k: v.model_dump() for k, v in observed_structure_flat if k in o_keys - e_keys}, indent=2)) + (tmp_path / "extra_expected_keys.json").write_text(json.dumps({k: v.model_dump() for k, v in expected_structure_flat if k in e_keys - o_keys}, indent=2)) + + raise AssertionError( + f"Key mismatch.\n" + f"Extra in observed: {sorted(o_keys - e_keys)[:20]}\n" + f"Missing in observed: {sorted(e_keys - o_keys)[:20]}\n" + f"Check {observed_tree_path!s} to see the observed JSON structure." + ) + + # Check that all values are the same for common keys + common_keys = o_keys & e_keys + mismatch_values = [ + k for k in common_keys if expected_structure_flat.get(k) != observed_structure_flat.get(k) + ] + + if mismatch_values != []: + mismatch_values_path = (tmp_path / "mismatch_values.json") + mismatch_values_path.write_text( + json.dumps( + {k: {"expected": expected_structure_flat[k].model_dump(), "observed": observed_structure_flat[k].model_dump()} for k in mismatch_values}, indent=2)) + raise AssertionError( + f"Value mismatches: {mismatch_values[:20]}\n" + f"Check {mismatch_values_path!s} to see the observed JSON structure." + ) diff --git a/tests/test_s2_multiscale_geo_metadata.py b/tests/test_s2_multiscale_geo_metadata.py index 4a6cc0a..98dd6ed 100644 --- a/tests/test_s2_multiscale_geo_metadata.py +++ b/tests/test_s2_multiscale_geo_metadata.py @@ -11,6 +11,7 @@ import pytest import xarray as xr import zarr +from pyproj import CRS from eopf_geozarr.s2_optimization.s2_multiscale import ( create_measurements_encoding, @@ -19,6 +20,29 @@ ) +def get_crs_from_dataset(ds: xr.Dataset) -> CRS | None: + """ + Extract CRS from a dataset. + + Tries multiple sources in priority order: + 1. Existing rio.crs + 2. proj:epsg in variable attributes + + Returns None if no CRS can be determined. + """ + # First try existing rio CRS + if hasattr(ds, "rio") and ds.rio.crs is not None: + return ds.rio.crs + + # Try to find EPSG in variable attributes + for var in ds.data_vars.values(): + if "proj:epsg" in var.attrs: + epsg_code = var.attrs["proj:epsg"] + return CRS.from_epsg(epsg_code) + + return None + + @pytest.fixture def sample_dataset_with_crs() -> xr.Dataset: """Create a sample dataset with CRS information.""" @@ -87,8 +111,12 @@ class TestWriteGeoMetadata: def test_write_geo_metadata_with_rio_crs(self, sample_dataset_with_crs: xr.Dataset) -> None: """Test _write_geo_metadata with dataset that has rioxarray CRS.""" + # Get CRS from dataset + crs = get_crs_from_dataset(sample_dataset_with_crs) + assert crs is not None + # Call the method - write_geo_metadata(sample_dataset_with_crs) + write_geo_metadata(sample_dataset_with_crs, crs=crs) # Verify CRS was written assert hasattr(sample_dataset_with_crs, "rio") @@ -106,8 +134,12 @@ def test_write_geo_metadata_with_epsg_attrs( or sample_dataset_with_epsg_attrs.rio.crs is None ) + # Get CRS from dataset attributes + crs = get_crs_from_dataset(sample_dataset_with_epsg_attrs) + assert crs is not None + # Call the method - write_geo_metadata(sample_dataset_with_epsg_attrs) + write_geo_metadata(sample_dataset_with_epsg_attrs, crs=crs) # Verify CRS was written from attributes assert hasattr(sample_dataset_with_epsg_attrs, "rio") @@ -120,21 +152,25 @@ def test_write_geo_metadata_no_crs(self, sample_dataset_no_crs: xr.Dataset) -> N # Verify initial state - no CRS assert not hasattr(sample_dataset_no_crs, "rio") or sample_dataset_no_crs.rio.crs is None - # Call the method - should not fail but also not add CRS - write_geo_metadata(sample_dataset_no_crs) + # Try to get CRS - should return None + crs = get_crs_from_dataset(sample_dataset_no_crs) - # Verify no CRS was added (method handles gracefully) - # The method should not fail even when no CRS is available - # This tests the robustness of the method + # Skip the test if no CRS is available (can't call write_geo_metadata without CRS) + if crs is None: + pytest.skip("No CRS available - write_geo_metadata requires a CRS parameter") def test_write_geo_metadata_custom_grid_mapping_name( self, sample_dataset_with_crs: xr.Dataset ) -> None: """Test _write_geo_metadata with custom grid_mapping variable name.""" + # Get CRS from dataset + crs = get_crs_from_dataset(sample_dataset_with_crs) + assert crs is not None + # Call the method with custom grid mapping name custom_name = "custom_spatial_ref" - write_geo_metadata(sample_dataset_with_crs, custom_name) + write_geo_metadata(sample_dataset_with_crs, crs=crs, grid_mapping_var_name=custom_name) # Verify CRS was written assert hasattr(sample_dataset_with_crs, "rio") @@ -145,13 +181,17 @@ def test_write_geo_metadata_preserves_existing_data( ) -> None: """Test that _write_geo_metadata preserves existing data variables and coordinates.""" + # Get CRS from dataset + crs = get_crs_from_dataset(sample_dataset_with_crs) + assert crs is not None + # Store original data original_vars = list(sample_dataset_with_crs.data_vars.keys()) original_coords = list(sample_dataset_with_crs.coords.keys()) original_b02_data = sample_dataset_with_crs["b02"].values.copy() # Call the method - write_geo_metadata(sample_dataset_with_crs) + write_geo_metadata(sample_dataset_with_crs, crs=crs) # Verify all original data is preserved assert list(sample_dataset_with_crs.data_vars.keys()) == original_vars @@ -163,24 +203,29 @@ def test_write_geo_metadata_empty_dataset(self) -> None: empty_ds = xr.Dataset({}, coords={}) - # Call the method - should handle gracefully - write_geo_metadata(empty_ds) + # Try to get CRS - should return None + crs = get_crs_from_dataset(empty_ds) - # Verify method doesn't fail with empty dataset - # This tests robustness + # Skip the test if no CRS is available + if crs is None: + pytest.skip("No CRS available - write_geo_metadata requires a CRS parameter") def test_write_geo_metadata_rio_write_crs_called( self, sample_dataset_with_crs: xr.Dataset ) -> None: """Test that rio.write_crs is called correctly.""" + # Get CRS from dataset + crs = get_crs_from_dataset(sample_dataset_with_crs) + assert crs is not None + # Mock the rio.write_crs method with patch.object(sample_dataset_with_crs.rio, "write_crs") as mock_write_crs: # Call the method - write_geo_metadata(sample_dataset_with_crs) + write_geo_metadata(sample_dataset_with_crs, crs=crs) # Verify rio.write_crs was called with correct arguments - mock_write_crs.assert_called_once() + mock_write_crs.assert_called() call_args = mock_write_crs.call_args assert call_args[1]["inplace"] is True # inplace=True should be passed @@ -201,12 +246,18 @@ def test_write_geo_metadata_crs_from_multiple_sources(self) -> None: ds = ds.rio.write_crs("EPSG:4326") # Rio CRS ds["b08"].attrs["proj:epsg"] = 32632 # EPSG attribute + # Get CRS - should prioritize rio.crs + crs = get_crs_from_dataset(ds) + assert crs is not None + assert crs.to_epsg() == 4326 # Should be 4326 (rio CRS), not 32632 + # Call the method - write_geo_metadata(ds) + write_geo_metadata(ds, crs=crs) # Verify rio CRS was used (priority over attributes) assert ds.rio.crs.to_epsg() == 4326 # Should still be 4326, not 32632 + @pytest.mark.filterwarnings("ignore::RuntimeWarning") def test_write_geo_metadata_integration_with_stream_write(self, tmp_path: Path) -> None: """Test that _write_geo_metadata is properly integrated in _stream_write_dataset.""" @@ -223,6 +274,10 @@ def test_write_geo_metadata_integration_with_stream_write(self, tmp_path: Path) ds = xr.Dataset(data_vars, coords=coords) ds = ds.rio.write_crs("EPSG:32632") + # Get CRS for passing to stream_write_dataset + crs = get_crs_from_dataset(ds) + assert crs is not None + # Create encoding for the dataset encoding = create_measurements_encoding(ds, spatial_chunk=1024, enable_sharding=True) @@ -235,6 +290,7 @@ def test_write_geo_metadata_integration_with_stream_write(self, tmp_path: Path) group=zarr.create_group(tmp_path), encoding=encoding, enable_sharding=True, + crs=crs, ) # Re-open the written dataset to verify CRS was persisted @@ -271,8 +327,9 @@ def test_write_geo_metadata_invalid_crs( # Method should raise an exception for invalid CRS (normal behavior) from pyproj.exceptions import CRSError - with pytest.raises(CRSError): - write_geo_metadata(ds) + # Try to get CRS - should raise error with invalid EPSG + with pytest.raises((CRSError, ValueError, TypeError)): + get_crs_from_dataset(ds) def test_write_geo_metadata_mixed_crs_variables( self, @@ -295,10 +352,14 @@ def test_write_geo_metadata_mixed_crs_variables( ds["var1"].attrs["proj:epsg"] = 32632 ds["var2"].attrs["proj:epsg"] = 4326 - # Call the method (should use the first CRS found) - write_geo_metadata(ds) + # Get CRS (should use the first CRS found) + crs = get_crs_from_dataset(ds) + assert crs is not None + + # Call the method + write_geo_metadata(ds, crs=crs) - # Verify a CRS was applied (should be the first one found) + # Verify a CRS was applied assert hasattr(ds, "rio") def test_write_geo_metadata_maintains_dataset_attrs( @@ -313,8 +374,12 @@ def test_write_geo_metadata_maintains_dataset_attrs( original_attrs = sample_dataset_with_crs.attrs.copy() + # Get CRS from dataset + crs = get_crs_from_dataset(sample_dataset_with_crs) + assert crs is not None + # Call the method - write_geo_metadata(sample_dataset_with_crs) + write_geo_metadata(sample_dataset_with_crs, crs=crs) # Verify dataset attributes are preserved for key, value in original_attrs.items(): diff --git a/tests/test_zarrio.py b/tests/test_zarrio.py new file mode 100644 index 0000000..49bd8d6 --- /dev/null +++ b/tests/test_zarrio.py @@ -0,0 +1,300 @@ +"""Tests for the zarrio module""" + +from __future__ import annotations + +import numcodecs +import pytest +import zarr +from zarr.core.dtype import Float32, UInt16 +from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata + +from eopf_geozarr.s2_optimization.s2_converter import array_reencoder +from eopf_geozarr.zarrio import ( + convert_compression, + default_array_reencoder, + reencode_group, + replace_json_invalid_floats, +) + + +def test_convert_compression_blosc() -> None: + """ + Test that convert_compression properly maps zarr + v2 blosc to zarr v3 blosc + """ + source = {"id": "blosc", "cname": "zstd", "clevel": 3, "shuffle": 2, "blocksize": 0} + + expected = ( + { + "name": "blosc", + "configuration": { + "cname": "zstd", + "clevel": 3, + "shuffle": "bitshuffle", + "blocksize": 0, + }, + }, + ) + + observed = convert_compression(source) + assert observed == expected + + +def test_convert_compression_none() -> None: + """ + Test that convert_compression maps None to an empty tuple + """ + source = None + expected = () + observed = convert_compression(source) + assert observed == expected + + +def test_convert_compression_fails() -> None: + """ + Test that convert compression will fail on out of band input + """ + source = {"id": "gzip"} + with pytest.raises(ValueError, match=r"Only blosc -> blosc or None -> ()"): + convert_compression(source) + + +def test_reencode_array_fill_value_none() -> None: + """ + Test that None fill_value is converted to dtype default + """ + array_a = zarr.create_array( + {}, shape=(10,), dtype="uint8", zarr_format=2, compressors=None, fill_value=None + ) + + meta = default_array_reencoder("test_array", array_a.metadata) + assert meta.fill_value == 0 + + +def test_reencode_array_fill_value_custom() -> None: + """ + Test that a custom fill_value is preserved + """ + array_a = zarr.create_array( + {}, shape=(10,), dtype="uint8", zarr_format=2, compressors=None, fill_value=255 + ) + + meta = default_array_reencoder("test_array", array_a.metadata) + assert meta.fill_value == 255 + + +@pytest.mark.parametrize("keep_scale_offset", [True, False]) +def test_reencode_array_scale_offset(keep_scale_offset: bool) -> None: + v2_meta = ArrayV2Metadata( + shape=(10,), + dtype=UInt16(), + chunks=(5,), + order="C", + compressor=None, + fill_value=0, + attributes={"scale_factor": 0.1, "add_offset": 10, "dtype": ">u2"}, + ) + observed = array_reencoder( + "test_array", v2_meta, config={"keep_scale_offset": keep_scale_offset} + ) + expect_dtype = v2_meta.dtype if keep_scale_offset else Float32() + assert observed.data_type == expect_dtype + + +def test_reencode_array_blosc_compression_converts() -> None: + """ + Test that blosc compression is properly converted from v2 to v3 + """ + compressor = numcodecs.Blosc(cname="zstd", clevel=5, shuffle=numcodecs.Blosc.BITSHUFFLE) + + # Verify the conversion function works + converted = convert_compression(compressor) + assert len(converted) == 1 + assert converted[0]["name"] == "blosc" + assert converted[0]["configuration"]["cname"] == "zstd" + assert converted[0]["configuration"]["clevel"] == 5 + assert converted[0]["configuration"]["shuffle"] == "bitshuffle" + + +def test_reencode_group_basic() -> None: + """ + Test that reencode_group creates a zarr v3 group from a v2 group + """ + group_v2 = zarr.create_group({}, zarr_format=2) + group_v3 = reencode_group(group_v2, {}, "") + + assert group_v3.metadata.zarr_format == 3 + + +def test_reencode_group_preserves_attributes() -> None: + """ + Test that group attributes are preserved during re-encoding + """ + group_v2 = zarr.create_group({}, zarr_format=2) + group_v2.attrs["foo"] = "bar" + group_v2.attrs["number"] = 42 + + group_v3 = reencode_group(group_v2, {}, "") + + assert group_v3.attrs["foo"] == "bar" + assert group_v3.attrs["number"] == 42 + + +def test_reencode_group_with_array() -> None: + """ + Test that arrays within a group are re-encoded + """ + group_v2 = zarr.create_group({}, zarr_format=2) + group_v2.create_array("data", shape=(10,), dtype="float32") + + group_v3 = reencode_group(group_v2, {}, "") + + assert "data" in group_v3 + assert group_v3["data"].metadata.zarr_format == 3 + + +def test_reencode_group_array_data_preserved() -> None: + """ + Test that array data is preserved during re-encoding + """ + group_v2 = zarr.create_group({}, zarr_format=2) + array_v2 = group_v2.create_array("data", shape=(5,), dtype="int32") + array_v2[:] = [1, 2, 3, 4, 5] + + group_v3 = reencode_group(group_v2, {}, "") + + assert (group_v3["data"][:] == [1, 2, 3, 4, 5]).all() + + +def test_reencode_group_array_attributes_preserved() -> None: + """ + Test that array attributes are preserved during re-encoding + """ + group_v2 = zarr.create_group({}, zarr_format=2) + array_v2 = group_v2.create_array("data", shape=(5,), dtype="int32") + array_v2.attrs["units"] = "meters" + + group_v3 = reencode_group(group_v2, {}, "") + + assert group_v3["data"].attrs["units"] == "meters" + + +def test_reencode_group_with_nested_group() -> None: + """ + Test that nested groups are re-encoded + """ + group_v2 = zarr.create_group({}, zarr_format=2) + group_v2.create_group("subgroup") + + group_v3 = reencode_group(group_v2, {}, "") + + assert "subgroup" in group_v3 + assert isinstance(group_v3["subgroup"], zarr.Group) + + +def test_reencode_group_nested_group_attributes() -> None: + """ + Test that nested group attributes are preserved + """ + group_v2 = zarr.create_group({}, zarr_format=2) + sub = group_v2.create_group("subgroup") + sub.attrs["nested"] = "value" + + group_v3 = reencode_group(group_v2, {}, "") + + assert group_v3["subgroup"].attrs["nested"] == "value" + + +def test_reencode_group_with_array_dimensions() -> None: + """ + Test that _ARRAY_DIMENSIONS attribute is converted to dimension_names + """ + group_v2 = zarr.create_group({}, zarr_format=2) + array_v2 = group_v2.create_array("data", shape=(10, 20), dtype="float32") + array_v2.attrs["_ARRAY_DIMENSIONS"] = ("x", "y") + + group_v3 = reencode_group(group_v2, {}, "") + + assert group_v3["data"].metadata.dimension_names == ("x", "y") + + +def test_reencode_group_removes_array_dimensions_from_attrs() -> None: + """ + Test that _ARRAY_DIMENSIONS is removed from attributes after conversion + """ + group_v2 = zarr.create_group({}, zarr_format=2) + array_v2 = group_v2.create_array("data", shape=(10,), dtype="float32") + array_v2.attrs["_ARRAY_DIMENSIONS"] = ("x",) + array_v2.attrs["other"] = "value" + + group_v3 = reencode_group(group_v2, {}, "") + + assert "_ARRAY_DIMENSIONS" not in group_v3["data"].attrs + assert group_v3["data"].attrs["other"] == "value" + + +def test_reencode_group_deep_hierarchy() -> None: + """ + Test that deeply nested hierarchies are fully re-encoded + """ + group_v2 = zarr.create_group({}, zarr_format=2) + level1 = group_v2.create_group("level1") + level2 = level1.create_group("level2") + level2.create_array("deep_array", shape=(5,), dtype="int32") + + group_v3 = reencode_group(group_v2, {}, "") + + assert "level1/level2/deep_array" in group_v3 + assert group_v3["level1/level2/deep_array"].metadata.zarr_format == 3 + + +def test_reencode_group_with_chunk_reencoder() -> None: + """ + Test that custom chunk_reencoder function is applied + """ + group_v2 = zarr.create_group({}, zarr_format=2) + old_node = group_v2.create_array( + "data", shape=(100,), chunks=(10,), dtype="float32", attributes={"foo": 10} + ) + new_chunks = (25,) + + def custom_array_encoder(key: str, metadata: ArrayV2Metadata) -> ArrayV3Metadata: + return ArrayV3Metadata( + shape=metadata.shape, + data_type=metadata.dtype, + fill_value=metadata.fill_value, + chunk_grid={"name": "regular", "configuration": {"chunk_shape": new_chunks}}, + chunk_key_encoding={"name": "default", "configuration": {"separator": "/"}}, + codecs=({"name": "bytes"},), + dimension_names=None, + attributes=metadata.attributes, + ) + + group_v3 = reencode_group(group_v2, {}, "", array_reencoder=custom_array_encoder) + + new_node = group_v3["data"] + assert new_node.metadata.chunk_grid.chunk_shape == new_chunks + assert new_node.attrs.asdict() == old_node.attrs.asdict() + + +def test_replace_json_invalid_floats() -> None: + data: dict[str, object] = { + "nan": float("nan"), + "nested_nan": {"nan": float("nan")}, + "nan_in_list": [float("nan")], + "inf": float("inf"), + "-inf": float("-inf"), + } + expected = { + "nan": "NaN", + "nested_nan": {"nan": "NaN"}, + "nan_in_list": ["NaN"], + "inf": "Infinity", + "-inf": "-Infinity", + } + observed = replace_json_invalid_floats(data) + assert observed == expected + + +if __name__ == "__main__": + pytest.main([__file__])