Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
64 commits
Select commit Hold shift + click to select a range
4cfc279
eagerly compute multiscales
d-v-b Dec 5, 2025
73c8b27
directly copy chunk bytes and metadata documents
d-v-b Dec 5, 2025
f9e6823
Merge branch 'main' of https://github.com/EOPF-Explorer/data-model in…
d-v-b Dec 5, 2025
f6eec7b
lint
d-v-b Dec 8, 2025
b21e936
untrack that which should not be tracked
d-v-b Dec 8, 2025
5a6443c
add spatial_ref after group re-encoding
d-v-b Dec 8, 2025
fe294b3
simplify tests
d-v-b Dec 8, 2025
2d36d78
fill value 0.0 -> nan in example JSON documents
d-v-b Dec 9, 2025
a4a3743
update optimized geozarr example json
d-v-b Dec 9, 2025
269968d
forward propagate attrs
d-v-b Dec 9, 2025
60036c0
update tests
d-v-b Dec 9, 2025
896e275
update test JSON models to have correct string fill value
d-v-b Dec 11, 2025
02985dc
simplify crs handling
d-v-b Dec 11, 2025
b184fe8
add module docstring
d-v-b Dec 11, 2025
a6dc580
remove typo
d-v-b Dec 11, 2025
25c1a52
tweak pydantic zarr usage in tests
d-v-b Dec 11, 2025
d4852d6
simplify tests
d-v-b Dec 11, 2025
f5dc9ae
zarrio tests
d-v-b Dec 11, 2025
4ba6eb1
fixes in zarrio
d-v-b Dec 11, 2025
b257098
Merge branch 'main' of https://github.com/EOPF-Explorer/data-model in…
d-v-b Dec 11, 2025
28c9282
silence warnings
d-v-b Dec 11, 2025
1414497
silence warnings
d-v-b Dec 11, 2025
d325f6d
silence warnings
d-v-b Dec 11, 2025
62c56d8
treat warnings as errors in tests
d-v-b Dec 11, 2025
aeaaeed
add omit-nodes parameter to reencode-group
d-v-b Dec 11, 2025
6a324a5
bump to latest version of pydantic-zarr
d-v-b Dec 12, 2025
1ec1e21
wip functional zarr transformers
d-v-b Dec 14, 2025
39bc936
update multiscales and expected JSON output
d-v-b Dec 14, 2025
c79b41f
preserve encoding when downsampling
d-v-b Dec 14, 2025
f9fa078
define correct encoding for downsampled data variables
d-v-b Dec 15, 2025
7192d04
fixes to avoid failing warnings in tests
d-v-b Dec 15, 2025
3688cca
smarter xarray encoding propagation
d-v-b Dec 15, 2025
f3849bb
ignore rio xarray warning
d-v-b Dec 15, 2025
f51541e
Merge branch 'main' of https://github.com/eopf-explorer/data-model in…
d-v-b Dec 15, 2025
373bfbe
update expected JSON outputs
d-v-b Dec 15, 2025
59cd14a
filter more warnings
d-v-b Dec 15, 2025
fed6121
lint
d-v-b Dec 15, 2025
5117005
Merge branch 'perf/direct-zarr-io' of https://github.com/d-v-b/data-m…
d-v-b Dec 15, 2025
e995a98
fix bugs in chunks / sharding, and ensure that small arrays are not s…
d-v-b Dec 15, 2025
efdeb6b
update JSON examples
d-v-b Dec 15, 2025
422a9cd
remove debug statement
d-v-b Dec 15, 2025
186024c
remove caching store
d-v-b Dec 15, 2025
9f1a131
don't downsample existing data vars
d-v-b Dec 16, 2025
f9b4845
add tests for multiscale skipping
d-v-b Dec 16, 2025
a8a9ee2
improve automatic chunking and add tests
d-v-b Dec 17, 2025
00659f7
update JSON examples
d-v-b Dec 17, 2025
f534648
update test to check for auto_chunks output
d-v-b Dec 17, 2025
4252194
add option to replace invalid JSON floats (NaN and infs) with strings
d-v-b Dec 17, 2025
a1375b7
thread allow_json_nan kwarg to cli
d-v-b Dec 17, 2025
9cab718
zarrio tests
d-v-b Dec 17, 2025
8c3052f
Merge branch 'main' of https://github.com/EOPF-Explorer/data-model in…
d-v-b Dec 19, 2025
db7fad6
Merge branch 'perf/direct-zarr-io' of https://github.com/d-v-b/data-m…
d-v-b Dec 19, 2025
aab0873
add multiscale metadata to output
d-v-b Jan 7, 2026
1d09dc8
write out multiscale metadata
d-v-b Jan 9, 2026
633969a
update launch configuration for GeoZarr conversion: modify output fil…
emmanuelmathot Jan 11, 2026
a383f6e
Merge branch 'main' of https://github.com/eopf-explorer/data-model in…
d-v-b Jan 13, 2026
b18f427
wire up array encoder to take a configuration
d-v-b Jan 14, 2026
7bac747
work towards matching expected output
d-v-b Jan 14, 2026
3ec7e93
add CRS implementation
d-v-b Jan 16, 2026
a276bd6
exclude dtype from encoding when stripping scale - offset
d-v-b Jan 20, 2026
e881a35
simplify test_s2_multiscale.py
d-v-b Jan 20, 2026
3ea6b68
get latest example JSON documents
d-v-b Jan 20, 2026
0a8a8bb
update to include b08 at lower scale levels
d-v-b Jan 20, 2026
c28bdb4
update conversion and tests
d-v-b Jan 21, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@
"convert-s2-optimized",
// "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202509-s02msil2a/08/products/cpm_v256/S2A_MSIL2A_20250908T100041_N0511_R122_T32TQM_20250908T115116.zarr",
"https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202511-s02msil2a-eu/15/products/cpm_v262/S2B_MSIL2A_20251115T091139_N0511_R050_T35SLU_20251115T111807.zarr",
// "https://objects.eodc.eu:443/e05ab01a9d56408d82ac32d69a5aae2a:202512-s02msil2a-eu/16/products/cpm_v262/S2B_MSIL2A_20251216T102339_N0511_R065_T32TNS_20251216T123617.zarr",
// "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202511-s02msil2a-eu/16/products/cpm_v262/S2A_MSIL2A_20251116T085431_N0511_R107_T35SQD_20251116T103813.zarr",
// "s3://esa-zarr-sentinel-explorer-fra/tests-output/sentinel-2-l2a-opt/S2A_MSIL2A_20250908T100041_N0511_R122_T32TQM_20250908T115116.zarr",
// "s3://esa-zarr-sentinel-explorer-fra/tests-output/sentinel-2-l2a-staging/S2B_MSIL2A_20251115T091139_N0511_R050_T35SLU_20251115T111807.zarr",
Expand All @@ -211,7 +212,7 @@
// "--omit-nodes",
// "quality/l2a_quicklook",
"--dask-cluster",
"--verbose"
// "--verbose"
],
"cwd": "${workspaceFolder}",
"justMyCode": false,
Expand Down
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ classifiers = [
]
requires-python = ">=3.11"
dependencies = [
"pydantic-zarr>=0.8.0",
"pydantic-zarr>=0.9.1",
"pydantic>=2.12",
"zarr>=3.1.1",
"zarr>=3.1.4",
"xarray>=2025.7.1",
"dask[array,distributed]>=2025.5.1",
"numpy>=2.3.1",
Expand Down Expand Up @@ -172,6 +172,8 @@ module = ["zarr.*", "xarray.*", "rioxarray.*", "cf_xarray.*", "dask.*"]
ignore_missing_imports = true

[tool.pytest.ini_options]
filterwarnings = "error"
log_level = "WARNING"
minversion = "7.0"
addopts = "-ra -q --strict-markers --strict-config"
testpaths = ["tests"]
Expand Down
18 changes: 17 additions & 1 deletion src/eopf_geozarr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1155,8 +1155,16 @@ def add_s2_optimization_commands(subparsers: argparse._SubParsersAction) -> None
choices=range(1, 10),
help="Compression level 1-9 (default: 3)",
)
s2_parser.add_argument(
"--omit-nodes", help="The names of groups or arrays to skip.", default="", type=str
)
Comment on lines +1158 to +1160
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this argument solves #81. You would pass --omit-nodes "quality/l2a_quicklook" to omit that group

cc @emmanuelmathot

s2_parser.add_argument("--skip-validation", action="store_true", help="Skip output validation")
s2_parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
s2_parser.add_argument(
"--allow-json-nan",
action="store_true",
help="Allow invalid float values (nan, inf) in output JSON",
)
s2_parser.add_argument(
"--keep-scale-offset",
action="store_true",
Expand Down Expand Up @@ -1184,18 +1192,26 @@ def convert_s2_optimized_command(args: argparse.Namespace) -> None:
# Load input dataset
log.info("Loading Sentinel-2 dataset from", input_path=args.input_path)
storage_options = get_storage_options(str(args.input_path))
store = args.input_path
dt_input = xr.open_datatree(
str(args.input_path), engine="zarr", chunks="auto", storage_options=storage_options
store,
engine="zarr",
chunks="auto",
storage_options=storage_options,
)

omit_nodes = set(args.omit_nodes.split())

# Convert
convert_s2_optimized(
dt_input=dt_input,
output_path=args.output_path,
enable_sharding=args.enable_sharding,
spatial_chunk=args.spatial_chunk,
omit_nodes=omit_nodes,
compression_level=args.compression_level,
validate_output=not args.skip_validation,
allow_json_nan=args.allow_json_nan,
keep_scale_offset=args.keep_scale_offset,
)

Expand Down
1 change: 1 addition & 0 deletions src/eopf_geozarr/conversion/geozarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1189,6 +1189,7 @@ def cleanup_prefix(prefix: str) -> None:
engine="zarr",
decode_coords="all",
chunks="auto",
consolidated=False,
storage_options=store_storage_options,
)
break
Expand Down
44 changes: 43 additions & 1 deletion src/eopf_geozarr/data_api/geozarr/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import io
import urllib
import urllib.request
from collections.abc import Mapping
from dataclasses import dataclass
from typing import (
TYPE_CHECKING,
Expand All @@ -24,12 +25,18 @@
from pydantic.experimental.missing_sentinel import MISSING
from typing_extensions import Protocol, TypedDict, runtime_checkable

from eopf_geozarr.data_api.geozarr.projjson import ProjJSON # noqa: TC001
from eopf_geozarr.data_api.geozarr.projjson import ProjJSON
from eopf_geozarr.data_api.geozarr.types import (
CF_SCALE_OFFSET_KEYS,
CFScaleOffset,
EmptyDict,
)

if TYPE_CHECKING:
from collections.abc import Mapping



@dataclass(frozen=True)
class UNSET_TYPE:
"""
Expand Down Expand Up @@ -289,3 +296,38 @@ def check_grid_mapping(model: TDataSetLike) -> TDataSetLike:

def is_none(data: object) -> TypeGuard[None]:
return data is None


def extract_scale_offset(
data: Mapping[str, object],
) -> tuple[dict[str, object], CFScaleOffset | EmptyDict]:
"""
Extract scale/offset information from a mapping, returning the remaining data and the scale/offset info.

Parameters
----------
data : Mapping[[str, object]]
The input mapping from which to extract scale/offset information.

Returns
-------
tuple[Mapping[str, object], CFScaleOffset]
A tuple containing the remaining data (with scale/offset keys removed) and the extracted scale/offset info.
"""
scale_offset: CFScaleOffset = {} # type: ignore[typeddict-item]
remaining_data: dict[str, object] = {}

if set(data.keys()).isdisjoint(CF_SCALE_OFFSET_KEYS):
return dict(data), {}

if set(data.keys()).issuperset(CF_SCALE_OFFSET_KEYS):
for key, value in data.items():
if key in CF_SCALE_OFFSET_KEYS:
scale_offset[key] = value # type: ignore[literal-required]
else:
remaining_data[key] = value
return remaining_data, scale_offset

raise ValueError(
"Incomplete scale/offset information: all of 'scale_factor', 'add_offset', must be present."
)
8 changes: 6 additions & 2 deletions src/eopf_geozarr/data_api/geozarr/multiscales/geozarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def valid_zcm(self) -> Self:
Ensure that the ZCM metadata, if present, is valid
"""
if self.layout is not MISSING:
zcm.Multiscales(**self.model_dump())
zcm.Multiscales(layout=self.layout, resampling_method=self.resampling_method)

return self

Expand All @@ -38,7 +38,11 @@ def valid_tms(self) -> Self:
Ensure that the TMS metadata, if present, is valid
"""
if self.tile_matrix_set is not MISSING:
tms.Multiscales(**self.model_dump())
tms.Multiscales(
tile_matrix_set=self.tile_matrix_set,
tile_matrix_limits=self.tile_matrix_limits,
resampling_method=self.resampling_method, # type: ignore[arg-type]
)

return self

Expand Down
11 changes: 10 additions & 1 deletion src/eopf_geozarr/data_api/geozarr/multiscales/zcm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import Final, Literal, NotRequired

from pydantic import BaseModel, field_validator
from pydantic import BaseModel, field_validator, model_serializer
from pydantic.experimental.missing_sentinel import MISSING
from typing_extensions import TypedDict

Expand Down Expand Up @@ -73,6 +73,15 @@ class Transform(BaseModel):
scale: tuple[float, ...] | MISSING = MISSING
translation: tuple[float, ...] | MISSING = MISSING

@model_serializer
def serialize_model(self) -> dict[str, tuple[float, ...]]:
result: dict[str, tuple[float, ...]] = {}
if self.scale is not MISSING:
result["scale"] = self.scale
if self.translation is not MISSING:
result["translation"] = self.translation
return result


class TransformJSON(TypedDict):
scale: NotRequired[tuple[float, ...]]
Expand Down
24 changes: 21 additions & 3 deletions src/eopf_geozarr/data_api/geozarr/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

from __future__ import annotations

from typing import TYPE_CHECKING, Final, Literal, NotRequired, TypedDict
from typing import TYPE_CHECKING, Final, Literal, NotRequired

from typing_extensions import TypedDict

if TYPE_CHECKING:
from collections.abc import Mapping
Expand All @@ -16,7 +18,7 @@ class TileMatrixLimitJSON(TypedDict):
maxTileRow: int


CF_SCALE_OFFSET_KEYS: Final[set[str]] = {"scale_factor", "add_offset", "dtype"}
CF_SCALE_OFFSET_KEYS: Final[set[str]] = {"scale_factor", "add_offset"}

XARRAY_ENCODING_KEYS: Final[set[str]] = {
"chunks",
Expand All @@ -25,16 +27,32 @@ class TileMatrixLimitJSON(TypedDict):
"filters",
"shards",
"_FillValue",
"dtype",
} | CF_SCALE_OFFSET_KEYS


class CFScaleOffset(TypedDict):
"""
Metadata defining scale/offset encoding for array values. Defined by the CF
conventions and found in EOPF Sentinel products in Zarr array attributes.
"""

scale_factor: float
add_offset: float
dtype: str


class EmptyDict(TypedDict, closed=True): # type: ignore[call-arg]
"""A dict with no keys."""


class XarrayDataArrayEncoding(TypedDict):
"""
The dict form of the encoding for xarray.DataArray
"""

chunks: NotRequired[tuple[int, ...]]
preferred_chunks: NotRequired[tuple[int, ...]]
preferred_chunks: NotRequired[dict[str, int]]
compressors: NotRequired[tuple[object, ...] | None]
filters: NotRequired[tuple[object, ...]]
shards: NotRequired[tuple[int, ...] | None]
Expand Down
Loading