Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
64 commits
Select commit Hold shift + click to select a range
4cfc279
eagerly compute multiscales
d-v-b Dec 5, 2025
73c8b27
directly copy chunk bytes and metadata documents
d-v-b Dec 5, 2025
f9e6823
Merge branch 'main' of https://github.com/EOPF-Explorer/data-model in…
d-v-b Dec 5, 2025
f6eec7b
lint
d-v-b Dec 8, 2025
b21e936
untrack that which should not be tracked
d-v-b Dec 8, 2025
5a6443c
add spatial_ref after group re-encoding
d-v-b Dec 8, 2025
fe294b3
simplify tests
d-v-b Dec 8, 2025
2d36d78
fill value 0.0 -> nan in example JSON documents
d-v-b Dec 9, 2025
a4a3743
update optimized geozarr example json
d-v-b Dec 9, 2025
269968d
forward propagate attrs
d-v-b Dec 9, 2025
60036c0
update tests
d-v-b Dec 9, 2025
896e275
update test JSON models to have correct string fill value
d-v-b Dec 11, 2025
02985dc
simplify crs handling
d-v-b Dec 11, 2025
b184fe8
add module docstring
d-v-b Dec 11, 2025
a6dc580
remove typo
d-v-b Dec 11, 2025
25c1a52
tweak pydantic zarr usage in tests
d-v-b Dec 11, 2025
d4852d6
simplify tests
d-v-b Dec 11, 2025
f5dc9ae
zarrio tests
d-v-b Dec 11, 2025
4ba6eb1
fixes in zarrio
d-v-b Dec 11, 2025
b257098
Merge branch 'main' of https://github.com/EOPF-Explorer/data-model in…
d-v-b Dec 11, 2025
28c9282
silence warnings
d-v-b Dec 11, 2025
1414497
silence warnings
d-v-b Dec 11, 2025
d325f6d
silence warnings
d-v-b Dec 11, 2025
62c56d8
treat warnings as errors in tests
d-v-b Dec 11, 2025
aeaaeed
add omit-nodes parameter to reencode-group
d-v-b Dec 11, 2025
6a324a5
bump to latest version of pydantic-zarr
d-v-b Dec 12, 2025
1ec1e21
wip functional zarr transformers
d-v-b Dec 14, 2025
39bc936
update multiscales and expected JSON output
d-v-b Dec 14, 2025
c79b41f
preserve encoding when downsampling
d-v-b Dec 14, 2025
f9fa078
define correct encoding for downsampled data variables
d-v-b Dec 15, 2025
7192d04
fixes to avoid failing warnings in tests
d-v-b Dec 15, 2025
3688cca
smarter xarray encoding propagation
d-v-b Dec 15, 2025
f3849bb
ignore rio xarray warning
d-v-b Dec 15, 2025
f51541e
Merge branch 'main' of https://github.com/eopf-explorer/data-model in…
d-v-b Dec 15, 2025
373bfbe
update expected JSON outputs
d-v-b Dec 15, 2025
59cd14a
filter more warnings
d-v-b Dec 15, 2025
fed6121
lint
d-v-b Dec 15, 2025
5117005
Merge branch 'perf/direct-zarr-io' of https://github.com/d-v-b/data-m…
d-v-b Dec 15, 2025
e995a98
fix bugs in chunks / sharding, and ensure that small arrays are not s…
d-v-b Dec 15, 2025
efdeb6b
update JSON examples
d-v-b Dec 15, 2025
422a9cd
remove debug statement
d-v-b Dec 15, 2025
186024c
remove caching store
d-v-b Dec 15, 2025
9f1a131
don't downsample existing data vars
d-v-b Dec 16, 2025
f9b4845
add tests for multiscale skipping
d-v-b Dec 16, 2025
a8a9ee2
improve automatic chunking and add tests
d-v-b Dec 17, 2025
00659f7
update JSON examples
d-v-b Dec 17, 2025
f534648
update test to check for auto_chunks output
d-v-b Dec 17, 2025
4252194
add option to replace invalid JSON floats (NaN and infs) with strings
d-v-b Dec 17, 2025
a1375b7
thread allow_json_nan kwarg to cli
d-v-b Dec 17, 2025
9cab718
zarrio tests
d-v-b Dec 17, 2025
8c3052f
Merge branch 'main' of https://github.com/EOPF-Explorer/data-model in…
d-v-b Dec 19, 2025
db7fad6
Merge branch 'perf/direct-zarr-io' of https://github.com/d-v-b/data-m…
d-v-b Dec 19, 2025
aab0873
add multiscale metadata to output
d-v-b Jan 7, 2026
1d09dc8
write out multiscale metadata
d-v-b Jan 9, 2026
633969a
update launch configuration for GeoZarr conversion: modify output fil…
emmanuelmathot Jan 11, 2026
a383f6e
Merge branch 'main' of https://github.com/eopf-explorer/data-model in…
d-v-b Jan 13, 2026
b18f427
wire up array encoder to take a configuration
d-v-b Jan 14, 2026
7bac747
work towards matching expected output
d-v-b Jan 14, 2026
3ec7e93
add CRS implementation
d-v-b Jan 16, 2026
a276bd6
exclude dtype from encoding when stripping scale - offset
d-v-b Jan 20, 2026
e881a35
simplify test_s2_multiscale.py
d-v-b Jan 20, 2026
3ea6b68
get latest example JSON documents
d-v-b Jan 20, 2026
0a8a8bb
update to include b08 at lower scale levels
d-v-b Jan 20, 2026
c28bdb4
update conversion and tests
d-v-b Jan 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/eopf_geozarr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1159,6 +1159,9 @@ def add_s2_optimization_commands(subparsers: argparse._SubParsersAction) -> None
choices=range(1, 10),
help="Compression level 1-9 (default: 3)",
)
s2_parser.add_argument(
"--omit-nodes", help="The names of groups or arrays to skip.", default="", type=str
)
Comment on lines +1158 to +1160
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this argument solves #81. You would pass --omit-nodes "quality/l2a_quicklook" to omit that group

cc @emmanuelmathot

s2_parser.add_argument("--skip-validation", action="store_true", help="Skip output validation")
s2_parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
s2_parser.add_argument(
Expand Down Expand Up @@ -1189,12 +1192,15 @@ def convert_s2_optimized_command(args: argparse.Namespace) -> None:
storage_options=storage_options,
)

omit_nodes = set(args.omit_nodes.split())

# Convert
convert_s2_optimized(
dt_input=dt_input,
output_path=args.output_path,
enable_sharding=args.enable_sharding,
spatial_chunk=args.spatial_chunk,
omit_nodes=omit_nodes,
compression_level=args.compression_level,
validate_output=not args.skip_validation,
)
Expand Down
33 changes: 23 additions & 10 deletions src/eopf_geozarr/s2_optimization/s2_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ def convert_s2_optimized(
spatial_chunk: int,
compression_level: int,
validate_output: bool,
omit_nodes: set[str] | None = None,
max_retries: int = 3,
) -> xr.DataTree:
"""
Expand All @@ -224,6 +225,9 @@ def convert_s2_optimized(
Optimized DataTree
"""

if omit_nodes is None:
omit_nodes = set()

start_time = time.time()
zg = get_zarr_group(dt_input)
s2root_model = Sentinel2Root.from_zarr(zg)
Expand Down Expand Up @@ -258,17 +262,26 @@ def chunk_reencoder(array: zarr.Array[Any]) -> ChunkEncodingSpec:
return {"write_chunks": (1,) * (array.ndim - 2) + (spatial_chunk, spatial_chunk)}
return {"write_chunks": array.chunks}

out_group = reencode_group(zg, out_store, "", overwrite=True, chunk_reencoder=chunk_reencoder)

log.info("Adding CRS elements to datasets in measurements")
for _, subgroup in out_group["measurements"].groups():
for _, dataset in subgroup.groups():
add_crs_and_grid_mapping(dataset, crs=crs)
out_group = reencode_group(
zg,
out_store,
path="",
overwrite=True,
chunk_reencoder=chunk_reencoder,
omit_nodes=omit_nodes,
)

log.info("Adding CRS elements to quality datasets")
for _, subgroup in out_group["quality"].groups():
for _, dataset in subgroup.groups():
add_crs_and_grid_mapping(dataset, crs=crs)
if "measurements" not in omit_nodes:
log.info("Adding CRS elements to datasets in measurements")
for _, subgroup in out_group["measurements"].groups():
for _, dataset in subgroup.groups():
add_crs_and_grid_mapping(dataset, crs=crs)

if "quality" not in omit_nodes:
log.info("Adding CRS elements to quality datasets")
for _, subgroup in out_group["quality"].groups():
for _, dataset in subgroup.groups():
add_crs_and_grid_mapping(dataset, crs=crs)

# Step 2: Create multiscale pyramids for each group in the original structure
log.info("Adding multiscale levels")
Expand Down
42 changes: 37 additions & 5 deletions src/eopf_geozarr/zarrio.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,34 +105,66 @@ def reencode_array(

def reencode_group(
group: zarr.Group,
store: Any,
store: zarr.storage.StoreLike,
path: str,
*,
overwrite: bool = False,
use_consolidated_for_children: bool = False,
omit_nodes: set[str] | None = None,
chunk_reencoder: Callable[[zarr.Array[Any]], ChunkEncodingSpec] | None = None,
) -> zarr.Group:
"""
Re-encode a Zarr group, applying a re-encoding to all sub-groups and sub-arrays.

Parameters
----------
group : zarr.Group
The Zarr group to re-encode
store : zarr.storage.StoreLike
The store to write into
path : str
The path in the new store to use
overwrite : bool, default = False
Whether to overwrite contents of the new store
omit_nodes : set[str], default = {}
The names of groups or arrays to omit from re-encoding.
chunk_reencoder : Callable[[zarr.Array[Any], ChunkEncodingSpec]] | None, default = None
A function that takes a Zarr array object and returns a ChunkEncodingSpec, which is a dict
that defines a new chunk encoding. Use this parameter to define per-array chunk encoding
logic.

"""
if omit_nodes is None:
omit_nodes = set()

log = structlog.get_logger()

# Convert store-like to a proper Store object
store_path = sync(make_store_path(store))
store = store_path.store

all_members = dict(
members = dict(
group.members(max_depth=None, use_consolidated_for_children=use_consolidated_for_children)
)

log = structlog.get_logger()
log.info("Begin re-encoding Zarr group %s", group)
new_members: dict[str, ArrayV3Metadata | GroupMetadata] = {
path: GroupMetadata(zarr_format=3, attributes=group.attrs.asdict())
}
chunks_to_encode: list[str] = []
for name, member in all_members.items():
log.info("re-encoding member %s", name)
for name in omit_nodes:
if not any(k.startswith(name) for k in members):
log.warning(
"The name %s was provided in omit_nodes but no such array or group exists.", name
)
for name, member in members.items():
if any(name.startswith(v) for v in omit_nodes):
log.info(
"Skipping node %s because it is contained in a subgroup declared in the omit_groups parameter",
name,
)
continue
log.info("Re-encoding member %s", name)
new_path = f"{path}/{name}"
member_attrs = member.attrs.asdict()
if isinstance(member, zarr.Array):
Expand Down
Loading