From c4f8a1b5301c24f8785d953e8b0bbfd1cab61eaa Mon Sep 17 00:00:00 2001 From: Pontus Lurcock Date: Fri, 26 Sep 2025 16:49:45 +0200 Subject: [PATCH 1/2] Support NetCDF as output format Now xcengine images can write NetCDF as well as Zarr by setting a dataset attribute in the notebook. Tested but not yet documented. --- test/test_util.py | 39 +++++++++++++++++++++++---------------- xcengine/util.py | 23 +++++++++++++++-------- 2 files changed, 38 insertions(+), 24 deletions(-) diff --git a/test/test_util.py b/test/test_util.py index 98090e3..68f9a74 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -38,14 +38,15 @@ def test_clear_directory(tmp_path): assert os.listdir(tmp_path) == [] -@pytest.mark.parametrize("write_zarrs", [False, True]) -def test_write_stac(tmp_path, dataset, write_zarrs): - datasets = {"ds1": dataset, "ds2": dataset} - if write_zarrs: +@pytest.mark.parametrize("write_datasets", [False, True]) +def test_write_stac(tmp_path, dataset, write_datasets): + datasets = {"ds1": dataset, "ds2": dataset.copy()} + datasets["ds2"].attrs["xcengine_output_format"] = "netcdf" + if write_datasets: output_path = tmp_path / "output" output_path.mkdir() - for ds_id, ds in datasets.items(): - ds.to_zarr(output_path / (ds_id + ".zarr")) + datasets["ds1"].to_zarr(output_path / ("ds1.zarr")) + datasets["ds2"].to_netcdf(output_path / ("ds2.nc")) write_stac(datasets, tmp_path) catalog = pystac.Catalog.from_file(tmp_path / "catalog.json") @@ -57,21 +58,27 @@ def test_write_stac(tmp_path, dataset, write_zarrs): for item in items } assert data_asset_hrefs == { - ds_id: [ - str(Path(tmp_path / ds_id / f"{ds_id}.zarr").resolve(strict=False)) - ] - for ds_id in datasets.keys() + "ds1": [str((tmp_path / "ds1" / "ds1.zarr").resolve(strict=False))], + "ds2": [str((tmp_path / "ds2" / "ds2.nc").resolve(strict=False))], } @pytest.mark.parametrize("eoap_mode", [False, True]) -def test_save_datasets(tmp_path, dataset, eoap_mode): - datasets = {"ds1": dataset, "ds2": dataset} +@pytest.mark.parametrize("ds2_format", [None, "zarr", "netcdf"]) +def test_save_datasets(tmp_path, dataset, eoap_mode, ds2_format): + datasets = {"ds1": dataset, "ds2": dataset.copy()} + if ds2_format is not None: + datasets["ds2"].attrs["xcengine_output_format"] = ds2_format save_datasets(datasets, tmp_path, eoap_mode) - for ds_id in datasets.keys(): - assert ( - tmp_path / (ds_id if eoap_mode else "output") / (ds_id + ".zarr") - ).is_dir() + def outdir(ds_id): + return tmp_path / (ds_id if eoap_mode else "output") + assert (outdir("ds1") / "ds1.zarr").is_dir() + ds2_suffix = "nc" if ds2_format == "netcdf" else "zarr" + ds2_path = outdir("ds2") / f"ds2.{ds2_suffix}" + if ds2_format == "netcdf": + assert ds2_path.is_file() + else: + assert ds2_path.is_dir() catalogue_path = tmp_path / "catalog.json" if eoap_mode: assert catalogue_path.is_file() diff --git a/xcengine/util.py b/xcengine/util.py index ba2f173..18376b8 100644 --- a/xcengine/util.py +++ b/xcengine/util.py @@ -31,18 +31,20 @@ def write_stac( href=f"{catalog_path}", ) for ds_name, ds in datasets.items(): - zarr_name = ds_name + ".zarr" - zarr_path = stac_root / "output" / zarr_name + output_format = ds.attrs.get("xcengine_output_format", "zarr") + suffix = "nc" if output_format == "netcdf" else "zarr" + output_name = f"{ds_name}.{suffix}" + output_path = stac_root / "output" / output_name asset_parent = stac_root / ds_name asset_parent.mkdir(parents=True, exist_ok=True) - asset_path = asset_parent / zarr_name - if zarr_path.exists(): + asset_path = asset_parent / output_name + if output_path.exists(): # If a Zarr for this asset is present in the output directory, # move it into the corresponding STAC subdirectory. If not, # we write the same STAC items with the same asset links anyway # and assume that the caller will take care of actually writing # the asset. - zarr_path.rename(asset_path) + output_path.rename(asset_path) asset = pystac.Asset( roles=["data", "visual"], href=str(asset_path), @@ -52,7 +54,7 @@ def write_stac( # https://planetarycomputer.microsoft.com/api/stac/v1/collections/terraclimate # uses the similar "application/vnd+zarr" but RFC 6838 mandates # "." rather than "+". - media_type="application/vnd.zarr", + media_type="application/x-netcdf" if output_format == "netcdf" else "application/vnd.zarr", title=ds.attrs.get("title", ds_name), ) bb = namedtuple("Bounds", ["left", "bottom", "right", "top"])( @@ -92,9 +94,14 @@ def save_datasets( for ds_id, ds in datasets.items(): output_subpath = output_path / (ds_id if eoap_mode else "output") output_subpath.mkdir(parents=True, exist_ok=True) - dataset_path = output_subpath / (ds_id + ".zarr") + output_format = ds.attrs.get("xcengine_output_format", "zarr") + suffix = "nc" if output_format == "netcdf" else "zarr" + dataset_path = output_subpath / f"{ds_id}.{suffix}" saved_datasets[ds_id] = dataset_path - ds.to_zarr(dataset_path) + if output_format == "netcdf": + ds.to_netcdf(dataset_path) + else: + ds.to_zarr(dataset_path) # The "finished" file is a flag to indicate to a runner when # processing is complete, though the xcetool runner doesn't yet use it. (output_path / "finished").touch() From 3d082bb8f535e19bd0f33ea685c425f21cb37cc7 Mon Sep 17 00:00:00 2001 From: Pontus Lurcock Date: Fri, 26 Sep 2025 17:12:55 +0200 Subject: [PATCH 2/2] Document NetCDF output; update changelog --- CHANGES.md | 1 + README.md | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index bcd22cd..96cfe67 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -14,6 +14,7 @@ * Support writing of stage-out STAC by notebook (#32) * Make viewer work on non-default ports (#21) * Improve dynamic example notebook +* Support NetCDF output (#28) ## Changes in 0.1.0 diff --git a/README.md b/README.md index 3e5acc5..eb21b6c 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,18 @@ parameters cell and make them available as command-line parameters for the output script and container, and as workflow parameters for the application package. +# Customizing output formats + +An xcengine-generated script or image can automatically write all +`xarray.Dataset` objects from the notebook code to disk, for example to be +staged out as EO Application Package outputs. By default, Zarr format is +used, but this can be changed to NetCDF on a per-dataset basis by applying +an attribute: + +```python +my_dataset.attrs["xcengine_output_format"] = "netcdf" +``` + # xcetool usage xcengine provides a command-line tool called `xcetool`, which has several