From 08baa866f76166dbe75a0cc8123eb5a7acb2e15c Mon Sep 17 00:00:00 2001 From: Stefaan Lippens Date: Mon, 17 Jun 2024 16:51:15 +0200 Subject: [PATCH 1/3] Issue #573 metadata_from_stac: add support for "item_assets" --- CHANGELOG.md | 2 + openeo/metadata.py | 77 ++++++++++++++++- openeo/testing.py | 36 ++++++++ tests/__init__.py | 2 + tests/conftest.py | 6 ++ .../data/stac/collections/agera5_daily01.json | 82 +++++++++++++++++++ tests/test_metadata.py | 31 ++++++- 7 files changed, 233 insertions(+), 3 deletions(-) create mode 100644 openeo/testing.py create mode 100644 tests/data/stac/collections/agera5_daily01.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 95cc8a1a6..73d6d7b0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Document PEP 723 based Python UDF dependency declarations ([Open-EO/openeo-geopyspark-driver#237](https://github.com/Open-EO/openeo-geopyspark-driver/issues/237)) - Added more `openeo.api.process.Parameter` helpers to easily create "bounding_box", "date", "datetime", "geojson" and "temporal_interval" parameters for UDP construction. - Added convenience method `Connection.load_stac_from_job(job)` to easily load the results of a batch job with the `load_stac` process ([#566](https://github.com/Open-EO/openeo-python-client/issues/566)) +- `load_stac`/`metadata_from_stac`: add support for extracting band info from "item_assets" in collection metadata ([#573](https://github.com/Open-EO/openeo-python-client/issues/573)) +- Added initial `openeo.testing` submodule for reusable test utilities ### Changed diff --git a/openeo/metadata.py b/openeo/metadata.py index ba0b8aede..d7a537f06 100644 --- a/openeo/metadata.py +++ b/openeo/metadata.py @@ -2,9 +2,11 @@ import logging import warnings -from typing import Any, Callable, List, NamedTuple, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Tuple, Union import pystac +import pystac.extensions.eo +import pystac.extensions.item_assets from openeo.internal.jupyter import render_component from openeo.util import deep_get @@ -107,6 +109,7 @@ class Band(NamedTuple): class BandDimension(Dimension): + # TODO #575 support unordered bands and avoid assumption that band order is known. def __init__(self, name: str, bands: List[Band]): super().__init__(type="bands", name=name) self.bands = bands @@ -534,6 +537,8 @@ def metadata_from_stac(url: str) -> CubeMetadata: :return: A :py:class:`CubeMetadata` containing the DataCube band metadata from the url. """ + # TODO move these nested functions and other logic to _StacMetadataParser + def get_band_metadata(eo_bands_location: dict) -> List[Band]: # TODO: return None iso empty list when no metadata? return [ @@ -573,6 +578,10 @@ def is_band_asset(asset: pystac.Asset) -> bool: for asset_band in asset_bands: if asset_band.name not in get_band_names(bands): bands.append(asset_band) + if collection.ext.has("item_assets"): + # TODO #575 support unordered band names and avoid conversion to a list. + bands = list(_StacMetadataParser().get_bands_from_item_assets(collection.ext.item_assets)) + elif isinstance(stac_object, pystac.Catalog): catalog = stac_object bands = get_band_metadata(catalog.extra_fields.get("summaries", {})) @@ -586,3 +595,69 @@ def is_band_asset(asset: pystac.Asset) -> bool: temporal_dimension = TemporalDimension(name="t", extent=[None, None]) metadata = CubeMetadata(dimensions=[band_dimension, temporal_dimension]) return metadata + + +class _StacMetadataParser: + """ + Helper to extract openEO metadata from STAC metadata resource + """ + + def __init__(self): + # TODO: toggles for how to handle strictness, warnings, logging, etc + pass + + def _get_band_from_eo_bands_item(self, eo_band: Union[dict, pystac.extensions.eo.Band]) -> Band: + if isinstance(eo_band, pystac.extensions.eo.Band): + return Band( + name=eo_band.name, + common_name=eo_band.common_name, + wavelength_um=eo_band.center_wavelength, + ) + elif isinstance(eo_band, dict) and "name" in eo_band: + return Band( + name=eo_band["name"], + common_name=eo_band.get("common_name"), + wavelength_um=eo_band.get("center_wavelength"), + ) + else: + raise ValueError(eo_band) + + def get_bands_from_eo_bands(self, eo_bands: List[Union[dict, pystac.extensions.eo.Band]]) -> List[Band]: + """ + Extract bands from STAC `eo:bands` array + + :param eo_bands: List of band objects, as dict or `pystac.extensions.eo.Band` instances + """ + # TODO: option to skip bands that failed to parse in some way? + return [self._get_band_from_eo_bands_item(band) for band in eo_bands] + + def _get_bands_from_item_asset( + self, item_asset: pystac.extensions.item_assets.AssetDefinition + ) -> Union[List[Band], None]: + """Get bands from a STAC 'item_assets' asset definition.""" + if item_asset.ext.has("eo"): + if item_asset.ext.eo.bands is not None: + return self.get_bands_from_eo_bands(item_asset.ext.eo.bands) + elif "eo:bands" in item_asset.properties: + # TODO: skip this in strict mode? + _log.warning("Extracting band info from 'eo:bands' metadata, but 'eo' STAC extension was not declared.") + return self.get_bands_from_eo_bands(item_asset.properties["eo:bands"]) + + def get_bands_from_item_assets( + self, item_assets: Dict[str, pystac.extensions.item_assets.AssetDefinition] + ) -> Set[Band]: + """ + Get bands extracted from "item_assets" objects (defined by "item-assets" extension, + in combination with "eo" extension) at STAC Collection top-level, + + Note that "item_assets" in STAC is a mapping, so the band order is undefined, + which is why we return a set of bands here. + + :param item_assets: a STAC `item_assets` mapping + """ + bands = set() + for item_asset in item_assets.values(): + asset_bands = self._get_bands_from_item_asset(item_asset) + if asset_bands: + bands.update(asset_bands) + return bands diff --git a/openeo/testing.py b/openeo/testing.py new file mode 100644 index 000000000..17d8cdc49 --- /dev/null +++ b/openeo/testing.py @@ -0,0 +1,36 @@ +""" +Utilities for testing of openEO client workflows. +""" + +import json +from pathlib import Path +from typing import Callable, Optional, Union + + +class TestDataLoader: + """ + Helper to resolve paths to test data files, load them as JSON, optionally preprocess them, etc. + + It's intended to be used as a pytest fixture, e.g. from conftest.py: + + @pytest.fixture + def test_data() -> TestDataLoader: + return TestDataLoader(root=Path(__file__).parent / "data") + + .. versionadded:: 0.30.0 + + """ + + def __init__(self, root: Union[str, Path]): + self.data_root = Path(root) + + def get_path(self, filename: Union[str, Path]) -> Path: + """Get absolute path to a test data file""" + return self.data_root / filename + + def load_json(self, filename: Union[str, Path], preprocess: Optional[Callable[[str], str]] = None) -> dict: + """Parse data from a test JSON file""" + data = self.get_path(filename).read_text(encoding="utf8") + if preprocess: + data = preprocess(data) + return json.loads(data) diff --git a/tests/__init__.py b/tests/__init__.py index 5e6f0702c..e7ec88f5e 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -5,11 +5,13 @@ def get_test_resource(relative_path: str) -> Path: + # TODO: migrate to TestDataLoader dir = Path(os.path.dirname(os.path.realpath(__file__))) return dir / relative_path def load_json_resource(relative_path, preprocess: Callable = None) -> dict: + # TODO: migrate to TestDataLoader with get_test_resource(relative_path).open("r+") as f: data = f.read() if preprocess: diff --git a/tests/conftest.py b/tests/conftest.py index 5e18e5ca6..28e22fb00 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ import pytest +from openeo.testing import TestDataLoader from openeo.util import ensure_dir pytest_plugins = "pytester" @@ -25,3 +26,8 @@ def tmp_openeo_config_home(tmp_path): path = ensure_dir(Path(str(tmp_path)) / "openeo-conf") with mock.patch.dict("os.environ", {"OPENEO_CONFIG_HOME": str(path)}): yield path + + +@pytest.fixture +def test_data() -> TestDataLoader: + return TestDataLoader(root=Path(__file__).parent / "data") diff --git a/tests/data/stac/collections/agera5_daily01.json b/tests/data/stac/collections/agera5_daily01.json new file mode 100644 index 000000000..2ffa7fb22 --- /dev/null +++ b/tests/data/stac/collections/agera5_daily01.json @@ -0,0 +1,82 @@ +{ + "type": "Collection", + "id": "agera5_daily", + "stac_version": "1.0.0", + "description": "ERA5", + "links": [ + { + "rel": "self", + "type": "application/json", + "href": "https://stac.test/collections/agera5_daily" + } + ], + "stac_extensions": [ + "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json", + "https://stac-extensions.github.io/eo/v1.1.0/schema.json" + ], + "item_assets": { + "2m_temperature_min": { + "type": "image/tiff; application=geotiff", + "title": "2m temperature min 24h", + "eo:bands": [ + { + "name": "2m_temperature_min", + "description": "temperature 2m above ground (Kelvin)" + } + ] + }, + "2m_temperature_max": { + "type": "image/tiff; application=geotiff", + "eo:bands": [ + { + "name": "2m_temperature_max", + "description": "temperature 2m above ground (Kelvin)" + } + ] + }, + "dewpoint_temperature_mean": { + "type": "image/tiff; application=geotiff", + "title": "2m dewpoint temperature", + "eo:bands": [ + { + "name": "dewpoint_temperature_mean", + "description": "dewpoint temperature 2m above ground (Kelvin)" + } + ] + }, + "vapour_pressure": { + "eo:bands": [ + { + "name": "vapour_pressure" + } + ] + } + }, + "title": "agERA5 data", + "extent": { + "spatial": { + "bbox": [ + [ + -180, + -90, + 180, + 90 + ] + ] + }, + "temporal": { + "interval": [ + [ + "2010-01-01T00:00:00Z", + "2024-06-12T00:00:00Z" + ] + ] + } + }, + "keywords": [ + "ERA5" + ], + "summaries": {}, + "assets": {}, + "license": "proprietary" +} diff --git a/tests/test_metadata.py b/tests/test_metadata.py index caa88820b..8cf3de90b 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -1,7 +1,8 @@ from __future__ import annotations import json -from typing import List +import re +from typing import List, Union import pytest @@ -835,8 +836,34 @@ def filter_bbox(self, bbox): ], ) def test_metadata_from_stac(tmp_path, test_stac, expected): - path = tmp_path / "stac.json" path.write_text(json.dumps(test_stac)) metadata = metadata_from_stac(path) assert metadata.band_names == expected + + +@pytest.mark.parametrize("eo_extension_is_declared", [False, True]) +def test_metadata_from_stac_collection_bands_from_item_assets(test_data, tmp_path, eo_extension_is_declared, caplog): + stac_data = test_data.load_json("stac/collections/agera5_daily01.json") + stac_data["stac_extensions"] = [ + ext + for ext in stac_data["stac_extensions"] + if (not ext.startswith("https://stac-extensions.github.io/eo/") or eo_extension_is_declared) + ] + assert ( + any(ext.startswith("https://stac-extensions.github.io/eo/") for ext in stac_data["stac_extensions"]) + == eo_extension_is_declared + ) + path = tmp_path / "stac.json" + path.write_text(json.dumps(stac_data)) + + metadata = metadata_from_stac(path) + assert sorted(metadata.band_names) == [ + "2m_temperature_max", + "2m_temperature_min", + "dewpoint_temperature_mean", + "vapour_pressure", + ] + + if not eo_extension_is_declared: + assert "Extracting band info from 'eo:bands' metadata, but 'eo' STAC extension was not declared." in caplog.text From 0448aff5d709cbe06aed185d2a2e6e582971017f Mon Sep 17 00:00:00 2001 From: Stefaan Lippens Date: Mon, 17 Jun 2024 17:16:03 +0200 Subject: [PATCH 2/3] Issue #573 Skip pystac 1.9.0 features on Python 3.7/3.8 --- openeo/metadata.py | 12 +++++++++--- tests/test_metadata.py | 2 ++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/openeo/metadata.py b/openeo/metadata.py index d7a537f06..d660b753c 100644 --- a/openeo/metadata.py +++ b/openeo/metadata.py @@ -578,7 +578,7 @@ def is_band_asset(asset: pystac.Asset) -> bool: for asset_band in asset_bands: if asset_band.name not in get_band_names(bands): bands.append(asset_band) - if collection.ext.has("item_assets"): + if _PYSTAC_1_9_EXTENSION_INTERFACE and collection.ext.has("item_assets"): # TODO #575 support unordered band names and avoid conversion to a list. bands = list(_StacMetadataParser().get_bands_from_item_assets(collection.ext.item_assets)) @@ -597,6 +597,11 @@ def is_band_asset(asset: pystac.Asset) -> bool: return metadata +# Sniff for PySTAC extension API since version 1.9.0 (which is not available below Python 3.9) +# TODO: remove this once support for Python 3.7 and 3.8 is dropped +_PYSTAC_1_9_EXTENSION_INTERFACE = hasattr(pystac.Item, "ext") + + class _StacMetadataParser: """ Helper to extract openEO metadata from STAC metadata resource @@ -635,12 +640,13 @@ def _get_bands_from_item_asset( self, item_asset: pystac.extensions.item_assets.AssetDefinition ) -> Union[List[Band], None]: """Get bands from a STAC 'item_assets' asset definition.""" - if item_asset.ext.has("eo"): + if _PYSTAC_1_9_EXTENSION_INTERFACE and item_asset.ext.has("eo"): if item_asset.ext.eo.bands is not None: return self.get_bands_from_eo_bands(item_asset.ext.eo.bands) elif "eo:bands" in item_asset.properties: # TODO: skip this in strict mode? - _log.warning("Extracting band info from 'eo:bands' metadata, but 'eo' STAC extension was not declared.") + if _PYSTAC_1_9_EXTENSION_INTERFACE: + _log.warning("Extracting band info from 'eo:bands' metadata, but 'eo' STAC extension was not declared.") return self.get_bands_from_eo_bands(item_asset.properties["eo:bands"]) def get_bands_from_item_assets( diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 8cf3de90b..92bfbeecb 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -7,6 +7,7 @@ import pytest from openeo.metadata import ( + _PYSTAC_1_9_EXTENSION_INTERFACE, Band, BandDimension, CollectionMetadata, @@ -842,6 +843,7 @@ def test_metadata_from_stac(tmp_path, test_stac, expected): assert metadata.band_names == expected +@pytest.mark.skipif(not _PYSTAC_1_9_EXTENSION_INTERFACE, reason="Requires PySTAC 1.9+ extension interface") @pytest.mark.parametrize("eo_extension_is_declared", [False, True]) def test_metadata_from_stac_collection_bands_from_item_assets(test_data, tmp_path, eo_extension_is_declared, caplog): stac_data = test_data.load_json("stac/collections/agera5_daily01.json") From 84a9f5b074f500a79c25f7cd9d0712240a540b17 Mon Sep 17 00:00:00 2001 From: Stefaan Lippens Date: Tue, 18 Jun 2024 09:24:16 +0200 Subject: [PATCH 3/3] Issue #573 avoid repeated warnings from get_bands_from_item_assets e.g. when eo extension is not declared --- openeo/metadata.py | 9 ++++++--- tests/test_metadata.py | 7 +++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/openeo/metadata.py b/openeo/metadata.py index d660b753c..4c69129c1 100644 --- a/openeo/metadata.py +++ b/openeo/metadata.py @@ -1,5 +1,6 @@ from __future__ import annotations +import functools import logging import warnings from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Tuple, Union @@ -637,7 +638,7 @@ def get_bands_from_eo_bands(self, eo_bands: List[Union[dict, pystac.extensions.e return [self._get_band_from_eo_bands_item(band) for band in eo_bands] def _get_bands_from_item_asset( - self, item_asset: pystac.extensions.item_assets.AssetDefinition + self, item_asset: pystac.extensions.item_assets.AssetDefinition, *, _warn: Callable[[str], None] = _log.warning ) -> Union[List[Band], None]: """Get bands from a STAC 'item_assets' asset definition.""" if _PYSTAC_1_9_EXTENSION_INTERFACE and item_asset.ext.has("eo"): @@ -646,7 +647,7 @@ def _get_bands_from_item_asset( elif "eo:bands" in item_asset.properties: # TODO: skip this in strict mode? if _PYSTAC_1_9_EXTENSION_INTERFACE: - _log.warning("Extracting band info from 'eo:bands' metadata, but 'eo' STAC extension was not declared.") + _warn("Extracting band info from 'eo:bands' metadata, but 'eo' STAC extension was not declared.") return self.get_bands_from_eo_bands(item_asset.properties["eo:bands"]) def get_bands_from_item_assets( @@ -662,8 +663,10 @@ def get_bands_from_item_assets( :param item_assets: a STAC `item_assets` mapping """ bands = set() + # Trick to just warn once per collection + _warn = functools.lru_cache()(_log.warning) for item_asset in item_assets.values(): - asset_bands = self._get_bands_from_item_asset(item_asset) + asset_bands = self._get_bands_from_item_asset(item_asset, _warn=_warn) if asset_bands: bands.update(asset_bands) return bands diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 92bfbeecb..afe0c687c 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -867,5 +867,8 @@ def test_metadata_from_stac_collection_bands_from_item_assets(test_data, tmp_pat "vapour_pressure", ] - if not eo_extension_is_declared: - assert "Extracting band info from 'eo:bands' metadata, but 'eo' STAC extension was not declared." in caplog.text + warn_count = sum( + "Extracting band info from 'eo:bands' metadata, but 'eo' STAC extension was not declared." in m + for m in caplog.messages + ) + assert warn_count == (0 if eo_extension_is_declared else 1)