diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c317488..f8866611 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Changed - Item IDs no longer contain the production datetime ([#88](https://github.com/stactools-packages/modis/pull/88)) +- Make XML metadata optional - extract metadata from HDF file if XML is not available ([#XX](https://github.com/stactools-packages/modis/pull/XX)) ### Fixed diff --git a/src/stactools/modis/builder.py b/src/stactools/modis/builder.py index 8304f9b0..961f63f1 100644 --- a/src/stactools/modis/builder.py +++ b/src/stactools/modis/builder.py @@ -110,7 +110,13 @@ def add_hdf_or_xml_href( xml_href = f"{href}.xml" else: raise ValueError(f"Invalid HDF or XML href: {href}") - self.add_xml_asset(xml_href) + + # Add XML asset if it exists, otherwise extract metadata from HDF + if os.path.exists(xml_href): + self.add_xml_asset(xml_href) + else: + self.metadata = Metadata.from_hdf_href(hdf_href, self.read_href_modifier) + self.add_hdf_asset( hdf_href, cog_directory=cog_directory, create_cogs=create_cogs ) diff --git a/src/stactools/modis/metadata.py b/src/stactools/modis/metadata.py index def1f084..e1f6b052 100644 --- a/src/stactools/modis/metadata.py +++ b/src/stactools/modis/metadata.py @@ -5,6 +5,7 @@ import fsspec import numpy as np +import rasterio from lxml import etree from rasterio import Affine from rasterio.crs import CRS @@ -232,6 +233,8 @@ def from_cog_tags(cls, cog_tags: Dict[str, str]) -> "Metadata": geometry, bbox = cls._geometry_and_bbox( collection, horizontal_tile, vertical_tile ) + qa_percent = cog_tags.get("QAPERCENTNOTPRODUCEDCLOUD") + qa_percent_not_produced_cloud = int(qa_percent) if qa_percent else None return Metadata( id=os.path.splitext(cog_tags["LOCALGRANULEID"])[0], product=product, @@ -242,7 +245,7 @@ def from_cog_tags(cls, cog_tags: Dict[str, str]) -> "Metadata": end_datetime=end_datetime, created=None, updated=None, - qa_percent_not_produced_cloud=int(cog_tags["QAPERCENTNOTPRODUCEDCLOUD"]), + qa_percent_not_produced_cloud=qa_percent_not_produced_cloud, qa_percent_cloud_cover=None, horizontal_tile=horizontal_tile, vertical_tile=vertical_tile, @@ -252,6 +255,30 @@ def from_cog_tags(cls, cog_tags: Dict[str, str]) -> "Metadata": collection=collection, ) + @classmethod + def from_hdf_href( + cls, href: str, read_href_modifier: Optional[ReadHrefModifier] = None + ) -> "Metadata": + """Reads metadata from an HDF file when XML is not available. + + Args: + href (str): The href of the HDF file + read_href_modifier (Optional[Callable[[str], str]]): Optional + function to modify the read href + + Returns: + Metadata: Information that will map to Item attributes. + """ + if read_href_modifier: + read_href = read_href_modifier(href) + else: + read_href = href + + with rasterio.open(read_href) as dataset: + hdf_tags = dataset.tags() + + return cls.from_cog_tags(hdf_tags) + @property def datetime(self) -> Optional[datetime.datetime]: """Returns a single nominal datetime for this metadata file. diff --git a/tests/test_stac.py b/tests/test_stac.py index da6e9658..77e90c1b 100644 --- a/tests/test_stac.py +++ b/tests/test_stac.py @@ -207,6 +207,25 @@ def test_raster_footprint_geometry() -> None: item.validate() +def test_create_item_from_hdf_without_xml(tmp_path: Path) -> None: + hdf_file = "MOD10A2.A2022033.h09v05.061.2022042050729.hdf" + source_hdf_path = test_data.get_path(f"data-files/{hdf_file}") + + temp_hdf_path = tmp_path / hdf_file + shutil.copyfile(source_hdf_path, temp_hdf_path) + + temp_xml_path = tmp_path / f"{hdf_file}.xml" + assert not temp_xml_path.exists() + + item = stactools.modis.stac.create_item(str(temp_hdf_path)) + + assert item is not None + assert item.id.startswith("MOD10A2.A2022033.h09v05") + assert "hdf" in item.assets + assert "metadata" not in item.assets + item.validate() + + @pytest.mark.parametrize("file_name", PROJECTION_EDGE_FILES) def test_raster_footprint_at_projection_edge(file_name: str) -> None: path = test_data.get_path(file_name)