Skip to content

Commit

Permalink
fixing a bug in thredds loader that limited crawling ability
Browse files Browse the repository at this point in the history
  • Loading branch information
dchandan committed Jan 17, 2024
1 parent 2bc22dc commit 6d65466
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 13 deletions.
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@ DOCKER_TAG := ghcr.io/crim-ca/stac-populator:$(APP_VERSION)
IMP_DIR := $(APP_NAME)/implementations
STAC_HOST ?= http://localhost:8880/stac
# CATALOG = https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/testdata/xclim/cmip6/catalog.html
CATALOG = https://daccs.cs.toronto.edu/twitcher/ows/proxy/thredds/catalog/datasets/CMIP6/catalog.html
# CATALOG = https://daccs.cs.toronto.edu/twitcher/ows/proxy/thredds/catalog/datasets/CMIP6/catalog.html
# CATALOG = https://daccs.cs.toronto.edu/twitcher/ows/proxy/thredds/catalog/datasets/CMIP6/CMIP/NOAA-GFDL/catalog.html
# CATALOG = https://daccs.cs.toronto.edu/twitcher/ows/proxy/thredds/catalog/datasets/CMIP6/CMIP/AS-RCEC/catalog.html

# CATALOG = https://daccs.cs.toronto.edu/twitcher/ows/proxy/thredds/catalog/datasets/CMIP6/CMIP/NUIST/catalog.html
CATALOG = https://daccs.cs.toronto.edu/twitcher/ows/proxy/thredds/catalog/datasets/CMIP6/CMIP/MIROC/catalog.html

PYESSV_ARCHIVE_DIR ?= ~/.esdoc/pyessv-archive
PYESSV_ARCHIVE_REF ?= https://github.com/ES-DOC/pyessv-archive

Expand Down
25 changes: 16 additions & 9 deletions STACpopulator/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class THREDDSCatalog(TDSCatalog):
Because of how :class:`TDSCatalog` automatically loads and parses right away from ``__init__`` call,
we need to hack around how the ``session`` attribute gets defined.
"""

def __init__(self, catalog_url: str, session: Optional[Session] = None) -> None:
self._session = session
super().__init__(catalog_url)
Expand Down Expand Up @@ -91,7 +92,8 @@ def __init__(
:type depth: int, optional
"""
super().__init__()
self._depth = depth if depth is not None else 1000
self._max_depth = depth if depth is not None else 1000
self._depth = 0

self.thredds_catalog_URL = self.validate_catalog_url(thredds_catalog_url)

Expand Down Expand Up @@ -134,18 +136,23 @@ def reset(self):
"""Reset the generator."""
self.catalog_head = self.catalog

def __iter__(self) -> Iterator[Tuple[str, MutableMapping[str, Any]]]:
def __iter__(self) -> Iterator[Tuple[str, str, MutableMapping[str, Any]]]:
"""Return a generator walking a THREDDS data catalog for datasets."""

if self._depth > self._max_depth:
return

if self.catalog_head.datasets.items():
for item_name, ds in self.catalog_head.datasets.items():
attrs = self.extract_metadata(ds)
yield item_name, attrs

if self._depth > 0:
for name, ref in self.catalog_head.catalog_refs.items():
self.catalog_head = ref.follow()
self._depth -= 1
yield from self
yield item_name, ds.url_path, attrs
# yield item_name, ds.url_path, []

for name, ref in self.catalog_head.catalog_refs.items():
self.catalog_head = ref.follow()
self._depth -= 1
yield from self
self._depth += 1

def __getitem__(self, dataset):
return self.catalog.datasets[dataset]
Expand Down
10 changes: 7 additions & 3 deletions STACpopulator/populator_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from STACpopulator.models import AnyGeometry
from STACpopulator.stac_utils import get_logger, load_config, url_validate


LOGGER = get_logger(__name__)


Expand Down Expand Up @@ -144,9 +143,12 @@ def publish_stac_collection(self, collection_data: dict[str, Any]) -> None:
post_stac_collection(self.stac_host, collection_data, self.update, session=self._session)

def ingest(self) -> None:
counter = 0
LOGGER.info("Data ingestion")
for item_name, item_data in self._ingest_pipeline:
LOGGER.info(f"Creating STAC representation for {item_name}")
for item_name, item_loc, item_data in self._ingest_pipeline:
LOGGER.info(f"New data item: {item_name}")
if item_loc:
LOGGER.info(f"Data location: {item_loc}")
stac_item = self.create_stac_item(item_name, item_data)
if stac_item:
post_stac_item(
Expand All @@ -157,3 +159,5 @@ def ingest(self) -> None:
update=self.update,
session=self._session,
)
counter += 1
LOGGER.info(f"Processed {counter} data items")

0 comments on commit 6d65466

Please sign in to comment.