From 112fc4c91aae85a6bffb92d2299e055f4bc102a4 Mon Sep 17 00:00:00 2001 From: David Huard Date: Thu, 16 Nov 2023 12:24:55 -0500 Subject: [PATCH 1/6] Add `ncattrs` function to get nc attributes without the need for siphon. --- CHANGES.md | 2 ++ STACpopulator/stac_utils.py | 50 ++++++++++++++++++++++++++++++ tests/test_standalone_stac_item.py | 37 ++++------------------ 3 files changed, 58 insertions(+), 31 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 0b6b22d..64ae4ab 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,6 +4,8 @@ +* New function `ncattrs` to get attributes from netCDF files hosted on a THREDDS server. + ## [0.2.0](https://github.com/crim-ca/stac-populator/tree/0.2.0) (2023-11-10) diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index c8c8aaa..7c45287 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -79,6 +79,56 @@ def collection2literal(collection, property="label"): return Literal[terms] +def thredds_catalog_attrs(url: str) -> dict: + """Return attributes from the catalog.xml THREDDS server response.""" + import xmltodict + import requests + + xml = requests.get(url).text + + raw = xmltodict.parse( + xml, + process_namespaces=True, + namespaces={ + "http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0": None, + "https://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0": None, + }, + ) + return raw + + +def ncattrs(url: str) -> dict: + """Return attributes from a THREDDS netCDF dataset.""" + import requests + import xncml + import urllib + + pr = urllib.parse.urlparse(url) + + parts = url.split("/") + nc = parts[-1] + + # Get catalog information about available services + catalog = "/".join(parts[:-1]) + "/catalog.xml" + cattrs = thredds_catalog_attrs(catalog)["catalog"] + + cid = cattrs["dataset"]["@ID"] + + # Get service URLs for the dataset + access_urls = {} + for service in cattrs["service"]["service"]: + access_urls[service["@serviceType"]] = f'{pr.scheme}://{pr.netloc}{service["@base"]}{cid}/{nc}' + + # Get dataset attributes + r = requests.get(access_urls["NCML"]) + attrs = xncml.Dataset.from_text(r.text).to_cf_dict() + attrs["attributes"] = numpy_to_python_datatypes(attrs["attributes"]) + + # Include service attributes + attrs["access_urls"] = access_urls + return attrs + + def ncattrs_to_geometry(attrs: MutableMapping[str, Any]) -> MutableMapping[str, Any]: """Create Polygon geometry from CFMetadata.""" attrs = attrs["groups"]["CFMetadata"]["attributes"] diff --git a/tests/test_standalone_stac_item.py b/tests/test_standalone_stac_item.py index 3163cd5..0ab4e33 100644 --- a/tests/test_standalone_stac_item.py +++ b/tests/test_standalone_stac_item.py @@ -1,16 +1,15 @@ import json import pytest -import requests import os import tempfile from urllib.parse import quote -import xncml - from STACpopulator.implementations.CMIP6_UofT.add_CMIP6 import CMIP6ItemProperties, CMIP6populator from STACpopulator.input import THREDDSLoader from STACpopulator.models import GeoJSONPolygon -from STACpopulator.stac_utils import STAC_item_from_metadata +from STACpopulator.stac_utils import STAC_item_from_metadata, ncattrs +from pystac.validation import JsonSchemaSTACValidator +from pystac import STACObjectType CUR_DIR = os.path.dirname(__file__) @@ -21,35 +20,11 @@ def quote_none_safe(url): @pytest.mark.online def test_standalone_stac_item_thredds_ncml(): - thredds_url = "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds" - thredds_path = "birdhouse/testdata/xclim/cmip6" - thredds_nc = "sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc" - thredds_catalog = f"{thredds_url}/catalog/{thredds_path}/catalog.html" - thredds_ds = f"{thredds_path}/{thredds_nc}" - thredds_ncml_url = ( - f"{thredds_url}/ncml/{thredds_path}/{thredds_nc}" - f"?catalog={quote_none_safe(thredds_catalog)}&dataset={quote_none_safe(thredds_ds)}" - ) - - # FIXME: avoid hackish workarounds - data = requests.get(thredds_ncml_url).text - attrs = xncml.Dataset.from_text(data).to_cf_dict() - attrs["access_urls"] = { # FIXME: all following should be automatically added, but they are not! - "HTTPServer": f"{thredds_url}/fileServer/{thredds_path}/{thredds_nc}", - "OPENDAP": f"{thredds_url}/dodsC/{thredds_path}/{thredds_nc}", - "WCS": f"{thredds_url}/wcs/{thredds_path}/{thredds_nc}?service=WCS&version=1.0.0&request=GetCapabilities", - "WMS": f"{thredds_url}/wms/{thredds_path}/{thredds_nc}?service=WMS&version=1.3.0&request=GetCapabilities", - "NetcdfSubset": f"{thredds_url}/ncss/{thredds_path}/{thredds_nc}/dataset.html", - } - + url = "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc" + attrs = ncattrs(url) stac_item_id = CMIP6populator.make_cmip6_item_id(attrs["attributes"]) stac_item = STAC_item_from_metadata(stac_item_id, attrs, CMIP6ItemProperties, GeoJSONPolygon) - - ref_file = os.path.join(CUR_DIR, "data/stac_item_testdata_xclim_cmip6_ncml.json") - with open(ref_file, mode="r", encoding="utf-8") as ff: - reference = json.load(ff) - - assert stac_item.to_dict() == reference + assert stac_item.validate() class MockedNoSTACUpload(CMIP6populator): From e758feab71d14bfa77cbb293ddb3d788ed1a34b4 Mon Sep 17 00:00:00 2001 From: David Huard Date: Thu, 16 Nov 2023 13:39:52 -0500 Subject: [PATCH 2/6] use parametrize to feed url to test. Move imports to top of file. --- STACpopulator/stac_utils.py | 10 ++++------ tests/conftest.py | 1 + tests/test_standalone_stac_item.py | 9 ++++++--- 3 files changed, 11 insertions(+), 9 deletions(-) create mode 100644 tests/conftest.py diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 7c45287..6742d7a 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -5,6 +5,10 @@ import re import sys from typing import Any, Literal, MutableMapping +import requests +import xncml +import xmltodict +import urllib import numpy as np import pystac @@ -81,9 +85,6 @@ def collection2literal(collection, property="label"): def thredds_catalog_attrs(url: str) -> dict: """Return attributes from the catalog.xml THREDDS server response.""" - import xmltodict - import requests - xml = requests.get(url).text raw = xmltodict.parse( @@ -99,9 +100,6 @@ def thredds_catalog_attrs(url: str) -> dict: def ncattrs(url: str) -> dict: """Return attributes from a THREDDS netCDF dataset.""" - import requests - import xncml - import urllib pr = urllib.parse.urlparse(url) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..5871ed8 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1 @@ +import pytest diff --git a/tests/test_standalone_stac_item.py b/tests/test_standalone_stac_item.py index 0ab4e33..8b07ad4 100644 --- a/tests/test_standalone_stac_item.py +++ b/tests/test_standalone_stac_item.py @@ -18,13 +18,16 @@ def quote_none_safe(url): return quote(url, safe="") +@pytest.mark.parametrize("url", ["https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/birdhouse/testdata/xclim/cmip6" + "/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc"]) @pytest.mark.online -def test_standalone_stac_item_thredds_ncml(): - url = "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc" +def test_standalone_stac_item_thredds_ncml(url): attrs = ncattrs(url) stac_item_id = CMIP6populator.make_cmip6_item_id(attrs["attributes"]) stac_item = STAC_item_from_metadata(stac_item_id, attrs, CMIP6ItemProperties, GeoJSONPolygon) - assert stac_item.validate() + schemas_validated = stac_item.validate() + assert len(schemas_validated) >= 1 + assert "item.json" in schemas_validated[0] class MockedNoSTACUpload(CMIP6populator): From a10a810b7371b73720e18a2f7a9cb7250a65ecdd Mon Sep 17 00:00:00 2001 From: David Huard Date: Thu, 16 Nov 2023 14:33:14 -0500 Subject: [PATCH 3/6] Add support in ncattrs for URLs with query parameters. --- STACpopulator/stac_utils.py | 27 ++++++++++++++++++++++----- tests/conftest.py | 1 - tests/test_standalone_stac_item.py | 6 ++++-- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 6742d7a..dbf961e 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -102,20 +102,31 @@ def ncattrs(url: str) -> dict: """Return attributes from a THREDDS netCDF dataset.""" pr = urllib.parse.urlparse(url) + scheme, netloc, path, params, query, frag = pr - parts = url.split("/") - nc = parts[-1] + # URL is a reference to a catalog item + if query: + q = urllib.parse.parse_qs(query) + nc = q["dataset"][0].split("/")[-1] + + if path.endswith("catalog.html"): + path = path.replace("catalog.html", "catalog.xml") + else: + nc = path.split("/")[-1] + path = path.replace(nc, "catalog.xml") # Get catalog information about available services - catalog = "/".join(parts[:-1]) + "/catalog.xml" + catalog = urllib.parse.urlunparse((scheme, netloc, path, "", query, "")) cattrs = thredds_catalog_attrs(catalog)["catalog"] - cid = cattrs["dataset"]["@ID"] + if not query: + cid += f"/{nc}" + # Get service URLs for the dataset access_urls = {} for service in cattrs["service"]["service"]: - access_urls[service["@serviceType"]] = f'{pr.scheme}://{pr.netloc}{service["@base"]}{cid}/{nc}' + access_urls[service["@serviceType"]] = f'{scheme}://{netloc}{service["@base"]}{cid}' # Get dataset attributes r = requests.get(access_urls["NCML"]) @@ -263,9 +274,12 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop media_types = { "HTTPServer": "application/x-netcdf", "OPENDAP": pystac.MediaType.HTML, + "NCML": pystac.MediaType.XML, "WCS": pystac.MediaType.XML, "WMS": pystac.MediaType.XML, "NetcdfSubset": "application/x-netcdf", + "ISO": pystac.MediaType.XML, + "UDDC": pystac.MediaType.HTML } asset_roles = { @@ -274,4 +288,7 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop "WCS": ["data"], "WMS": ["visual"], "NetcdfSubset": ["data"], + "NCML": ["metadata"], + "ISO": ["metadata"], + "UDDC": ["metadata"] } diff --git a/tests/conftest.py b/tests/conftest.py index 5871ed8..e69de29 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1 +0,0 @@ -import pytest diff --git a/tests/test_standalone_stac_item.py b/tests/test_standalone_stac_item.py index 8b07ad4..5d243ce 100644 --- a/tests/test_standalone_stac_item.py +++ b/tests/test_standalone_stac_item.py @@ -12,14 +12,16 @@ from pystac import STACObjectType CUR_DIR = os.path.dirname(__file__) +TEST_NC_URLS = ["https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/birdhouse/testdata/xclim/cmip6" + "/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", + ] def quote_none_safe(url): return quote(url, safe="") -@pytest.mark.parametrize("url", ["https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/birdhouse/testdata/xclim/cmip6" - "/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc"]) +@pytest.mark.parametrize("url", TEST_NC_URLS) @pytest.mark.online def test_standalone_stac_item_thredds_ncml(url): attrs = ncattrs(url) From b8b2ffaa76baf61b1017adcfeeaf1107ed8ade40 Mon Sep 17 00:00:00 2001 From: David Huard Date: Thu, 16 Nov 2023 14:36:09 -0500 Subject: [PATCH 4/6] add test_stac_utils --- tests/test_stac_utils.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 tests/test_stac_utils.py diff --git a/tests/test_stac_utils.py b/tests/test_stac_utils.py new file mode 100644 index 0000000..0a403d2 --- /dev/null +++ b/tests/test_stac_utils.py @@ -0,0 +1,27 @@ +import pytest +from STACpopulator.stac_utils import thredds_catalog_attrs, ncattrs + +TEST_NC_URLS = ["https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/birdhouse/testdata/xclim/cmip6" + "/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", + "https://psl.noaa.gov/thredds/catalog/Datasets/20thC_ReanV2/Monthlies/gaussian/monolevel/catalog.html?dataset=Datasets/20thC_ReanV2/Monthlies/gaussian/monolevel/air.2m.mon.mean.nc"] + +TEST_CATALOG_URLS = ["https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/birdhouse/testdata/xclim/cmip6/catalog.xml"] + + + +@pytest.mark.parametrize("url", TEST_CATALOG_URLS) +@pytest.mark.online +def test_thredds_catalog_attrs(url): + attrs = thredds_catalog_attrs(url) + assert "service" in attrs["catalog"] + assert "dataset" in attrs["catalog"] + assert isinstance(attrs["catalog"]["service"]["service"], list) + + +@pytest.mark.parametrize("url", TEST_NC_URLS) +@pytest.mark.online +def test_ncattrs(url): + url = "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc" + attrs = ncattrs(url) + assert "access_urls" in attrs + assert "attributes" in attrs From ab8264be92fe7f54d4bdad8a6c2b84500be08b0a Mon Sep 17 00:00:00 2001 From: David Huard Date: Thu, 16 Nov 2023 14:44:08 -0500 Subject: [PATCH 5/6] add pystac[validation] to dev dependencies --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 0a1a65e..c3524df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,6 +82,7 @@ dev = [ "coverage", "responses", "bump-my-version", + "pystac[validation]" ] [tool.pytest.ini_options] From 658e4b47801d60634e66c918cafd3105b4ec6889 Mon Sep 17 00:00:00 2001 From: David Huard Date: Fri, 17 Nov 2023 16:00:36 -0500 Subject: [PATCH 6/6] Store server responses for local tests. Create stac_item decription on disk for test comparisons. Normalize serviceType using Enum. Add utility functions to bypass siphon. --- STACpopulator/stac_utils.py | 131 ++++-- tests/conftest.py | 74 ++++ ...n_CCCma-CanESM5_ssp245_r13i1p2f1_2020.json | 1 + tests/data/responses.yaml | 379 ++++++++++++++++++ tests/test_stac_utils.py | 60 ++- tests/test_standalone_stac_item.py | 22 +- 6 files changed, 628 insertions(+), 39 deletions(-) create mode 100644 tests/data/references/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.json create mode 100644 tests/data/responses.yaml diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index f995b66..3a82759 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -8,13 +8,13 @@ import requests import xncml import xmltodict -import urllib - +import urllib.parse +from pathlib import Path import numpy as np import pystac import yaml from colorlog import ColoredFormatter - +from enum import Enum from STACpopulator.models import STACItem LOGGER = logging.getLogger(__name__) @@ -84,7 +84,13 @@ def collection2literal(collection, property="label"): def thredds_catalog_attrs(url: str) -> dict: - """Return attributes from the catalog.xml THREDDS server response.""" + """Return attributes from the catalog.xml THREDDS server response. + + Parameters + ---------- + url : str + Link to the THREDDS catalog URL. + """ xml = requests.get(url).text raw = xmltodict.parse( @@ -98,8 +104,8 @@ def thredds_catalog_attrs(url: str) -> dict: return raw -def ncattrs(url: str) -> dict: - """Return attributes from a THREDDS netCDF dataset.""" +def catalog_url(url: str) -> (str, str): + """Given a THREDDS link to a netCDF file, return a link to its catalog and the file name.""" pr = urllib.parse.urlparse(url) scheme, netloc, path, params, query, frag = pr @@ -111,30 +117,76 @@ def ncattrs(url: str) -> dict: if path.endswith("catalog.html"): path = path.replace("catalog.html", "catalog.xml") + + # Ideally we would create targeted queries for one dataset, but we're missing the dataset name. + # query = "" else: nc = path.split("/")[-1] path = path.replace(nc, "catalog.xml") # Get catalog information about available services catalog = urllib.parse.urlunparse((scheme, netloc, path, "", query, "")) - cattrs = thredds_catalog_attrs(catalog)["catalog"] - cid = cattrs["dataset"]["@ID"] - if not query: - cid += f"/{nc}" + return catalog, nc + + +def access_urls(catalog_url: str, ds: str) -> dict: + """Return THREDDS endpoints for the catalog and dataset. + + Parameters + ---------- + catalog_url : str + URI to the THREDDS catalog. + ds : str + Dataset path relative to the catalog. + """ + # Get catalog information about available services + cattrs = thredds_catalog_attrs(catalog_url)["catalog"] + + pr = urllib.parse.urlparse(str(catalog_url)) + + cid = cattrs["dataset"]["@ID"] + if not pr.query: + cid += f"/{ds}" # Get service URLs for the dataset access_urls = {} for service in cattrs["service"]["service"]: - access_urls[service["@serviceType"]] = f'{scheme}://{netloc}{service["@base"]}{cid}' + type = ServiceType.from_value(service["@serviceType"]).value + access_urls[type] = f'{pr.scheme}://{pr.netloc}{service["@base"]}{cid}' + + return access_urls + + +def ncml_attrs(ncml_url: str) -> dict: + """Return attributes from the NcML response of a THREDDS dataset. + + Parameters + ---------- + ncml_url : str + URI to the NcML dataset description, either a remote server URL or path to a local xml file. + """ + xml = requests.get(ncml_url).text # Get dataset attributes - r = requests.get(access_urls["NCML"]) - attrs = xncml.Dataset.from_text(r.text).to_cf_dict() + attrs = xncml.Dataset.from_text(xml).to_cf_dict() attrs["attributes"] = numpy_to_python_datatypes(attrs["attributes"]) + return attrs + + +def ds_attrs(url: str) -> dict: + """Return attributes from the NcML response of a THREDDS dataset and access URLs from the THREDDS server. + + Parameters + ---------- + url : str + URL to the THREDDS netCDF file + """ + urls = access_urls(*catalog_url(url)) + attrs = ncml_attrs(urls["NcML"]) # Include service attributes - attrs["access_urls"] = access_urls + attrs["access_urls"] = urls return attrs @@ -263,18 +315,10 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop return item -asset_name_remaps = { - "httpserver_service": "HTTPServer", - "opendap_service": "OPENDAP", - "wcs_service": "WCS", - "wms_service": "WMS", - "nccs_service": "NetcdfSubset", -} - media_types = { "HTTPServer": "application/x-netcdf", - "OPENDAP": pystac.MediaType.HTML, - "NCML": pystac.MediaType.XML, + "OpenDAP": pystac.MediaType.HTML, + "NcML": pystac.MediaType.XML, "WCS": pystac.MediaType.XML, "WMS": pystac.MediaType.XML, "NetcdfSubset": "application/x-netcdf", @@ -284,11 +328,46 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop asset_roles = { "HTTPServer": ["data"], - "OPENDAP": ["data"], + "OpenDAP": ["data"], "WCS": ["data"], "WMS": ["visual"], "NetcdfSubset": ["data"], - "NCML": ["metadata"], + "NcML": ["metadata"], "ISO": ["metadata"], "UDDC": ["metadata"] } + + +class ServiceType(Enum): + adde = "ADDE" + dap4 = "DAP4" + dods = "DODS" # same as OpenDAP + opendap = "OpenDAP" + opendapg = "OpenDAPG" + netcdfsubset = "NetcdfSubset" + cdmremote = "CdmRemote" + cdmfeature = "CdmFeature" + ncjson = "ncJSON" + h5service = "H5Service" + httpserver = "HTTPServer" + ftp = "FTP" + gridftp = "GridFTP" + file = "File" + iso = "ISO" + las = "LAS" + ncml = "NcML" + uddc = "UDDC" + wcs = "WCS" + wms = "WMS" + wsdl = "WSDL" + webform = "WebForm" + catalog = "Catalog" + compound = "Compound" + resolver = "Resolver" + thredds = "THREDDS" + + @classmethod + def from_value(cls, value): + """Return value irrespective of case.""" + return cls[value.lower()] + diff --git a/tests/conftest.py b/tests/conftest.py index e69de29..05885d0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -0,0 +1,74 @@ +import urllib.parse +import json +import pytest +import responses +from responses import _recorder +from pathlib import Path +import requests +from STACpopulator.stac_utils import catalog_url, access_urls, ds_attrs +from STACpopulator.implementations.CMIP6_UofT.add_CMIP6 import CMIP6ItemProperties, CMIP6populator +from STACpopulator.models import GeoJSONPolygon +from STACpopulator.stac_utils import STAC_item_from_metadata + + +URLS = ["https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/birdhouse/testdata/xclim/cmip6" + "/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", + ] +URLS = ["https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/birdhouse/testdata/xclim/cmip6" + "/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", + "https://psl.noaa.gov/thredds/catalog/Datasets/20thC_ReanV2/Monthlies/gaussian/monolevel/catalog.html?dataset=Datasets/20thC_ReanV2/Monthlies/gaussian/monolevel/air.2m.mon.mean.nc"] + +DATA = Path(__file__).parent / "data" + + +def reference_path_from_url(url): + """Return local path to json dict representation of STAC item.""" + catalog_link, nc = catalog_url(url) + nc = Path(nc) + parts = catalog_link.split("/") + return DATA.joinpath("references", parts[-2], nc.with_suffix(".json")) + + +@_recorder.record(file_path=DATA / "responses.yaml") +def store_responses(): + """Store server responses. + + Run this if new URLs are added, if remote THREDDS servers are updated or their configuration changed. + """ + for url in URLS: + # Request to catalog link + catalog_link, nc = catalog_url(url) + requests.get(catalog_link) + + # Request to NcML link + ncml_link = access_urls(catalog_link, nc)["NCML"] + requests.get(ncml_link) + + +@responses.activate +def create_reference_items(overwrite=False): + """Store json representation of STAC item dict created from stored XML responses. + + - Run after store_responses() to update the expected STAC item representation. + - Run if the STAC item representation changes. + """ + # Get server responses from files stored on disk + responses._add_from_file(file_path=DATA / "responses.yaml") + + for url in URLS: + # Request to catalog link + catalog_link, nc = catalog_url(url) + + # Request to NcML link + ncml_link = access_urls(catalog_link, nc)["NcML"] + + reference_path = reference_path_from_url(url) + + if overwrite or not reference_path.exists(): + reference_path.parent.mkdir(parents=True, exist_ok=True) + attrs = ds_attrs(ncml_link, catalog_link) + + if "cmip6" in url: + stac_item_id = CMIP6populator.make_cmip6_item_id(attrs["attributes"]) + stac_item = STAC_item_from_metadata(stac_item_id, attrs, CMIP6ItemProperties, GeoJSONPolygon) + reference_path.write_text(json.dumps(stac_item.to_dict())) diff --git a/tests/data/references/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.json b/tests/data/references/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.json new file mode 100644 index 0000000..65fb16c --- /dev/null +++ b/tests/data/references/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.json @@ -0,0 +1 @@ +{"type": "Feature", "stac_version": "1.0.0", "id": "ScenarioMIP_CCCma_CanESM5_ssp245_r13i1p2f1_SImon_siconc_gn", "properties": {"start_datetime": "2019-12-06T12:00:00Z", "end_datetime": "2020-11-04T12:00:00Z", "datetime": null, "cmip6:Conventions": "CF-1.7 CMIP-6.2", "cmip6:activity_id": "ScenarioMIP", "cmip6:creation_date": "2019-09-25T23:01:33Z", "cmip6:data_specs_version": "01.00.30", "cmip6:experiment": "update of RCP4.5 based on SSP2", "cmip6:experiment_id": "ssp245", "cmip6:frequency": "mon", "cmip6:further_info_url": "https://furtherinfo.es-doc.org/CMIP6.CCCma.CanESM5.ssp245.none.r13i1p2f1", "cmip6:grid_label": "gn", "cmip6:institution": "Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada, Victoria, BC V8P 5C2, Canada", "cmip6:institution_id": "CCCma", "cmip6:nominal_resolution": "100 km", "cmip6:realm": ["seaIce"], "cmip6:source": "CanESM5 (2019): \naerosol: interactive\natmos: CanAM5 (T63L49 native atmosphere, T63 Linear Gaussian Grid; 128 x 64 longitude/latitude; 49 levels; top level 1 hPa)\natmosChem: specified oxidants for aerosols\nland: CLASS3.6/CTEM1.2\nlandIce: specified ice sheets\nocean: NEMO3.4.1 (ORCA1 tripolar grid, 1 deg with refinement to 1/3 deg within 20 degrees of the equator; 361 x 290 longitude/latitude; 45 vertical levels; top grid cell 0-6.19 m)\nocnBgchem: Canadian Model of Ocean Carbon (CMOC); NPZD ecosystem with OMIP prescribed carbonate chemistry\nseaIce: LIM2", "cmip6:source_id": "CanESM5", "cmip6:source_type": ["AOGCM"], "cmip6:sub_experiment": "none", "cmip6:sub_experiment_id": "none", "cmip6:table_id": "SImon", "cmip6:variable_id": "siconc", "cmip6:variant_label": "r13i1p2f1", "cmip6:initialization_index": 1, "cmip6:physics_index": 2, "cmip6:realization_index": 13, "cmip6:forcing_index": 1, "cmip6:tracking_id": "hdl:21.14100/9e4f804b-c161-44fa-acd1-c2e94e220c95", "cmip6:version": "v20190429", "cmip6:product": "model-output", "cmip6:license": "CMIP6 model data produced by The Government of Canada (Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada) is licensed under a Creative Commons Attribution ShareAlike 4.0 International License (https://creativecommons.org/licenses). Consult https://pcmdi.llnl.gov/CMIP6/TermsOfUse for terms of use governing CMIP6 output, including citation requirements and proper acknowledgment. Further information about this data, including some limitations, can be found via the further_info_url (recorded as a global attribute in this file) and at https:///pcmdi.llnl.gov/. The data producers and data providers make no warranty, either express or implied, including, but not limited to, warranties of merchantability and fitness for a particular purpose. All liabilities arising from the supply of the information (including any liability arising in negligence) are excluded to the fullest extent permitted by law.", "cmip6:grid": "ORCA1 tripolar grid, 1 deg with refinement to 1/3 deg within 20 degrees of the equator; 361 x 290 longitude/latitude; 45 vertical levels; top grid cell 0-6.19 m", "cmip6:mip_era": "CMIP6"}, "geometry": {"type": "Polygon", "coordinates": [[[0.049800001084804535, -78.39350128173828], [0.049800001084804535, 89.74176788330078], [359.99493408203125, 89.74176788330078], [359.99493408203125, -78.39350128173828], [0.049800001084804535, -78.39350128173828]]]}, "links": [{"rel": "source", "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", "type": "application/x-netcdf", "title": "birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc"}], "assets": {"HTTPServer": {"href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", "type": "application/x-netcdf", "roles": ["data"]}, "OpenDAP": {"href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", "type": "text/html", "roles": ["data"]}, "NcML": {"href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/ncml/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", "type": "application/xml", "roles": ["metadata"]}, "UDDC": {"href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/uddc/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", "type": "text/html", "roles": ["metadata"]}, "ISO": {"href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/iso/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", "type": "application/xml", "roles": ["metadata"]}, "WCS": {"href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wcs/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", "type": "application/xml", "roles": ["data"]}, "WMS": {"href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wms/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", "type": "application/xml", "roles": ["visual"]}, "NetcdfSubset": {"href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/ncss/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", "type": "application/x-netcdf", "roles": ["data"]}}, "bbox": [0.049800001084804535, -78.39350128173828, 359.99493408203125, 89.74176788330078], "stac_extensions": []} \ No newline at end of file diff --git a/tests/data/responses.yaml b/tests/data/responses.yaml new file mode 100644 index 0000000..ce8286b --- /dev/null +++ b/tests/data/responses.yaml @@ -0,0 +1,379 @@ +responses: +- response: + auto_calculate_content_length: false + body: "\r\n\r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n all\r\n \r\n \r\n 3.583\r\n 2021-02-15T14:37:21.508Z\r\n \r\n \r\n 865.3\r\n 2021-02-15T14:37:21.496Z\r\n \r\n \r\ + \n\r\n" + content_type: text/plain + method: GET + status: 200 + url: https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/birdhouse/testdata/xclim/cmip6/catalog.xml +- response: + auto_calculate_content_length: false + body: "\r\n\r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n all\r\n \r\n \r\n 3.583\r\n 2021-02-15T14:37:21.508Z\r\n \r\n \r\n 865.3\r\n 2021-02-15T14:37:21.496Z\r\n \r\n \r\ + \n\r\n" + content_type: text/plain + method: GET + status: 200 + url: https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/birdhouse/testdata/xclim/cmip6/catalog.xml +- response: + auto_calculate_content_length: false + body: "\r\n\r\n \r\n \r\n \r\n\ + \ \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\ + \n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\ + \ \r\n\ + \ \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\ + \ \r\n \r\n \r\n \r\ + \n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\ + \ \r\n\ + \ \r\n \r\n \r\n \r\n \r\ + \n \r\n \r\n \r\n \r\n \r\n \ + \ \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\ + \n \r\n \ + \ \r\n \ + \ \r\ + \n \r\n \r\n \r\n \r\ + \n \r\n \r\n \ + \ \r\n \ + \ \r\ + \n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\ + \ \r\ + \n \r\ + \n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\ + \n \r\n \r\n \r\n \r\n \r\ + \n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\ + \n \r\ + \n \r\n \r\n\ + \ \r\n \r\n \r\n \r\n \r\n \ + \ \r\n \r\n \r\n \r\n \r\n \r\n\r\ + \n\r\n" + content_type: text/plain + method: GET + status: 200 + url: https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/ncml/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc +- response: + auto_calculate_content_length: false + body: "\r\n\r\n \r\n \ + \ \r\ + \n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n 62.12\r\n 2020-04-13T15:46:10.168Z\r\n \r\n \ + \ all\r\n Grid\r\n \ + \ \r\n \r\n\r\n" + content_type: text/plain + method: GET + status: 200 + url: https://psl.noaa.gov/thredds/catalog/Datasets/20thC_ReanV2/Monthlies/gaussian/monolevel/catalog.xml?dataset=Datasets/20thC_ReanV2/Monthlies/gaussian/monolevel/air.2m.mon.mean.nc +- response: + auto_calculate_content_length: false + body: "\r\n\r\n \r\n \ + \ \r\ + \n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n 62.12\r\n 2020-04-13T15:46:10.168Z\r\n \r\n \ + \ all\r\n Grid\r\n \ + \ \r\n \r\n\r\n" + content_type: text/plain + method: GET + status: 200 + url: https://psl.noaa.gov/thredds/catalog/Datasets/20thC_ReanV2/Monthlies/gaussian/monolevel/catalog.xml?dataset=Datasets/20thC_ReanV2/Monthlies/gaussian/monolevel/air.2m.mon.mean.nc +- response: + auto_calculate_content_length: false + body: "\r\n\r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\ + \ \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\ + \n \r\n \r\n \r\n \r\n\ + \ \r\n\ + \ \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\ + \ \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\ + \n \r\n \r\n \r\n \r\n \r\ + \n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\ + \ \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\ + \n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\ + \n\r\n" + content_type: text/plain + method: GET + status: 200 + url: https://psl.noaa.gov/thredds/ncml/Datasets/20thC_ReanV2/Monthlies/gaussian/monolevel/air.2m.mon.mean.nc diff --git a/tests/test_stac_utils.py b/tests/test_stac_utils.py index 0a403d2..9f1a8cb 100644 --- a/tests/test_stac_utils.py +++ b/tests/test_stac_utils.py @@ -1,5 +1,10 @@ import pytest -from STACpopulator.stac_utils import thredds_catalog_attrs, ncattrs +import requests +import responses +from pathlib import Path +from STACpopulator.stac_utils import catalog_url, access_urls +from STACpopulator.stac_utils import thredds_catalog_attrs, ncml_attrs, ds_attrs +from conftest import DATA TEST_NC_URLS = ["https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/birdhouse/testdata/xclim/cmip6" "/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", @@ -10,8 +15,10 @@ @pytest.mark.parametrize("url", TEST_CATALOG_URLS) -@pytest.mark.online +@responses.activate def test_thredds_catalog_attrs(url): + responses._add_from_file(file_path=DATA / "responses.yaml") + attrs = thredds_catalog_attrs(url) assert "service" in attrs["catalog"] assert "dataset" in attrs["catalog"] @@ -19,9 +26,48 @@ def test_thredds_catalog_attrs(url): @pytest.mark.parametrize("url", TEST_NC_URLS) -@pytest.mark.online -def test_ncattrs(url): - url = "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc" - attrs = ncattrs(url) - assert "access_urls" in attrs +@responses.activate +def test_catalog_url(url): + responses._add_from_file(file_path=DATA / "responses.yaml") + + link, ds = catalog_url(url) + resp = requests.get(link) + resp.raise_for_status() + + +@pytest.mark.parametrize("url", TEST_NC_URLS) +@responses.activate +def test_access_urls(url): + responses._add_from_file(file_path=DATA / "responses.yaml") + + link, ds = catalog_url(url) + urls = access_urls(link, ds) + assert "NcML" in urls + # assert "OPENDAP" in urls + assert "HTTPServer" in urls + + +@pytest.mark.parametrize("url", TEST_NC_URLS) +@responses.activate +def test_ncml_attrs(url): + responses._add_from_file(file_path=DATA / "responses.yaml") + + link, ds = catalog_url(url) + urls = access_urls(link, ds) + attrs = ncml_attrs(urls["NcML"]) + assert "attributes" in attrs + + +@pytest.mark.parametrize("url", TEST_NC_URLS) +@responses.activate +def test_ds_attrs(url): + responses._add_from_file(file_path=DATA / "responses.yaml") + + attrs = ds_attrs(url) + + assert "attributes" in attrs + assert "access_urls" in attrs + + + diff --git a/tests/test_standalone_stac_item.py b/tests/test_standalone_stac_item.py index 5d243ce..8fe1ac2 100644 --- a/tests/test_standalone_stac_item.py +++ b/tests/test_standalone_stac_item.py @@ -3,34 +3,44 @@ import os import tempfile from urllib.parse import quote +import responses +from pathlib import Path from STACpopulator.implementations.CMIP6_UofT.add_CMIP6 import CMIP6ItemProperties, CMIP6populator from STACpopulator.input import THREDDSLoader from STACpopulator.models import GeoJSONPolygon -from STACpopulator.stac_utils import STAC_item_from_metadata, ncattrs -from pystac.validation import JsonSchemaSTACValidator -from pystac import STACObjectType +from STACpopulator.stac_utils import STAC_item_from_metadata, ds_attrs +import pystac +from conftest import reference_path_from_url, DATA CUR_DIR = os.path.dirname(__file__) TEST_NC_URLS = ["https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/birdhouse/testdata/xclim/cmip6" "/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", ] - def quote_none_safe(url): return quote(url, safe="") @pytest.mark.parametrize("url", TEST_NC_URLS) -@pytest.mark.online +@responses.activate def test_standalone_stac_item_thredds_ncml(url): - attrs = ncattrs(url) + responses._add_from_file(file_path=DATA / "responses.yaml") + attrs = ds_attrs(url) stac_item_id = CMIP6populator.make_cmip6_item_id(attrs["attributes"]) stac_item = STAC_item_from_metadata(stac_item_id, attrs, CMIP6ItemProperties, GeoJSONPolygon) + + assert isinstance(stac_item, pystac.Item) + schemas_validated = stac_item.validate() assert len(schemas_validated) >= 1 assert "item.json" in schemas_validated[0] + # Compare with stored item dict + path = reference_path_from_url(url) + expected = json.loads(path.read_text()) + assert stac_item.to_dict() == expected + class MockedNoSTACUpload(CMIP6populator): def load_config(self):