Skip to content

Commit

Permalink
refactor metadata extraction and add flag to preview for metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
satchelbaldwin committed Mar 25, 2024
1 parent 8c1085a commit a3b74c1
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 34 deletions.
35 changes: 35 additions & 0 deletions api/dataset/metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from typing import Any
import xarray
import numpy


def extract_esgf_specific_fields(ds: xarray.Dataset) -> dict[str, Any]:
return {
"dataSourceDate": ds.attrs.get("creation_date", "UNKNOWN"),
"datasetUrl": ds.attrs.get("further_info_url", "UNKNOWN"),
"source": ds.attrs.get("source", "UNKNOWN"),
"variableId": ds.attrs.get("variable_id", ""),
}


def extract_metadata(ds: xarray.Dataset) -> dict[str, Any]:
def extract_numpy_item(x):
return x.item() if isinstance(x, numpy.generic) else x

return {
"format": "netcdf",
"dataStructure": {
k: {
"attrs": {
ak: extract_numpy_item(ds[k].attrs[ak])
for ak in ds[k].attrs
# _ChunkSizes is an unserializable ndarray, safely ignorable
if ak != "_ChunkSizes"
},
"indexes": list(ds[k].indexes.keys()),
"coordinates": list(ds[k].coords.keys()),
}
for k in ds.variables.keys()
},
"raw": {k: extract_numpy_item(ds.attrs[k]) for k in ds.attrs.keys()},
}
36 changes: 4 additions & 32 deletions api/dataset/terarium_hmi.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import xarray
from api.dataset.models import DatasetSubsetOptions
from api.dataset.metadata import extract_metadata, extract_esgf_specific_fields
from api.search.providers.era5 import ERA5SearchData
from api.settings import default_settings
import requests
Expand Down Expand Up @@ -58,36 +59,10 @@ def enumerate_dataset_skeleton(
"userId": "",
"fileNames": [],
"columns": [],
"metadata": {
"format": "netcdf",
"metadata": extract_metadata(ds)
| {
"parentDatasetId": parent_id,
"variableId": ds.attrs.get("variable_id", ""),
"preview": preview,
"dataStructure": {
k: {
"attrs": {
ak: (
ds[k].attrs[ak].item()
if isinstance(ds[k].attrs[ak], numpy.generic)
else ds[k].attrs[ak]
)
for ak in ds[k].attrs
# _ChunkSizes is an unserializable ndarray, safely ignorable
if ak != "_ChunkSizes"
},
"indexes": [i for i in ds[k].indexes.keys()],
"coordinates": [i for i in ds[k].coords.keys()],
}
for k in ds.variables.keys()
},
"raw": {
k: (
ds.attrs[k].item()
if isinstance(ds.attrs[k], numpy.generic)
else ds.attrs[k]
)
for k in ds.attrs.keys()
},
},
"grounding": {},
}
Expand All @@ -112,10 +87,7 @@ def construct_hmi_dataset(
additional_fields = {
"name": f"{dataset_name}-subset-{subset_uuid}",
"description": generate_description(ds, dataset_id, opts),
"dataSourceDate": ds.attrs.get("creation_date", "UNKNOWN"),
"datasetUrl": ds.attrs.get("further_info_url", "UNKNOWN"),
"source": ds.attrs.get("source", "UNKNOWN"),
}
} | extract_esgf_specific_fields(ds)
additional_metadata = {
"parentDatasetId": parent_dataset_id,
"subsetDetails": repr(opts),
Expand Down
12 changes: 11 additions & 1 deletion api/preview/render.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import datetime
import io
import base64
from typing import Any
from api.search.provider import AccessURLs
import cartopy.crs as ccrs
import xarray
from api.dataset.metadata import extract_esgf_specific_fields, extract_metadata
from matplotlib import pyplot as plt
from api.dataset.remote import (
cleanup_potential_artifacts,
Expand All @@ -25,16 +27,24 @@ def render_preview_for_dataset(
variable_index: str = "",
time_index: str = "",
timestamps: str = "",
analyze: bool = False,
**kwargs,
):
job_id = kwargs["job_id"]
try:
ds: xarray.Dataset | None = None
extra_metadata_discovery: dict[str, Any] = {}
# AccessURLs list or UUID str -- UUID str is terarium handle.
if isinstance(dataset, list):
ds = open_dataset(dataset, job_id)
elif isinstance(dataset, str):
ds = open_remote_dataset_hmi(dataset, job_id)
if analyze:
print("attempting to extract more information", flush=True)
extra_metadata_discovery = {
"metadata": extract_metadata(ds) | extract_esgf_specific_fields(ds)
}

if timestamps != "":
if len(timestamps.split(",")) != 2:
return {
Expand All @@ -45,7 +55,7 @@ def render_preview_for_dataset(
except KeyError as e:
return {"error": f"{e}"}
cleanup_potential_artifacts(job_id)
return {"previews": png}
return {"previews": png} | extra_metadata_discovery
except IOError as e:
return {"error": f"upstream hosting is likely having a problem. {e}"}

Expand Down
3 changes: 2 additions & 1 deletion api/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ async def esgf_preview(
variable_id: str = "",
time_index: str = "",
timestamps: str = "",
analyze: bool = False,
redis=Depends(get_redis),
):
dataset = (
Expand All @@ -107,7 +108,7 @@ async def esgf_preview(
)
job = create_job(
func=render_preview_for_dataset,
args=[dataset, variable_id, time_index, timestamps],
args=[dataset, variable_id, time_index, timestamps, analyze],
redis=redis,
queue="preview",
)
Expand Down

0 comments on commit a3b74c1

Please sign in to comment.