diff --git a/contributor_folders/aidan/__pycache__/dataset.cpython-312.pyc b/contributor_folders/aidan/__pycache__/dataset.cpython-312.pyc index 48c2861..a6342db 100644 Binary files a/contributor_folders/aidan/__pycache__/dataset.cpython-312.pyc and b/contributor_folders/aidan/__pycache__/dataset.cpython-312.pyc differ diff --git a/contributor_folders/aidan/dataset.py b/contributor_folders/aidan/dataset.py index 6c29316..d2e5000 100644 --- a/contributor_folders/aidan/dataset.py +++ b/contributor_folders/aidan/dataset.py @@ -14,13 +14,19 @@ class SpatialBounds(BaseModel): class Access(BaseModel): platform: str path: str +<<<<<<< HEAD +======= access_function: Optional[str] = "" other_args: Optional[dict] = {} +>>>>>>> aa213b2d152fb29a697d7271cdf076f9f2c1a546 class Variable(BaseModel): standard_name: str description: str +<<<<<<< HEAD +======= units: str +>>>>>>> aa213b2d152fb29a697d7271cdf076f9f2c1a546 class Variables(BaseModel): variables: List[Variable] diff --git a/contributor_folders/aidan/datasets.ipynb b/contributor_folders/aidan/datasets.ipynb index b748633..167ba13 100644 --- a/contributor_folders/aidan/datasets.ipynb +++ b/contributor_folders/aidan/datasets.ipynb @@ -2,7 +2,11 @@ "cells": [ { "cell_type": "code", +<<<<<<< HEAD + "execution_count": 2, +======= "execution_count": 1, +>>>>>>> aa213b2d152fb29a697d7271cdf076f9f2c1a546 "id": "f571ec6c-d60e-4049-a198-25b4dad1b7bb", "metadata": {}, "outputs": [], @@ -13,11 +17,36 @@ }, { "cell_type": "code", +<<<<<<< HEAD + "execution_count": 3, +======= "execution_count": 2, +>>>>>>> aa213b2d152fb29a697d7271cdf076f9f2c1a546 "id": "e1f9d631-4ba9-435c-bddf-97962db3ccce", "metadata": {}, "outputs": [], "source": [ +<<<<<<< HEAD + "d = Dataset(\n", + " name=\"Multi-Scale Ultra High Resolution (MUR) Sea Surface Temperature (SST)\",\n", + " description=\"A global, gap-free, gridded, daily 1 km Sea Surface Temperature (SST) dataset created by merging multiple Level-2 satellite SST datasets. Those input datasets include the NASA Advanced Microwave Scanning Radiometer-EOS (AMSR-E), the JAXA Advanced Microwave Scanning Radiometer 2 (AMSR-2) on GCOM-W1, the Moderate Resolution Imaging Spectroradiometers (MODIS) on the NASA Aqua and Terra platforms, the US Navy microwave WindSat radiometer, the Advanced Very High Resolution Radiometer (AVHRR) on several NOAA satellites, and in situ SST observations from the NOAA iQuam project. Data are available from 2002 to present in Zarr format. The original source of the MUR data is the NASA JPL Physical Oceanography DAAC.\",\n", + " spatial_bounds=SpatialBounds(\n", + " min_lat=1.0,\n", + " min_lon=1.0,\n", + " max_lat=1.0,\n", + " max_lon=1.0\n", + " ),\n", + " temporal_bounds=TemporalBounds(\n", + " start_time=\"1234\",\n", + " end_time=\"4567\"\n", + " ),\n", + " variables=Variables(\n", + " variables=[Variable(standard_name=\"water temp\", description=\"how hot da water\")]\n", + " ),\n", + " access=Access(\n", + " platform=\"aws\",\n", + " path=\"s3://path_to_file.zarr\"\n", +======= "mur = Dataset(\n", " name=\"GHRSST Level 4 MUR Global Foundation Sea Surface Temperature Analysis (v4.1)\",\n", " description=\"The GHRSST MUR Level 4 sea surface temperature dataset provides global 0.01° analyses using wavelet-based optimal interpolation, combining nighttime SST observations from multiple satellite instruments and in situ sources, with retrospective (four-day latency) and near-real-time (one-day latency) products. It also includes ice concentration data for high-latitude SST improvements, SST anomalies, and the temporal distance to the nearest IR measurement for each pixel.\",\n", @@ -74,12 +103,41 @@ " platform=\"aws\",\n", " path=\"s3://mur-sst/zarr-v1/\",\n", " access_function=\"load_mur\"\n", +>>>>>>> aa213b2d152fb29a697d7271cdf076f9f2c1a546 " )\n", ")" ] }, { "cell_type": "code", +<<<<<<< HEAD + "execution_count": 4, + "id": "0440e690-0fc9-48af-a10a-41283d7bc009", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'name': 'Multi-Scale Ultra High Resolution (MUR) Sea Surface Temperature (SST)',\n", + " 'description': 'A global, gap-free, gridded, daily 1 km Sea Surface Temperature (SST) dataset created by merging multiple Level-2 satellite SST datasets. Those input datasets include the NASA Advanced Microwave Scanning Radiometer-EOS (AMSR-E), the JAXA Advanced Microwave Scanning Radiometer 2 (AMSR-2) on GCOM-W1, the Moderate Resolution Imaging Spectroradiometers (MODIS) on the NASA Aqua and Terra platforms, the US Navy microwave WindSat radiometer, the Advanced Very High Resolution Radiometer (AVHRR) on several NOAA satellites, and in situ SST observations from the NOAA iQuam project. Data are available from 2002 to present in Zarr format. The original source of the MUR data is the NASA JPL Physical Oceanography DAAC.',\n", + " 'temporal_bounds': {'start_time': '1234', 'end_time': '4567'},\n", + " 'spatial_bounds': {'min_lat': 1.0,\n", + " 'min_lon': 1.0,\n", + " 'max_lat': 1.0,\n", + " 'max_lon': 1.0},\n", + " 'variables': {'variables': [{'standard_name': 'water temp',\n", + " 'description': 'how hot da water'}]},\n", + " 'access': {'platform': 'aws', 'path': 's3://path_to_file.zarr'}}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "json.loads(d.json())" +======= "execution_count": 3, "id": "d600f8e8-6e99-448e-85f2-c24e1255f908", "metadata": {}, @@ -249,11 +307,15 @@ " )\n", "\n", ")" +>>>>>>> aa213b2d152fb29a697d7271cdf076f9f2c1a546 ] }, { "cell_type": "code", "execution_count": 5, +<<<<<<< HEAD + "id": "ed735e0c-28b6-41e0-a36e-e551627b1af5", +======= "id": "c617b4e2-588a-41f3-9944-e8db06ac04fe", "metadata": {}, "outputs": [], @@ -325,11 +387,27 @@ "cell_type": "code", "execution_count": 7, "id": "05959572-9556-4a9a-aa9a-ee11e296575d", +>>>>>>> aa213b2d152fb29a697d7271cdf076f9f2c1a546 "metadata": {}, "outputs": [ { "data": { "text/plain": [ +<<<<<<< HEAD + "{'datasets': [{'name': 'Multi-Scale Ultra High Resolution (MUR) Sea Surface Temperature (SST)',\n", + " 'description': 'A global, gap-free, gridded, daily 1 km Sea Surface Temperature (SST) dataset created by merging multiple Level-2 satellite SST datasets. Those input datasets include the NASA Advanced Microwave Scanning Radiometer-EOS (AMSR-E), the JAXA Advanced Microwave Scanning Radiometer 2 (AMSR-2) on GCOM-W1, the Moderate Resolution Imaging Spectroradiometers (MODIS) on the NASA Aqua and Terra platforms, the US Navy microwave WindSat radiometer, the Advanced Very High Resolution Radiometer (AVHRR) on several NOAA satellites, and in situ SST observations from the NOAA iQuam project. Data are available from 2002 to present in Zarr format. The original source of the MUR data is the NASA JPL Physical Oceanography DAAC.',\n", + " 'temporal_bounds': {'start_time': '1234', 'end_time': '4567'},\n", + " 'spatial_bounds': {'min_lat': 1.0,\n", + " 'min_lon': 1.0,\n", + " 'max_lat': 1.0,\n", + " 'max_lon': 1.0},\n", + " 'variables': {'variables': [{'standard_name': 'water temp',\n", + " 'description': 'how hot da water'}]},\n", + " 'access': {'platform': 'aws', 'path': 's3://path_to_file.zarr'}}]}" + ] + }, + "execution_count": 5, +======= "{'datasets': [{'name': 'Indian Ocean grid',\n", " 'description': 'Our Indian Ocean IO.zarr is a 1972-2022 blended dataset for the Arabian Sea and Bay of Bengal formated as a .zarr file, containing daily cleaned and interpolated data from variables across multiple sources, mostly from processed NASA/NOAA and Copernicus collections and the ERA5 reanalysis products.',\n", " 'temporal_bounds': {'start_time': '1979-01-01', 'end_time': '2022-12-31'},\n", @@ -521,11 +599,15 @@ ] }, "execution_count": 7, +>>>>>>> aa213b2d152fb29a697d7271cdf076f9f2c1a546 "metadata": {}, "output_type": "execute_result" } ], "source": [ +<<<<<<< HEAD + "json.loads(DatasetCollection(datasets=[d]).json())" +======= "json.loads(dataset_collection.json())" ] }, @@ -540,12 +622,17 @@ "\n", "with open(dataset_path, \"w\") as f:\n", " f.write(dataset_collection.model_dump_json(indent=2)) " +>>>>>>> aa213b2d152fb29a697d7271cdf076f9f2c1a546 ] }, { "cell_type": "code", "execution_count": null, +<<<<<<< HEAD + "id": "cca20b5c-7b2d-4b2d-844c-011d301c2ba8", +======= "id": "e71e91c1-07dd-48cb-bbbe-afbfb83c7f07", +>>>>>>> aa213b2d152fb29a697d7271cdf076f9f2c1a546 "metadata": {}, "outputs": [], "source": [] diff --git a/contributor_folders/aidan/explore_langchain.ipynb b/contributor_folders/aidan/explore_langchain.ipynb index 928d92c..e9d7a78 100644 --- a/contributor_folders/aidan/explore_langchain.ipynb +++ b/contributor_folders/aidan/explore_langchain.ipynb @@ -115,7 +115,11 @@ }, { "cell_type": "code", +<<<<<<< HEAD + "execution_count": 14, +======= "execution_count": 2, +>>>>>>> aa213b2d152fb29a697d7271cdf076f9f2c1a546 "id": "c48c0b5a-95e0-45ed-bca7-033baecac369", "metadata": {}, "outputs": [], @@ -125,7 +129,11 @@ }, { "cell_type": "code", +<<<<<<< HEAD + "execution_count": 15, +======= "execution_count": 3, +>>>>>>> aa213b2d152fb29a697d7271cdf076f9f2c1a546 "id": "0001cc2c-3fd2-4665-9f7f-7dfabf86fb44", "metadata": {}, "outputs": [], @@ -135,7 +143,11 @@ }, { "cell_type": "code", +<<<<<<< HEAD + "execution_count": 16, +======= "execution_count": 4, +>>>>>>> aa213b2d152fb29a697d7271cdf076f9f2c1a546 "id": "86001908-aa14-44b5-9f56-06e7bbe12970", "metadata": {}, "outputs": [], @@ -158,19 +170,32 @@ }, { "cell_type": "code", +<<<<<<< HEAD + "execution_count": null, +======= "execution_count": 7, +>>>>>>> aa213b2d152fb29a697d7271cdf076f9f2c1a546 "id": "3acde5d4-3299-4a30-8624-2f40af33bf17", "metadata": {}, "outputs": [], "source": [ +<<<<<<< HEAD + "read_arraylake_dataset(\n", + " \"earthmover-public/era5-surface-aws\",\n", + " \"spatial\",\n", + " \"sst\"\n", +======= "ds = read_arraylake_dataset(\n", " \"earthmover-public/era5-surface-aws\",\n", " \"spatial\"\n", +>>>>>>> aa213b2d152fb29a697d7271cdf076f9f2c1a546 ")" ] }, { "cell_type": "code", +<<<<<<< HEAD +======= "execution_count": 9, "id": "8a4c3fb5-5ccc-49ae-a523-86cfb11e0dba", "metadata": {}, @@ -3543,6 +3568,7 @@ }, { "cell_type": "code", +>>>>>>> aa213b2d152fb29a697d7271cdf076f9f2c1a546 "execution_count": 20, "id": "00fff333-8015-4776-92bc-88700297235a", "metadata": {}, diff --git a/contributor_folders/finn/data_readin.ipynb b/contributor_folders/finn/data_readin.ipynb index 2ea91de..f6972b9 100644 --- a/contributor_folders/finn/data_readin.ipynb +++ b/contributor_folders/finn/data_readin.ipynb @@ -8,12 +8,9 @@ "outputs": [], "source": [ "from __future__ import annotations\n", - "from typing import Tuple\n", - "\n", + "from typing import Optional, Union, Tuple, Dict, Any\n", "import xarray as xr\n", - "import numpy as np\n", - "import cf_xarray\n", - "import s3fs, gcsfs, fsspec, zarr" + "import numpy as np" ] }, { @@ -25,9 +22,6 @@ "source": [ "### helper functions to normalize coords\n", "\n", - "# --------------------- helpers (unchanged from previous) --------------------- #\n", - "# Note: These helpers are kept as they perform general-purpose coordinate and variable handling.\n", - "\n", "def _select_variable(ds: xr.Dataset, var: Union[str, Dict[str, str]]) -> str:\n", " \"\"\"\n", " Pick a variable name from a Dataset.\n", @@ -60,15 +54,22 @@ " raise KeyError(f\"Could not locate variable from hints {var}. Variables: {list(ds.data_vars)}\")\n", "\n", "\n", - "def _normalize_coord_names(ds: xr.Dataset) -> xr.Dataset:\n", + "def _get_coord_names(ds: xr.Dataset) -> Tuple[str, str]:\n", " \"\"\"\n", - " Standardize coordinate names to 'latitude', 'longitude', 'time'.\n", + " Get the longitude and latitude coordinate names from the dataset.\n", + " Supports both long ('longitude', 'latitude') and short ('lon', 'lat') names.\n", + " \n", + " Returns\n", + " -------\n", + " tuple of (lon_name, lat_name)\n", " \"\"\"\n", - " rename_map = {}\n", - " for alias, standard in {\"lat\": \"latitude\", \"lon\": \"longitude\"}.items():\n", - " if alias in ds.coords and standard not in ds.coords:\n", - " rename_map[alias] = standard\n", - " return ds.rename(rename_map) if rename_map else ds\n", + " lon_name = next((name for name in ['longitude', 'lon'] if name in ds.coords), None)\n", + " lat_name = next((name for name in ['latitude', 'lat'] if name in ds.coords), None)\n", + " \n", + " if not lon_name or not lat_name:\n", + " raise ValueError(f\"Could not find longitude/latitude coordinates. Found: {list(ds.coords)}\")\n", + " \n", + " return lon_name, lat_name\n", "\n", "\n", "def _infer_target_lon_frame(lon_min: float, lon_max: float) -> str:\n", @@ -81,11 +82,11 @@ "def _coerce_longitudes(ds: xr.Dataset, target_frame: str, assume_frame: Optional[str] = None) -> xr.Dataset:\n", " \"\"\"\n", " Coerce dataset longitudes to a target frame ('0-360' or '-180-180').\n", + " Works with either 'longitude' or 'lon' coordinate names.\n", " \"\"\"\n", - " if \"longitude\" not in ds.coords:\n", - " return ds\n", - "\n", - " lon = ds[\"longitude\"].values\n", + " lon_name, _ = _get_coord_names(ds)\n", + " \n", + " lon = ds[lon_name].values\n", " if assume_frame:\n", " current = assume_frame\n", " else:\n", @@ -99,159 +100,7412 @@ " else: # target is -180-180\n", " lon_new = ((lon + 180) % 360) - 180\n", " \n", - " ds = ds.assign_coords(longitude=lon_new)\n", - " return ds.sortby(\"longitude\")\n", + " ds = ds.assign_coords({lon_name: lon_new})\n", + " return ds.sortby(lon_name)\n", "\n", "\n", "def _ensure_lat_monotonic(ds: xr.Dataset) -> xr.Dataset:\n", " \"\"\"\n", " Ensures the latitude coordinate is monotonically increasing.\n", + " Works with either 'latitude' or 'lat' coordinate names.\n", " \"\"\"\n", - " if \"latitude\" in ds.coords and ds[\"latitude\"].ndim == 1 and ds[\"latitude\"].values[0] > ds[\"latitude\"].values[-1]:\n", - " return ds.sortby(\"latitude\")\n", + " _, lat_name = _get_coord_names(ds)\n", + " \n", + " if ds[lat_name].ndim == 1 and ds[lat_name].values[0] > ds[lat_name].values[-1]:\n", + " return ds.sortby(lat_name)\n", " return ds\n", "\n", "\n", "def _slice_longitude(ds: xr.Dataset, lon_min: float, lon_max: float) -> xr.Dataset:\n", " \"\"\"\n", " Slice longitude robustly, handling wrap-around for ranges like 350E to 10E.\n", + " Works with either 'longitude' or 'lon' coordinate names.\n", " \"\"\"\n", + " lon_name, _ = _get_coord_names(ds)\n", + " \n", " if lon_min <= lon_max:\n", - " return ds.sel(longitude=slice(lon_min, lon_max))\n", + " return ds.sel(**{lon_name: slice(lon_min, lon_max)})\n", " \n", - " lon = ds[\"longitude\"]\n", - " part1 = ds.sel(longitude=slice(lon_min, float(lon.max())))\n", - " part2 = ds.sel(longitude=slice(float(lon.min()), lon_max))\n", - " return xr.concat([part1, part2], dim=\"longitude\")" + " lon = ds[lon_name]\n", + " part1 = ds.sel(**{lon_name: slice(lon_min, float(lon.max()))})\n", + " part2 = ds.sel(**{lon_name: slice(float(lon.min()), lon_max)})\n", + " return xr.concat([part1, part2], dim=lon_name)" ] }, { "cell_type": "code", "execution_count": 3, - "id": "d5ebf53d-efcf-4498-aac2-86dba953ab22", + "id": "6391de80-3409-4398-82cc-5a880af149d3", "metadata": {}, "outputs": [], "source": [ - "def load_aws_dataset(\n", - " s3_path: str,\n", - " variable_of_interest: Union[str, Dict[str, str]],\n", - " region_of_interest: Optional[Dict[str, float]] = None,\n", - " time_of_interest: Optional[Union[slice, Tuple[str, str]]] = None,\n", + "def load_climate_data(\n", + " cloud_path: str,\n", + " variable: Union[str, Dict[str, str]],\n", + " lon_range: Optional[Tuple[float, float]] = None,\n", + " lat_range: Optional[Tuple[float, float]] = None,\n", " *,\n", - " group: Optional[str] = None,\n", - " consolidated: Optional[bool] = None,\n", - " chunks: Optional[Dict] = None,\n", - " assume_lon: Optional[str] = None, # \"0-360\" or \"-180-180\" if you know...\n", - " return_dataset: bool = False,\n", - " save_to: Optional[Union[str, pathlib.Path]] = None,\n", - ") -> Union[xr.DataArray, xr.Dataset]:\n", + " time_range: Optional[Tuple[str, str]] = None,\n", + " resample_to: Optional[str] = None,\n", + " chunks: Optional[Dict[str, int]] = None,\n", + "):\n", " \"\"\"\n", - " Load and subset a Zarr dataset from a public AWS S3 bucket.\n", - "\n", + " Load climate data from cloud storage (S3 or GCS) with consistent processing.\n", + " \n", " Parameters\n", " ----------\n", - " s3_path:\n", - " The full S3 path to the Zarr store (e.g., \"s3://era5-pds/zarr/...\").\n", - " variable_of_interest:\n", - " - Name of the variable in the dataset (e.g., \"sst\", \"tos\", \"t2m\"), OR\n", - " - A mapping of CF/long-name hints to try, e.g.:\n", - " {\"standard_name\": \"sea_surface_temperature\"}\n", - " region_of_interest:\n", - " Dict with geographic bounds: {\"lat_min\": -90, \"lat_max\": 90, \"lon_min\": 0, \"lon_max\": 360}.\n", - " Longitudes may be 0–360 or −180–180. Function will reconcile.\n", - " time_of_interest:\n", - " Either a Python slice (e.g., slice(\"1990-01-01\",\"2000-12-31\")) or a 2-tuple of ISO strings.\n", - " group:\n", - " Zarr group within the store (e.g., \"spatial\" for ERA5).\n", - " consolidated:\n", - " Whether the Zarr store is consolidated. If None, attempts sensible defaults.\n", - " chunks:\n", - " Dask chunking dict, e.g., {\"time\": 2400}.\n", - " assume_lon:\n", - " If set, forces interpretation of dataset longitudes as \"0-360\" or \"-180-180\".\n", - " return_dataset:\n", - " If True, return the full Dataset. Otherwise return the selected DataArray.\n", - " save_to:\n", - " Optional path to save the subset as NetCDF.\n", - "\n", + " cloud_path : str\n", + " Full URL to the Zarr store (e.g., \"s3://...\" or \"gs://...\")\n", + " variable : str or dict\n", + " Variable name or CF-style selector (e.g., {\"standard_name\": \"air_temperature\"})\n", + " lon_range : tuple of float\n", + " (min_longitude, max_longitude) in dataset's native frame\n", + " lat_range : tuple of float\n", + " (min_latitude, max_latitude)\n", + " time_range : tuple of str, optional\n", + " (start_date, end_date) as ISO strings\n", + " convert_kelvin_to_celsius : bool, default True\n", + " If True, convert temperature data from Kelvin to Celsius\n", + " resample_to : str, optional\n", + " If provided, resample time dimension (e.g., \"MS\" for month start)\n", + " chunks : dict, optional\n", + " Dask chunks specification (e.g., {\"time\": 1024})\n", + " \n", " Returns\n", " -------\n", - " xr.DataArray or xr.Dataset\n", - " The subsetted data.\n", + " xr.Dataset\n", + " Processed dataset with consistent dimensions\n", " \"\"\"\n", - " # normalize input params\n", - " if isinstance(time_of_interest, tuple):\n", - " time_of_interest = slice(time_of_interest[0], time_of_interest[1])\n", - "\n", - " region = region_of_interest or {}\n", - " lat_min = region.get(\"lat_min\", None)\n", - " lat_max = region.get(\"lat_max\", None)\n", - " lon_min = region.get(\"lon_min\", None)\n", - " lon_max = region.get(\"lon_max\", None)\n", - "\n", - " # config aws\n", - " storage_options = {\"anon\": True}\n", - " if consolidated is None:\n", - " consolidated = False if (group is not None and \"era5\" in s3_path.lower()) else True\n", - "\n", - " # open dataset\n", + " \n", + " # Open dataset\n", " ds = xr.open_dataset(\n", - " s3_path,\n", + " cloud_path,\n", " engine=\"zarr\",\n", " chunks=chunks,\n", - " consolidated=consolidated,\n", - " backend_kwargs={\n", - " \"storage_options\": storage_options,\n", - " **({\"group\": group} if group else {}),\n", - " },\n", " )\n", + " \n", + " # Get coordinate names\n", + " lon_name, lat_name = _get_coord_names(ds)\n", + " \n", + " # Subset space and time\n", + " region = {}\n", + " if lon_range is not None and lat_range is not None:\n", + " region.update({\n", + " lon_name: slice(*lon_range),\n", + " lat_name: slice(*lat_range)\n", + " })\n", + " if time_range is not None:\n", + " region[\"time\"] = slice(*time_range)\n", + " \n", + " # Only apply selection if we have regions to subset\n", + " if region:\n", + " ds = ds.sel(**region)\n", + " \n", + " # Handle longitude frame and monotonic latitude\n", + " # Handle longitude frame and monotonic latitude\n", + " if lon_range is not None:\n", + " target_frame = _infer_target_lon_frame(*lon_range)\n", + " ds = _coerce_longitudes(ds, target_frame)\n", + " ds = _ensure_lat_monotonic(ds)\n", + " \n", + " # Optional time resampling\n", + " if resample_to:\n", + " ds = ds.resample(time=resample_to).mean()\n", + " \n", + " # Ensure consistent dimension order\n", + " # Get available dimensions\n", + " dims = list(ds.dims)\n", + " # Core dims we want first (if they exist)\n", + " core_dims = [\"time\", \"latitude\", \"longitude\"]\n", + " # Filter out core dims that actually exist\n", + " core_dims = [d for d in core_dims if d in dims]\n", + " # Add any remaining dims at the end\n", + " other_dims = [d for d in dims if d not in core_dims]\n", + " # Combine for final ordering\n", + " final_dims = core_dims + other_dims\n", + " \n", + " ds = ds.transpose(*final_dims)\n", "\n", - " # select variable (cf-aware if possible)\n", - " var = _select_variable(ds, variable_of_interest)\n", - "\n", - " # normalize coordinate names\n", - " ds = _normalize_coord_names(ds)\n", - "\n", - " # fix lon to desired slicing frame, if needed \n", - " if (lon_min is not None) and (lon_max is not None):\n", - " ds = _coerce_longitudes(ds, target_frame=_infer_target_lon_frame(lon_min, lon_max), assume_frame=assume_lon)\n", - "\n", - " # wnsure latitude selection works if lat is descending (ERA5 style)\n", - " if (lat_min is not None) and (lat_max is not None):\n", - " ds = _ensure_lat_monotonic(ds)\n", - "\n", - " # apply coord selections\n", - " sel = ds\n", - " if time_of_interest is not None and (\"time\" in sel.dims or \"time\" in sel.coords):\n", - " sel = sel.sel(time=time_of_interest)\n", - " if (lat_min is not None) and (lat_max is not None) and \"latitude\" in sel.coords:\n", - " sel = sel.sel(latitude=slice(min(lat_min, lat_max), max(lat_min, lat_max)))\n", - " if (lon_min is not None) and (lon_max is not None) and \"longitude\" in sel.coords:\n", - " sel = _slice_longitude(sel, lon_min, lon_max)\n", - "\n", - " # return subset da/ds\n", - " out = sel if return_dataset else sel[var]\n", - " if save_to is not None:\n", - " save_path = pathlib.Path(save_to).expanduser().resolve()\n", - " save_path.parent.mkdir(parents=True, exist_ok=True)\n", - " out.to_netcdf(save_path)\n", - " return out" + " if variable:\n", + " var = _select_variable(ds, variable)\n", + " ds = ds[var]\n", + " \n", + " return ds" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a4811e9a-5912-4094-90ed-9d6c478c4a7b", + "metadata": {}, + "outputs": [], + "source": [ + "era5_data = load_climate_data(\n", + " cloud_path=\"gs://weatherbench2/datasets/era5/1959-2023_01_10-6h-240x121_equiangular_with_poles_conservative.zarr\",\n", + " variable=None,\n", + " lon_range=(0, 90), \n", + " lat_range=(-20, 60), \n", + " time_range=(\"2020-01-01\", \"2020-12-31\"),\n", + " resample_to=\"MS\", \n", + " chunks={\"time\": 1024})" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e1d1e0db-a5b1-40a0-b43c-91a46fc95c0f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 38MB\n",
+       "Dimensions:                                           (time: 12, longitude: 61,\n",
+       "                                                       latitude: 54, level: 13)\n",
+       "Coordinates:\n",
+       "  * latitude                                          (latitude) float64 432B ...\n",
+       "  * level                                             (level) int64 104B 50 ....\n",
+       "  * longitude                                         (longitude) float64 488B ...\n",
+       "  * time                                              (time) datetime64[ns] 96B ...\n",
+       "Data variables: (12/62)\n",
+       "    10m_u_component_of_wind                           (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    10m_v_component_of_wind                           (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    10m_wind_speed                                    (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    2m_dewpoint_temperature                           (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    2m_temperature                                    (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    above_ground                                      (time, latitude, longitude, level) float32 2MB dask.array<chunksize=(12, 54, 61, 13), meta=np.ndarray>\n",
+       "    ...                                                ...\n",
+       "    slope_of_sub_gridscale_orography                  (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    soil_type                                         (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    standard_deviation_of_filtered_subgrid_orography  (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    standard_deviation_of_orography                   (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    type_of_high_vegetation                           (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    type_of_low_vegetation                            (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>
" + ], + "text/plain": [ + " Size: 38MB\n", + "Dimensions: (time: 12, longitude: 61,\n", + " latitude: 54, level: 13)\n", + "Coordinates:\n", + " * latitude (latitude) float64 432B ...\n", + " * level (level) int64 104B 50 ....\n", + " * longitude (longitude) float64 488B ...\n", + " * time (time) datetime64[ns] 96B ...\n", + "Data variables: (12/62)\n", + " 10m_u_component_of_wind (time, latitude, longitude) float32 158kB dask.array\n", + " 10m_v_component_of_wind (time, latitude, longitude) float32 158kB dask.array\n", + " 10m_wind_speed (time, latitude, longitude) float32 158kB dask.array\n", + " 2m_dewpoint_temperature (time, latitude, longitude) float32 158kB dask.array\n", + " 2m_temperature (time, latitude, longitude) float32 158kB dask.array\n", + " above_ground (time, latitude, longitude, level) float32 2MB dask.array\n", + " ... ...\n", + " slope_of_sub_gridscale_orography (time, latitude, longitude) float32 158kB dask.array\n", + " soil_type (time, latitude, longitude) float32 158kB dask.array\n", + " standard_deviation_of_filtered_subgrid_orography (time, latitude, longitude) float32 158kB dask.array\n", + " standard_deviation_of_orography (time, latitude, longitude) float32 158kB dask.array\n", + " type_of_high_vegetation (time, latitude, longitude) float32 158kB dask.array\n", + " type_of_low_vegetation (time, latitude, longitude) float32 158kB dask.array" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "era5_data" ] }, { "cell_type": "code", - "execution_count": null, - "id": "3b9e2d45-2f97-40d3-9596-634bf031320a", + "execution_count": 9, + "id": "9b05cb01-a62f-4570-bf06-305d7e887b0a", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "chl_data = load_climate_data(\n", + " cloud_path=\"gcs://nmfs_odp_nwfsc/CB/mind_the_chl_gap/IO.zarr\",\n", + " variable=None,\n", + " time_range=(\"2020-01-01\", \"2020-03-31\"),\n", + " resample_to=\"MS\",)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4d0835c4-fe09-472a-a0a5-055bac4a86ae", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 14MB\n",
+       "Dimensions:                       (time: 3, lat: 177, lon: 241)\n",
+       "Coordinates:\n",
+       "  * lat                           (lat) float32 708B -12.0 -11.75 ... 31.75 32.0\n",
+       "  * lon                           (lon) float32 964B 42.0 42.25 ... 101.8 102.0\n",
+       "  * time                          (time) datetime64[ns] 24B 2020-01-01 ... 20...\n",
+       "Data variables: (12/27)\n",
+       "    CHL                           (time, lat, lon) float32 512kB nan nan ... nan\n",
+       "    CHL_cmes-cloud                (time, lat, lon) float64 1MB 2.0 2.0 ... 2.0\n",
+       "    CHL_cmes-gapfree              (time, lat, lon) float32 512kB nan nan ... nan\n",
+       "    CHL_cmes-level3               (time, lat, lon) float32 512kB nan nan ... nan\n",
+       "    CHL_cmes_flags-gapfree        (time, lat, lon) float32 512kB nan nan ... nan\n",
+       "    CHL_cmes_flags-level3         (time, lat, lon) float32 512kB nan nan ... nan\n",
+       "    ...                            ...\n",
+       "    v_wind                        (time, lat, lon) float32 512kB -3.804 ... 0...\n",
+       "    vg_curr                       (time, lat, lon) float32 512kB -0.1609 ... nan\n",
+       "    wind_dir                      (time, lat, lon) float32 512kB -65.01 ... 6...\n",
+       "    wind_speed                    (time, lat, lon) float32 512kB 4.731 ... 0....\n",
+       "    CHL_cmes-land                 (time, lat, lon) uint8 128kB 2 2 2 2 ... 2 2 2\n",
+       "    topo                          (time, lat, lon) float64 1MB -2.658e+03 ......\n",
+       "Attributes: (12/92)\n",
+       "    Conventions:                     CF-1.8, ACDD-1.3\n",
+       "    DPM_reference:                   GC-UD-ACRI-PUG\n",
+       "    IODD_reference:                  GC-UD-ACRI-PUG\n",
+       "    acknowledgement:                 The Licensees will ensure that original ...\n",
+       "    citation:                        The Licensees will ensure that original ...\n",
+       "    cmems_product_id:                OCEANCOLOUR_GLO_BGC_L3_MY_009_103\n",
+       "    ...                              ...\n",
+       "    time_coverage_end:               2024-04-18T02:58:23Z\n",
+       "    time_coverage_resolution:        P1D\n",
+       "    time_coverage_start:             2024-04-16T21:12:05Z\n",
+       "    title:                           cmems_obs-oc_glo_bgc-plankton_my_l3-mult...\n",
+       "    westernmost_longitude:           -180.0\n",
+       "    westernmost_valid_longitude:     -180.0
" + ], + "text/plain": [ + " Size: 14MB\n", + "Dimensions: (time: 3, lat: 177, lon: 241)\n", + "Coordinates:\n", + " * lat (lat) float32 708B -12.0 -11.75 ... 31.75 32.0\n", + " * lon (lon) float32 964B 42.0 42.25 ... 101.8 102.0\n", + " * time (time) datetime64[ns] 24B 2020-01-01 ... 20...\n", + "Data variables: (12/27)\n", + " CHL (time, lat, lon) float32 512kB nan nan ... nan\n", + " CHL_cmes-cloud (time, lat, lon) float64 1MB 2.0 2.0 ... 2.0\n", + " CHL_cmes-gapfree (time, lat, lon) float32 512kB nan nan ... nan\n", + " CHL_cmes-level3 (time, lat, lon) float32 512kB nan nan ... nan\n", + " CHL_cmes_flags-gapfree (time, lat, lon) float32 512kB nan nan ... nan\n", + " CHL_cmes_flags-level3 (time, lat, lon) float32 512kB nan nan ... nan\n", + " ... ...\n", + " v_wind (time, lat, lon) float32 512kB -3.804 ... 0...\n", + " vg_curr (time, lat, lon) float32 512kB -0.1609 ... nan\n", + " wind_dir (time, lat, lon) float32 512kB -65.01 ... 6...\n", + " wind_speed (time, lat, lon) float32 512kB 4.731 ... 0....\n", + " CHL_cmes-land (time, lat, lon) uint8 128kB 2 2 2 2 ... 2 2 2\n", + " topo (time, lat, lon) float64 1MB -2.658e+03 ......\n", + "Attributes: (12/92)\n", + " Conventions: CF-1.8, ACDD-1.3\n", + " DPM_reference: GC-UD-ACRI-PUG\n", + " IODD_reference: GC-UD-ACRI-PUG\n", + " acknowledgement: The Licensees will ensure that original ...\n", + " citation: The Licensees will ensure that original ...\n", + " cmems_product_id: OCEANCOLOUR_GLO_BGC_L3_MY_009_103\n", + " ... ...\n", + " time_coverage_end: 2024-04-18T02:58:23Z\n", + " time_coverage_resolution: P1D\n", + " time_coverage_start: 2024-04-16T21:12:05Z\n", + " title: cmems_obs-oc_glo_bgc-plankton_my_l3-mult...\n", + " westernmost_longitude: -180.0\n", + " westernmost_valid_longitude: -180.0" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chl_data" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python (Pixi)", "language": "python", - "name": "python3" + "name": "pixi-kernel-python3" }, "language_info": { "codemirror_mode": { diff --git a/contributor_folders/finn/testing_marimo.ipynb b/contributor_folders/finn/testing_marimo.ipynb deleted file mode 100644 index 8b13789..0000000 --- a/contributor_folders/finn/testing_marimo.ipynb +++ /dev/null @@ -1 +0,0 @@ -