From 7a9d15c9e13971d880cf219850c30b56d1ab026c Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Mon, 19 Jun 2023 17:57:48 -0400 Subject: [PATCH 01/33] WIP - major refactoring of ECCC --- miranda/eccc/convert.py | 131 ++++ .../eccc/data/eccc_homogenized_cf_attrs.json | 111 +++ .../eccc/data/eccc_obs_summary_cf_attrs.json | 173 +++++ miranda/eccc/eccc_obs_cf_attrs.json | 686 +++++++++++------- miranda/units.py | 1 + templates/eccc_raw_daily_conversion.py | 34 +- templates/eccc_raw_hourly_conversion.py | 68 +- 7 files changed, 909 insertions(+), 295 deletions(-) create mode 100644 miranda/eccc/convert.py create mode 100644 miranda/eccc/data/eccc_homogenized_cf_attrs.json create mode 100644 miranda/eccc/data/eccc_obs_summary_cf_attrs.json diff --git a/miranda/eccc/convert.py b/miranda/eccc/convert.py new file mode 100644 index 00000000..cc86cfe2 --- /dev/null +++ b/miranda/eccc/convert.py @@ -0,0 +1,131 @@ +"""Specialized conversion tools for Environment and Climate Change Canada / Meteorological Service of Canada data.""" + +from __future__ import annotations + +import json +import logging.config +import multiprocessing as mp +import os +import time +from functools import partial +from pathlib import Path + +from miranda.eccc._raw import _convert_station_file +from miranda.eccc._utils import cf_station_metadata +from miranda.scripting import LOGGING_CONFIG + +logging.config.dictConfig(LOGGING_CONFIG) + + +_data_folder = Path(__file__).parent / "data" + +eccc_observation_variables = dict() +eccc_observation_variables["flat"] = [ + v + for v in json.load(open(_data_folder / "eccc_obs_flat_attrs.json"))[ + "variables" + ].keys() +] +eccc_observation_variables["summary"] = [ + attrs["_cf_variable_name"] + for attrs in json.load(open(_data_folder / "eccc_obs_summary_cf_attrs.json"))[ + "variables" + ].values() +] +eccc_observation_variables["homogenized"] = [ + attrs["_cf_variable_name"] + for attrs in json.load(open(_data_folder / "eccc_homogenized_cf_attrs.json"))[ + "variables" + ].values() +] + + +def convert_flat_files( + source_files: str | os.PathLike, + output_folder: str | os.PathLike | list[str | int], + variables: str | int | list[str | int], + mode: str = "hourly", + n_workers: int = 4, +) -> None: + """ + + Parameters + ---------- + source_files: str or Path + output_folder: str or Path + variables: str or List[str] + mode: {"hourly", "daily"} + n_workers: int + + Returns + ------- + None + """ + func_time = time.time() + + if mode.lower() in ["h", "hour", "hourly"]: + num_observations = 24 + column_names = ["code", "year", "month", "day", "code_var"] + column_dtypes = [str, float, float, float, str] + elif mode.lower() in ["d", "day", "daily"]: + num_observations = 31 + column_names = ["code", "year", "month", "code_var"] + column_dtypes = [str, float, float, str] + else: + raise NotImplementedError("`mode` must be 'h'/'hourly or 'd'/'daily'.") + + # Preparing the data column headers + for i in range(1, num_observations + 1): + data_entry, flag_entry = f"D{i:0n}", f"F{i:0n}" + column_names.append(data_entry) + column_names.append(flag_entry) + column_dtypes.extend([str, str]) + + if isinstance(variables, (str, int)): + variables = [variables] + + for variable_code in variables: + variable_code = str(variable_code).zfill(3) + metadata = cf_station_metadata(variable_code) + nc_name = metadata["nc_name"] + + rep_nc = Path(output_folder).joinpath(nc_name) + rep_nc.mkdir(parents=True, exist_ok=True) + + # Loop on the files + logging.info( + f"Collecting files for variable '{metadata['standard_name']}' " + f"(filenames containing '{metadata['_table_name']}')." + ) + list_files = list() + if isinstance(source_files, list) or Path(source_files).is_file(): + list_files.append(source_files) + else: + glob_patterns = [g for g in metadata["_table_name"]] + for pattern in glob_patterns: + list_files.extend( + [f for f in Path(source_files).rglob(f"{pattern}*") if f.is_file()] + ) + manager = mp.Manager() + errored_files = manager.list() + converter_func = partial( + _convert_station_file, + output_path=rep_nc, + errored_files=errored_files, + mode=mode, + variable_code=variable_code, + column_names=column_names, + column_dtypes=column_dtypes, + **metadata, + ) + with mp.Pool(processes=n_workers) as pool: + pool.map(converter_func, list_files) + pool.close() + pool.join() + + if errored_files: + logging.warning( + "Some files failed to be properly parsed:\n", ", ".join(errored_files) + ) + + logging.warning(f"Process completed in {time.time() - func_time:.2f} seconds") diff --git a/miranda/eccc/data/eccc_homogenized_cf_attrs.json b/miranda/eccc/data/eccc_homogenized_cf_attrs.json new file mode 100644 index 00000000..92c3b0f1 --- /dev/null +++ b/miranda/eccc/data/eccc_homogenized_cf_attrs.json @@ -0,0 +1,111 @@ +{ + "Header": { + "Conventions": "CF-1.8", + "_product": { + "gen2": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 2", + "gen3": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 3" + }, + "citation": { + "gen2": "Mekis, É and L.A. Vincent, 2011: An overview of the second generation adjusted daily precipitation dataset for trend analysis in Canada. Atmosphere-Ocean 49(2), 163-177 doi:10.1080/07055900.2011.583910", + "gen3": "Vincent, L.A., M.M. Hartwell and X.L. Wang, 2020: A Third Generation of Homogenized Temperature for Trend Analysis and Monitoring Changes in Canada’s Climate. Atmosphere-Ocean. https://doi.org/10.1080/07055900.2020.1765728" + }, + "contact": "info.cccs-ccsc@canada.ca", + "documentation": "https://www.canada.ca/en/environment-climate-change/services/climate-change/canadian-centre-climate-services/display-download/technical-documentation-adjusted-climate-data.html", + "float_missing_value": "1e20", + "frequency": "day", + "institution": "GovCan", + "int_missing_value": "-999", + "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html", + "license_type": "permissive", + "organization": "ECCC", + "realm": "atmos", + "table_date": "2023-03-23", + "table_id": "ECCC" + }, + "variable_entry": { + "dm": { + "add_offset": 273.15, + "cell_methods": "time: mean", + "comments": "Station data converted from Mean Temp (°C)", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Near-Surface Air Temperature", + "original_field": "Mean Temp (°C)", + "out_name": "tas", + "scale_factor": 1, + "standard_name": "air_temperature", + "type": "real", + "units": "K" + }, + "dn": { + "add_offset": 273.15, + "cell_methods": "time: minimum", + "comments": "Station data converted from Min Temp (°C)", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Daily Minimum Near-Surface Air Temperature", + "original_field": "Min Temp (°C)", + "out_name": "tasmin", + "scale_factor": 1, + "standard_name": "air_temperature", + "type": "real", + "units": "K" + }, + "dr": { + "add_offset": 0, + "cell_methods": "time: mean", + "comments": "Station data converted from Total Rain (mm) using a density of 1000 kg/m³", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Liquid Precipitation", + "original_field": "Total Rain (mm)", + "out_name": "prlp", + "scale_factor": 1.1574074074074073e-05, + "standard_name": "rainfall_flux", + "type": "real", + "units": "kg m-2 s-1" + }, + "ds": { + "add_offset": 0, + "cell_methods": "time: mean", + "comments": "station data converted from Total Snow (cm) using a density of 100 kg/m³", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Snowfall Flux", + "original_field": "Total Snow (cm)", + "out_name": "prsn", + "scale_factor": 1.1574074074074073e-05, + "standard_name": "snowfall_flux", + "type": "real", + "units": "kg m-2 s-1" + }, + "dt": { + "add_offset": 0, + "cell_methods": "time: mean", + "comments": "Station data converted from Total Precip (mm) using a density of 1000 kg/m³", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Precipitation", + "original_field": "Total Precip (mm)", + "out_name": "pr", + "scale_factor": 1.1574074074074073e-05, + "standard_name": "precipitation_flux", + "type": "real", + "units": "kg m-2 s-1" + }, + "dx": { + "add_offset": 273.15, + "cell_methods": "time: maximum", + "comments": "station data converted from Max Temp (°C)", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Daily Maximum Near-Surface Air Temperature", + "original_field": "Max Temp (°C)", + "out_name": "tasmax", + "scale_factor": 1, + "standard_name": "air_temperature", + "type": "real", + "units": "K" + } + } +} diff --git a/miranda/eccc/data/eccc_obs_summary_cf_attrs.json b/miranda/eccc/data/eccc_obs_summary_cf_attrs.json new file mode 100644 index 00000000..b21f224e --- /dev/null +++ b/miranda/eccc/data/eccc_obs_summary_cf_attrs.json @@ -0,0 +1,173 @@ +{ + "Header": { + "Conventions": "CF-1.8", + "contact": "info.cccs-ccsc@canada.ca", + "institution": "GovCan", + "int_missing_value": "-999", + "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html", + "license_type": "permissive", + "missing_value": "1e20", + "organization": "ECCC", + "processing_level": "raw", + "realm": "atmos", + "source": "msc", + "table_date": "2023-03-23", + "type": "station-obs" + }, + "variable_entry": { + "cdd": { + "add_offset": 0, + "cell_methods": "time: sum", + "comments": "Station data converted from Cool Deg Days (°C)", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Number of Degrees Celsius Over a Mean Temperature of 18 °C", + "original_variable": "Cool Deg Days (°C)", + "out_name": "cdd", + "scale_factor": 1, + "standard_name": "cooling_degree_days", + "type": "real", + "units": "C" + }, + "hdd": { + "add_offset": 0, + "cell_methods": "time: sum", + "comments": "Station data converted from Heat Deg Days (°C)", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Number of Degrees Celsius Under a Mean Temperature of 18 °C", + "original_variable": "Heat Deg Days (°C)", + "out_name": "hdd", + "scale_factor": 1, + "standard_name": "heating_degree_days", + "type": "real", + "units": "C" + }, + "pr": { + "add_offset": 0, + "cell_methods": "time: mean", + "comments": "Station data converted from Total Precip (mm) using a density of 1000 kg/m³", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Precipitation", + "original_variable": "Total Precip (mm)", + "out_name": "pr", + "scale_factor": 1.1574074074074073e-05, + "standard_name": "precipitation_flux", + "type": "real", + "units": "kg m-2 s-1" + }, + "prlp": { + "add_offset": 0, + "cell_methods": "time: mean", + "comments": "Station data converted from Total Rain (mm) using a density of 1000 kg/m³", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Liquid Precipitation", + "original_variable": "Total Rain (mm)", + "out_name": "prlp", + "scale_factor": 1.1574074074074073e-05, + "standard_name": "rainfall_flux", + "type": "real", + "units": "kg m-2 s-1" + }, + "prsn": { + "add_offset": 0, + "cell_methods": "time: mean", + "comments": "station data converted from Total Snow (cm) using a density of 100 kg/m³", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Snowfall Flux", + "original_variable": "Total Snow (cm)", + "out_name": "prsn", + "scale_factor": 1.1574074074074073e-05, + "standard_name": "snowfall_flux", + "type": "real", + "units": "kg m-2 s-1" + }, + "sfcWindAz": { + "add_offset": 0, + "cell_methods": "time: mean", + "comments": "Station data converted from Dir of Max Gust (10s deg)", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Direction from which the Daily Maximum Near-Surface Gust Wind Speed maximum Blows", + "original_variable": "Dir of Max Gust (10s deg)", + "out_name": "sfcWindAz", + "scale_factor": 1, + "standard_name": "wind_direction", + "type": "real", + "units": "degree" + }, + "sfcWindMax": { + "add_offset": 0, + "cell_methods": "time: max", + "comments": "Station data converted from Spd of Max Gust (km/h)", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Daily Maximum Near-Surface Gust Wind Speed maximum", + "original_variable": "Spd of Max Gust (km/h)", + "out_name": "sfcWindMax", + "scale_factor": 0.2777777777777778, + "standard_name": "wind_speed_of_gust maximum", + "type": "real", + "units": "m s-1" + }, + "snd": { + "add_offset": 0, + "cell_methods": "time: mean", + "comments": "Station data converted from Snow on Grnd (cm)", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Snow Depth", + "original_variable": "Snow on Grnd (cm)", + "out_name": "snd", + "scale_factor": 0.01, + "standard_name": "surface_snow_thickness", + "type": "real", + "units": "m" + }, + "tas": { + "add_offset": 273.15, + "cell_methods": "time: mean", + "comments": "Station data converted from Mean Temp (°C)", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Near-Surface Air Temperature", + "original_variable": "Mean Temp (°C)", + "out_name": "tas", + "scale_factor": 1, + "standard_name": "air_temperature", + "type": "real", + "units": "K" + }, + "tasmax": { + "add_offset": 273.15, + "cell_methods": "time: maximum", + "comments": "station data converted from Max Temp (°C)", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Daily Maximum Near-Surface Air Temperature", + "original_variable": "Max Temp (°C)", + "out_name": "tasmax", + "scale_factor": 1, + "standard_name": "air_temperature", + "type": "real", + "units": "K" + }, + "tasmin": { + "add_offset": 273.15, + "cell_methods": "time: minimum", + "comments": "Station data converted from Min Temp (°C)", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Daily Minimum Near-Surface Air Temperature", + "original_variable": "Min Temp (°C)", + "out_name": "tasmin", + "scale_factor": 1, + "standard_name": "air_temperature", + "type": "real", + "units": "K" + } + } +} diff --git a/miranda/eccc/eccc_obs_cf_attrs.json b/miranda/eccc/eccc_obs_cf_attrs.json index 7c882e31..135756c9 100644 --- a/miranda/eccc/eccc_obs_cf_attrs.json +++ b/miranda/eccc/eccc_obs_cf_attrs.json @@ -1,6 +1,8 @@ { "Header": { - "Conventions": "CF-1.8", + "Conventions": "CF-1.9", + "_frequency": true, + "_miranda_version": true, "contact": "climatcentre-climatecentral@ec.gc.ca", "institution": "GovCan", "int_missing_value": "-999", @@ -11,985 +13,1179 @@ "organization": "ECCC", "processing_level": "raw", "realm": "atmos", - "source": "msc", + "source": "MSC", "table_date": "2023-03-23", "type": "station-obs" }, - "variable_entry": { + "dimensions": { + "latitude": { + "_cf_dimension_name": "lat", + "_precision": 4, + "axis": "Y", + "standard_name": "latitude" + }, + "longitude": { + "_cf_dimension_name": "lon", + "_precision": 4, + "axis": "X", + "standard_name": "longitude" + }, + "time": { + "_ensure_correct_time": { + "obs-daily": "1D", + "obs-hourly": "1H" + }, + "_strict_time": false, + "axis": "T", + "long_name": "time", + "standard_name": "time" + } + }, + "variables": { "001": { + "_cf_variable_name": "tasmax", + "_corrected_units": "degC", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "DLY02", "DLY04", "DLY44" ], - "add_offset": 0, - "nc_name": "tasmax", + "_transformation": "op * 0.1 degC", "original_units": "0.1 °C", "original_variable": "Daily Maximum Temperature", - "raw_units": "degC", "scale_factor": 0.1, "standard_name": "air_temperature_maximum", "units": "K" }, "002": { + "_cf_variable_name": "tasmin", + "_corrected_units": "degC", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "DLY02", "DLY04", "DLY44" ], - "add_offset": 0, - "nc_name": "tasmin", + "_transformation": "op * 0.1 degC", "original_units": "0.1 °C", "original_variable": "Daily Minimum Temperature", - "raw_units": "degC", "scale_factor": 0.1, "standard_name": "air_temperature_minimum", "units": "K" }, "003": { + "_cf_variable_name": "tas", + "_corrected_units": "degC", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "DLY02", "DLY04", "DLY44" ], - "add_offset": 0, - "nc_name": "tas", + "_transformation": "op * 0.1 degC", "original_units": "0.1 °C", "original_variable": "Daily Mean Temperature", - "raw_units": "degC", "scale_factor": 0.1, "standard_name": "air_temperature", "units": "K" }, "010": { + "_cf_variable_name": "prlptot", + "_corrected_units": "mm", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "DLY02", "DLY04", "DLY44" ], - "add_offset": 0, - "nc_name": "prlptot", + "_transformation": "op * 0.1 mm day-1", "original_units": "0.1 mm day-1", "original_variable": "Daily Total Rainfall", - "raw_units": "mm", "scale_factor": 0.1, "standard_name": "liquid_precipitation_amount", "units": "m" }, "011": { + "_cf_variable_name": "prsntot", + "_corrected_units": "cm", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "DLY02", "DLY04", "DLY44" ], - "add_offset": 0, - "nc_name": "prsntot", + "_transformation": "op * 0.1 cm day-1", "original_units": "0.1 cm day-1", "original_variable": "Daily Total Snowfall", - "raw_units": "cm", "scale_factor": 0.1, "standard_name": "solid_precipitation_amount", "units": "m" }, "012": { + "_cf_variable_name": "prcptot", + "_corrected_units": "mm", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "DLY02", "DLY04", "DLY44" ], - "add_offset": 0, - "nc_name": "prcptot", + "_transformation": "op * 0.1 mm day-1", "original_units": "0.1 mm day-1", "original_variable": "Daily Total Precipitation", - "raw_units": "mm", "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "m" }, "013": { + "_cf_variable_name": "sndtot", + "_corrected_units": "cm", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "DLY02", "DLY04", "DLY44" ], - "add_offset": 0, - "nc_name": "sndtot", + "_transformation": false, "original_units": "cm", "original_variable": "Snow on the Ground", - "raw_units": "cm", "scale_factor": 1, "standard_name": "surface_snow_thickness", "units": "m" }, "014": { + "_cf_variable_name": "thunder", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "DLY02", "DLY04", "DLY44" ], - "add_offset": 0, - "nc_name": "thunder", + "_transformation": false, "original_variable": "Thunderstorms", - "raw_units": "1", "scale_factor": 1, "standard_name": "thunderstorm_presence", "units": "1" }, "015": { + "_cf_variable_name": "freezing_rain_drizzle", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "DLY02", "DLY04", "DLY44" ], - "add_offset": 0, - "nc_name": "freezing_rain_drizzle", + "_transformation": false, "original_variable": "Freezing rain or drizzle", - "raw_units": "1", "scale_factor": 1, "standard_name": "freeze_rain_drizzle_presence", "units": "1" }, "016": { + "_cf_variable_name": "hail", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "DLY02", "DLY04", "DLY44" ], - "add_offset": 0, - "nc_name": "hail", + "_transformation": false, "original_variable": "Hail", - "raw_units": "1", "scale_factor": 1, "standard_name": "hail_presence", "units": "1" }, "017": { + "_cf_variable_name": "fog_ice_fog", + "_corrected_units": "1", "_table_name": [ "DLY02", "DLY04", "DLY44" ], - "add_offset": 0, - "nc_name": "fog_ice_fog", "original_variable": "Fog or Ice Fog", - "raw_units": "1", "scale_factor": 1, "standard_name": "fog_ice_fog_presence", "units": "1" }, "018": { + "_cf_variable_name": "smoke_haze", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "DLY02", "DLY04" ], - "add_offset": 0, - "nc_name": "smoke_haze", + "_transformation": false, "original_variable": "Smoke or Haze", - "raw_units": "1", "scale_factor": 1, "standard_name": "smoke_haze_presence", "units": "1" }, "019": { + "_cf_variable_name": "blowing_dust_sand", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "DLY02", "DLY04" ], - "add_offset": 0, - "nc_name": "blowing_dust_sand", + "_transformation": false, "original_variable": "Blowing Dust or Sand", - "raw_units": "1", "scale_factor": 1, "standard_name": "blowing_dust_sand_presence", "units": "1" }, "020": { + "_cf_variable_name": "blow_snow", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "DLY02", "DLY04" ], - "add_offset": 0, - "nc_name": "blow_snow", + "_transformation": false, "original_variable": "Blowing snow", - "raw_units": "1", "scale_factor": 1, "standard_name": "blowing_snow_presence", "units": "1" }, "021": { + "_cf_variable_name": "wind_gt_28kt", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "DLY02", "DLY04" ], - "add_offset": 0, - "nc_name": "wind_gt_28kt", + "_transformation": false, "original_variable": "Wind speed >= 28 Knots", - "raw_units": "1", "scale_factor": 1, "standard_name": "wind_exceeding_28_knots", "units": "1" }, "022": { + "_cf_variable_name": "wind_gt_34kt", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "DLY02", "DLY04" ], - "add_offset": 0, - "nc_name": "wind_gt_34kt", + "_transformation": false, "original_variable": "Wind speed >= 34 Knots", - "raw_units": "1", "scale_factor": 1, "standard_name": "wind_exceeding_34_knots", "units": "1" }, "023": { + "_cf_variable_name": "gust_dir_16pts", + "_corrected_units": "deg", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "DLY02", "DLY04" ], - "add_offset": 0, - "nc_name": "gust_dir_16pts", + "_transformation": "op * 10 deg", "original_units": "10's of degrees", "original_variable": "Direction of extreme gust (16 pts) to December 1976", - "raw_units": "deg", - "scale_factor": 10, "standard_name": "gust_to_direction", "units": "deg" }, "024": { + "_cf_variable_name": "gust_speed", + "_corrected_units": "km h-1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "DLY02", "DLY04" ], - "add_offset": 0, - "nc_name": "gust_speed", + "_transformation": false, "original_units": "km/h", "original_variable": "Speed of extreme gust", - "raw_units": "km h-1", "scale_factor": 1, "standard_name": "wind_speed_of_gust", "units": "m s-1" }, "025": { + "_cf_variable_name": "gust_hour", + "_corrected_units": "h", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "DLY02", "DLY04" ], - "add_offset": 0, - "nc_name": "gust_hour", + "_transformation": false, "original_variable": "UTC hour of extreme gust", - "raw_units": "h", "scale_factor": 1, "standard_name": "hour_of_extreme_gust", "units": "h" }, "061": { + "_cf_variable_name": "rf1_radiation", + "_corrected_units": "MJ m-2", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY11" ], - "add_offset": 0, - "nc_name": "rf1_radiation", + "_transformation": "op / 1000 MJ m-2", "original_units": "0.001 MJ/m", "original_variable": "RF1 global solar radiation", - "raw_units": "W m-2 h-1", - "scale_factor": 277.77777777777777, "standard_name": "solar_radiation_flux", - "units": "W m-2 h-1" + "units": "W h m-2" }, "062": { + "_cf_variable_name": "rf2_radiation", + "_corrected_units": "MJ m-2", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY11" ], - "add_offset": 0, - "nc_name": "rf2_radiation", + "_transformation": "op / 1000 MJ m-2", "original_units": "0.001 MJ/m", "original_variable": "RF2 sky (diffuse) radiation", - "raw_units": "W m-2 h-1", - "scale_factor": 277.77777777777777, "standard_name": "solar_radiation_flux", - "units": "W m-2 h-1" + "units": "W h m-2" }, "063": { + "_cf_variable_name": "rf3_radiation", + "_corrected_units": "MJ m-2", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY11" ], - "add_offset": 0, - "nc_name": "rf3_radiation", + "_transformation": "op / 1000 MJ m-2", "original_units": "0.001 MJ/m", "original_variable": "RF3 reflected solar radiation", - "raw_units": "W m-2 h-1", - "scale_factor": 277.77777777777777, "standard_name": "solar_radiation_flux", - "units": "W m-2 h-1" + "units": "W h m-2" }, "064": { + "_cf_variable_name": "rf4_radiation", + "_corrected_units": "MJ m-2", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY11" ], - "add_offset": 0, - "nc_name": "rf4_radiation", + "_transformation": "op / 1000 MJ m-2", "original_units": "0.001 MJ/m", "original_variable": "RF4 net all wave radiation", - "raw_units": "W m-2 h-1", - "scale_factor": 277.77777777777777, "standard_name": "solar_radiation_flux", - "units": "W m-2 h-1" + "units": "W h m-2" }, "067": { + "_cf_variable_name": "rf7_radiation", + "_corrected_units": "lux h", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY11" ], - "add_offset": 0, - "nc_name": "rf7_radiation", + "_transformation": false, "original_units": "0.01 Kilolux_hrs", "original_variable": "RF7 daylight illumination", - "raw_units": "lux h", "scale_factor": 10, "standard_name": "solar_radiation_flux", "units": "lux h" }, "068": { + "_cf_variable_name": "rf8_radiation", + "_corrected_units": "MJ m-2", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY11" ], - "add_offset": 0, - "nc_name": "rf8_radiation", + "_transformation": "op / 1000 MJ m-2", "original_units": "0.001 MJ/m", "original_variable": "RF8 direct solar radiation", - "raw_units": "W m-2 h-1", - "scale_factor": 277.77777777777777, "standard_name": "solar_radiation_flux", - "units": "W m-2 h-1" + "units": "W h m-2" }, "069": { + "_cf_variable_name": "wind_dir_45B", + "_corrected_units": "deg", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY15" ], - "add_offset": 0, - "nc_name": "wind_dir_45B", + "_transformation": false, "original_units": "10's of degrees", "original_variable": "Direction - 45B anemometer (8 pts)", - "raw_units": "deg", "scale_factor": 1, "standard_name": "wind_to_direction", "units": "deg" }, "071": { + "_cf_variable_name": "ceiling_hgt", + "_corrected_units": "m", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "ceiling_hgt", + "_transformation": false, "original_units": "30's of meters", "original_variable": "Ceiling height of lowest layer of clouds", - "raw_units": "m", "scale_factor": 30, "standard_name": "ceiling_cloud_height", "units": "m" }, "072": { + "_cf_variable_name": "visibility", + "_corrected_units": "km", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "visibility", + "_transformation": false, "original_units": "0.1 km", "original_variable": "Visibility", - "raw_units": "km", "scale_factor": 0.1, "standard_name": "visibility_in_air", "units": "m" }, "073": { + "_cf_variable_name": "psl", + "_corrected_units": "Pa", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "psl", + "_transformation": false, "original_units": "0.01 kPa", "original_variable": "Sea Level Pressure", - "raw_units": "Pa", "scale_factor": 10, "standard_name": "air_pressure_at_mean_sea_level", "units": "Pa" }, "074": { + "_cf_variable_name": "tds", + "_corrected_units": "degC", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "tds", + "_transformation": false, "original_units": "0.1 °C", "original_variable": "Dew Point Temperature", - "raw_units": "degC", "scale_factor": 0.1, "standard_name": "dew_point_temperature", "units": "K" }, "075": { + "_cf_variable_name": "wind_dir_u2a_16", + "_corrected_units": "deg", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "wind_dir_u2a_16", + "_transformation": false, "original_units": "10's of degrees", "original_variable": "Wind Direction at 2 m (U2A Anemometer) (16 pts)", - "raw_units": "deg", "scale_factor": 10, "standard_name": "wind_direction_u2a", "units": "deg" }, "076": { + "_cf_variable_name": "wind_speed_u2a", + "_corrected_units": "km h-1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "wind_speed_u2a", + "_transformation": false, "original_units": "km/h", "original_variable": "Wind Speed - U2A (16 pts) to December 1970", - "raw_units": "km h-1", "scale_factor": 1, "standard_name": "wind_speed_u2a", "units": "m s-1" }, "077": { + "_cf_variable_name": "pressure", + "_corrected_units": "Pa", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "pressure", + "_transformation": false, "original_units": "0.01 kPa", "original_variable": "Station Pressure", - "raw_units": "Pa", "scale_factor": 10, "standard_name": "atmospheric_pressure", "units": "Pa" }, "078": { + "_cf_variable_name": "tas_dry", + "_corrected_units": "degC", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "tas_dry", + "_transformation": false, "original_units": "0.1 °C", "original_variable": "Dry Bulb Temperature", - "raw_units": "degC", "scale_factor": 0.1, "standard_name": "dry_bulb_temperature", "units": "K" }, "079": { + "_cf_variable_name": "tas_wet", + "_corrected_units": "degC", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "tas_wet", + "_transformation": false, "original_units": "0.1 °C", "original_variable": "Wet Bulb temperature", - "raw_units": "degC", "scale_factor": 0.1, "standard_name": "wet_bulb_temperature", "units": "K" }, "080": { + "_cf_variable_name": "hur", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "hur", + "_transformation": false, "original_units": "%", "original_variable": "Relative Humidity", - "raw_units": "1", "scale_factor": 1, "standard_name": "relative_humidity", "units": "1" }, "081": { + "_cf_variable_name": "clo", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "clo", + "_transformation": false, "original_units": "%", "original_variable": "Total Cloud Opacity", - "raw_units": "1", "scale_factor": 10, "standard_name": "cloud_albedo", "units": "1" }, "082": { + "_cf_variable_name": "clt", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "clt", + "_transformation": false, "original_units": "%", "original_variable": "Total Cloud Amount", - "raw_units": "1", "scale_factor": 10, "standard_name": "cloud_area_fraction", "units": "1" }, "089": { + "_cf_variable_name": "freeze_rain", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "freeze_rain", + "_transformation": false, "original_variable": "Freezing Rain", - "raw_units": "1", "scale_factor": 1, "standard_name": "freezing_rain", "units": "1" }, "094": { + "_cf_variable_name": "ice_pellets", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "ice_pellets", + "_transformation": false, "original_variable": "Ice Pellets", - "raw_units": "1", "scale_factor": 1, "standard_name": "ice_pellet_presence", "units": "1" }, "107": { + "_cf_variable_name": "1low_cloud_opac", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "1low_cloud_opac", + "_transformation": false, "original_units": "Tenths", "original_variable": "Lowest cloud layer opacity", - "raw_units": "1", "scale_factor": 10, "standard_name": "low_type_cloud_opacity_fraction", "units": "1" }, "108": { + "_cf_variable_name": "1low_cloud_frac", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "1low_cloud_frac", + "_transformation": false, "original_units": "Tenths", "original_variable": "Lowest cloud layer amount or condition", - "raw_units": "1", "scale_factor": 10, "standard_name": "low_type_cloud_area_fraction", "units": "1" }, "109": { + "_cf_variable_name": "1low_cloud_type", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "1low_cloud_type", + "_transformation": false, "original_variable": "Lowest cloud layer type", - "raw_units": "1", "scale_factor": 1, "standard_name": "low_type_cloud_type", "units": "1" }, "110": { + "_cf_variable_name": "1low_cloud_hgt", + "_corrected_units": "m", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "1low_cloud_hgt", + "_transformation": false, "original_units": "30's of meters", "original_variable": "Lowest cloud layer height", - "raw_units": "m", "scale_factor": 30, "standard_name": "low_type_cloud_height", "units": "m" }, "111": { + "_cf_variable_name": "2low_cloud_opac", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "2low_cloud_opac", + "_transformation": false, "original_units": "Tenths", "original_variable": "Second lowest cloud layer opacity", - "raw_units": "1", "scale_factor": 10, "standard_name": "low_type_cloud_opacity_fraction", "units": "1" }, "112": { + "_cf_variable_name": "2low_cloud_frac", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "2low_cloud_frac", + "_transformation": false, "original_units": "Tenths", "original_variable": "Second lowest cloud layer amount or condition", - "raw_units": "1", "scale_factor": 10, "standard_name": "low_type_cloud_area_fraction", "units": "1" }, "113": { + "_cf_variable_name": "2low_cloud_type", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "2low_cloud_type", + "_transformation": false, "original_units": "", "original_variable": "Second lowest cloud layer type", - "raw_units": "1", "scale_factor": 1, "standard_name": "low_type_cloud_type", "units": "1" }, "114": { + "_cf_variable_name": "2low_cloud_hgt", + "_corrected_units": "m", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "2low_cloud_hgt", + "_transformation": false, "original_units": "30's of meters", "original_variable": "Second lowest cloud layer height", - "raw_units": "m", "scale_factor": 30, "standard_name": "low_type_cloud_height", "units": "m" }, "115": { + "_cf_variable_name": "3low_cloud_opac", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "3low_cloud_opac", + "_transformation": false, "original_units": "Tenths", "original_variable": "Thirsd lowest cloud layer opacity", - "raw_units": "1", "scale_factor": 10, "standard_name": "low_type_cloud_opacity_fraction", "units": "1" }, "116": { + "_cf_variable_name": "3low_cloud_frac", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "3low_cloud_frac", + "_transformation": false, "original_units": "Tenths", "original_variable": "Third lowest cloud layer amount or condition", - "raw_units": "1", "scale_factor": 10, "standard_name": "low_type_cloud_area_fraction", "units": "1" }, "117": { + "_cf_variable_name": "3low_cloud_type", + "_corrected_units": "1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "3low_cloud_type", + "_transformation": false, "original_units": "", "original_variable": "Third lowest cloud layer type", - "raw_units": "1", "scale_factor": 1, "standard_name": "low_type_cloud_type", "units": "1" }, "118": { + "_cf_variable_name": "3low_cloud_hgt", + "_corrected_units": "m", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "3low_cloud_hgt", + "_transformation": false, "original_units": "30's of meters", "original_variable": "Third lowest cloud layer height", - "raw_units": "m", "scale_factor": 30, "standard_name": "low_type_cloud_height", "units": "m" }, "123": { + "_cf_variable_name": "rainfall", + "_corrected_units": "mm h-1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "add_offset": 0, - "nc_name": "rainfall", + "_transformation": false, "original_units": "0.1 mm", "original_variable": "Total Rainfall", - "raw_units": "mm h-1", "scale_factor": 0.1, "standard_name": "rainfall_flux", "units": "kg m2 s-1" }, "133": { + "_cf_variable_name": "sun", + "_corrected_units": "h", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY10" ], - "add_offset": 0, - "nc_name": "sun", + "_transformation": false, "original_units": "0.1 hrs", "original_variable": "Sunshine", - "raw_units": "h", "scale_factor": 0.1, "standard_name": "duration_of_sunshine", "units": "s" }, "156": { + "_cf_variable_name": "wind_dir_u2a_36", + "_corrected_units": "deg", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01" ], - "nc_name": "wind_dir_u2a_36", + "_transformation": false, "original_units": "10's of degrees", "original_variable": "Wind Direction - U2A (36 pts) from January 1971", - "raw_units": "deg", + "scale_factor": 10, + "standard_name": "wind_direction_u2a", + "units": "deg" + }, + "209": { + "_cf_variable_name": "wind_character", + "_corrected_units": "", + "_invert_sign": false, + "_offset_time": false, + "_table_name": [ + "HLY01" + ], + "_transformation": false, + "long_name": "wind_direction_u2a", + "original_units": "1, 2", + "original_variable": "Wind character at 10 m", + "scale_factor": 1, + "units": "" + }, + "210": { + "_cf_variable_name": "wind_dir_u2a_36", + "_corrected_units": "deg", + "_invert_sign": false, + "_offset_time": false, + "_table_name": [ + "HLY01" + ], + "_transformation": false, + "original_units": "km/h", + "original_variable": "Wind Direction - U2A (36 pts) from January 1971", "scale_factor": 10, "standard_name": "wind_direction_u2a", "units": "deg" }, "262": { + "_cf_variable_name": "prtot", + "_corrected_units": "mm", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01_RCS" ], - "add_offset": 0, - "nc_name": "prtot", + "_transformation": false, "original_units": "0.1 mm", "original_variable": "Total Precipitation (minutes 00-60)", - "raw_units": "mm", "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, "263": { + "_cf_variable_name": "prtot_q1", + "_corrected_units": "mm", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01_RCS" ], - "add_offset": 0, - "nc_name": "prtot_q1", + "_transformation": false, "original_units": "0.1 mm", "original_variable": "Total Precipitation (minutes 00-15)", - "raw_units": "mm", "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, "264": { + "_cf_variable_name": "prtot_q2", + "_corrected_units": "mm", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01_RCS" ], - "add_offset": 0, - "nc_name": "prtot_q2", + "_transformation": false, "original_units": "0.1 mm", "original_variable": "Total Precipitation (minutes 15-30)", - "raw_units": "mm", "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, "265": { + "_cf_variable_name": "prtot_q3", + "_corrected_units": "mm", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01_RCS" ], - "add_offset": 0, - "nc_name": "prtot_q3", + "_transformation": false, "original_units": "0.1 mm", "original_variable": "Total Precipitation (minutes 30-45)", - "raw_units": "mm", "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, "266": { + "_cf_variable_name": "prtot_q4", + "_corrected_units": "mm", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01_RCS" ], - "add_offset": 0, - "nc_name": "prtot_q4", + "_transformation": false, "original_units": "0.1 mm", "original_variable": "Total Precipitation (minutes 45-60)", - "raw_units": "mm", "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, "267": { + "_cf_variable_name": "precipitation_weight_q1", + "_corrected_units": "kg m-2", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01_RCS" ], - "add_offset": 0, - "nc_name": "precipitation_weight_q1", + "_transformation": false, "original_units": "0.1 kg/m²", "original_variable": "Precipitation Gauge Weight per Unit Area (at minute 15)", - "raw_units": "kg m-2", "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, "268": { + "_cf_variable_name": "precipitation_weight_q2", + "_corrected_units": "kg m-2", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01_RCS" ], - "add_offset": 0, - "nc_name": "precipitation_weight_q2", + "_transformation": false, "original_units": "0.1 kg/m²", "original_variable": "Precipitation Gauge Weight per Unit Area (at minute 30)", - "raw_units": "kg m-2", "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, "269": { + "_cf_variable_name": "precipitation_weight_q3", + "_corrected_units": "kg m-2", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01_RCS" ], - "add_offset": 0, - "nc_name": "precipitation_weight_q3", + "_transformation": false, "original_units": "0.1 kg/m²", "original_variable": "Precipitation Gauge Weight per Unit Area (at minute 45)", - "raw_units": "kg m-2", "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, "270": { + "_cf_variable_name": "precipitation_weight_q4", + "_corrected_units": "kg m-2", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01_RCS" ], - "add_offset": 0, - "nc_name": "precipitation_weight_q4", + "_transformation": false, "original_units": "0.1 kg/m²", "original_variable": "Precipitation Gauge Weight per Unit Area (at minute 60)", - "raw_units": "kg m-2", "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, "271": { + "_cf_variable_name": "wind_speed_q1", + "_corrected_units": "km h-1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01_RCS" ], - "add_offset": 0, - "nc_name": "wind_speed_q1", - "nc_units": "m s-1", + "_transformation": false, "original_units": "0.1 km/h", "original_variable": "Wind Speed at 2 m (minutes 00-15)", - "raw_units": "km h-1", "scale_factor": 0.1, - "standard_name": "wind_speed" + "standard_name": "wind_speed", + "units": "m s-1" }, "272": { + "_cf_variable_name": "wind_speed_q2", + "_corrected_units": "km h-1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01_RCS" ], - "add_offset": 0, - "nc_name": "wind_speed_q2", - "nc_units": "m s-1", + "_transformation": false, "original_units": "0.1 km/h", "original_variable": "Wind Speed at 2 m (minutes 15-30)", - "raw_units": "km h-1", "scale_factor": 0.1, - "standard_name": "wind_speed" + "standard_name": "wind_speed", + "units": "m s-1" }, "273": { + "_cf_variable_name": "wind_speed_q3", + "_corrected_units": "km h-1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01_RCS" ], - "add_offset": 0, - "nc_name": "wind_speed_q3", - "nc_units": "m s-1", + "_transformation": false, "original_units": "0.1 km/h", "original_variable": "Wind Speed at 2 m (minutes 30-45)", - "raw_units": "km h-1", "scale_factor": 0.1, - "standard_name": "wind_speed" + "standard_name": "wind_speed", + "units": "m s-1" }, "274": { + "_cf_variable_name": "wind_speed_q4", + "_corrected_units": "km h-1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01_RCS" ], - "add_offset": 0, - "nc_name": "wind_speed_q4", - "nc_units": "m s-1", + "_transformation": false, "original_units": "0.1 km/h", "original_variable": "Wind Speed at 2 m (minutes 45-60)", - "raw_units": "km h-1", "scale_factor": 0.1, - "standard_name": "wind_speed" + "standard_name": "wind_speed", + "units": "m s-1" }, "275": { + "_cf_variable_name": "snd_q4", + "_corrected_units": "cm", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01_RCS" ], - "add_offset": 0, - "nc_name": "snd_q4", + "_transformation": false, "original_units": "cm", "original_variable": "Snow Depth (at minute 60)", - "raw_units": "cm", "scale_factor": 1, "standard_name": "surface_snow_thickness", "units": "m" }, "276": { + "_cf_variable_name": "snd_q1", + "_corrected_units": "cm", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01_RCS" ], - "add_offset": 0, - "nc_name": "snd_q1", + "_transformation": false, "original_units": "cm", "original_variable": "Snow Depth (at minute 15)", - "raw_units": "cm", "scale_factor": 1, "standard_name": "surface_snow_thickness", "units": "m" }, "277": { + "_cf_variable_name": "snd_q2", + "_corrected_units": "cm", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01_RCS" ], - "add_offset": 0, - "nc_name": "snd_q2", + "_transformation": false, "original_units": "cm", "original_variable": "Snow Depth (at minute 30)", - "raw_units": "cm", "scale_factor": 1, "standard_name": "surface_snow_thickness", "units": "m" }, "278": { + "_cf_variable_name": "snd_q3", + "_corrected_units": "cm", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01_RCS" ], - "add_offset": 0, - "nc_name": "snd_q3", + "_transformation": false, "original_units": "cm", "original_variable": "Snow Depth (at minute 45)", - "raw_units": "cm", "scale_factor": 1, "standard_name": "surface_snow_thickness", "units": "m" }, "279": { + "_cf_variable_name": "wind_dir", + "_corrected_units": "deg", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01_RCS" ], - "add_offset": 0, - "nc_name": "wind_dir", - "nc_units": "deg", + "_transformation": false, "original_units": "Degrees", "original_variable": "Wind Direction at 2 m (minutes 50-60)", - "raw_units": "deg", "scale_factor": 1, - "standard_name": "wind_direction" + "standard_name": "wind_direction", + "units": "deg" }, "280": { + "_cf_variable_name": "wind_speed", + "_corrected_units": "km h-1", + "_invert_sign": false, + "_offset_time": false, "_table_name": [ "HLY01_RCS" ], - "add_offset": 0, - "nc_name": "wind_speed", + "_transformation": false, "original_units": "0.1 km/h", "original_variable": "Wind Speed at 2 m (minutes 50-60)", - "raw_units": "km h-1", "scale_factor": 0.1, "standard_name": "wind_speed", "units": "m s-1" diff --git a/miranda/units.py b/miranda/units.py index 381ac063..4775b62c 100644 --- a/miranda/units.py +++ b/miranda/units.py @@ -7,6 +7,7 @@ import pandas as pd import xarray as xr from xclim.core.calendar import parse_offset +from xclim.core.units import units KiB = int(pow(2, 10)) MiB = int(pow(2, 20)) diff --git a/templates/eccc_raw_daily_conversion.py b/templates/eccc_raw_daily_conversion.py index 4fb64de7..583c8d10 100644 --- a/templates/eccc_raw_daily_conversion.py +++ b/templates/eccc_raw_daily_conversion.py @@ -11,25 +11,25 @@ time_step = "daily" n_workers = 3 var_codes = [ - 1, - 2, - 3, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, + # 1, + # 2, + # 3, + # 10, + # 11, + # 12, + # 13, + # 14, + # 15, + # 16, + # 17, + # 18, + # 19, + # 20, + # 21, + # 22, 23, 24, - 25, + # 25, ] in_files = getenv("in") diff --git a/templates/eccc_raw_hourly_conversion.py b/templates/eccc_raw_hourly_conversion.py index 68a24405..0849e182 100644 --- a/templates/eccc_raw_hourly_conversion.py +++ b/templates/eccc_raw_hourly_conversion.py @@ -11,39 +11,41 @@ time_step = "hourly" n_workers = 3 var_codes = [ - 76, - 77, - 78, - 79, - 80, - 89, - 94, - 107, - 108, - 109, - 110, - 123, - 133, - 156, - 262, - 263, - 264, - 265, - 266, - 267, - 268, - 269, - 270, - 271, - 272, - 273, - 274, - 275, - 276, - 277, - 278, - 279, - 280, + 209, + 210 + # 76, + # 77, + # 78, + # 79, + # 80, + # 89, + # 94, + # 107, + # 108, + # 109, + # 110, + # 123, + # 133, + # 156, + # 262, + # 263, + # 264, + # 265, + # 266, + # 267, + # 268, + # 269, + # 270, + # 271, + # 272, + # 273, + # 274, + # 275, + # 276, + # 277, + # 278, + # 279, + # 280, ] in_files = getenv("in") From d1a8c679fb245d1bd5537893a7687b6561ce01b3 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Tue, 20 Jun 2023 13:53:09 -0400 Subject: [PATCH 02/33] WIP - more units handling --- miranda/eccc/eccc_obs_cf_attrs.json | 178 +++++++++------------------- 1 file changed, 55 insertions(+), 123 deletions(-) diff --git a/miranda/eccc/eccc_obs_cf_attrs.json b/miranda/eccc/eccc_obs_cf_attrs.json index 135756c9..570c318c 100644 --- a/miranda/eccc/eccc_obs_cf_attrs.json +++ b/miranda/eccc/eccc_obs_cf_attrs.json @@ -52,10 +52,9 @@ "DLY04", "DLY44" ], - "_transformation": "op * 0.1 degC", + "_transformation": "op / 10 degC", "original_units": "0.1 °C", "original_variable": "Daily Maximum Temperature", - "scale_factor": 0.1, "standard_name": "air_temperature_maximum", "units": "K" }, @@ -69,10 +68,9 @@ "DLY04", "DLY44" ], - "_transformation": "op * 0.1 degC", + "_transformation": "op / 10 degC", "original_units": "0.1 °C", "original_variable": "Daily Minimum Temperature", - "scale_factor": 0.1, "standard_name": "air_temperature_minimum", "units": "K" }, @@ -86,10 +84,9 @@ "DLY04", "DLY44" ], - "_transformation": "op * 0.1 degC", + "_transformation": "op / 10 degC", "original_units": "0.1 °C", "original_variable": "Daily Mean Temperature", - "scale_factor": 0.1, "standard_name": "air_temperature", "units": "K" }, @@ -103,10 +100,9 @@ "DLY04", "DLY44" ], - "_transformation": "op * 0.1 mm day-1", + "_transformation": "op / 10 mm day-1", "original_units": "0.1 mm day-1", "original_variable": "Daily Total Rainfall", - "scale_factor": 0.1, "standard_name": "liquid_precipitation_amount", "units": "m" }, @@ -120,10 +116,9 @@ "DLY04", "DLY44" ], - "_transformation": "op * 0.1 cm day-1", + "_transformation": "op / 10 cm day-1", "original_units": "0.1 cm day-1", "original_variable": "Daily Total Snowfall", - "scale_factor": 0.1, "standard_name": "solid_precipitation_amount", "units": "m" }, @@ -137,10 +132,9 @@ "DLY04", "DLY44" ], - "_transformation": "op * 0.1 mm day-1", + "_transformation": "op / 10 mm day-1", "original_units": "0.1 mm day-1", "original_variable": "Daily Total Precipitation", - "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "m" }, @@ -157,7 +151,6 @@ "_transformation": false, "original_units": "cm", "original_variable": "Snow on the Ground", - "scale_factor": 1, "standard_name": "surface_snow_thickness", "units": "m" }, @@ -173,7 +166,6 @@ ], "_transformation": false, "original_variable": "Thunderstorms", - "scale_factor": 1, "standard_name": "thunderstorm_presence", "units": "1" }, @@ -189,7 +181,6 @@ ], "_transformation": false, "original_variable": "Freezing rain or drizzle", - "scale_factor": 1, "standard_name": "freeze_rain_drizzle_presence", "units": "1" }, @@ -205,7 +196,6 @@ ], "_transformation": false, "original_variable": "Hail", - "scale_factor": 1, "standard_name": "hail_presence", "units": "1" }, @@ -218,7 +208,6 @@ "DLY44" ], "original_variable": "Fog or Ice Fog", - "scale_factor": 1, "standard_name": "fog_ice_fog_presence", "units": "1" }, @@ -233,7 +222,6 @@ ], "_transformation": false, "original_variable": "Smoke or Haze", - "scale_factor": 1, "standard_name": "smoke_haze_presence", "units": "1" }, @@ -248,7 +236,6 @@ ], "_transformation": false, "original_variable": "Blowing Dust or Sand", - "scale_factor": 1, "standard_name": "blowing_dust_sand_presence", "units": "1" }, @@ -263,7 +250,6 @@ ], "_transformation": false, "original_variable": "Blowing snow", - "scale_factor": 1, "standard_name": "blowing_snow_presence", "units": "1" }, @@ -278,7 +264,6 @@ ], "_transformation": false, "original_variable": "Wind speed >= 28 Knots", - "scale_factor": 1, "standard_name": "wind_exceeding_28_knots", "units": "1" }, @@ -293,7 +278,6 @@ ], "_transformation": false, "original_variable": "Wind speed >= 34 Knots", - "scale_factor": 1, "standard_name": "wind_exceeding_34_knots", "units": "1" }, @@ -324,7 +308,6 @@ "_transformation": false, "original_units": "km/h", "original_variable": "Speed of extreme gust", - "scale_factor": 1, "standard_name": "wind_speed_of_gust", "units": "m s-1" }, @@ -339,7 +322,6 @@ ], "_transformation": false, "original_variable": "UTC hour of extreme gust", - "scale_factor": 1, "standard_name": "hour_of_extreme_gust", "units": "h" }, @@ -401,18 +383,17 @@ }, "067": { "_cf_variable_name": "rf7_radiation", - "_corrected_units": "lux h", + "_corrected_units": "klx h", "_invert_sign": false, "_offset_time": false, "_table_name": [ "HLY11" ], - "_transformation": false, + "_transformation": "op / 100 klx h", "original_units": "0.01 Kilolux_hrs", "original_variable": "RF7 daylight illumination", - "scale_factor": 10, "standard_name": "solar_radiation_flux", - "units": "lux h" + "units": "klx h" }, "068": { "_cf_variable_name": "rf8_radiation", @@ -436,10 +417,9 @@ "_table_name": [ "HLY15" ], - "_transformation": false, + "_transformation": "op * 10 deg", "original_units": "10's of degrees", "original_variable": "Direction - 45B anemometer (8 pts)", - "scale_factor": 1, "standard_name": "wind_to_direction", "units": "deg" }, @@ -451,10 +431,9 @@ "_table_name": [ "HLY01" ], - "_transformation": false, + "_transformation": "op * 30 m", "original_units": "30's of meters", "original_variable": "Ceiling height of lowest layer of clouds", - "scale_factor": 30, "standard_name": "ceiling_cloud_height", "units": "m" }, @@ -466,10 +445,9 @@ "_table_name": [ "HLY01" ], - "_transformation": false, + "_transformation": "op / 10 km", "original_units": "0.1 km", "original_variable": "Visibility", - "scale_factor": 0.1, "standard_name": "visibility_in_air", "units": "m" }, @@ -481,10 +459,9 @@ "_table_name": [ "HLY01" ], - "_transformation": false, + "_transformation": "op / 100 kPa", "original_units": "0.01 kPa", "original_variable": "Sea Level Pressure", - "scale_factor": 10, "standard_name": "air_pressure_at_mean_sea_level", "units": "Pa" }, @@ -496,10 +473,9 @@ "_table_name": [ "HLY01" ], - "_transformation": false, + "_transformation": "op / 10 degC", "original_units": "0.1 °C", "original_variable": "Dew Point Temperature", - "scale_factor": 0.1, "standard_name": "dew_point_temperature", "units": "K" }, @@ -511,10 +487,9 @@ "_table_name": [ "HLY01" ], - "_transformation": false, + "_transformation": "op * 10 deg", "original_units": "10's of degrees", "original_variable": "Wind Direction at 2 m (U2A Anemometer) (16 pts)", - "scale_factor": 10, "standard_name": "wind_direction_u2a", "units": "deg" }, @@ -529,7 +504,6 @@ "_transformation": false, "original_units": "km/h", "original_variable": "Wind Speed - U2A (16 pts) to December 1970", - "scale_factor": 1, "standard_name": "wind_speed_u2a", "units": "m s-1" }, @@ -541,10 +515,9 @@ "_table_name": [ "HLY01" ], - "_transformation": false, + "_transformation": "op / 100 kPa", "original_units": "0.01 kPa", "original_variable": "Station Pressure", - "scale_factor": 10, "standard_name": "atmospheric_pressure", "units": "Pa" }, @@ -556,10 +529,9 @@ "_table_name": [ "HLY01" ], - "_transformation": false, + "_transformation": "op / 10 degC", "original_units": "0.1 °C", "original_variable": "Dry Bulb Temperature", - "scale_factor": 0.1, "standard_name": "dry_bulb_temperature", "units": "K" }, @@ -571,10 +543,9 @@ "_table_name": [ "HLY01" ], - "_transformation": false, + "_transformation": "op / 10 degC", "original_units": "0.1 °C", "original_variable": "Wet Bulb temperature", - "scale_factor": 0.1, "standard_name": "wet_bulb_temperature", "units": "K" }, @@ -589,7 +560,6 @@ "_transformation": false, "original_units": "%", "original_variable": "Relative Humidity", - "scale_factor": 1, "standard_name": "relative_humidity", "units": "1" }, @@ -601,8 +571,8 @@ "_table_name": [ "HLY01" ], - "_transformation": false, - "original_units": "%", + "_transformation": "op * 10", + "original_units": "Tenths", "original_variable": "Total Cloud Opacity", "scale_factor": 10, "standard_name": "cloud_albedo", @@ -616,8 +586,8 @@ "_table_name": [ "HLY01" ], - "_transformation": false, - "original_units": "%", + "_transformation": "op * 10", + "original_units": "Tenths", "original_variable": "Total Cloud Amount", "scale_factor": 10, "standard_name": "cloud_area_fraction", @@ -633,7 +603,6 @@ ], "_transformation": false, "original_variable": "Freezing Rain", - "scale_factor": 1, "standard_name": "freezing_rain", "units": "1" }, @@ -647,7 +616,6 @@ ], "_transformation": false, "original_variable": "Ice Pellets", - "scale_factor": 1, "standard_name": "ice_pellet_presence", "units": "1" }, @@ -659,10 +627,9 @@ "_table_name": [ "HLY01" ], - "_transformation": false, + "_transformation": "op * 10", "original_units": "Tenths", "original_variable": "Lowest cloud layer opacity", - "scale_factor": 10, "standard_name": "low_type_cloud_opacity_fraction", "units": "1" }, @@ -674,10 +641,9 @@ "_table_name": [ "HLY01" ], - "_transformation": false, + "_transformation": "op * 10", "original_units": "Tenths", "original_variable": "Lowest cloud layer amount or condition", - "scale_factor": 10, "standard_name": "low_type_cloud_area_fraction", "units": "1" }, @@ -691,7 +657,6 @@ ], "_transformation": false, "original_variable": "Lowest cloud layer type", - "scale_factor": 1, "standard_name": "low_type_cloud_type", "units": "1" }, @@ -703,10 +668,9 @@ "_table_name": [ "HLY01" ], - "_transformation": false, + "_transformation": "op * 30 m", "original_units": "30's of meters", "original_variable": "Lowest cloud layer height", - "scale_factor": 30, "standard_name": "low_type_cloud_height", "units": "m" }, @@ -718,10 +682,9 @@ "_table_name": [ "HLY01" ], - "_transformation": false, + "_transformation": "op * 30 m", "original_units": "Tenths", "original_variable": "Second lowest cloud layer opacity", - "scale_factor": 10, "standard_name": "low_type_cloud_opacity_fraction", "units": "1" }, @@ -733,10 +696,9 @@ "_table_name": [ "HLY01" ], - "_transformation": false, + "_transformation": "op * 10", "original_units": "Tenths", "original_variable": "Second lowest cloud layer amount or condition", - "scale_factor": 10, "standard_name": "low_type_cloud_area_fraction", "units": "1" }, @@ -751,7 +713,6 @@ "_transformation": false, "original_units": "", "original_variable": "Second lowest cloud layer type", - "scale_factor": 1, "standard_name": "low_type_cloud_type", "units": "1" }, @@ -763,10 +724,9 @@ "_table_name": [ "HLY01" ], - "_transformation": false, + "_transformation": "op * 30 m", "original_units": "30's of meters", "original_variable": "Second lowest cloud layer height", - "scale_factor": 30, "standard_name": "low_type_cloud_height", "units": "m" }, @@ -778,10 +738,9 @@ "_table_name": [ "HLY01" ], - "_transformation": false, + "_transformation": "op * 10", "original_units": "Tenths", "original_variable": "Thirsd lowest cloud layer opacity", - "scale_factor": 10, "standard_name": "low_type_cloud_opacity_fraction", "units": "1" }, @@ -793,10 +752,9 @@ "_table_name": [ "HLY01" ], - "_transformation": false, + "_transformation": "op * 10", "original_units": "Tenths", "original_variable": "Third lowest cloud layer amount or condition", - "scale_factor": 10, "standard_name": "low_type_cloud_area_fraction", "units": "1" }, @@ -811,7 +769,6 @@ "_transformation": false, "original_units": "", "original_variable": "Third lowest cloud layer type", - "scale_factor": 1, "standard_name": "low_type_cloud_type", "units": "1" }, @@ -823,10 +780,9 @@ "_table_name": [ "HLY01" ], - "_transformation": false, + "_transformation": "op * 30 m", "original_units": "30's of meters", "original_variable": "Third lowest cloud layer height", - "scale_factor": 30, "standard_name": "low_type_cloud_height", "units": "m" }, @@ -838,10 +794,9 @@ "_table_name": [ "HLY01" ], - "_transformation": false, + "_transformation": "op / 10 mm h-1", "original_units": "0.1 mm", "original_variable": "Total Rainfall", - "scale_factor": 0.1, "standard_name": "rainfall_flux", "units": "kg m2 s-1" }, @@ -853,10 +808,9 @@ "_table_name": [ "HLY10" ], - "_transformation": false, + "_transformation": "op / 10 h", "original_units": "0.1 hrs", "original_variable": "Sunshine", - "scale_factor": 0.1, "standard_name": "duration_of_sunshine", "units": "s" }, @@ -868,10 +822,8 @@ "_table_name": [ "HLY01" ], - "_transformation": false, - "original_units": "10's of degrees", + "_transformation": "op * 10 deg", "original_variable": "Wind Direction - U2A (36 pts) from January 1971", - "scale_factor": 10, "standard_name": "wind_direction_u2a", "units": "deg" }, @@ -884,15 +836,15 @@ "HLY01" ], "_transformation": false, + "description": "Gust (G)=1, Squall (Q)=2", "long_name": "wind_direction_u2a", "original_units": "1, 2", "original_variable": "Wind character at 10 m", - "scale_factor": 1, "units": "" }, "210": { - "_cf_variable_name": "wind_dir_u2a_36", - "_corrected_units": "deg", + "_cf_variable_name": "", + "_corrected_units": "km h-1", "_invert_sign": false, "_offset_time": false, "_table_name": [ @@ -900,10 +852,9 @@ ], "_transformation": false, "original_units": "km/h", - "original_variable": "Wind Direction - U2A (36 pts) from January 1971", - "scale_factor": 10, - "standard_name": "wind_direction_u2a", - "units": "deg" + "original_variable": "Wind gust speed at 10 m", + "standard_name": "wind_speed_of_gust", + "units": "m s-1" }, "262": { "_cf_variable_name": "prtot", @@ -913,10 +864,9 @@ "_table_name": [ "HLY01_RCS" ], - "_transformation": false, + "_transformation": "op / 10 mm", "original_units": "0.1 mm", "original_variable": "Total Precipitation (minutes 00-60)", - "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, @@ -928,10 +878,9 @@ "_table_name": [ "HLY01_RCS" ], - "_transformation": false, + "_transformation": "op / 10 mm", "original_units": "0.1 mm", "original_variable": "Total Precipitation (minutes 00-15)", - "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, @@ -943,10 +892,9 @@ "_table_name": [ "HLY01_RCS" ], - "_transformation": false, + "_transformation": "op / 10 mm", "original_units": "0.1 mm", "original_variable": "Total Precipitation (minutes 15-30)", - "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, @@ -958,10 +906,9 @@ "_table_name": [ "HLY01_RCS" ], - "_transformation": false, + "_transformation": "op / 10 mm", "original_units": "0.1 mm", "original_variable": "Total Precipitation (minutes 30-45)", - "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, @@ -973,10 +920,9 @@ "_table_name": [ "HLY01_RCS" ], - "_transformation": false, + "_transformation": "op / 10 mm", "original_units": "0.1 mm", "original_variable": "Total Precipitation (minutes 45-60)", - "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, @@ -988,10 +934,9 @@ "_table_name": [ "HLY01_RCS" ], - "_transformation": false, + "_transformation": "op / 10 kg m-2", "original_units": "0.1 kg/m²", "original_variable": "Precipitation Gauge Weight per Unit Area (at minute 15)", - "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, @@ -1003,10 +948,9 @@ "_table_name": [ "HLY01_RCS" ], - "_transformation": false, + "_transformation": "op / 10 kg m-2", "original_units": "0.1 kg/m²", "original_variable": "Precipitation Gauge Weight per Unit Area (at minute 30)", - "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, @@ -1018,10 +962,9 @@ "_table_name": [ "HLY01_RCS" ], - "_transformation": false, + "_transformation": "op / 10 kg m-2", "original_units": "0.1 kg/m²", "original_variable": "Precipitation Gauge Weight per Unit Area (at minute 45)", - "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, @@ -1033,10 +976,9 @@ "_table_name": [ "HLY01_RCS" ], - "_transformation": false, + "_transformation": "op / 10 kg m-2", "original_units": "0.1 kg/m²", "original_variable": "Precipitation Gauge Weight per Unit Area (at minute 60)", - "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, @@ -1048,10 +990,9 @@ "_table_name": [ "HLY01_RCS" ], - "_transformation": false, + "_transformation": "op / 10 km h-1", "original_units": "0.1 km/h", "original_variable": "Wind Speed at 2 m (minutes 00-15)", - "scale_factor": 0.1, "standard_name": "wind_speed", "units": "m s-1" }, @@ -1063,10 +1004,9 @@ "_table_name": [ "HLY01_RCS" ], - "_transformation": false, + "_transformation": "op / 10 km h-1", "original_units": "0.1 km/h", "original_variable": "Wind Speed at 2 m (minutes 15-30)", - "scale_factor": 0.1, "standard_name": "wind_speed", "units": "m s-1" }, @@ -1078,10 +1018,9 @@ "_table_name": [ "HLY01_RCS" ], - "_transformation": false, + "_transformation": "op / 10 km h-1", "original_units": "0.1 km/h", "original_variable": "Wind Speed at 2 m (minutes 30-45)", - "scale_factor": 0.1, "standard_name": "wind_speed", "units": "m s-1" }, @@ -1093,15 +1032,14 @@ "_table_name": [ "HLY01_RCS" ], - "_transformation": false, + "_transformation": "op / 10 km h-1", "original_units": "0.1 km/h", "original_variable": "Wind Speed at 2 m (minutes 45-60)", - "scale_factor": 0.1, "standard_name": "wind_speed", "units": "m s-1" }, "275": { - "_cf_variable_name": "snd_q4", + "_cf_variable_name": "snd", "_corrected_units": "cm", "_invert_sign": false, "_offset_time": false, @@ -1111,7 +1049,6 @@ "_transformation": false, "original_units": "cm", "original_variable": "Snow Depth (at minute 60)", - "scale_factor": 1, "standard_name": "surface_snow_thickness", "units": "m" }, @@ -1126,7 +1063,6 @@ "_transformation": false, "original_units": "cm", "original_variable": "Snow Depth (at minute 15)", - "scale_factor": 1, "standard_name": "surface_snow_thickness", "units": "m" }, @@ -1141,7 +1077,6 @@ "_transformation": false, "original_units": "cm", "original_variable": "Snow Depth (at minute 30)", - "scale_factor": 1, "standard_name": "surface_snow_thickness", "units": "m" }, @@ -1156,7 +1091,6 @@ "_transformation": false, "original_units": "cm", "original_variable": "Snow Depth (at minute 45)", - "scale_factor": 1, "standard_name": "surface_snow_thickness", "units": "m" }, @@ -1171,7 +1105,6 @@ "_transformation": false, "original_units": "Degrees", "original_variable": "Wind Direction at 2 m (minutes 50-60)", - "scale_factor": 1, "standard_name": "wind_direction", "units": "deg" }, @@ -1183,10 +1116,9 @@ "_table_name": [ "HLY01_RCS" ], - "_transformation": false, + "_transformation": "op / 10 km h-1", "original_units": "0.1 km/h", "original_variable": "Wind Speed at 2 m (minutes 50-60)", - "scale_factor": 0.1, "standard_name": "wind_speed", "units": "m s-1" } From 94fe8669013264a82097e6b84632beadd5dcd2ad Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Tue, 20 Jun 2023 15:09:09 -0400 Subject: [PATCH 03/33] WIP - more refactoring - AHCCD incomplete --- miranda/convert/__init__.py | 4 +- miranda/convert/_data_definitions.py | 56 ++++- miranda/convert/_reconstruction.py | 2 +- .../{_data_corrections.py => _treatments.py} | 217 +++--------------- miranda/convert/corrections.py | 157 +++++++++++++ ...f_attrs.json => eccc-canswe_cf_attrs.json} | 15 +- .../data/eccc_homogenized_cf_attrs.json | 18 +- .../data}/eccc_obs_cf_attrs.json | 11 +- miranda/convert/data/eccc_rdrs_cf_attrs.json | 11 +- miranda/convert/data/espo-g6-e5l_attrs.json | 1 + miranda/convert/data/espo-g6-r2_attrs.json | 1 + ...sa_cf_attrs.json => nasa_ag_cf_attrs.json} | 0 .../convert/data/nex-gddp-cmip6_attrs.json | 1 + miranda/convert/{eccc.py => eccc_canswe.py} | 4 +- miranda/{eccc/_raw.py => convert/eccc_obs.py} | 49 ++-- miranda/convert/eccc_rdrs.py | 10 +- miranda/convert/melcc.py | 8 +- miranda/eccc/__init__.py | 1 - miranda/eccc/convert.py | 2 +- templates/eccc_raw_hourly_conversion.py | 6 +- templates/emdna_processing.py | 3 +- templates/era5-land_reanalysis_processing.py | 3 +- templates/espo-g6.py | 3 +- templates/nasa_nex-gddp-cmip6_processing.py | 3 +- 24 files changed, 327 insertions(+), 259 deletions(-) rename miranda/convert/{_data_corrections.py => _treatments.py} (78%) create mode 100644 miranda/convert/corrections.py rename miranda/convert/data/{eccc_cf_attrs.json => eccc-canswe_cf_attrs.json} (60%) rename miranda/{eccc => convert}/data/eccc_homogenized_cf_attrs.json (96%) rename miranda/{eccc => convert/data}/eccc_obs_cf_attrs.json (99%) rename miranda/convert/data/{nasa_cf_attrs.json => nasa_ag_cf_attrs.json} (100%) rename miranda/convert/{eccc.py => eccc_canswe.py} (95%) rename miranda/{eccc/_raw.py => convert/eccc_obs.py} (96%) diff --git a/miranda/convert/__init__.py b/miranda/convert/__init__.py index a38c34ee..179bd597 100644 --- a/miranda/convert/__init__.py +++ b/miranda/convert/__init__.py @@ -1,9 +1,9 @@ """Data Conversion module.""" from __future__ import annotations -from . import deh, eccc, ecmwf, hq, melcc, utils +from . import deh, eccc_canswe, ecmwf, hq, melcc, utils from ._aggregation import * -from ._data_corrections import * from ._data_definitions import * +from ._treatments import * # from ._reconstruction import * diff --git a/miranda/convert/_data_definitions.py b/miranda/convert/_data_definitions.py index 24fe4f2f..4ed4741b 100644 --- a/miranda/convert/_data_definitions.py +++ b/miranda/convert/_data_definitions.py @@ -5,6 +5,7 @@ import logging.config import os from pathlib import Path +from typing import Any from miranda.scripting import LOGGING_CONFIG from miranda.storage import report_file_size @@ -21,10 +22,11 @@ "gather_nex", "gather_nrcan_gridded_obs", "gather_raw_rdrs_by_years", - "gather_rdrs", + "gather_eccc_rdrs", "gather_sc_earth", "gather_wfdei_gem_capa", "gather_emdna", + "load_json_data_mappings", "nasa_ag_variables", "nrcan_variables", "project_institutes", @@ -35,6 +37,54 @@ _data_folder = Path(__file__).parent / "data" + +def load_json_data_mappings(project: str) -> dict[str, Any]: + """Load JSON mappings for supported dataset conversions. + + Parameters + ---------- + project : str + + Returns + ------- + dict[str, Any] + """ + data_folder = Path(__file__).resolve().parent / "data" + + if project.startswith("era5"): + metadata_definition = json.load(open(data_folder / "ecmwf_cf_attrs.json")) + elif project in ["rdrs-v21"]: + metadata_definition = json.load(open(data_folder / "eccc_rdrs_cf_attrs.json")) + elif project == "eccc-obs": + metadata_definition = json.load(open(data_folder / "eccc_obs_cf_attrs.json")) + elif project in ["agcfsr", "agmerra2"]: + metadata_definition = json.load(open(data_folder / "nasa_ag_cf_attrs.json")) + elif project in ["cordex", "cmip5", "cmip6"]: + metadata_definition = json.load(open(data_folder / "cmip_ouranos_attrs.json")) + elif project == "ets-grnch": + metadata_definition = json.load(open(data_folder / "ets_grnch_cf_attrs.json")) + elif project == "nrcan-gridded-10km": + raise NotImplementedError() + elif project == "wfdei-gem-capa": + metadata_definition = json.load(open(data_folder / "usask_cf_attrs.json")) + elif project.startswith("melcc"): + metadata_definition = json.load(open(data_folder / "melcc_cf_attrs.json")) + elif project.startswith("ec"): + metadata_definition = json.load(open(data_folder / "eccc-canswe_cf_attrs.json")) + elif project in ["NEX-GDDP-CMIP6"]: + metadata_definition = json.load(open(data_folder / "nex-gddp-cmip6_attrs.json")) + elif project in ["ESPO-G6-R2"]: + metadata_definition = json.load(open(data_folder / "espo-g6-r2_attrs.json")) + elif project in ["ESPO-G6-E5L"]: + metadata_definition = json.load(open(data_folder / "espo-g6-e5l_attrs.json")) + elif project in ["EMDNA"]: + metadata_definition = json.load(open(data_folder / "emdna_cf_attrs.json")) + else: + raise NotImplementedError() + + return metadata_definition + + eccc_rdrs_variables = dict() eccc_rdrs_variables["raw"] = [ v @@ -54,7 +104,7 @@ ].keys() grnch_variables = ["T", "Tmin", "Tmax", "P"] nrcan_variables = ["tasmin", "tasmax", "pr"] -nasa_ag_variables = json.load(open(_data_folder / "nasa_cf_attrs.json"))[ +nasa_ag_variables = json.load(open(_data_folder / "nasa_ag_cf_attrs.json"))[ "variables" ].keys() sc_earth_variables = ["prcp", "tdew", "tmean", "trange", "wind"] @@ -236,7 +286,7 @@ def gather_sc_earth(path: str | os.PathLike) -> dict[str, list[Path]]: ) -def gather_rdrs( +def gather_eccc_rdrs( name: str, path: str | os.PathLike, suffix: str, key: str ) -> dict[str, dict[str, list[Path]]]: """Gather RDRS processed source data. diff --git a/miranda/convert/_reconstruction.py b/miranda/convert/_reconstruction.py index 6cabf25d..4963d68e 100644 --- a/miranda/convert/_reconstruction.py +++ b/miranda/convert/_reconstruction.py @@ -17,8 +17,8 @@ from miranda.utils import chunk_iterables from ._aggregation import aggregate as aggregate_func -from ._data_corrections import dataset_corrections from ._data_definitions import project_institutes, xarray_frequencies_to_cmip6like +from .corrections import dataset_corrections logging.config.dictConfig(LOGGING_CONFIG) diff --git a/miranda/convert/_data_corrections.py b/miranda/convert/_treatments.py similarity index 78% rename from miranda/convert/_data_corrections.py rename to miranda/convert/_treatments.py index 5bdbcaf9..a1241940 100644 --- a/miranda/convert/_data_corrections.py +++ b/miranda/convert/_treatments.py @@ -1,14 +1,11 @@ from __future__ import annotations import datetime -import json import logging.config import os import warnings -from collections.abc import Iterator, Sequence from functools import partial from pathlib import Path -from typing import Any, Callable import numpy as np import xarray as xr @@ -18,72 +15,33 @@ from xclim.core.calendar import parse_offset from miranda import __version__ as __miranda_version__ -from miranda.gis import subset_domain from miranda.scripting import LOGGING_CONFIG from miranda.units import get_time_frequency -from .utils import date_parser, find_version_hash +from ._data_definitions import load_json_data_mappings +from .utils import date_parser logging.config.dictConfig(LOGGING_CONFIG) VERSION = datetime.datetime.now().strftime("%Y.%m.%d") __all__ = [ - "dataset_corrections", - "dims_conversion", - "dataset_conversion", - "load_json_data_mappings", + "cf_units_conversion", + "clip_values", + "conservative_regrid", + "correct_unit_names", + "dimensions_compliance", + "ensure_correct_time_frequency", + "invert_value_sign", "metadata_conversion", + "offset_time_dimension", + "preprocessing_corrections", "threshold_mask", + "transform_values", "variable_conversion", ] -def load_json_data_mappings(project: str) -> dict[str, Any]: - """Load JSON mappings for supported dataset conversions. - - Parameters - ---------- - project : str - - Returns - ------- - dict[str, Any] - """ - data_folder = Path(__file__).resolve().parent / "data" - - if project.startswith("era5"): - metadata_definition = json.load(open(data_folder / "ecmwf_cf_attrs.json")) - elif project in ["rdrs-v21"]: - metadata_definition = json.load(open(data_folder / "eccc_rdrs_cf_attrs.json")) - elif project in ["agcfsr", "agmerra2"]: # This should handle the AG versions: - metadata_definition = json.load(open(data_folder / "nasa_cf_attrs.json")) - elif project in ["cordex", "cmip5", "cmip6"]: - metadata_definition = json.load(open(data_folder / "cmip_ouranos_attrs.json")) - elif project == "ets-grnch": - metadata_definition = json.load(open(data_folder / "ets_grnch_cf_attrs.json")) - elif project == "nrcan-gridded-10km": - raise NotImplementedError() - elif project == "wfdei-gem-capa": - metadata_definition = json.load(open(data_folder / "usask_cf_attrs.json")) - elif project.startswith("melcc"): - metadata_definition = json.load(open(data_folder / "melcc_cf_attrs.json")) - elif project.startswith("ec"): - metadata_definition = json.load(open(data_folder / "eccc_cf_attrs.json")) - elif project in ["NEX-GDDP-CMIP6"]: - metadata_definition = json.load(open(data_folder / "nex-gddp-cmip6_attrs.json")) - elif project in ["ESPO-G6-R2"]: - metadata_definition = json.load(open(data_folder / "espo-g6-r2_attrs.json")) - elif project in ["ESPO-G6-E5L"]: - metadata_definition = json.load(open(data_folder / "espo-g6-e5l_attrs.json")) - elif project in ["EMDNA"]: - metadata_definition = json.load(open(data_folder / "emdna_cf_attrs.json")) - else: - raise NotImplementedError() - - return metadata_definition - - def _get_section_entry_key(meta, entry, var, key, project): var_meta = meta[entry].get(var, {}) if key in var_meta: @@ -321,7 +279,8 @@ def _preprocess_correct(d: xr.Dataset, *, ops: list[partial]) -> xr.Dataset: return ds -def _correct_units_names(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: +def correct_unit_names(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: + """Correct unit names.""" key = "_corrected_units" for var, val in _iter_entry_key(d, m, "variables", key, p): if val: @@ -336,7 +295,8 @@ def _correct_units_names(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: # for de-accumulation or conversion to flux -def _transform(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: +def transform_values(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: + """Transform dataset values according to operation listed.""" key = "_transformation" d_out = xr.Dataset(coords=d.coords, attrs=d.attrs) converted = [] @@ -430,7 +390,8 @@ def _transform(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: return d_out -def _offset_time(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: +def offset_time_dimension(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: + """Offset time dimension using listed frequency.""" key = "_offset_time" d_out = xr.Dataset(coords=d.coords, attrs=d.attrs) converted = [] @@ -479,7 +440,8 @@ def _offset_time(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: return d_out -def _invert_sign(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: +def invert_value_sign(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: + """Flip value of DataArray.""" key = "_invert_sign" d_out = xr.Dataset(coords=d.coords, attrs=d.attrs) converted = [] @@ -507,7 +469,8 @@ def _invert_sign(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: # For converting variable units to standard workflow units -def _units_cf_conversion(d: xr.Dataset, m: dict) -> xr.Dataset: +def cf_units_conversion(d: xr.Dataset, m: dict) -> xr.Dataset: + """Perform pint-based units-conversion.""" if "time" in m["dimensions"].keys(): if m["dimensions"]["time"].get("units"): d["time"]["units"] = m["dimensions"]["time"]["units"] @@ -524,7 +487,8 @@ def _units_cf_conversion(d: xr.Dataset, m: dict) -> xr.Dataset: # For clipping variable values to an established maximum/minimum -def _clip_values(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: +def clip_values(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: + """Clip values to an appropriate range,.""" key = "_clip_values" d_out = xr.Dataset(coords=d.coords, attrs=d.attrs) converted = [] @@ -572,7 +536,8 @@ def _clip_values(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: return d_out -def _ensure_correct_time(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: +def ensure_correct_time_frequency(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: + """Ensure that time frequency is consistent with expected frequency for project.""" key = "_ensure_correct_time" strict_time = "_strict_time" @@ -643,8 +608,8 @@ def _ensure_correct_time(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: # For renaming and reordering lat and lon dims -def dims_conversion(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: - """Rename dimensions to CF to their equivalents. +def dimensions_compliance(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: + """Rename dimensions to CF to their equivalents and reorder them if needed. Parameters ---------- @@ -854,129 +819,3 @@ def metadata_conversion(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: d.attrs.update(dict(history=history)) return d - - -def dataset_corrections(ds: xr.Dataset, project: str) -> xr.Dataset: - """Convert variables to CF-compliant format""" - metadata_definition = load_json_data_mappings(project) - - ds = _correct_units_names(ds, project, metadata_definition) - ds = _transform(ds, project, metadata_definition) - ds = _invert_sign(ds, project, metadata_definition) - ds = _units_cf_conversion(ds, metadata_definition) - ds = _clip_values(ds, project, metadata_definition) - - ds = dims_conversion(ds, project, metadata_definition) - ds = _ensure_correct_time(ds, project, metadata_definition) - ds = _offset_time(ds, project, metadata_definition) - - ds = variable_conversion(ds, project, metadata_definition) - - ds = metadata_conversion(ds, project, metadata_definition) - - ds.attrs["history"] = ( - f"{datetime.datetime.now()}: " - f"Variables converted from original files using miranda.convert.{dataset_corrections.__name__}. " - f"{ds.attrs.get('history')}".strip() - ) - - return ds - - -def dataset_conversion( - input_files: ( - str - | os.PathLike - | Sequence[str | os.PathLike] - | Iterator[os.PathLike] - | xr.Dataset - ), - project: str, - domain: str | None = None, - mask: xr.Dataset | xr.DataArray | None = None, - mask_cutoff: float | bool = False, - regrid: bool = False, - add_version_hashes: bool = True, - preprocess: Callable | str | None = "auto", - **xr_kwargs, -) -> xr.Dataset | xr.DataArray: - """Convert an existing Xarray-compatible dataset to another format with variable corrections applied. - - Parameters - ---------- - input_files : str or os.PathLike or Sequence[str or os.PathLike] or Iterator[os.PathLike] or xr.Dataset - Files or objects to be converted. - If sent a list or GeneratorType, will open with :py:func:`xarray.open_mfdataset` and concatenate files. - project : {"cordex", "cmip5", "cmip6", "ets-grnch", "isimip-ft", "pcic-candcs-u6", "converted"} - Project name for decoding/handling purposes. - domain: {"global", "nam", "can", "qc", "mtl"}, optional - Domain to perform subsetting for. Default: None. - mask : Optional[Union[xr.Dataset, xr.DataArray]] - DataArray or single data_variable dataset containing mask. - mask_cutoff : float or bool - If land_sea_mask supplied, the threshold above which to mask with land_sea_mask. Default: False. - regrid : bool - Performing regridding with xesmf. Default: False. - add_version_hashes : bool - If True, version name and sha256sum of source file(s) will be added as a field among the global attributes. - preprocess : callable or str, optional - Preprocessing functions to perform over each Dataset. - Default: "auto" - Run preprocessing fixes based on supplied fields from metadata definition. - Callable - Runs function over Dataset (single) or supplied to `preprocess` (multifile dataset). - **xr_kwargs - Arguments passed directly to xarray. - - Returns - ------- - xr.Dataset or xr.DataArray - """ - if isinstance(input_files, xr.Dataset): - ds = input_files - else: - if isinstance(input_files, (str, os.PathLike)): - if Path(input_files).is_dir(): - files = [] - files.extend([f for f in Path(input_files).glob("*.nc")]) - files.extend([f for f in Path(input_files).glob("*.zarr")]) - else: - files = [Path(input_files)] - elif isinstance(input_files, (Sequence, Iterator)): - files = [Path(f) for f in input_files] - else: - files = input_files - version_hashes = dict() - if add_version_hashes: - for file in files: - version_hashes[file.name] = find_version_hash(file) - - preprocess_kwargs = dict() - if preprocess: - if preprocess == "auto": - preprocess_kwargs.update( - preprocess=partial(preprocessing_corrections, project=project) - ) - elif isinstance(preprocess, Callable): - preprocess_kwargs.update(preprocess=preprocess) - - if len(files) == 1: - ds = xr.open_dataset(files[0], **xr_kwargs) - for _, process in preprocess_kwargs.items(): - ds = process(ds) - else: - ds = xr.open_mfdataset(files, **xr_kwargs, **preprocess_kwargs) - if version_hashes: - ds.attrs.update(dict(original_files=str(version_hashes))) - - ds = dataset_corrections(ds, project) - - if domain: - ds = subset_domain(ds, domain) - - if isinstance(mask, (str, Path)): - mask = xr.open_dataset(mask) - if isinstance(mask, (xr.Dataset, xr.DataArray)): - if regrid: - mask = conservative_regrid(ds, mask) - ds = threshold_mask(ds, mask=mask, mask_cutoff=mask_cutoff) - - return ds diff --git a/miranda/convert/corrections.py b/miranda/convert/corrections.py new file mode 100644 index 00000000..259d2b1b --- /dev/null +++ b/miranda/convert/corrections.py @@ -0,0 +1,157 @@ +"""Dataset corrections submodule.""" +from __future__ import annotations + +import datetime +import os +from functools import partial +from pathlib import Path +from typing import Callable, Iterator, Sequence + +import xarray as xr + +from miranda.convert import ( + dimensions_compliance, + metadata_conversion, + threshold_mask, + variable_conversion, +) +from miranda.convert._data_definitions import load_json_data_mappings +from miranda.convert._treatments import ( + cf_units_conversion, + clip_values, + conservative_regrid, + correct_unit_names, + ensure_correct_time_frequency, + invert_value_sign, + offset_time_dimension, + preprocessing_corrections, + transform_values, +) +from miranda.convert.utils import find_version_hash +from miranda.gis import subset_domain + + +def dataset_corrections(ds: xr.Dataset, project: str) -> xr.Dataset: + """Convert variables to CF-compliant format""" + metadata_definition = load_json_data_mappings(project) + + ds = correct_unit_names(ds, project, metadata_definition) + ds = transform_values(ds, project, metadata_definition) + ds = invert_value_sign(ds, project, metadata_definition) + ds = cf_units_conversion(ds, metadata_definition) + ds = clip_values(ds, project, metadata_definition) + + ds = dimensions_compliance(ds, project, metadata_definition) + ds = ensure_correct_time_frequency(ds, project, metadata_definition) + ds = offset_time_dimension(ds, project, metadata_definition) + + ds = variable_conversion(ds, project, metadata_definition) + + ds = metadata_conversion(ds, project, metadata_definition) + + ds.attrs["history"] = ( + f"{datetime.datetime.now()}: " + f"Variables converted from original files using miranda.convert.{dataset_corrections.__name__}. " + f"{ds.attrs.get('history')}".strip() + ) + + return ds + + +def dataset_conversion( + input_files: ( + str + | os.PathLike + | Sequence[str | os.PathLike] + | Iterator[os.PathLike] + | xr.Dataset + ), + project: str, + domain: str | None = None, + mask: xr.Dataset | xr.DataArray | None = None, + mask_cutoff: float | bool = False, + regrid: bool = False, + add_version_hashes: bool = True, + preprocess: Callable | str | None = "auto", + **xr_kwargs, +) -> xr.Dataset | xr.DataArray: + """Convert an existing Xarray-compatible dataset to another format with variable corrections applied. + + Parameters + ---------- + input_files : str or os.PathLike or Sequence[str or os.PathLike] or Iterator[os.PathLike] or xr.Dataset + Files or objects to be converted. + If sent a list or GeneratorType, will open with :py:func:`xarray.open_mfdataset` and concatenate files. + project : {"cordex", "cmip5", "cmip6", "ets-grnch", "isimip-ft", "pcic-candcs-u6", "converted"} + Project name for decoding/handling purposes. + domain: {"global", "nam", "can", "qc", "mtl"}, optional + Domain to perform subsetting for. Default: None. + mask : Optional[Union[xr.Dataset, xr.DataArray]] + DataArray or single data_variable dataset containing mask. + mask_cutoff : float or bool + If land_sea_mask supplied, the threshold above which to mask with land_sea_mask. Default: False. + regrid : bool + Performing regridding with xesmf. Default: False. + add_version_hashes : bool + If True, version name and sha256sum of source file(s) will be added as a field among the global attributes. + preprocess : callable or str, optional + Preprocessing functions to perform over each Dataset. + Default: "auto" - Run preprocessing fixes based on supplied fields from metadata definition. + Callable - Runs function over Dataset (single) or supplied to `preprocess` (multifile dataset). + **xr_kwargs + Arguments passed directly to xarray. + + Returns + ------- + xr.Dataset or xr.DataArray + """ + if isinstance(input_files, xr.Dataset): + ds = input_files + else: + if isinstance(input_files, (str, os.PathLike)): + if Path(input_files).is_dir(): + files = [] + files.extend([f for f in Path(input_files).glob("*.nc")]) + files.extend([f for f in Path(input_files).glob("*.zarr")]) + else: + files = [Path(input_files)] + elif isinstance(input_files, (Sequence, Iterator)): + files = [Path(f) for f in input_files] + else: + files = input_files + version_hashes = dict() + if add_version_hashes: + for file in files: + version_hashes[file.name] = find_version_hash(file) + + preprocess_kwargs = dict() + if preprocess: + if preprocess == "auto": + preprocess_kwargs.update( + preprocess=partial(preprocessing_corrections, project=project) + ) + elif isinstance(preprocess, Callable): + preprocess_kwargs.update(preprocess=preprocess) + + if len(files) == 1: + ds = xr.open_dataset(files[0], **xr_kwargs) + for _, process in preprocess_kwargs.items(): + ds = process(ds) + else: + ds = xr.open_mfdataset(files, **xr_kwargs, **preprocess_kwargs) + if version_hashes: + ds.attrs.update(dict(original_files=str(version_hashes))) + + ds = dataset_corrections(ds, project) + + if domain: + ds = subset_domain(ds, domain) + + if isinstance(mask, (str, Path)): + mask = xr.open_dataset(mask) + if isinstance(mask, (xr.Dataset, xr.DataArray)): + if regrid: + mask = conservative_regrid(ds, mask) + ds = threshold_mask(ds, mask=mask, mask_cutoff=mask_cutoff) + + return ds diff --git a/miranda/convert/data/eccc_cf_attrs.json b/miranda/convert/data/eccc-canswe_cf_attrs.json similarity index 60% rename from miranda/convert/data/eccc_cf_attrs.json rename to miranda/convert/data/eccc-canswe_cf_attrs.json index 4424ae76..4b48eb98 100644 --- a/miranda/convert/data/eccc_cf_attrs.json +++ b/miranda/convert/data/eccc-canswe_cf_attrs.json @@ -2,31 +2,32 @@ "Header": { "Conventions": "CF-1.9", "_contact": { - "ec-canswe": "vincent.vionnet@canada.ca" + "eccc-canswe": "vincent.vionnet@canada.ca" }, "_doi": { - "ec-canswe": "10.5281/zenodo.6638382" + "eccc-canswe": "10.5281/zenodo.6638382" }, "_license": { - "ec-canswe": "https://open.canada.ca/en/open-government-licence-canada" + "eccc-canswe": "https://open.canada.ca/en/open-government-licence-canada" }, "_miranda_version": true, "_reference": { - "ec-canswe": "https://zenodo.org/record/6638382" + "eccc-canswe": "https://zenodo.org/record/6638382" }, "_source": { - "ec-canswe": "CanSWE" + "eccc-canswe": "CanSWE" }, "_version": { - "ec-canswe": "v4" + "eccc-canswe": "v4" }, "institution": "GovCan", "license_type": { - "ec-canswe": "permissive" + "eccc-canswe": "permissive" }, "organisation": "ECCC", "processing_level": "raw", "realm": "atmos", + "source": "ECCC-CANSWE", "table_date": "2023-03-23", "table_id": "eccc", "type": "station-obs" diff --git a/miranda/eccc/data/eccc_homogenized_cf_attrs.json b/miranda/convert/data/eccc_homogenized_cf_attrs.json similarity index 96% rename from miranda/eccc/data/eccc_homogenized_cf_attrs.json rename to miranda/convert/data/eccc_homogenized_cf_attrs.json index 92c3b0f1..9eac354f 100644 --- a/miranda/eccc/data/eccc_homogenized_cf_attrs.json +++ b/miranda/convert/data/eccc_homogenized_cf_attrs.json @@ -1,24 +1,28 @@ { "Header": { "Conventions": "CF-1.8", + "_citation": { + "gen2": "Mekis, É and L.A. Vincent, 2011: An overview of the second generation adjusted daily precipitation dataset for trend analysis in Canada. Atmosphere-Ocean 49(2), 163-177 doi:10.1080/07055900.2011.583910", + "gen3": "Vincent, L.A., M.M. Hartwell and X.L. Wang, 2020: A Third Generation of Homogenized Temperature for Trend Analysis and Monitoring Changes in Canada’s Climate. Atmosphere-Ocean. https://doi.org/10.1080/07055900.2020.1765728" + }, + "_frequency": true, + "_miranda_version": true, + "_missing_values": [ + "-999", + "1e20" + ], "_product": { "gen2": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 2", "gen3": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 3" }, - "citation": { - "gen2": "Mekis, É and L.A. Vincent, 2011: An overview of the second generation adjusted daily precipitation dataset for trend analysis in Canada. Atmosphere-Ocean 49(2), 163-177 doi:10.1080/07055900.2011.583910", - "gen3": "Vincent, L.A., M.M. Hartwell and X.L. Wang, 2020: A Third Generation of Homogenized Temperature for Trend Analysis and Monitoring Changes in Canada’s Climate. Atmosphere-Ocean. https://doi.org/10.1080/07055900.2020.1765728" - }, "contact": "info.cccs-ccsc@canada.ca", "documentation": "https://www.canada.ca/en/environment-climate-change/services/climate-change/canadian-centre-climate-services/display-download/technical-documentation-adjusted-climate-data.html", - "float_missing_value": "1e20", - "frequency": "day", "institution": "GovCan", - "int_missing_value": "-999", "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html", "license_type": "permissive", "organization": "ECCC", "realm": "atmos", + "source": "AHCCD", "table_date": "2023-03-23", "table_id": "ECCC" }, diff --git a/miranda/eccc/eccc_obs_cf_attrs.json b/miranda/convert/data/eccc_obs_cf_attrs.json similarity index 99% rename from miranda/eccc/eccc_obs_cf_attrs.json rename to miranda/convert/data/eccc_obs_cf_attrs.json index 570c318c..68b30487 100644 --- a/miranda/eccc/eccc_obs_cf_attrs.json +++ b/miranda/convert/data/eccc_obs_cf_attrs.json @@ -3,17 +3,22 @@ "Conventions": "CF-1.9", "_frequency": true, "_miranda_version": true, + "_missing_flags": "M", + "_missing_values": [ + "-999", + "1e20", + "-9999", + "#####" + ], "contact": "climatcentre-climatecentral@ec.gc.ca", "institution": "GovCan", - "int_missing_value": "-999", "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html", "license_preamble": "The data is owned by the Government of Canada (Environment and Climate Change Canada), and fall under the licence agreement for use of Environment and Climate Change Canada data.", "license_type": "permissive", - "missing_value": "1e20", "organization": "ECCC", "processing_level": "raw", "realm": "atmos", - "source": "MSC", + "source": "ECCC-OBS", "table_date": "2023-03-23", "type": "station-obs" }, diff --git a/miranda/convert/data/eccc_rdrs_cf_attrs.json b/miranda/convert/data/eccc_rdrs_cf_attrs.json index d3d985b3..9a946194 100644 --- a/miranda/convert/data/eccc_rdrs_cf_attrs.json +++ b/miranda/convert/data/eccc_rdrs_cf_attrs.json @@ -27,6 +27,7 @@ "organisation": "ECCC", "processing_level": "raw", "realm": "atmos", + "source": "RDRS", "table_date": "2023-03-23", "table_id": "eccc", "type": "reconstruction" @@ -46,8 +47,8 @@ "RDRS_v2.1_A_PR0_SFC": { "_cf_variable_name": "pr", "_corrected_units": false, - "_invert_sign": {}, - "_offset_time": {}, + "_invert_sign": false, + "_offset_time": false, "_transformation": { "rdrs-v21": "amount2rate" }, @@ -63,9 +64,9 @@ "_corrected_units": { "rdrs-v21": "degC" }, - "_invert_sign": {}, - "_offset_time": {}, - "_transformation": {}, + "_invert_sign": false, + "_offset_time": false, + "_transformation": false, "cell_methods": "time: point", "long_name": "1.5 metre temperature", "standard_name": "air_temperature", diff --git a/miranda/convert/data/espo-g6-e5l_attrs.json b/miranda/convert/data/espo-g6-e5l_attrs.json index e4e76045..71a2c80a 100644 --- a/miranda/convert/data/espo-g6-e5l_attrs.json +++ b/miranda/convert/data/espo-g6-e5l_attrs.json @@ -14,6 +14,7 @@ "domain": "NAM", "mip_era": "CMIP6", "processing_level": "biasadjusted", + "source": "ESPO-G6-E5L", "table_date": "2023-04-24", "table_id": "ESPO-G6-E5L", "type": "simulation", diff --git a/miranda/convert/data/espo-g6-r2_attrs.json b/miranda/convert/data/espo-g6-r2_attrs.json index ad57313f..c0e73f03 100644 --- a/miranda/convert/data/espo-g6-r2_attrs.json +++ b/miranda/convert/data/espo-g6-r2_attrs.json @@ -14,6 +14,7 @@ "domain": "NAM", "mip_era": "CMIP6", "processing_level": "biasadjusted", + "source": "ESPO-G6-R2", "table_date": "2023-04-24", "table_id": "ESPO-G6-R2", "type": "simulation", diff --git a/miranda/convert/data/nasa_cf_attrs.json b/miranda/convert/data/nasa_ag_cf_attrs.json similarity index 100% rename from miranda/convert/data/nasa_cf_attrs.json rename to miranda/convert/data/nasa_ag_cf_attrs.json diff --git a/miranda/convert/data/nex-gddp-cmip6_attrs.json b/miranda/convert/data/nex-gddp-cmip6_attrs.json index a58f29de..2e962b6e 100644 --- a/miranda/convert/data/nex-gddp-cmip6_attrs.json +++ b/miranda/convert/data/nex-gddp-cmip6_attrs.json @@ -12,6 +12,7 @@ "domain": "QC", "mip_era": "CMIP6", "processing_level": "biasadjusted", + "source": "NASA-NEX-GDDP", "table_date": "2023-04-11", "table_id": "NEX-GDDP-CMIP6", "type": "simulation" diff --git a/miranda/convert/eccc.py b/miranda/convert/eccc_canswe.py similarity index 95% rename from miranda/convert/eccc.py rename to miranda/convert/eccc_canswe.py index d3d9ac93..9ed1237b 100644 --- a/miranda/convert/eccc.py +++ b/miranda/convert/eccc_canswe.py @@ -7,7 +7,7 @@ import pandas as pd import xarray as xr -from ._data_corrections import dataset_corrections +from .corrections import dataset_corrections __all__ = ["convert_canswe"] @@ -55,7 +55,7 @@ def parse_desc(desc): ds.snd.attrs["ancillary_variables"] = "data_flag_snd qc_flag_snd" ds.snw.attrs["ancillary_variables"] = "data_flag_snw qc_flag_snw" - ds = dataset_corrections(ds, "ec-canswe") + ds = dataset_corrections(ds, "eccc-canswe") ds.attrs["frequency"] = "day" date = "-".join(ds.indexes["time"][[0, -1]].strftime("%Y%m")) for var in ["snd", "snw"]: diff --git a/miranda/eccc/_raw.py b/miranda/convert/eccc_obs.py similarity index 96% rename from miranda/eccc/_raw.py rename to miranda/convert/eccc_obs.py index d4dc98ff..30c6c776 100644 --- a/miranda/eccc/_raw.py +++ b/miranda/convert/eccc_obs.py @@ -1,3 +1,4 @@ +"""Specialized conversion tools for Environment and Climate Change Canada / Meteorological Service of Canada data.""" ###################################################################### # S.Biner, Ouranos, mai 2019 # @@ -36,21 +37,22 @@ from xclim.core.units import convert_units_to from miranda.archive import group_by_length +from miranda.convert import load_json_data_mappings from miranda.scripting import LOGGING_CONFIG from miranda.storage import file_size, report_file_size -from miranda.units import GiB, MiB from miranda.utils import generic_extract_archive -from ._utils import cf_station_metadata - config.dictConfig(LOGGING_CONFIG) __all__ = [ - "aggregate_stations", + "merge_stations", "convert_flat_files", "merge_converted_variables", ] +KiB = int(pow(2, 10)) +MiB = int(pow(2, 20)) +GiB = int(pow(2, 30)) TABLE_DATE = dt.now().strftime("%d %B %Y") @@ -86,7 +88,7 @@ def _remove_duplicates(ds): def _convert_station_file( - fichier: Path, + file: Path, output_path: Path, errored_files: list[Path], mode: str, @@ -117,11 +119,11 @@ def _convert_station_file( missing_values = {-9999, "#####"} with tempfile.TemporaryDirectory() as temp_folder: - if fichier.suffix in [".gz", ".tar", ".zip", ".7z"]: - data_files = generic_extract_archive(fichier, output_dir=temp_folder) + if file.suffix in [".gz", ".tar", ".zip", ".7z"]: + data_files = generic_extract_archive(file, output_dir=temp_folder) else: - data_files = [fichier] - logging.info(f"Processing file: {fichier}.") + data_files = [file] + logging.info(f"Processing file: {file}.") size_limit = 1 * GiB @@ -325,7 +327,7 @@ def _convert_station_file( history = ( f"{dt.now().strftime('%Y-%m-%d %X')} converted from flat station file " - f"(`{fichier.name}`) to n-dimensional array." + f"(`{file.name}`) to n-dimensional array." ) # TODO: This info should eventually be sourced from a JSON definition @@ -432,8 +434,8 @@ def convert_flat_files( for variable_code in variables: variable_code = str(variable_code).zfill(3) - metadata = cf_station_metadata(variable_code) - nc_name = metadata["nc_name"] + metadata = load_json_data_mappings("eccc-obs")[variable_code] + nc_name = metadata["cf_variable_name"] rep_nc = Path(output_folder).joinpath(nc_name) rep_nc.mkdir(parents=True, exist_ok=True) @@ -477,7 +479,7 @@ def convert_flat_files( logging.warning(f"Process completed in {time.time() - func_time:.2f} seconds") -def aggregate_stations( +def merge_stations( source_files: str | os.PathLike | None = None, output_folder: str | os.PathLike | None = None, time_step: str = None, @@ -525,6 +527,7 @@ def aggregate_stations( pass elif isinstance(variables, (str, int)): variables = [variables] + # TODO: have the variable gathered from a JSON file elif variables is None: if mode == "hourly": @@ -542,8 +545,8 @@ def aggregate_stations( raise NotImplementedError() for variable_code in variables: - info = cf_station_metadata(variable_code) - variable_name = info["nc_name"] + info = load_json_data_mappings("eccc-obs")["variables"][variable_code] + variable_name = info["cf_variable_name"] logging.info(f"Merging `{variable_name}` using `{time_step}` time step.") # Only perform aggregation on available data with corresponding metadata @@ -869,18 +872,18 @@ def merge_converted_variables( Parameters ---------- - source_files: str, Path - output_folder: str, Path - variables: str or int or list of str or int, optional - station_metadata: str or Path, optional - overwrite: bool - n_workers: int + source_files : str, Path + output_folder : str, Path + variables : str or int or list of str or int, optional + station_metadata : str or Path, optional + overwrite : bool + n_workers : int Returns ------- None """ - meta = load_station_metadata(station_metadata) + meta = load_json_data_mappings("eccc-obs") metadata_file = Path(tempfile.NamedTemporaryFile(suffix=".nc", delete=False).name) meta.to_netcdf(metadata_file) @@ -894,7 +897,7 @@ def merge_converted_variables( if not isinstance(variables, list): variables = [variables] for var in variables: - selected_variables.append(cf_station_metadata(var)) + selected_variables.append(meta[var]) variables_found = [x.name for x in source_files.iterdir() if x.is_dir()] if selected_variables: diff --git a/miranda/convert/eccc_rdrs.py b/miranda/convert/eccc_rdrs.py index fa206746..ceff1654 100644 --- a/miranda/convert/eccc_rdrs.py +++ b/miranda/convert/eccc_rdrs.py @@ -13,8 +13,12 @@ from miranda.units import get_time_frequency from ._aggregation import aggregate -from ._data_corrections import dataset_conversion, load_json_data_mappings -from ._data_definitions import gather_raw_rdrs_by_years, gather_rdrs +from ._data_definitions import ( + gather_eccc_rdrs, + gather_raw_rdrs_by_years, + load_json_data_mappings, +) +from .corrections import dataset_conversion logging.config.dictConfig(LOGGING_CONFIG) @@ -159,7 +163,7 @@ def rdrs_to_daily( working_folder = Path(working_folder).expanduser() # GATHER ALL RDRS FILES - gathered = gather_rdrs(project, input_folder, "zarr", "cf") + gathered = gather_eccc_rdrs(project, input_folder, "zarr", "cf") files = gathered["rdrs-v21"] # noqa if process_variables: for vv in [f for f in files.keys() if f not in process_variables]: diff --git a/miranda/convert/melcc.py b/miranda/convert/melcc.py index e7619848..dba999dc 100644 --- a/miranda/convert/melcc.py +++ b/miranda/convert/melcc.py @@ -21,13 +21,11 @@ from xclim.core.units import convert_units_to, pint_multiply, str2pint from miranda import __version__ +from miranda.convert._data_definitions import load_json_data_mappings +from miranda.convert.corrections import dataset_corrections from miranda.scripting import LOGGING_CONFIG -from ._data_corrections import ( - dataset_corrections, - load_json_data_mappings, - metadata_conversion, -) +from ._treatments import metadata_conversion logging.config.dictConfig(LOGGING_CONFIG) logger = logging.getLogger(__name__) diff --git a/miranda/eccc/__init__.py b/miranda/eccc/__init__.py index 3fe8fdd3..781076e0 100644 --- a/miranda/eccc/__init__.py +++ b/miranda/eccc/__init__.py @@ -2,5 +2,4 @@ from __future__ import annotations from ._homogenized import * -from ._raw import * from ._summaries import * diff --git a/miranda/eccc/convert.py b/miranda/eccc/convert.py index cc86cfe2..eb94be48 100644 --- a/miranda/eccc/convert.py +++ b/miranda/eccc/convert.py @@ -10,7 +10,7 @@ from functools import partial from pathlib import Path -from miranda.eccc._raw import _convert_station_file +from miranda.convert.eccc_obs import _convert_station_file from miranda.eccc._utils import cf_station_metadata from miranda.scripting import LOGGING_CONFIG diff --git a/templates/eccc_raw_hourly_conversion.py b/templates/eccc_raw_hourly_conversion.py index 0849e182..fcb47b2f 100644 --- a/templates/eccc_raw_hourly_conversion.py +++ b/templates/eccc_raw_hourly_conversion.py @@ -1,8 +1,8 @@ from os import getenv from pathlib import Path -from miranda.eccc import ( - aggregate_stations, +from miranda.convert.eccc_obs import ( + merge_stations, convert_flat_files, merge_converted_variables, ) @@ -76,7 +76,7 @@ n_workers=n_workers, ) - aggregate_stations( + merge_stations( source_files=merged, output_folder=final, time_step=time_step, diff --git a/templates/emdna_processing.py b/templates/emdna_processing.py index 194526fc..3ee9eb07 100644 --- a/templates/emdna_processing.py +++ b/templates/emdna_processing.py @@ -4,6 +4,7 @@ from dask.diagnostics import ProgressBar +import miranda.convert.corrections from miranda import convert, io, structure @@ -23,7 +24,7 @@ def main(): files_by_member = convert.gather_emdna(path) for member, files in files_by_member.items(): if member == "OI": - ds = convert.dataset_conversion( + ds = miranda.convert.corrections.dataset_conversion( files, project="EMDNA", preprocess=preprocess_dna ) diff --git a/templates/era5-land_reanalysis_processing.py b/templates/era5-land_reanalysis_processing.py index c58aa430..3fc27945 100644 --- a/templates/era5-land_reanalysis_processing.py +++ b/templates/era5-land_reanalysis_processing.py @@ -1,5 +1,6 @@ from pathlib import Path +import miranda.convert.corrections from miranda import convert, io @@ -7,7 +8,7 @@ def main(): path_era5_land_out = Path("~/Desktop").expanduser() era5_land_files = convert.gather_ecmwf("era5-land", path_era5_land_out) - ds = convert.dataset_conversion( + ds = miranda.convert.corrections.dataset_conversion( era5_land_files, project="era5-land-monthly-means", ) diff --git a/templates/espo-g6.py b/templates/espo-g6.py index ac43168f..ea9a3e4d 100644 --- a/templates/espo-g6.py +++ b/templates/espo-g6.py @@ -4,6 +4,7 @@ from dask.diagnostics import ProgressBar +import miranda.convert.corrections from miranda import convert, io, structure from miranda.decode import Decoder @@ -42,7 +43,7 @@ def main(): ) if not os.path.exists(new_path): # and path not in skip: # open as dataset - ds = convert.dataset_conversion( + ds = miranda.convert.corrections.dataset_conversion( [f], add_version_hashes=False, project=project, diff --git a/templates/nasa_nex-gddp-cmip6_processing.py b/templates/nasa_nex-gddp-cmip6_processing.py index 78f51687..3fb7572b 100644 --- a/templates/nasa_nex-gddp-cmip6_processing.py +++ b/templates/nasa_nex-gddp-cmip6_processing.py @@ -1,5 +1,6 @@ from pathlib import Path +import miranda.convert.corrections from miranda import convert, io @@ -10,7 +11,7 @@ def main(): for path, list_files in nex_files.items(): # open as dataset - ds = convert.dataset_conversion( + ds = miranda.convert.corrections.dataset_conversion( list_files, add_version_hashes=False, project="NEX-GDDP-CMIP6", From d63523bbf9353a1f6d02ae13f8d8434febd9a127 Mon Sep 17 00:00:00 2001 From: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 21 Jun 2023 15:36:19 -0400 Subject: [PATCH 04/33] broken - refactoring of station writer --- miranda/convert/eccc_obs.py | 522 +++++++++++++++++------------------- miranda/eccc/geomet.py | 29 ++ 2 files changed, 281 insertions(+), 270 deletions(-) create mode 100644 miranda/eccc/geomet.py diff --git a/miranda/convert/eccc_obs.py b/miranda/convert/eccc_obs.py index 30c6c776..76f9146b 100644 --- a/miranda/convert/eccc_obs.py +++ b/miranda/convert/eccc_obs.py @@ -27,7 +27,6 @@ from datetime import datetime as dt from logging import config from pathlib import Path -from urllib.error import HTTPError import dask.dataframe as dd import numpy as np @@ -56,26 +55,19 @@ TABLE_DATE = dt.now().strftime("%d %B %Y") -def load_station_metadata(meta: str | os.PathLike) -> xr.Dataset: - if meta: - df_inv = pd.read_csv(meta, header=0) - else: - try: - import geopandas as gpd +def fwf_column_definitions(time_frequency: str): + """Return the column widths for the fixed-width format.""" - station_metadata_url = "https://api.weather.gc.ca/collections/climate-stations/items?f=json&limit=15000000" - df_inv = gpd.read_file(station_metadata_url) - except HTTPError as err: - raise RuntimeError( - f"Station metadata table unable to be fetched. Considering downloading directly: {err}" - ) - df_inv["LONGITUDE"] = df_inv.geometry.x - df_inv["LATITUDE"] = df_inv.geometry.y - df_inv["ELEVATION"] = df_inv.ELEVATION.astype(float) - df_inv["CLIMATE_IDENTIFIER"] = df_inv["CLIMATE_IDENTIFIER"].astype(str) + if time_frequency.lower() in ["h", "hour", "hourly"]: + num_observations = 24 + column_widths = [7, 4, 2, 2, 3] + [6, 1] * num_observations + elif time_frequency.lower() in ["d", "day", "daily"]: + num_observations = 31 + column_widths = [7, 4, 2, 3] + [6, 1] * num_observations + else: + raise NotImplementedError("`mode` must be 'h'/'hourly or 'd'/'daily'.") - df_inv = df_inv.drop(["geometry"], axis=1) - return df_inv.to_xarray() + return column_widths def _remove_duplicates(ds): @@ -87,6 +79,241 @@ def _remove_duplicates(ds): return ds.sel(time=~ds.get_index("time").duplicated()) +def convert_station( + data: str | os.PathLike, mode: str, using_dask_array: bool = False, **kwargs +): + column_widths = fwf_column_definitions(mode) + + if using_dask_array: + pandas_reader = dd + chunks = dict(blocksize=200 * MiB) + else: + pandas_reader = pd + chunks = dict() + using_dask_array = False + + # Create a dataframe from the files + try: + df = pandas_reader.read_fwf( + data, + widths=column_widths, + names=column_names, + dtype={ + name: data_type for name, data_type in zip(column_names, column_dtypes) + }, + assume_missing=True, + **chunks, + ) + if using_dask_array: + df = c.persist(df) + + except FileNotFoundError: + logging.error(f"File {data} was not found.") + errored_files.append(data) + return + + except (UnicodeDecodeError, Exception) as e: + logging.error( + f"File {data.name} was unable to be read. " + f"This is probably an issue with the file: {e}" + ) + errored_files.append(data) + return + + # Loop through the station codes + station_codes = df["code"].unique() + for code in station_codes: + df_code = df[df["code"] == code] + + # Abort if the variable is not found + if using_dask_array: + has_variable_codes = ( + (df_code["code_var"] == variable_code).compute() + ).any() + else: + has_variable_codes = (df_code["code_var"] == variable_code).any() + if not has_variable_codes: + logging.info( + f"Variable `{nc_name}` not found for station code: {code} in file {data}. Continuing..." + ) + continue + + # Perform the data treatment + logging.info(f"Converting `{nc_name}` for station code: {code}") + + # Dump the data into a DataFrame + df_var = df_code[df_code["code_var"] == variable_code].copy() + + # Mask the data according to the missing values flag + df_var = df_var.replace(missing_values, np.nan) + + # Decode the values and flags + dfd = df_var.loc[:, [f"D{i:0n}" for i in range(1, num_observations + 1)]] + dff = df_var.loc[:, [f"F{i:0n}" for i in range(1, num_observations + 1)]] + + # Remove the "NaN" flag + dff = dff.fillna("") + + # Use the flag to mask the values + try: + val = np.asarray(dfd.values, float) + except ValueError as e: + logging.error(f"{e} raised from {dfd}, continuing...") + continue + try: + flag = np.asarray(dff.values, str) + except ValueError as e: + logging.error(f"{e} raised from {dff}, continuing...") + continue + mask = np.isin(flag, missing_flags) + val[mask] = np.nan + + # Treat according to units conversions + val = val * scale_factor + add_offset + + # Create the DataArray + date_summations = dict(time=list()) + if mode == "hourly": + for index, row in df_var.iterrows(): + period = pd.Period( + year=row.year, month=row.month, day=row.day, freq="D" + ) + dates = pd.Series( + pd.date_range( + start=period.start_time, + end=period.end_time, + freq="H", + ) + ) + date_summations["time"].extend(dates) + written_values = val.flatten() + written_flags = flag.flatten() + elif mode == "daily": + value_days = list() + flag_days = list() + for i, (index, row) in enumerate(df_var.iterrows()): + period = pd.Period(year=row.year, month=row.month, freq="M") + dates = pd.Series( + pd.date_range( + start=period.start_time, + end=period.end_time, + freq="D", + ) + ) + date_summations["time"].extend(dates) + + value_days.extend( + val[i][range(monthrange(int(row.year), int(row.month))[1])] + ) + flag_days.extend( + flag[i][range(monthrange(int(row.year), int(row.month))[1])] + ) + written_values = value_days + written_flags = flag_days + + ds = xr.Dataset() + da_val = xr.DataArray(written_values, coords=date_summations, dims=["time"]) + + if raw_units != units: + da_val.attrs["units"] = raw_units + da_val = convert_units_to(da_val, units) + else: + da_val.attrs["units"] = units + + da_val = da_val.rename(nc_name) + variable_attributes = dict( + variable_code=variable_code, + standard_name=standard_name, + long_name=long_name, + ) + if "original_units" in kwargs: + variable_attributes["original_units"] = kwargs["original_units"] + da_val.attrs.update(variable_attributes) + + da_flag = xr.DataArray(written_flags, coords=date_summations, dims=["time"]) + da_flag = da_flag.rename("flag") + flag_attributes = dict( + long_name="data flag", + note="See ECCC technical documentation for details", + ) + da_flag.attrs.update(flag_attributes) + + ds[nc_name] = da_val + ds["flag"] = da_flag + + # save the file in NetCDF format + start_year = ds.time.dt.year.values[0] + end_year = ds.time.dt.year.values[-1] + + station_folder = output_path.joinpath(str(code)) + station_folder.mkdir(parents=True, exist_ok=True) + + f_nc = ( + f"{code}_{variable_code}_{nc_name}_" + f"{start_year if start_year == end_year else '_'.join([str(start_year), str(end_year)])}.nc" + ) + + if station_folder.joinpath(f_nc).exists(): + logging.warning(f"File `{f_nc}` already exists. Continuing...") + + history = ( + f"{dt.now().strftime('%Y-%m-%d %X')} converted from flat station file " + f"(`{file.name}`) to n-dimensional array." + ) + + # TODO: This info should eventually be sourced from a JSON definition + global_attrs = dict( + Conventions="CF-1.8", + comment="Acquired on demand from data specialists at " + "ECCC Climate Services / Services Climatiques.", + contact="John Richard", + contact_email="climatcentre-climatecentral@ec.gc.ca", + domain="CAN", + ) + if mode == "hourly": + global_attrs.update(dict(frequency="1hr")) + elif mode == "daily": + global_attrs.update(dict(frequency="day")) + global_attrs.update( + dict( + history=history, + internal_comment=f"Converted by {os.environ.get('USER', os.environ.get('USERNAME'))}.", + institution="ECCC", + license="https://climate.weather.gc.ca/prods_servs/attachment1_e.html", + member=code, + processing_level="raw", + redistribution="Redistribution permitted.", + references="https://climate.weather.gc.ca/doc/Technical_Documentation.pdf", + source="historical-station-records", + table_date=TABLE_DATE, + title="Environment and Climate Change Canada (ECCC) weather station observations", + type="station-obs", + usage="The original data is owned by the Government of Canada (Environment and Climate " + "Change Canada), and falls under the licence agreement for use of Environment and " + "Climate Change Canada data", + variable=str(nc_name), + version=f"v{dt.now().strftime('%Y.%m.%V')}", # Year.Month.Week + ) + ) + ds.attrs.update(global_attrs) + + logging.info(f"Exporting to: {station_folder.joinpath(f_nc)}") + ds.to_netcdf(station_folder.joinpath(f_nc)) + del ds + del val + del mask + del flag + del da_val + del da_flag + del dfd + del dff + del written_values + del written_flags + del date_summations + + del df + + def _convert_station_file( file: Path, output_path: Path, @@ -104,16 +331,9 @@ def _convert_station_file( scale_factor: float, standard_name: str, variable_code: str, - **kwargs, + **dask_kwargs, ): - if mode.lower() in ["h", "hour", "hourly"]: - num_observations = 24 - column_widths = [7, 4, 2, 2, 3] + [6, 1] * num_observations - elif mode.lower() in ["d", "day", "daily"]: - num_observations = 31 - column_widths = [7, 4, 2, 3] + [6, 1] * num_observations - else: - raise NotImplementedError("`mode` must be 'h'/'hourly or 'd'/'daily'.") + column_widths = fwf_column_definitions(mode) if not missing_values: missing_values = {-9999, "#####"} @@ -132,255 +352,17 @@ def _convert_station_file( logging.info( f"File exceeds {report_file_size(size_limit)} - Using dask.dataframes." ) - pandas_reader = dd - using_dask_array = True - chunks = dict(blocksize=200 * MiB) client = ProgressBar + using_dask = True else: logging.info( f"File below {report_file_size(size_limit)} - Using pandas.dataframes." ) - pandas_reader = pd - chunks = dict() - using_dask_array = False client = contextlib.nullcontext + using_dask = False - with client() as c: - # Create a dataframe from the files - try: - df = pandas_reader.read_fwf( - data, - widths=column_widths, - names=column_names, - dtype={ - name: data_type - for name, data_type in zip(column_names, column_dtypes) - }, - assume_missing=True, - **chunks, - ) - if using_dask_array: - df = c.persist(df) - - except FileNotFoundError: - logging.error(f"File {data} was not found.") - errored_files.append(data) - return - - except (UnicodeDecodeError, Exception) as e: - logging.error( - f"File {data.name} was unable to be read. " - f"This is probably an issue with the file: {e}" - ) - errored_files.append(data) - return - - # Loop through the station codes - station_codes = df["code"].unique() - for code in station_codes: - df_code = df[df["code"] == code] - - # Abort if the variable is not found - if using_dask_array: - has_variable_codes = ( - (df_code["code_var"] == variable_code).compute() - ).any() - else: - has_variable_codes = ( - df_code["code_var"] == variable_code - ).any() - if not has_variable_codes: - logging.info( - f"Variable `{nc_name}` not found for station code: {code} in file {data}. Continuing..." - ) - continue - - # Perform the data treatment - logging.info(f"Converting `{nc_name}` for station code: {code}") - - # Dump the data into a DataFrame - df_var = df_code[df_code["code_var"] == variable_code].copy() - - # Mask the data according to the missing values flag - df_var = df_var.replace(missing_values, np.nan) - - # Decode the values and flags - dfd = df_var.loc[ - :, [f"D{i:0n}" for i in range(1, num_observations + 1)] - ] - dff = df_var.loc[ - :, [f"F{i:0n}" for i in range(1, num_observations + 1)] - ] - - # Remove the "NaN" flag - dff = dff.fillna("") - - # Use the flag to mask the values - try: - val = np.asarray(dfd.values, float) - except ValueError as e: - logging.error(f"{e} raised from {dfd}, continuing...") - continue - try: - flag = np.asarray(dff.values, str) - except ValueError as e: - logging.error(f"{e} raised from {dff}, continuing...") - continue - mask = np.isin(flag, missing_flags) - val[mask] = np.nan - - # Treat according to units conversions - val = val * scale_factor + add_offset - - # Create the DataArray - date_summations = dict(time=list()) - if mode == "hourly": - for index, row in df_var.iterrows(): - period = pd.Period( - year=row.year, month=row.month, day=row.day, freq="D" - ) - dates = pd.Series( - pd.date_range( - start=period.start_time, - end=period.end_time, - freq="H", - ) - ) - date_summations["time"].extend(dates) - written_values = val.flatten() - written_flags = flag.flatten() - elif mode == "daily": - value_days = list() - flag_days = list() - for i, (index, row) in enumerate(df_var.iterrows()): - period = pd.Period(year=row.year, month=row.month, freq="M") - dates = pd.Series( - pd.date_range( - start=period.start_time, - end=period.end_time, - freq="D", - ) - ) - date_summations["time"].extend(dates) - - value_days.extend( - val[i][ - range(monthrange(int(row.year), int(row.month))[1]) - ] - ) - flag_days.extend( - flag[i][ - range(monthrange(int(row.year), int(row.month))[1]) - ] - ) - written_values = value_days - written_flags = flag_days - - ds = xr.Dataset() - da_val = xr.DataArray( - written_values, coords=date_summations, dims=["time"] - ) - - if raw_units != units: - da_val.attrs["units"] = raw_units - da_val = convert_units_to(da_val, units) - else: - da_val.attrs["units"] = units - - da_val = da_val.rename(nc_name) - variable_attributes = dict( - variable_code=variable_code, - standard_name=standard_name, - long_name=long_name, - ) - if "original_units" in kwargs: - variable_attributes["original_units"] = kwargs["original_units"] - da_val.attrs.update(variable_attributes) - - da_flag = xr.DataArray( - written_flags, coords=date_summations, dims=["time"] - ) - da_flag = da_flag.rename("flag") - flag_attributes = dict( - long_name="data flag", - note="See ECCC technical documentation for details", - ) - da_flag.attrs.update(flag_attributes) - - ds[nc_name] = da_val - ds["flag"] = da_flag - - # save the file in NetCDF format - start_year = ds.time.dt.year.values[0] - end_year = ds.time.dt.year.values[-1] - - station_folder = output_path.joinpath(str(code)) - station_folder.mkdir(parents=True, exist_ok=True) - - f_nc = ( - f"{code}_{variable_code}_{nc_name}_" - f"{start_year if start_year == end_year else '_'.join([str(start_year), str(end_year)])}.nc" - ) - - if station_folder.joinpath(f_nc).exists(): - logging.warning(f"File `{f_nc}` already exists. Continuing...") - - history = ( - f"{dt.now().strftime('%Y-%m-%d %X')} converted from flat station file " - f"(`{file.name}`) to n-dimensional array." - ) - - # TODO: This info should eventually be sourced from a JSON definition - global_attrs = dict( - Conventions="CF-1.8", - comment="Acquired on demand from data specialists at " - "ECCC Climate Services / Services Climatiques.", - contact="John Richard", - contact_email="climatcentre-climatecentral@ec.gc.ca", - domain="CAN", - ) - if mode == "hourly": - global_attrs.update(dict(frequency="1hr")) - elif mode == "daily": - global_attrs.update(dict(frequency="day")) - global_attrs.update( - dict( - history=history, - internal_comment=f"Converted by {os.environ.get('USER', os.environ.get('USERNAME'))}.", - institution="ECCC", - license="https://climate.weather.gc.ca/prods_servs/attachment1_e.html", - member=code, - processing_level="raw", - redistribution="Redistribution permitted.", - references="https://climate.weather.gc.ca/doc/Technical_Documentation.pdf", - source="historical-station-records", - table_date=TABLE_DATE, - title="Environment and Climate Change Canada (ECCC) weather station observations", - type="station-obs", - usage="The original data is owned by the Government of Canada (Environment and Climate " - "Change Canada), and falls under the licence agreement for use of Environment and " - "Climate Change Canada data", - variable=str(nc_name), - version=f"v{dt.now().strftime('%Y.%m.%V')}", # Year.Month.Week - ) - ) - ds.attrs.update(global_attrs) - - logging.info(f"Exporting to: {station_folder.joinpath(f_nc)}") - ds.to_netcdf(station_folder.joinpath(f_nc)) - del ds - del val - del mask - del flag - del da_val - del da_flag - del dfd - del dff - del written_values - del written_flags - del date_summations - - del df + with client(**dask_kwargs) as c: + convert_station(data, mode, using_dask=using_dask) if os.listdir(temp_folder): for temporary_file in Path(temp_folder).glob("*"): diff --git a/miranda/eccc/geomet.py b/miranda/eccc/geomet.py new file mode 100644 index 00000000..3fb00d72 --- /dev/null +++ b/miranda/eccc/geomet.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +import os +from urllib.error import HTTPError + +import pandas as pd +import xarray as xr + + +def load_station_metadata(meta: str | os.PathLike) -> xr.Dataset: + if meta: + df_inv = pd.read_csv(meta, header=0) + else: + try: + import geopandas as gpd + + station_metadata_url = "https://api.weather.gc.ca/collections/climate-stations/items?f=json&limit=15000000" + df_inv = gpd.read_file(station_metadata_url) + except HTTPError as err: + raise RuntimeError( + f"Station metadata table unable to be fetched. Considering downloading directly: {err}" + ) + df_inv["LONGITUDE"] = df_inv.geometry.x + df_inv["LATITUDE"] = df_inv.geometry.y + df_inv["ELEVATION"] = df_inv.ELEVATION.astype(float) + df_inv["CLIMATE_IDENTIFIER"] = df_inv["CLIMATE_IDENTIFIER"].astype(str) + + df_inv = df_inv.drop(["geometry"], axis=1) + return df_inv.to_xarray() From 500ae2ad5cf751b17a5374fee8ec68a59f9b224f Mon Sep 17 00:00:00 2001 From: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> Date: Thu, 6 Jul 2023 14:53:53 -0400 Subject: [PATCH 05/33] broken - more refactoring --- miranda/convert/eccc_obs.py | 57 ++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 33 deletions(-) diff --git a/miranda/convert/eccc_obs.py b/miranda/convert/eccc_obs.py index 76f9146b..32e2c3e1 100644 --- a/miranda/convert/eccc_obs.py +++ b/miranda/convert/eccc_obs.py @@ -27,6 +27,7 @@ from datetime import datetime as dt from logging import config from pathlib import Path +from typing import List, Tuple, Union, Type, Any import dask.dataframe as dd import numpy as np @@ -55,19 +56,23 @@ TABLE_DATE = dt.now().strftime("%d %B %Y") -def fwf_column_definitions(time_frequency: str): - """Return the column widths for the fixed-width format.""" +def _fwf_column_definitions(time_frequency: str) -> Tuple[List[str], List[int], List[Type[Union[str, int]]]]: + """Return the column names, widths, and data types for the fixed-width format.""" if time_frequency.lower() in ["h", "hour", "hourly"]: num_observations = 24 + column_names = ["code", "year", "month", "day", "code_var"] column_widths = [7, 4, 2, 2, 3] + [6, 1] * num_observations + column_dtypes = [str, int, int, int, str] elif time_frequency.lower() in ["d", "day", "daily"]: num_observations = 31 + column_names = ["code", "year", "month", "code_var"] column_widths = [7, 4, 2, 3] + [6, 1] * num_observations + column_dtypes = [str, int, int, str] else: raise NotImplementedError("`mode` must be 'h'/'hourly or 'd'/'daily'.") - return column_widths + return column_names, column_widths, column_dtypes def _remove_duplicates(ds): @@ -80,9 +85,10 @@ def _remove_duplicates(ds): def convert_station( - data: str | os.PathLike, mode: str, using_dask_array: bool = False, **kwargs + data: str | os.PathLike, mode: str, using_dask_array: bool = False, *, client: Any, **kwargs ): - column_widths = fwf_column_definitions(mode) + data = Path(data) + column_names, column_widths, column_dtypes = _fwf_column_definitions(mode) if using_dask_array: pandas_reader = dd @@ -105,20 +111,17 @@ def convert_station( **chunks, ) if using_dask_array: - df = c.persist(df) + df = client.persist(df) - except FileNotFoundError: - logging.error(f"File {data} was not found.") - errored_files.append(data) - return + except FileNotFoundError as e: + msg = f"File {data} was not found: {e}" + logging.error(msg) + raise FileNotFoundError(msg) - except (UnicodeDecodeError, Exception) as e: - logging.error( - f"File {data.name} was unable to be read. " - f"This is probably an issue with the file: {e}" - ) - errored_files.append(data) - return + except UnicodeDecodeError as e: + msg = f"File {data.name} was unable to be read. This is probably an issue with the file: {e}" + logging.error(msg) + raise UnicodeDecodeError(msg) # Loop through the station codes station_codes = df["code"].unique() @@ -320,8 +323,6 @@ def _convert_station_file( errored_files: list[Path], mode: str, add_offset: float, - column_dtypes: list[str], - column_names: list[str], long_name: str, missing_flags: set[str], missing_values: set[str], @@ -333,8 +334,6 @@ def _convert_station_file( variable_code: str, **dask_kwargs, ): - column_widths = fwf_column_definitions(mode) - if not missing_values: missing_values = {-9999, "#####"} @@ -362,7 +361,10 @@ def _convert_station_file( using_dask = False with client(**dask_kwargs) as c: - convert_station(data, mode, using_dask=using_dask) + try: + convert_station(data, mode, using_dask=using_dask) + except FileNotFoundError: + errored_files.append(data) if os.listdir(temp_folder): for temporary_file in Path(temp_folder).glob("*"): @@ -393,17 +395,6 @@ def convert_flat_files( """ func_time = time.time() - if mode.lower() in ["h", "hour", "hourly"]: - num_observations = 24 - column_names = ["code", "year", "month", "day", "code_var"] - column_dtypes = [str, float, float, float, str] - elif mode.lower() in ["d", "day", "daily"]: - num_observations = 31 - column_names = ["code", "year", "month", "code_var"] - column_dtypes = [str, float, float, str] - else: - raise NotImplementedError("`mode` must be 'h'/'hourly or 'd'/'daily'.") - # Preparing the data column headers for i in range(1, num_observations + 1): data_entry, flag_entry = f"D{i:0n}", f"F{i:0n}" From c781287840afd3d8cc6d055fc1af95fec93fd751 Mon Sep 17 00:00:00 2001 From: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> Date: Thu, 6 Jul 2023 17:56:45 -0400 Subject: [PATCH 06/33] broken - more refactoring --- miranda/convert/eccc_obs.py | 41 ++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/miranda/convert/eccc_obs.py b/miranda/convert/eccc_obs.py index 32e2c3e1..fd827d2f 100644 --- a/miranda/convert/eccc_obs.py +++ b/miranda/convert/eccc_obs.py @@ -27,7 +27,7 @@ from datetime import datetime as dt from logging import config from pathlib import Path -from typing import List, Tuple, Union, Type, Any +from typing import Any, List import dask.dataframe as dd import numpy as np @@ -56,22 +56,33 @@ TABLE_DATE = dt.now().strftime("%d %B %Y") -def _fwf_column_definitions(time_frequency: str) -> Tuple[List[str], List[int], List[Type[Union[str, int]]]]: +def _fwf_column_definitions( + time_frequency: str, +) -> tuple[list[str], list[int], list[type[str | int]]]: """Return the column names, widths, and data types for the fixed-width format.""" + # Preparing the column headers if time_frequency.lower() in ["h", "hour", "hourly"]: num_observations = 24 column_names = ["code", "year", "month", "day", "code_var"] - column_widths = [7, 4, 2, 2, 3] + [6, 1] * num_observations + column_widths = [7, 4, 2, 2, 3] column_dtypes = [str, int, int, int, str] elif time_frequency.lower() in ["d", "day", "daily"]: num_observations = 31 column_names = ["code", "year", "month", "code_var"] - column_widths = [7, 4, 2, 3] + [6, 1] * num_observations + column_widths = [7, 4, 2, 3] column_dtypes = [str, int, int, str] else: raise NotImplementedError("`mode` must be 'h'/'hourly or 'd'/'daily'.") + # Add the data columns + for i in range(1, num_observations + 1): + data_entry, flag_entry = f"D{i:0n}", f"F{i:0n}" + column_names.append(data_entry) + column_names.append(flag_entry) + column_widths.extend([6, 1] * num_observations) + column_dtypes.extend([str, str]) + return column_names, column_widths, column_dtypes @@ -85,7 +96,12 @@ def _remove_duplicates(ds): def convert_station( - data: str | os.PathLike, mode: str, using_dask_array: bool = False, *, client: Any, **kwargs + data: str | os.PathLike, + mode: str, + using_dask_array: bool = False, + *, + client: Any, + **kwargs, ): data = Path(data) column_names, column_widths, column_dtypes = _fwf_column_definitions(mode) @@ -362,7 +378,7 @@ def _convert_station_file( with client(**dask_kwargs) as c: try: - convert_station(data, mode, using_dask=using_dask) + convert_station(data, mode, using_dask=using_dask, client=c) except FileNotFoundError: errored_files.append(data) @@ -393,15 +409,6 @@ def convert_flat_files( ------- None """ - func_time = time.time() - - # Preparing the data column headers - for i in range(1, num_observations + 1): - data_entry, flag_entry = f"D{i:0n}", f"F{i:0n}" - column_names.append(data_entry) - column_names.append(flag_entry) - column_dtypes.extend([str, str]) - if isinstance(variables, (str, int)): variables = [variables] @@ -435,8 +442,6 @@ def convert_flat_files( errored_files=errored_files, mode=mode, variable_code=variable_code, - column_names=column_names, - column_dtypes=column_dtypes, **metadata, ) with mp.Pool(processes=n_workers) as pool: @@ -449,8 +454,6 @@ def convert_flat_files( "Some files failed to be properly parsed:\n", ", ".join(errored_files) ) - logging.warning(f"Process completed in {time.time() - func_time:.2f} seconds") - def merge_stations( source_files: str | os.PathLike | None = None, From ce2f6d92985e1c0dd244d7c98a10dc0caffb995e Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 2 Aug 2023 16:58:48 -0400 Subject: [PATCH 07/33] significant refactoring - WIP * moved many eccc conversion functions to new preprocessing module * created new vocabularies module for CV support of multiple projects * json mappings are now configs * CF conversion will be handled by convert only - WIP --- miranda/__init__.py | 3 +- miranda/convert/__init__.py | 2 +- miranda/convert/_data_definitions.py | 60 +++--- .../{data => configs}/cmip_ouranos_attrs.json | 0 .../{data => configs}/deh_cf_attrs.json | 0 .../eccc-canswe_cf_attrs.json | 0 .../eccc-homogenized_cf_attrs.json} | 1 + .../eccc-obs_cf_attrs.json} | 1 - .../eccc-rdrs_cf_attrs.json} | 0 .../{data => configs}/ecmwf_cf_attrs.json | 0 .../{data => configs}/emdna_cf_attrs.json | 0 .../{data => configs}/espo-g6-e5l_attrs.json | 0 .../{data => configs}/espo-g6-r2_attrs.json | 0 .../{data => configs}/ets_grnch_cf_attrs.json | 0 .../{data => configs}/hq_cf_attrs.json | 0 .../{data => configs}/melcc_cf_attrs.json | 0 .../{data => configs}/nasa_ag_cf_attrs.json | 0 .../nex-gddp-cmip6_attrs.json | 0 .../{data => configs}/rvt_raven_attrs.json | 0 .../{data => configs}/usask_cf_attrs.json | 0 miranda/convert/deh.py | 5 +- miranda/convert/hq.py | 6 +- miranda/decode/_decoder.py | 4 +- miranda/eccc/__init__.py | 3 - miranda/eccc/_utils.py | 166 +---------------- miranda/eccc/eccc_homogenized_cf_attrs.json | 111 ----------- miranda/eccc/eccc_obs_summary_cf_attrs.json | 173 ------------------ miranda/eccc/geomet.py | 4 +- miranda/io/_rechunk.py | 2 +- miranda/io/utils.py | 2 +- miranda/preprocess/__init__.py | 1 + miranda/preprocess/_data_definitions.py | 37 ++++ .../_eccc_homogenized.py} | 115 +++++++++++- .../eccc_obs.py => preprocess/_eccc_obs.py} | 8 +- .../_eccc_summaries.py} | 2 +- miranda/preprocess/_treatments.py | 43 +++++ .../configs/eccc-homogenized_attrs.json | 36 ++++ .../configs/eccc-obs-summary_cf_attrs.json} | 0 .../preprocess/configs/eccc-obs_attrs.json | 29 +++ .../{eccc/convert.py => preprocess/eccc.py} | 13 +- .../ecmwf.py => preprocess/ecmwf_tigge.py} | 0 miranda/structure/_structure.py | 6 +- miranda/validators.py | 4 +- miranda/vocabularies/__init__.py | 1 + miranda/vocabularies/eccc.py | 88 +++++++++ miranda/{cv.py => vocabularies/esgf.py} | 2 +- pyproject.toml | 6 +- templates/eccc_raw_hourly_conversion.py | 2 +- templates/eccc_rdrs_processing.py | 2 +- 49 files changed, 425 insertions(+), 513 deletions(-) rename miranda/convert/{data => configs}/cmip_ouranos_attrs.json (100%) rename miranda/convert/{data => configs}/deh_cf_attrs.json (100%) rename miranda/convert/{data => configs}/eccc-canswe_cf_attrs.json (100%) rename miranda/convert/{data/eccc_homogenized_cf_attrs.json => configs/eccc-homogenized_cf_attrs.json} (99%) rename miranda/convert/{data/eccc_obs_cf_attrs.json => configs/eccc-obs_cf_attrs.json} (99%) rename miranda/convert/{data/eccc_rdrs_cf_attrs.json => configs/eccc-rdrs_cf_attrs.json} (100%) rename miranda/convert/{data => configs}/ecmwf_cf_attrs.json (100%) rename miranda/convert/{data => configs}/emdna_cf_attrs.json (100%) rename miranda/convert/{data => configs}/espo-g6-e5l_attrs.json (100%) rename miranda/convert/{data => configs}/espo-g6-r2_attrs.json (100%) rename miranda/convert/{data => configs}/ets_grnch_cf_attrs.json (100%) rename miranda/convert/{data => configs}/hq_cf_attrs.json (100%) rename miranda/convert/{data => configs}/melcc_cf_attrs.json (100%) rename miranda/convert/{data => configs}/nasa_ag_cf_attrs.json (100%) rename miranda/convert/{data => configs}/nex-gddp-cmip6_attrs.json (100%) rename miranda/convert/{data => configs}/rvt_raven_attrs.json (100%) rename miranda/convert/{data => configs}/usask_cf_attrs.json (100%) delete mode 100644 miranda/eccc/eccc_homogenized_cf_attrs.json delete mode 100644 miranda/eccc/eccc_obs_summary_cf_attrs.json create mode 100644 miranda/preprocess/__init__.py create mode 100644 miranda/preprocess/_data_definitions.py rename miranda/{eccc/_homogenized.py => preprocess/_eccc_homogenized.py} (74%) rename miranda/{convert/eccc_obs.py => preprocess/_eccc_obs.py} (99%) rename miranda/{eccc/_summaries.py => preprocess/_eccc_summaries.py} (99%) create mode 100644 miranda/preprocess/_treatments.py create mode 100644 miranda/preprocess/configs/eccc-homogenized_attrs.json rename miranda/{eccc/data/eccc_obs_summary_cf_attrs.json => preprocess/configs/eccc-obs-summary_cf_attrs.json} (100%) create mode 100644 miranda/preprocess/configs/eccc-obs_attrs.json rename miranda/{eccc/convert.py => preprocess/eccc.py} (90%) rename miranda/{convert/ecmwf.py => preprocess/ecmwf_tigge.py} (100%) create mode 100644 miranda/vocabularies/__init__.py create mode 100644 miranda/vocabularies/eccc.py rename miranda/{cv.py => vocabularies/esgf.py} (99%) diff --git a/miranda/__init__.py b/miranda/__init__.py index 8439bcd3..6c9577f1 100644 --- a/miranda/__init__.py +++ b/miranda/__init__.py @@ -23,14 +23,15 @@ from . import ( archive, convert, - cv, decode, io, + preprocess, scripting, structure, units, utils, validators, + vocabularies, ) from .data import DataBase from .storage import FileMeta, StorageState diff --git a/miranda/convert/__init__.py b/miranda/convert/__init__.py index 179bd597..bfc31224 100644 --- a/miranda/convert/__init__.py +++ b/miranda/convert/__init__.py @@ -1,7 +1,7 @@ """Data Conversion module.""" from __future__ import annotations -from . import deh, eccc_canswe, ecmwf, hq, melcc, utils +from . import deh, hq, melcc, utils from ._aggregation import * from ._data_definitions import * from ._treatments import * diff --git a/miranda/convert/_data_definitions.py b/miranda/convert/_data_definitions.py index 4ed4741b..1b6dbc8a 100644 --- a/miranda/convert/_data_definitions.py +++ b/miranda/convert/_data_definitions.py @@ -35,7 +35,7 @@ "xarray_frequencies_to_cmip6like", ] -_data_folder = Path(__file__).parent / "data" +_config_folder = Path(__file__).resolve().parent / "configs" def load_json_data_mappings(project: str) -> dict[str, Any]: @@ -49,38 +49,50 @@ def load_json_data_mappings(project: str) -> dict[str, Any]: ------- dict[str, Any] """ - data_folder = Path(__file__).resolve().parent / "data" - if project.startswith("era5"): - metadata_definition = json.load(open(data_folder / "ecmwf_cf_attrs.json")) + metadata_definition = json.load(open(_config_folder / "ecmwf_cf_attrs.json")) elif project in ["rdrs-v21"]: - metadata_definition = json.load(open(data_folder / "eccc_rdrs_cf_attrs.json")) + metadata_definition = json.load( + open(_config_folder / "eccc-rdrs_cf_attrs.json") + ) elif project == "eccc-obs": - metadata_definition = json.load(open(data_folder / "eccc_obs_cf_attrs.json")) + metadata_definition = json.load(open(_config_folder / "eccc-obs_cf_attrs.json")) elif project in ["agcfsr", "agmerra2"]: - metadata_definition = json.load(open(data_folder / "nasa_ag_cf_attrs.json")) + metadata_definition = json.load(open(_config_folder / "nasa_ag_cf_attrs.json")) elif project in ["cordex", "cmip5", "cmip6"]: - metadata_definition = json.load(open(data_folder / "cmip_ouranos_attrs.json")) + metadata_definition = json.load( + open(_config_folder / "cmip_ouranos_attrs.json") + ) elif project == "ets-grnch": - metadata_definition = json.load(open(data_folder / "ets_grnch_cf_attrs.json")) + metadata_definition = json.load( + open(_config_folder / "ets_grnch_cf_attrs.json") + ) elif project == "nrcan-gridded-10km": raise NotImplementedError() elif project == "wfdei-gem-capa": - metadata_definition = json.load(open(data_folder / "usask_cf_attrs.json")) - elif project.startswith("melcc"): - metadata_definition = json.load(open(data_folder / "melcc_cf_attrs.json")) - elif project.startswith("ec"): - metadata_definition = json.load(open(data_folder / "eccc-canswe_cf_attrs.json")) + metadata_definition = json.load(open(_config_folder / "usask_cf_attrs.json")) + elif project == "melcc": + metadata_definition = json.load(open(_config_folder / "melcc_cf_attrs.json")) + elif project == "eccc-canswe": + metadata_definition = json.load( + open(_config_folder / "eccc-canswe_cf_attrs.json") + ) + elif project == "eccc-homogenized": + metadata_definition = json.load( + open(_config_folder / "eccc-homogenized_cf_attrs.json") + ) elif project in ["NEX-GDDP-CMIP6"]: - metadata_definition = json.load(open(data_folder / "nex-gddp-cmip6_attrs.json")) + metadata_definition = json.load( + open(_config_folder / "nex-gddp-cmip6_attrs.json") + ) elif project in ["ESPO-G6-R2"]: - metadata_definition = json.load(open(data_folder / "espo-g6-r2_attrs.json")) + metadata_definition = json.load(open(_config_folder / "espo-g6-r2_attrs.json")) elif project in ["ESPO-G6-E5L"]: - metadata_definition = json.load(open(data_folder / "espo-g6-e5l_attrs.json")) + metadata_definition = json.load(open(_config_folder / "espo-g6-e5l_attrs.json")) elif project in ["EMDNA"]: - metadata_definition = json.load(open(data_folder / "emdna_cf_attrs.json")) + metadata_definition = json.load(open(_config_folder / "emdna_cf_attrs.json")) else: - raise NotImplementedError() + raise NotImplementedError(f"Project not supported: {project}") return metadata_definition @@ -88,27 +100,27 @@ def load_json_data_mappings(project: str) -> dict[str, Any]: eccc_rdrs_variables = dict() eccc_rdrs_variables["raw"] = [ v - for v in json.load(open(_data_folder / "eccc_rdrs_cf_attrs.json"))[ + for v in json.load(open(_config_folder / "eccc-rdrs_cf_attrs.json"))[ "variables" ].keys() ] eccc_rdrs_variables["cf"] = [ attrs["_cf_variable_name"] - for attrs in json.load(open(_data_folder / "eccc_rdrs_cf_attrs.json"))[ + for attrs in json.load(open(_config_folder / "eccc-rdrs_cf_attrs.json"))[ "variables" ].values() ] -era5_variables = json.load(open(_data_folder / "ecmwf_cf_attrs.json"))[ +era5_variables = json.load(open(_config_folder / "ecmwf_cf_attrs.json"))[ "variables" ].keys() grnch_variables = ["T", "Tmin", "Tmax", "P"] nrcan_variables = ["tasmin", "tasmax", "pr"] -nasa_ag_variables = json.load(open(_data_folder / "nasa_ag_cf_attrs.json"))[ +nasa_ag_variables = json.load(open(_config_folder / "nasa_ag_cf_attrs.json"))[ "variables" ].keys() sc_earth_variables = ["prcp", "tdew", "tmean", "trange", "wind"] -wfdei_gem_capa_variables = json.load(open(_data_folder / "usask_cf_attrs.json"))[ +wfdei_gem_capa_variables = json.load(open(_config_folder / "usask_cf_attrs.json"))[ "variables" ].keys() diff --git a/miranda/convert/data/cmip_ouranos_attrs.json b/miranda/convert/configs/cmip_ouranos_attrs.json similarity index 100% rename from miranda/convert/data/cmip_ouranos_attrs.json rename to miranda/convert/configs/cmip_ouranos_attrs.json diff --git a/miranda/convert/data/deh_cf_attrs.json b/miranda/convert/configs/deh_cf_attrs.json similarity index 100% rename from miranda/convert/data/deh_cf_attrs.json rename to miranda/convert/configs/deh_cf_attrs.json diff --git a/miranda/convert/data/eccc-canswe_cf_attrs.json b/miranda/convert/configs/eccc-canswe_cf_attrs.json similarity index 100% rename from miranda/convert/data/eccc-canswe_cf_attrs.json rename to miranda/convert/configs/eccc-canswe_cf_attrs.json diff --git a/miranda/convert/data/eccc_homogenized_cf_attrs.json b/miranda/convert/configs/eccc-homogenized_cf_attrs.json similarity index 99% rename from miranda/convert/data/eccc_homogenized_cf_attrs.json rename to miranda/convert/configs/eccc-homogenized_cf_attrs.json index 9eac354f..5c777230 100644 --- a/miranda/convert/data/eccc_homogenized_cf_attrs.json +++ b/miranda/convert/configs/eccc-homogenized_cf_attrs.json @@ -6,6 +6,7 @@ "gen3": "Vincent, L.A., M.M. Hartwell and X.L. Wang, 2020: A Third Generation of Homogenized Temperature for Trend Analysis and Monitoring Changes in Canada’s Climate. Atmosphere-Ocean. https://doi.org/10.1080/07055900.2020.1765728" }, "_frequency": true, + "_generation": true, "_miranda_version": true, "_missing_values": [ "-999", diff --git a/miranda/convert/data/eccc_obs_cf_attrs.json b/miranda/convert/configs/eccc-obs_cf_attrs.json similarity index 99% rename from miranda/convert/data/eccc_obs_cf_attrs.json rename to miranda/convert/configs/eccc-obs_cf_attrs.json index 68b30487..c504965e 100644 --- a/miranda/convert/data/eccc_obs_cf_attrs.json +++ b/miranda/convert/configs/eccc-obs_cf_attrs.json @@ -17,7 +17,6 @@ "license_type": "permissive", "organization": "ECCC", "processing_level": "raw", - "realm": "atmos", "source": "ECCC-OBS", "table_date": "2023-03-23", "type": "station-obs" diff --git a/miranda/convert/data/eccc_rdrs_cf_attrs.json b/miranda/convert/configs/eccc-rdrs_cf_attrs.json similarity index 100% rename from miranda/convert/data/eccc_rdrs_cf_attrs.json rename to miranda/convert/configs/eccc-rdrs_cf_attrs.json diff --git a/miranda/convert/data/ecmwf_cf_attrs.json b/miranda/convert/configs/ecmwf_cf_attrs.json similarity index 100% rename from miranda/convert/data/ecmwf_cf_attrs.json rename to miranda/convert/configs/ecmwf_cf_attrs.json diff --git a/miranda/convert/data/emdna_cf_attrs.json b/miranda/convert/configs/emdna_cf_attrs.json similarity index 100% rename from miranda/convert/data/emdna_cf_attrs.json rename to miranda/convert/configs/emdna_cf_attrs.json diff --git a/miranda/convert/data/espo-g6-e5l_attrs.json b/miranda/convert/configs/espo-g6-e5l_attrs.json similarity index 100% rename from miranda/convert/data/espo-g6-e5l_attrs.json rename to miranda/convert/configs/espo-g6-e5l_attrs.json diff --git a/miranda/convert/data/espo-g6-r2_attrs.json b/miranda/convert/configs/espo-g6-r2_attrs.json similarity index 100% rename from miranda/convert/data/espo-g6-r2_attrs.json rename to miranda/convert/configs/espo-g6-r2_attrs.json diff --git a/miranda/convert/data/ets_grnch_cf_attrs.json b/miranda/convert/configs/ets_grnch_cf_attrs.json similarity index 100% rename from miranda/convert/data/ets_grnch_cf_attrs.json rename to miranda/convert/configs/ets_grnch_cf_attrs.json diff --git a/miranda/convert/data/hq_cf_attrs.json b/miranda/convert/configs/hq_cf_attrs.json similarity index 100% rename from miranda/convert/data/hq_cf_attrs.json rename to miranda/convert/configs/hq_cf_attrs.json diff --git a/miranda/convert/data/melcc_cf_attrs.json b/miranda/convert/configs/melcc_cf_attrs.json similarity index 100% rename from miranda/convert/data/melcc_cf_attrs.json rename to miranda/convert/configs/melcc_cf_attrs.json diff --git a/miranda/convert/data/nasa_ag_cf_attrs.json b/miranda/convert/configs/nasa_ag_cf_attrs.json similarity index 100% rename from miranda/convert/data/nasa_ag_cf_attrs.json rename to miranda/convert/configs/nasa_ag_cf_attrs.json diff --git a/miranda/convert/data/nex-gddp-cmip6_attrs.json b/miranda/convert/configs/nex-gddp-cmip6_attrs.json similarity index 100% rename from miranda/convert/data/nex-gddp-cmip6_attrs.json rename to miranda/convert/configs/nex-gddp-cmip6_attrs.json diff --git a/miranda/convert/data/rvt_raven_attrs.json b/miranda/convert/configs/rvt_raven_attrs.json similarity index 100% rename from miranda/convert/data/rvt_raven_attrs.json rename to miranda/convert/configs/rvt_raven_attrs.json diff --git a/miranda/convert/data/usask_cf_attrs.json b/miranda/convert/configs/usask_cf_attrs.json similarity index 100% rename from miranda/convert/data/usask_cf_attrs.json rename to miranda/convert/configs/usask_cf_attrs.json diff --git a/miranda/convert/deh.py b/miranda/convert/deh.py index 6179d7fd..55581f5d 100644 --- a/miranda/convert/deh.py +++ b/miranda/convert/deh.py @@ -18,11 +18,12 @@ __all__ = ["open_txt"] # CMOR-like attributes -cmor = json.load(open(Path(__file__).parent / "data" / "deh_cf_attrs.json"))[ # noqa +cmor = json.load(open(Path(__file__).parent / "configs" / "deh_cf_attrs.json"))[ "variable_entry" ] -# TODO: Some potentially useful attributes were skipped, because they would be complicated to include in a dataset since they vary per station +# TODO: Some potentially useful attributes were skipped +# because they would be complicated to include in a dataset since they vary per station meta_patterns = { "Station: ": "name", "Bassin versant: ": "bv", diff --git a/miranda/convert/hq.py b/miranda/convert/hq.py index 6425f0e1..338fcd92 100644 --- a/miranda/convert/hq.py +++ b/miranda/convert/hq.py @@ -21,7 +21,9 @@ __all__ = ["open_csv"] # CMOR-like attributes -cmor = json.load(open(Path(__file__).parent / "data" / "hq_cf_attrs.json"))["variables"] +cmor = json.load(open(Path(__file__).parent / "configs" / "hq_cf_attrs.json"))[ + "variables" +] fp = r"[-+]?\d*,\d+|\d+" @@ -180,6 +182,6 @@ def to_cf(meta: dict, data: pd.DataFrame, cf_table: dict | None = None) -> xr.Da def open_csv(path: str | Path, cf_table: dict | None = cmor) -> xr.DataArray: - """Extract daily HQ meteo data and convert to xr.DataArray with CF-Convention attributes.""" + """Extract daily HQ meteo configs and convert to xr.DataArray with CF-Convention attributes.""" meta, data = extract_daily(path) return to_cf(meta, data, cf_table) diff --git a/miranda/decode/_decoder.py b/miranda/decode/_decoder.py index 088e0cc5..67433f1c 100644 --- a/miranda/decode/_decoder.py +++ b/miranda/decode/_decoder.py @@ -19,15 +19,15 @@ from pandas._libs.tslibs import NaTType # noqa from miranda.convert.utils import date_parser, find_version_hash # noqa -from miranda.cv import VALIDATION_ENABLED from miranda.scripting import LOGGING_CONFIG from miranda.units import get_time_frequency +from miranda.vocabularies.esgf import VALIDATION_ENABLED from ._time import TIME_UNITS_TO_FREQUENCY, TIME_UNITS_TO_TIMEDELTA, DecoderError if VALIDATION_ENABLED: - from miranda.cv import INSTITUTIONS, PROJECT_MODELS from miranda.validators import FACETS_SCHEMA # noqa + from miranda.vocabularies.esgf import INSTITUTIONS, PROJECT_MODELS config.dictConfig(LOGGING_CONFIG) diff --git a/miranda/eccc/__init__.py b/miranda/eccc/__init__.py index 781076e0..c4a33869 100644 --- a/miranda/eccc/__init__.py +++ b/miranda/eccc/__init__.py @@ -1,5 +1,2 @@ """Environment and Climate Change Canada specialized conversion module.""" from __future__ import annotations - -from ._homogenized import * -from ._summaries import * diff --git a/miranda/eccc/_utils.py b/miranda/eccc/_utils.py index afb34770..a501dac6 100644 --- a/miranda/eccc/_utils.py +++ b/miranda/eccc/_utils.py @@ -6,7 +6,7 @@ from miranda.scripting import LOGGING_CONFIG -__all__ = ["cf_station_metadata", "cf_ahccd_metadata"] +__all__ = ["cf_station_metadata"] logging.config.dictConfig(LOGGING_CONFIG) @@ -836,167 +836,3 @@ def cf_station_metadata(variable_code: int | str) -> Mapping[str, int | float | logging.error(f"Hourly variable `{code}` not supported.") raise return variable - - -def cf_ahccd_metadata( - code: str, gen: int -) -> (dict[str, int | float | str], dict, list[tuple[int, int]], int): - """ - - Parameters - ---------- - code: {"dx", "dn", "dm", "dt", "ds", "dr"} - gen: {1, 2, 3} - - Returns - ------- - dict[str, int or str or float], dict, list[tuple[int, int]], int - """ - generation = {1: "First", 2: "Second", 3: "Third"}.get(gen) - - ec_ahccd_attrs = dict( - dx=dict( - variable="tasmax", - units="degC", - standard_name="air_temperature", - long_name="Near-Surface Maximum Daily Air Temperature", - comment=f"ECCC {generation} Generation of Adjusted and Homogenized Temperature Data", - ), - dn=dict( - variable="tasmin", - units="degC", - standard_name="air_temperature", - long_name="Near-Surface Minimum Daily Air Temperature", - comment=f"ECCC {generation} Generation of Adjusted and Homogenized Temperature Data", - ), - dm=dict( - variable="tas", - units="degC", - standard_name="air_temperature", - long_name="Near-Surface Daily Mean Air Temperature", - comment=f"ECCC {generation} Generation of Adjusted and Homogenized Temperature Data", - ), - dt=dict( - variable="pr", - units="mm d-1", - standard_name="precipitation_flux", - long_name="Daily Total Precipitation", - comment=f"ECCC {generation} Generation of Adjusted and Homogenized Precipitation Data", - ), - ds=dict( - variable="prsn", - units="mm d-1", - standard_name="snowfall_flux", - long_name="Daily Snowfall", - comment=f"ECCC {generation} Generation of Adjusted and Homogenized Precipitation Data", - ), - dr=dict( - variable="prlp", - units="mm d-1", - standard_name="rainfall_flux", - long_name="Daily Rainfall", - comment=f"ECCC {generation} Generation of Adjusted and Homogenized Precipitation Data", - ), - ) - try: - variable = ec_ahccd_attrs[code] - variable["missing_flags"] = "M" - if variable["variable"].startswith("tas"): - variable["NaN_value"] = -9999.9 - column_names = [ - "No", - "StnId", - "Station name", - "Prov", - "FromYear", - "FromMonth", - "ToYear", - "ToMonth", - "%Miss", - "Lat(deg)", - "Long(deg)", - "Elev(m)", - "Joined", - "RCS", - ] - column_spaces = [(0, 5), (5, 6), (6, 8), (8, 9)] - ii = 9 - for i in range(1, 32): - column_spaces.append((ii, ii + 7)) - ii += 7 - column_spaces.append((ii, ii + 1)) - ii += 1 - header_row = 3 - - elif variable["variable"].startswith("pr"): - variable["NaN_value"] = -9999.99 - column_names = [ - "Prov", - "Station name", - "stnid", - "beg yr", - "beg mon", - "end yr", - "end mon", - "lat (deg)", - "long (deg)", - "elev (m)", - "stns joined", - ] - column_spaces = [(0, 4), (4, 5), (5, 7), (7, 8)] - ii = 8 - for i in range(1, 32): - column_spaces.append((ii, ii + 8)) - ii += 8 - column_spaces.append((ii, ii + 1)) - ii += 1 - header_row = 0 - - else: - raise KeyError - - column_names = { - col.lower() - .split("(")[0] - .replace("%", "pct_") - .strip() - .replace(" ", "_"): col - for col in list(column_names) - } - - if gen == 3: - _citation = ( - "Vincent, L.A., M.M. Hartwell and X.L. Wang, 2020: A Third Generation of Homogenized " - "Temperature for Trend Analysis and Monitoring Changes in Canada’s Climate. " - "Atmosphere-Ocean. https://doi.org/10.1080/07055900.2020.1765728" - ) - elif gen == 2: - _citation = ( - "Mekis, É and L.A. Vincent, 2011: An overview of the second generation adjusted daily " - "precipitation dataset for trend analysis in Canada. Atmosphere-Ocean 49(2), " - "163-177 doi:10.1080/07055900.2011.583910" - ) - else: - msg = f"Generation '{gen}' not supported." - raise NotImplementedError(msg) - - global_attrs = dict( - title=f"{generation} Generation of Homogenized Daily {variable['variable']} " - "for Canada (Updated to December 2019)", - history=f"{dt.today().strftime('%Y-%m-%d')}: Convert from original format to NetCDF", - type="station_obs", - institute="Environment and Climate Change Canada", - institute_id="ECCC", - dataset_id=f"AHCCD_gen{gen}_day_{variable['variable']}", - frequency="day", - license_type="permissive", - license="https:/open.canada.ca/en/open-government-licence-canada", - citation=_citation, - ) - - except KeyError as e: - msg = f"AHCCD variable '{code}' or generation '{gen}' not supported." - logging.error(msg) - raise NotImplementedError(msg) from e - - return variable, column_names, column_spaces, header_row, global_attrs diff --git a/miranda/eccc/eccc_homogenized_cf_attrs.json b/miranda/eccc/eccc_homogenized_cf_attrs.json deleted file mode 100644 index 92c3b0f1..00000000 --- a/miranda/eccc/eccc_homogenized_cf_attrs.json +++ /dev/null @@ -1,111 +0,0 @@ -{ - "Header": { - "Conventions": "CF-1.8", - "_product": { - "gen2": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 2", - "gen3": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 3" - }, - "citation": { - "gen2": "Mekis, É and L.A. Vincent, 2011: An overview of the second generation adjusted daily precipitation dataset for trend analysis in Canada. Atmosphere-Ocean 49(2), 163-177 doi:10.1080/07055900.2011.583910", - "gen3": "Vincent, L.A., M.M. Hartwell and X.L. Wang, 2020: A Third Generation of Homogenized Temperature for Trend Analysis and Monitoring Changes in Canada’s Climate. Atmosphere-Ocean. https://doi.org/10.1080/07055900.2020.1765728" - }, - "contact": "info.cccs-ccsc@canada.ca", - "documentation": "https://www.canada.ca/en/environment-climate-change/services/climate-change/canadian-centre-climate-services/display-download/technical-documentation-adjusted-climate-data.html", - "float_missing_value": "1e20", - "frequency": "day", - "institution": "GovCan", - "int_missing_value": "-999", - "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html", - "license_type": "permissive", - "organization": "ECCC", - "realm": "atmos", - "table_date": "2023-03-23", - "table_id": "ECCC" - }, - "variable_entry": { - "dm": { - "add_offset": 273.15, - "cell_methods": "time: mean", - "comments": "Station data converted from Mean Temp (°C)", - "frequency": "day", - "grid_mapping": "regular_lon_lat", - "long_name": "Near-Surface Air Temperature", - "original_field": "Mean Temp (°C)", - "out_name": "tas", - "scale_factor": 1, - "standard_name": "air_temperature", - "type": "real", - "units": "K" - }, - "dn": { - "add_offset": 273.15, - "cell_methods": "time: minimum", - "comments": "Station data converted from Min Temp (°C)", - "frequency": "day", - "grid_mapping": "regular_lon_lat", - "long_name": "Daily Minimum Near-Surface Air Temperature", - "original_field": "Min Temp (°C)", - "out_name": "tasmin", - "scale_factor": 1, - "standard_name": "air_temperature", - "type": "real", - "units": "K" - }, - "dr": { - "add_offset": 0, - "cell_methods": "time: mean", - "comments": "Station data converted from Total Rain (mm) using a density of 1000 kg/m³", - "frequency": "day", - "grid_mapping": "regular_lon_lat", - "long_name": "Liquid Precipitation", - "original_field": "Total Rain (mm)", - "out_name": "prlp", - "scale_factor": 1.1574074074074073e-05, - "standard_name": "rainfall_flux", - "type": "real", - "units": "kg m-2 s-1" - }, - "ds": { - "add_offset": 0, - "cell_methods": "time: mean", - "comments": "station data converted from Total Snow (cm) using a density of 100 kg/m³", - "frequency": "day", - "grid_mapping": "regular_lon_lat", - "long_name": "Snowfall Flux", - "original_field": "Total Snow (cm)", - "out_name": "prsn", - "scale_factor": 1.1574074074074073e-05, - "standard_name": "snowfall_flux", - "type": "real", - "units": "kg m-2 s-1" - }, - "dt": { - "add_offset": 0, - "cell_methods": "time: mean", - "comments": "Station data converted from Total Precip (mm) using a density of 1000 kg/m³", - "frequency": "day", - "grid_mapping": "regular_lon_lat", - "long_name": "Precipitation", - "original_field": "Total Precip (mm)", - "out_name": "pr", - "scale_factor": 1.1574074074074073e-05, - "standard_name": "precipitation_flux", - "type": "real", - "units": "kg m-2 s-1" - }, - "dx": { - "add_offset": 273.15, - "cell_methods": "time: maximum", - "comments": "station data converted from Max Temp (°C)", - "frequency": "day", - "grid_mapping": "regular_lon_lat", - "long_name": "Daily Maximum Near-Surface Air Temperature", - "original_field": "Max Temp (°C)", - "out_name": "tasmax", - "scale_factor": 1, - "standard_name": "air_temperature", - "type": "real", - "units": "K" - } - } -} diff --git a/miranda/eccc/eccc_obs_summary_cf_attrs.json b/miranda/eccc/eccc_obs_summary_cf_attrs.json deleted file mode 100644 index b21f224e..00000000 --- a/miranda/eccc/eccc_obs_summary_cf_attrs.json +++ /dev/null @@ -1,173 +0,0 @@ -{ - "Header": { - "Conventions": "CF-1.8", - "contact": "info.cccs-ccsc@canada.ca", - "institution": "GovCan", - "int_missing_value": "-999", - "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html", - "license_type": "permissive", - "missing_value": "1e20", - "organization": "ECCC", - "processing_level": "raw", - "realm": "atmos", - "source": "msc", - "table_date": "2023-03-23", - "type": "station-obs" - }, - "variable_entry": { - "cdd": { - "add_offset": 0, - "cell_methods": "time: sum", - "comments": "Station data converted from Cool Deg Days (°C)", - "frequency": "day", - "grid_mapping": "regular_lon_lat", - "long_name": "Number of Degrees Celsius Over a Mean Temperature of 18 °C", - "original_variable": "Cool Deg Days (°C)", - "out_name": "cdd", - "scale_factor": 1, - "standard_name": "cooling_degree_days", - "type": "real", - "units": "C" - }, - "hdd": { - "add_offset": 0, - "cell_methods": "time: sum", - "comments": "Station data converted from Heat Deg Days (°C)", - "frequency": "day", - "grid_mapping": "regular_lon_lat", - "long_name": "Number of Degrees Celsius Under a Mean Temperature of 18 °C", - "original_variable": "Heat Deg Days (°C)", - "out_name": "hdd", - "scale_factor": 1, - "standard_name": "heating_degree_days", - "type": "real", - "units": "C" - }, - "pr": { - "add_offset": 0, - "cell_methods": "time: mean", - "comments": "Station data converted from Total Precip (mm) using a density of 1000 kg/m³", - "frequency": "day", - "grid_mapping": "regular_lon_lat", - "long_name": "Precipitation", - "original_variable": "Total Precip (mm)", - "out_name": "pr", - "scale_factor": 1.1574074074074073e-05, - "standard_name": "precipitation_flux", - "type": "real", - "units": "kg m-2 s-1" - }, - "prlp": { - "add_offset": 0, - "cell_methods": "time: mean", - "comments": "Station data converted from Total Rain (mm) using a density of 1000 kg/m³", - "frequency": "day", - "grid_mapping": "regular_lon_lat", - "long_name": "Liquid Precipitation", - "original_variable": "Total Rain (mm)", - "out_name": "prlp", - "scale_factor": 1.1574074074074073e-05, - "standard_name": "rainfall_flux", - "type": "real", - "units": "kg m-2 s-1" - }, - "prsn": { - "add_offset": 0, - "cell_methods": "time: mean", - "comments": "station data converted from Total Snow (cm) using a density of 100 kg/m³", - "frequency": "day", - "grid_mapping": "regular_lon_lat", - "long_name": "Snowfall Flux", - "original_variable": "Total Snow (cm)", - "out_name": "prsn", - "scale_factor": 1.1574074074074073e-05, - "standard_name": "snowfall_flux", - "type": "real", - "units": "kg m-2 s-1" - }, - "sfcWindAz": { - "add_offset": 0, - "cell_methods": "time: mean", - "comments": "Station data converted from Dir of Max Gust (10s deg)", - "frequency": "day", - "grid_mapping": "regular_lon_lat", - "long_name": "Direction from which the Daily Maximum Near-Surface Gust Wind Speed maximum Blows", - "original_variable": "Dir of Max Gust (10s deg)", - "out_name": "sfcWindAz", - "scale_factor": 1, - "standard_name": "wind_direction", - "type": "real", - "units": "degree" - }, - "sfcWindMax": { - "add_offset": 0, - "cell_methods": "time: max", - "comments": "Station data converted from Spd of Max Gust (km/h)", - "frequency": "day", - "grid_mapping": "regular_lon_lat", - "long_name": "Daily Maximum Near-Surface Gust Wind Speed maximum", - "original_variable": "Spd of Max Gust (km/h)", - "out_name": "sfcWindMax", - "scale_factor": 0.2777777777777778, - "standard_name": "wind_speed_of_gust maximum", - "type": "real", - "units": "m s-1" - }, - "snd": { - "add_offset": 0, - "cell_methods": "time: mean", - "comments": "Station data converted from Snow on Grnd (cm)", - "frequency": "day", - "grid_mapping": "regular_lon_lat", - "long_name": "Snow Depth", - "original_variable": "Snow on Grnd (cm)", - "out_name": "snd", - "scale_factor": 0.01, - "standard_name": "surface_snow_thickness", - "type": "real", - "units": "m" - }, - "tas": { - "add_offset": 273.15, - "cell_methods": "time: mean", - "comments": "Station data converted from Mean Temp (°C)", - "frequency": "day", - "grid_mapping": "regular_lon_lat", - "long_name": "Near-Surface Air Temperature", - "original_variable": "Mean Temp (°C)", - "out_name": "tas", - "scale_factor": 1, - "standard_name": "air_temperature", - "type": "real", - "units": "K" - }, - "tasmax": { - "add_offset": 273.15, - "cell_methods": "time: maximum", - "comments": "station data converted from Max Temp (°C)", - "frequency": "day", - "grid_mapping": "regular_lon_lat", - "long_name": "Daily Maximum Near-Surface Air Temperature", - "original_variable": "Max Temp (°C)", - "out_name": "tasmax", - "scale_factor": 1, - "standard_name": "air_temperature", - "type": "real", - "units": "K" - }, - "tasmin": { - "add_offset": 273.15, - "cell_methods": "time: minimum", - "comments": "Station data converted from Min Temp (°C)", - "frequency": "day", - "grid_mapping": "regular_lon_lat", - "long_name": "Daily Minimum Near-Surface Air Temperature", - "original_variable": "Min Temp (°C)", - "out_name": "tasmin", - "scale_factor": 1, - "standard_name": "air_temperature", - "type": "real", - "units": "K" - } - } -} diff --git a/miranda/eccc/geomet.py b/miranda/eccc/geomet.py index 3fb00d72..c5446dd5 100644 --- a/miranda/eccc/geomet.py +++ b/miranda/eccc/geomet.py @@ -1,3 +1,4 @@ +"""ECCC Geomet Module.""" from __future__ import annotations import os @@ -7,7 +8,8 @@ import xarray as xr -def load_station_metadata(meta: str | os.PathLike) -> xr.Dataset: +def load_station_metadata(meta: str | os.PathLike | None) -> xr.Dataset: + """Method to load station metadata from a file or URL.""" if meta: df_inv = pd.read_csv(meta, header=0) else: diff --git a/miranda/io/_rechunk.py b/miranda/io/_rechunk.py index 6c25d432..7c55fee4 100644 --- a/miranda/io/_rechunk.py +++ b/miranda/io/_rechunk.py @@ -26,7 +26,7 @@ "translate_time_chunk", ] -_data_folder = Path(__file__).parent / "data" +_data_folder = Path(__file__).parent / "configs" chunk_configurations = json.load(open(_data_folder / "ouranos_chunk_config.json")) diff --git a/miranda/io/utils.py b/miranda/io/utils.py index 33da8643..42f7b5e7 100644 --- a/miranda/io/utils.py +++ b/miranda/io/utils.py @@ -28,7 +28,7 @@ "sort_variables", ] -_data_folder = Path(__file__).parent / "data" +_data_folder = Path(__file__).parent / "configs" name_configurations = json.load(open(_data_folder / "ouranos_name_config.json")) diff --git a/miranda/preprocess/__init__.py b/miranda/preprocess/__init__.py new file mode 100644 index 00000000..0ae1f1d6 --- /dev/null +++ b/miranda/preprocess/__init__.py @@ -0,0 +1 @@ +"""Preprocessing tools for Miranda.""" diff --git a/miranda/preprocess/_data_definitions.py b/miranda/preprocess/_data_definitions.py new file mode 100644 index 00000000..815b1048 --- /dev/null +++ b/miranda/preprocess/_data_definitions.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +_config_folder = Path(__file__).resolve().parent / "configs" + + +__all__ = ["load_json_data_mappings"] + + +def load_json_data_mappings(project: str) -> dict[str, Any]: + """Load JSON mappings for supported dataset conversions. + + Parameters + ---------- + project : str + + Returns + ------- + dict[str, Any] + """ + if project == "eccc-homogenized": + metadata_definition = json.load( + open(_config_folder / "eccc-homogenized_attrs.json") + ) + elif project == "eccc-obs": + metadata_definition = json.load(open(_config_folder / "eccc-obs_attrs.json")) + elif project == "eccc-obs-summary": + metadata_definition = json.load( + open(_config_folder / "eccc-obs-summary_attrs.json") + ) + else: + raise NotImplementedError(f"Project not supported: {project}") + + return metadata_definition diff --git a/miranda/eccc/_homogenized.py b/miranda/preprocess/_eccc_homogenized.py similarity index 74% rename from miranda/eccc/_homogenized.py rename to miranda/preprocess/_eccc_homogenized.py index de4b04f0..4b4e6706 100644 --- a/miranda/eccc/_homogenized.py +++ b/miranda/preprocess/_eccc_homogenized.py @@ -10,16 +10,121 @@ import xarray as xr from dask.diagnostics import ProgressBar +from miranda.preprocess._data_definitions import load_json_data_mappings +from miranda.preprocess._treatments import basic_metadata_conversion from miranda.scripting import LOGGING_CONFIG -from ._utils import cf_ahccd_metadata - logging.config.dictConfig(LOGGING_CONFIG) logger = logging.Logger("miranda") __all__ = ["convert_ahccd", "convert_ahccd_fwf_files"] +def _ahccd_metadata( + gen: int, +) -> (dict[str, int | float | str], dict, list[tuple[int, int]], int): + """ + + Parameters + ---------- + gen: {1, 2, 3} + + Returns + ------- + dict[str, int or str or float], dict, list[tuple[int, int]], int + """ + generation = {1: "First", 2: "Second", 3: "Third"}.get(gen) + if not generation: + raise NotImplementedError(f"Generation '{gen}' not supported") + + config = load_json_data_mappings("eccc-homogenized") + metadata = basic_metadata_conversion("eccc-homogenized", config) + header = metadata["Header"] + + # Conditional handling of global attributes based on generation + for field in [f for f in header if f.startswith("_")]: + if isinstance(header[field], dict): + attr_treatment = header[field]["generation"] + else: + raise AttributeError( + f"Attribute treatment configuration for field `{field}` is not properly configured. Verify JSON." + ) + if field in ["_citation" "_product"]: + for attribute, value in attr_treatment.items(): + if attribute == generation: + header[field[1:]] = value + del header[field] + + return header + + +def _column_definitions( + variable_code: str, metadata: dict +) -> tuple[dict, list[tuple[int, int]], int]: + variable = metadata[variable_code] + variable["missing_flags"] = "M" + if variable["variable"].startswith("tas"): + variable["NaN_value"] = -9999.9 + column_names = [ + "No", + "StnId", + "Station name", + "Prov", + "FromYear", + "FromMonth", + "ToYear", + "ToMonth", + "%Miss", + "Lat(deg)", + "Long(deg)", + "Elev(m)", + "Joined", + "RCS", + ] + column_spaces = [(0, 5), (5, 6), (6, 8), (8, 9)] + ii = 9 + for i in range(1, 32): + column_spaces.append((ii, ii + 7)) + ii += 7 + column_spaces.append((ii, ii + 1)) + ii += 1 + header_row = 3 + + elif variable["variable"].startswith("pr"): + variable["NaN_value"] = -9999.99 + column_names = [ + "Prov", + "Station name", + "stnid", + "beg yr", + "beg mon", + "end yr", + "end mon", + "lat (deg)", + "long (deg)", + "elev (m)", + "stns joined", + ] + column_spaces = [(0, 4), (4, 5), (5, 7), (7, 8)] + ii = 8 + for i in range(1, 32): + column_spaces.append((ii, ii + 8)) + ii += 8 + column_spaces.append((ii, ii + 1)) + ii += 1 + header_row = 0 + + else: + raise KeyError + + column_names = { + col.lower().split("(")[0].replace("%", "pct_").strip().replace(" ", "_"): col + for col in list(column_names) + } + + return column_names, column_spaces, header_row + + def convert_ahccd( data_source: str | Path, output_dir: str | Path, @@ -45,6 +150,8 @@ def convert_ahccd( code = dict(tasmax="dx", tasmin="dn", tas="dm", pr="dt", prsn="ds", prlp="dr").get( variable ) + + attrs = _ahccd_metadata(generation) var, col_names, col_spaces, header_row, global_attrs = cf_ahccd_metadata( code, generation ) @@ -56,7 +163,7 @@ def convert_ahccd( else: raise NotImplementedError(f"Code '{code} for generation {gen}.") - metadata_source = Path(__file__).resolve().parent.joinpath("data").joinpath(meta) + metadata_source = Path(__file__).resolve().parent.joinpath("configs").joinpath(meta) if "tas" in variable: metadata = pd.read_csv(metadata_source, header=2) @@ -179,7 +286,7 @@ def convert_ahccd_fwf_files( ) if attrs is None: - attrs, _, _, _, _ = cf_ahccd_metadata(code, generation) + attrs = _ahccd_metadata(generation) if cols_specs is None: _, _, cols_specs, _, _ = cf_ahccd_metadata(code, generation) _, _, _, nhead, _ = cf_ahccd_metadata(code, generation) diff --git a/miranda/convert/eccc_obs.py b/miranda/preprocess/_eccc_obs.py similarity index 99% rename from miranda/convert/eccc_obs.py rename to miranda/preprocess/_eccc_obs.py index fd827d2f..7cfa9249 100644 --- a/miranda/convert/eccc_obs.py +++ b/miranda/preprocess/_eccc_obs.py @@ -45,9 +45,10 @@ config.dictConfig(LOGGING_CONFIG) __all__ = [ - "merge_stations", "convert_flat_files", + "convert_station", "merge_converted_variables", + "merge_stations", ] KiB = int(pow(2, 10)) @@ -60,8 +61,6 @@ def _fwf_column_definitions( time_frequency: str, ) -> tuple[list[str], list[int], list[type[str | int]]]: """Return the column names, widths, and data types for the fixed-width format.""" - - # Preparing the column headers if time_frequency.lower() in ["h", "hour", "hourly"]: num_observations = 24 column_names = ["code", "year", "month", "day", "code_var"] @@ -103,6 +102,7 @@ def convert_station( client: Any, **kwargs, ): + """Convert a single station's data from the fixed-width format to a netCDF file.""" data = Path(data) column_names, column_widths, column_dtypes = _fwf_column_definitions(mode) @@ -690,7 +690,7 @@ def _tmp_zarr( try: ds = xr.open_mfdataset( - nc, combine="nested", concat_dim={"station"}, preprocess=_remove_duplicates + nc, combine="nested", concat_dim="station", preprocess=_remove_duplicates ) except ValueError as e: errored_nc_files = ", ".join([Path(f).name for f in nc]) diff --git a/miranda/eccc/_summaries.py b/miranda/preprocess/_eccc_summaries.py similarity index 99% rename from miranda/eccc/_summaries.py rename to miranda/preprocess/_eccc_summaries.py index 8e0b42c1..3c31ba32 100755 --- a/miranda/eccc/_summaries.py +++ b/miranda/preprocess/_eccc_summaries.py @@ -30,7 +30,7 @@ __all__ = ["extract_daily_summaries", "daily_summaries_to_netcdf"] eccc_metadata = json.load( - open(Path(__file__).parent / "eccc_obs_summary_cf_attrs.json") + open(Path(__file__).resolve().parent / "configs" / "eccc-obs-summary_cf_attrs.json") )["variable_entry"] diff --git a/miranda/preprocess/_treatments.py b/miranda/preprocess/_treatments.py new file mode 100644 index 00000000..80d3fd45 --- /dev/null +++ b/miranda/preprocess/_treatments.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +import logging +from typing import Any + +from miranda import __version__ as __miranda_version__ + + +def basic_metadata_conversion( + project: str, metadata: dict +) -> dict[str, dict[str, Any]]: + """Present basic metadata conversion. + + Parameters + ---------- + project : str + Dataset project name. + metadata : dict + Metadata definition dictionary for project and variable(s). + + Returns + ------- + xarray.Dataset + """ + logging.info("Converting metadata.") + header = metadata["Header"] + + # Static handling of version global attributes + miranda_version = header.get("_miranda_version") + if miranda_version: + if isinstance(miranda_version, bool): + header["miranda_version"] = __miranda_version__ + elif isinstance(miranda_version, dict): + if project in miranda_version.keys(): + header["miranda_version"] = __miranda_version__ + else: + logging.warning( + f"`_miranda_version` not set for project `{project}`. Not appending." + ) + if "_miranda_version" in header: + del header["_miranda_version"] + + return metadata diff --git a/miranda/preprocess/configs/eccc-homogenized_attrs.json b/miranda/preprocess/configs/eccc-homogenized_attrs.json new file mode 100644 index 00000000..539ad40c --- /dev/null +++ b/miranda/preprocess/configs/eccc-homogenized_attrs.json @@ -0,0 +1,36 @@ +{ + "Header": { + "_citation": { + "generation": { + "Second": "Mekis, É and L.A. Vincent, 2011: An overview of the second generation adjusted daily precipitation dataset for trend analysis in Canada. Atmosphere-Ocean 49(2), 163-177 doi:10.1080/07055900.2011.583910", + "Third": "Vincent, L.A., M.M. Hartwell and X.L. Wang, 2020: A Third Generation of Homogenized Temperature for Trend Analysis and Monitoring Changes in Canada’s Climate. Atmosphere-Ocean. https://doi.org/10.1080/07055900.2020.1765728" + } + }, + "_converter": true, + "_miranda_version": true, + "_missing_values": [ + "-999", + "1e20" + ], + "_product": { + "generation": { + "Second": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 2", + "Third": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 3" + } + }, + "acknowledgement": "This data is provided by Environment and Climate Change Canada (ECCC).", + "author": "Environment and Climate Change Canada (ECCC)", + "contact": "info.cccs-ccsc@canada.ca", + "documentation": "https://www.canada.ca/en/environment-climate-change/services/climate-change/canadian-centre-climate-services/display-download/technical-documentation-adjusted-climate-data.html", + "institution": "GovCan", + "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html", + "license_preamble": "The data is owned by the Government of Canada (Environment and Climate Change Canada), and fall under the licence agreement for use of Environment and Climate Change Canada data.", + "license_type": "permissive", + "organization": "ECCC", + "processing_level": "adjusted", + "realm": "atmos", + "source": "AHCCD", + "table_date": "2023-03-23", + "table_id": "ECCC" + } +} diff --git a/miranda/eccc/data/eccc_obs_summary_cf_attrs.json b/miranda/preprocess/configs/eccc-obs-summary_cf_attrs.json similarity index 100% rename from miranda/eccc/data/eccc_obs_summary_cf_attrs.json rename to miranda/preprocess/configs/eccc-obs-summary_cf_attrs.json diff --git a/miranda/preprocess/configs/eccc-obs_attrs.json b/miranda/preprocess/configs/eccc-obs_attrs.json new file mode 100644 index 00000000..7265ca71 --- /dev/null +++ b/miranda/preprocess/configs/eccc-obs_attrs.json @@ -0,0 +1,29 @@ +{ + "Header": { + "_converter": true, + "_frequency": true, + "_miranda_version": true, + "_missing_flags": "M", + "_missing_values": [ + "-999", + "1e20", + "-9999", + "#####" + ], + "acknowledgement": "This data is provided by Environment and Climate Change Canada (ECCC).", + "author": "Environment and Climate Change Canada (ECCC)", + "contact": "climatcentre-climatecentral@ec.gc.ca", + "documentation": "https://climate.weather.gc.ca/doc/Technical_Documentation.pdf", + "institution": "GovCan", + "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html", + "license_preamble": "The data is owned by the Government of Canada (Environment and Climate Change Canada), and fall under the licence agreement for use of Environment and Climate Change Canada data.", + "license_type": "permissive", + "organization": "ECCC", + "processing_level": "raw", + "source": "ECCC-OBS", + "table_date": "2023-08-02", + "title": "Environment and Climate Change Canada (ECCC) weather station observations", + "type": "station-obs", + "usage": "The original data is owned by the Government of Canada (Environment and Climate Change Canada), and falls under the licence agreement for use of Environment and Climate Change Canada data" + } +} diff --git a/miranda/eccc/convert.py b/miranda/preprocess/eccc.py similarity index 90% rename from miranda/eccc/convert.py rename to miranda/preprocess/eccc.py index eb94be48..aab3595d 100644 --- a/miranda/eccc/convert.py +++ b/miranda/preprocess/eccc.py @@ -10,31 +10,28 @@ from functools import partial from pathlib import Path -from miranda.convert.eccc_obs import _convert_station_file from miranda.eccc._utils import cf_station_metadata +from miranda.preprocess._eccc_obs import _convert_station_file from miranda.scripting import LOGGING_CONFIG logging.config.dictConfig(LOGGING_CONFIG) -_data_folder = Path(__file__).parent / "data" +_data_folder = Path(__file__).parent / "configs" eccc_observation_variables = dict() eccc_observation_variables["flat"] = [ - v - for v in json.load(open(_data_folder / "eccc_obs_flat_attrs.json"))[ - "variables" - ].keys() + v for v in json.load(open(_data_folder / "eccc-obs_attrs.json"))["variables"].keys() ] eccc_observation_variables["summary"] = [ attrs["_cf_variable_name"] - for attrs in json.load(open(_data_folder / "eccc_obs_summary_cf_attrs.json"))[ + for attrs in json.load(open(_data_folder / "eccc-obs-summary_attrs.json"))[ "variables" ].values() ] eccc_observation_variables["homogenized"] = [ attrs["_cf_variable_name"] - for attrs in json.load(open(_data_folder / "eccc_homogenized_cf_attrs.json"))[ + for attrs in json.load(open(_data_folder / "eccc-homogenized_attrs.json"))[ "variables" ].values() ] diff --git a/miranda/convert/ecmwf.py b/miranda/preprocess/ecmwf_tigge.py similarity index 100% rename from miranda/convert/ecmwf.py rename to miranda/preprocess/ecmwf_tigge.py diff --git a/miranda/structure/_structure.py b/miranda/structure/_structure.py index cac8b53b..1c2b321a 100644 --- a/miranda/structure/_structure.py +++ b/miranda/structure/_structure.py @@ -12,10 +12,10 @@ import yaml from schema import SchemaError -from miranda.cv import VALIDATION_ENABLED from miranda.decode import Decoder, DecoderError, guess_project from miranda.io import discover_data from miranda.scripting import LOGGING_CONFIG +from miranda.vocabularies.esgf import VALIDATION_ENABLED if VALIDATION_ENABLED: from miranda.validators import validation_schemas @@ -306,7 +306,9 @@ def build_path_from_schema( Path or None """ if schema is None: - schema = Path(__file__).parent.joinpath("data").joinpath("ouranos_schema.yml") + schema = ( + Path(__file__).parent.joinpath("configs").joinpath("ouranos_schema.yml") + ) tree = parse_schema(facets, schema, top_folder) branch = tree[0] diff --git a/miranda/validators.py b/miranda/validators.py index 75551a9d..02ffc7bd 100644 --- a/miranda/validators.py +++ b/miranda/validators.py @@ -8,12 +8,12 @@ from pandas._libs.tslibs import NaTType # noqa from schema import Literal, Optional, Or, Regex, Schema -from .cv import VALIDATION_ENABLED +from miranda.vocabularies.esgf import VALIDATION_ENABLED __all__ = ["url_validate"] if VALIDATION_ENABLED: - from .cv import ( + from miranda.vocabularies.esgf import ( ACTIVITIES, BIAS_ADJUST_INSTITUTIONS, DRIVING_MODELS, diff --git a/miranda/vocabularies/__init__.py b/miranda/vocabularies/__init__.py new file mode 100644 index 00000000..74f0a223 --- /dev/null +++ b/miranda/vocabularies/__init__.py @@ -0,0 +1 @@ +"""Controlled Vocabulary module.""" diff --git a/miranda/vocabularies/eccc.py b/miranda/vocabularies/eccc.py new file mode 100644 index 00000000..f86ebb53 --- /dev/null +++ b/miranda/vocabularies/eccc.py @@ -0,0 +1,88 @@ +"""Definition lists of variables from ECCC for each type of archive.""" + +# For more information see the ECCC Technical Documentation + +__all__ = [ + "DLY", + "DLY02", + "DLY03", + "DLY04", + "DLY12", + "DLY13", + "DLY21", + "DLY44", + "HLY", + "HLY01", + "HLY01_RCS", + "HLY03", + "HLY10", + "HLY15", + "HLY21", + "MLY", + "MLY04", +] + +# Hourly Data + +HLY01 = [] +HLY01.extend(list(range(71, 123))) # Hourly variables +HLY01.extend([209, 210]) # Wind character and gust speed +HLY01.extend(list(range(219, 231))) # Cloud layers +HLY01.append(244) # Precipitation type +HLY01.append(260) # Freezing fog + +HLY01_RCS = HLY01.copy() +HLY01_RCS.extend( + list(range(262, 281)) +) # Reference Climate Surface (RCS) weather stations + +HLY03 = [] +HLY03.extend(list(range(123, 133))) # Hourly rainfall +HLY03.extend([160, 161]) + +HLY10 = [] +HLY10.extend(list(range(61, 69))) # Sunshine +HLY10.extend([133, 169, 170, 171, 172]) # Solar radiation + +HLY15 = [69, 70, 76, 156] # Wind + +HLY21 = [123] # Fischer/Porter precipitation + +HLY = list(set(HLY01 + HLY01_RCS + HLY03 + HLY10 + HLY15 + HLY21)) + +# Daily Data + +DLY02 = [] +DLY02.extend(list(range(1, 26))) # Daily variables +DLY02.append(157) # Direction of extreme gust +DLY02.append(179) # Daily bright sunshine + +DLY03 = [] +DLY03.extend(list(range(124, 133))) +DLY03.extend([160, 161]) + +DLY04 = DLY02.copy() + +DLY12 = [] +DLY12.extend(list(range(134, 151))) # Soil temperatures + +DLY13 = list(range(151, 156)) # Pan evaporation + +DLY21 = [12] # Precipitation +DLY21.extend(list(range(127, 133))) # Precipitation over time +DLY21.append(161) # Most precipitation in 25 hours + +DLY44 = [] +DLY44.extend([1, 2, 3]) # Temperature +DLY44.extend(list(range(10, 18))) # Precipitation + +DLY = list(set(DLY02 + DLY03 + DLY04 + DLY12 + DLY13 + DLY21 + DLY44)) + +# Monthly data + +MLY04 = [] +MLY04.extend(list(range(26, 39))) # Days with variables +MLY04.extend(list(range(39, 61))) # Means of variables +MLY04.append(158) # Direction of extreme gust + +MLY = list(set(MLY04)) diff --git a/miranda/cv.py b/miranda/vocabularies/esgf.py similarity index 99% rename from miranda/cv.py rename to miranda/vocabularies/esgf.py index 93d1f3c2..c22e0f13 100644 --- a/miranda/cv.py +++ b/miranda/vocabularies/esgf.py @@ -1,4 +1,4 @@ -"""Controlled Vocabulary module.""" +"""ESGF Vocabularies.""" from __future__ import annotations import warnings diff --git a/pyproject.toml b/pyproject.toml index 2b5c171c..ed3ca140 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,7 +100,11 @@ remote = [ [tool.black] target-version = [ - "py37" + "py37", + "py38", + "py39", + "py310", + "py311" ] [tool.coverage.run] diff --git a/templates/eccc_raw_hourly_conversion.py b/templates/eccc_raw_hourly_conversion.py index fcb47b2f..c72e3777 100644 --- a/templates/eccc_raw_hourly_conversion.py +++ b/templates/eccc_raw_hourly_conversion.py @@ -1,7 +1,7 @@ from os import getenv from pathlib import Path -from miranda.convert.eccc_obs import ( +from miranda.preprocess._eccc_obs import ( merge_stations, convert_flat_files, merge_converted_variables, diff --git a/templates/eccc_rdrs_processing.py b/templates/eccc_rdrs_processing.py index c7563157..f2313cfa 100644 --- a/templates/eccc_rdrs_processing.py +++ b/templates/eccc_rdrs_processing.py @@ -1,7 +1,7 @@ import logging from pathlib import Path -from miranda.convert.eccc_rdrs import convert_rdrs, rdrs_to_daily +from miranda.preprocess.eccc_rdrs import convert_rdrs, rdrs_to_daily from miranda.io import concat_rechunk_zarr From 9baaad505ddffe08847032771ad740b2c1c66f49 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Thu, 3 Aug 2023 15:52:34 -0400 Subject: [PATCH 08/33] working version of ahccd conversion --- miranda/convert/corrections.py | 10 +- miranda/io/_rechunk.py | 2 +- miranda/io/utils.py | 2 +- miranda/preprocess/_eccc_homogenized.py | 347 +++++++++--------- .../configs}/ahccd_gen2_precipitation.csv | 0 .../configs}/ahccd_gen3_temperature.csv | 0 .../configs/eccc-homogenized_attrs.json | 83 ++++- 7 files changed, 259 insertions(+), 185 deletions(-) rename miranda/{eccc/data => preprocess/configs}/ahccd_gen2_precipitation.csv (100%) rename miranda/{eccc/data => preprocess/configs}/ahccd_gen3_temperature.csv (100%) diff --git a/miranda/convert/corrections.py b/miranda/convert/corrections.py index 259d2b1b..8a5ef0ee 100644 --- a/miranda/convert/corrections.py +++ b/miranda/convert/corrections.py @@ -9,23 +9,21 @@ import xarray as xr -from miranda.convert import ( - dimensions_compliance, - metadata_conversion, - threshold_mask, - variable_conversion, -) from miranda.convert._data_definitions import load_json_data_mappings from miranda.convert._treatments import ( cf_units_conversion, clip_values, conservative_regrid, correct_unit_names, + dimensions_compliance, ensure_correct_time_frequency, invert_value_sign, + metadata_conversion, offset_time_dimension, preprocessing_corrections, + threshold_mask, transform_values, + variable_conversion, ) from miranda.convert.utils import find_version_hash from miranda.gis import subset_domain diff --git a/miranda/io/_rechunk.py b/miranda/io/_rechunk.py index 7c55fee4..6c25d432 100644 --- a/miranda/io/_rechunk.py +++ b/miranda/io/_rechunk.py @@ -26,7 +26,7 @@ "translate_time_chunk", ] -_data_folder = Path(__file__).parent / "configs" +_data_folder = Path(__file__).parent / "data" chunk_configurations = json.load(open(_data_folder / "ouranos_chunk_config.json")) diff --git a/miranda/io/utils.py b/miranda/io/utils.py index 42f7b5e7..33da8643 100644 --- a/miranda/io/utils.py +++ b/miranda/io/utils.py @@ -28,7 +28,7 @@ "sort_variables", ] -_data_folder = Path(__file__).parent / "configs" +_data_folder = Path(__file__).parent / "data" name_configurations = json.load(open(_data_folder / "ouranos_name_config.json")) diff --git a/miranda/preprocess/_eccc_homogenized.py b/miranda/preprocess/_eccc_homogenized.py index 4b4e6706..01530156 100644 --- a/miranda/preprocess/_eccc_homogenized.py +++ b/miranda/preprocess/_eccc_homogenized.py @@ -8,7 +8,6 @@ import numpy as np import pandas as pd import xarray as xr -from dask.diagnostics import ProgressBar from miranda.preprocess._data_definitions import load_json_data_mappings from miranda.preprocess._treatments import basic_metadata_conversion @@ -20,13 +19,15 @@ __all__ = ["convert_ahccd", "convert_ahccd_fwf_files"] -def _ahccd_metadata( +def _ahccd_variable_metadata( + variable_code: str, gen: int, ) -> (dict[str, int | float | str], dict, list[tuple[int, int]], int): """ Parameters ---------- + variable_code: {"dm", "dn", "dr", "ds", "dt", "dx"} gen: {1, 2, 3} Returns @@ -39,8 +40,19 @@ def _ahccd_metadata( config = load_json_data_mappings("eccc-homogenized") metadata = basic_metadata_conversion("eccc-homogenized", config) - header = metadata["Header"] + variable_meta = metadata["variables"].get(variable_code) + if not variable_meta: + raise NotImplementedError(f"Variable `{variable_code}` not supported.") + + variable_name = variable_meta.get("_variable_name") + if variable_name: + variable_meta = {variable_name: variable_meta} + del variable_meta[variable_name]["_variable_name"] + else: + variable_meta = {variable_code: variable_meta} + + header = metadata["Header"] # Conditional handling of global attributes based on generation for field in [f for f in header if f.startswith("_")]: if isinstance(header[field], dict): @@ -55,16 +67,21 @@ def _ahccd_metadata( header[field[1:]] = value del header[field] - return header + return variable_meta, header + +def _ahccd_station_metadata(code): + pass -def _column_definitions( - variable_code: str, metadata: dict + +def _ahccd_column_definitions( + variable_code: str, ) -> tuple[dict, list[tuple[int, int]], int]: - variable = metadata[variable_code] - variable["missing_flags"] = "M" - if variable["variable"].startswith("tas"): - variable["NaN_value"] = -9999.9 + config = load_json_data_mappings("eccc-homogenized") + metadata = basic_metadata_conversion("eccc-homogenized", config) + + variable = metadata["variables"][variable_code]["_variable_name"] + if variable.startswith("tas"): column_names = [ "No", "StnId", @@ -90,8 +107,7 @@ def _column_definitions( ii += 1 header_row = 3 - elif variable["variable"].startswith("pr"): - variable["NaN_value"] = -9999.99 + elif variable.startswith("pr"): column_names = [ "Prov", "Station name", @@ -125,146 +141,12 @@ def _column_definitions( return column_names, column_spaces, header_row -def convert_ahccd( - data_source: str | Path, - output_dir: str | Path, - variable: str, - generation: int | None = None, -) -> None: - """Convert Adjusted and Homogenized Canadian Climate Dataset files. - - Parameters - ---------- - data_source: str or Path - output_dir: str or Path - variable: str - generation: int, optional - - Returns - ------- - None - """ - output_dir = Path(output_dir).resolve().joinpath(variable) - output_dir.mkdir(parents=True, exist_ok=True) - - code = dict(tasmax="dx", tasmin="dn", tas="dm", pr="dt", prsn="ds", prlp="dr").get( - variable - ) - - attrs = _ahccd_metadata(generation) - var, col_names, col_spaces, header_row, global_attrs = cf_ahccd_metadata( - code, generation - ) - gen = {2: "Second", 3: "Third"}.get(generation) - if generation == 3 and code in {"dx", "dn", "dm"}: - meta = "ahccd_gen3_temperature.csv" - elif generation == 2 and code in {"dt", "ds", "dr"}: - meta = "ahccd_gen2_precipitation.csv" - - else: - raise NotImplementedError(f"Code '{code} for generation {gen}.") - metadata_source = Path(__file__).resolve().parent.joinpath("configs").joinpath(meta) - - if "tas" in variable: - metadata = pd.read_csv(metadata_source, header=2) - metadata.columns = col_names.keys() - cols_specs = col_spaces - - elif "pr" in variable: - metadata = pd.read_csv(metadata_source, header=3) - metadata.columns = col_names.keys() - cols_specs = col_spaces - for index, row in metadata.iterrows(): - if isinstance(row["stnid"], str): - metadata.loc[index, "stnid"] = metadata.loc[index, "stnid"].replace( - " ", "" - ) - else: - raise KeyError(f"{variable} does not include 'pr' or 'tas'.") - - # Convert station .txt files to netcdf - for ff in Path(data_source).glob("*d*.txt"): - outfile = output_dir.joinpath(ff.name.replace(".txt", ".nc")) - if not outfile.exists(): - logger.info(ff.name) - - stid = ff.name.replace(code, "").split(".txt")[0] - try: - metadata_st = metadata[metadata["stnid"] == int(stid)] - except ValueError: - metadata_st = metadata[metadata["stnid"] == stid] - - if len(metadata_st) == 1: - ds_out = convert_ahccd_fwf_files( - ff, metadata_st, variable, generation, cols_specs, var - ) - ds_out.attrs = global_attrs - - ds_out.to_netcdf(outfile, engine="h5netcdf") - else: - logger.warning( - f"metadata info for station {ff.name} not found : skipping" - ) - - # merge individual stations to single .nc file - # variable - ncfiles = list(output_dir.glob("*.nc")) - outfile = output_dir.parent.joinpath( - "merged_stations", f"ahccd_gen{generation}_{variable}.nc" - ) - - if not outfile.exists(): - logger.info("merging stations :", variable) - with ProgressBar(): - ds_ahccd = xr.open_mfdataset( - ncfiles, concat_dim="station", combine="nested" - ).load() - - for coord in ds_ahccd.coords: - # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files - # Do not apply to datetime object - if coord != "time" and ds_ahccd[coord].dtype == "O": - ds_ahccd[coord] = ds_ahccd[coord].astype(str) - - for v in ds_ahccd.data_vars: - # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files - # Do not apply to flag timeseries - if ds_ahccd[v].dtype == "O" and "flag" not in v: - logger.info(v) - ds_ahccd[v] = ds_ahccd[v].astype(str) - - ds_ahccd[f"{variable}_flag"].attrs[ - "long_name" - ] = f"{ds_ahccd[f'{variable}'].attrs['long_name']} flag" - ds_ahccd.lon.attrs["units"] = "degrees_east" - ds_ahccd.lon.attrs["long_name"] = "longitude" - ds_ahccd.lat.attrs["units"] = "degrees_north" - ds_ahccd.lat.attrs["long_name"] = "latitude" - - for clean_name, orig_name in col_names.items(): - if clean_name in ["lat", "long"]: - continue - ds_ahccd[clean_name].attrs["long_name"] = orig_name - - outfile.parent.mkdir(parents=True, exist_ok=True) - ds_ahccd.to_netcdf( - outfile, engine="h5netcdf", format="NETCDF4_CLASSIC", mode="w" - ) - - del ds_ahccd - for nc in outfile.parent.glob("*.nc"): - logger.info(nc) - ds = xr.open_dataset(nc) - logger.info(ds) - - def convert_ahccd_fwf_files( ff: Path | str, metadata: pd.DataFrame, variable: str, - generation: int = None, - cols_specs: list[tuple[int, int]] | None = None, - attrs: dict | None = None, + *, + generation: int, ) -> xr.Dataset: """Convert AHCCD fixed-width files. @@ -273,9 +155,7 @@ def convert_ahccd_fwf_files( ff: str or Path metadata: pandas.DataFrame variable: str - generation - cols_specs - attrs + generation: int Returns ------- @@ -285,26 +165,25 @@ def convert_ahccd_fwf_files( variable ) - if attrs is None: - attrs = _ahccd_metadata(generation) - if cols_specs is None: - _, _, cols_specs, _, _ = cf_ahccd_metadata(code, generation) - _, _, _, nhead, _ = cf_ahccd_metadata(code, generation) + variable_meta, global_attrs = _ahccd_variable_metadata(code, generation) + col_names, cols_specs, header = _ahccd_column_definitions(code) - df = pd.read_fwf(ff, header=nhead, colspecs=cols_specs) + df = pd.read_fwf(ff, header=header, colspecs=cols_specs) if "pr" in variable: cols = list(df.columns[0:3]) cols = cols[0::2] cols.extend(list(df.columns[4::2])) flags = list(df.columns[5::2]) dfflags = df[flags] - else: + elif "tas" in variable: cols = [c for c in df.columns if "Unnamed" not in c] flags = [c for c in df.columns if "Unnamed" in c] dfflags = df[flags[2:]] + else: + raise NotImplementedError(f"Variable `{variable}` not supported.") df = df[cols] - df.replace(attrs["NaN_value"], np.NaN, inplace=True) + df.replace(variable_meta[variable]["NaN_value"], np.NaN, inplace=True) for i, j in enumerate(["Year", "Month"]): df = df.rename(columns={df.columns[i]: j}) @@ -316,17 +195,19 @@ def convert_ahccd_fwf_files( index = pd.MultiIndex.from_arrays([df["Year"], df["Month"]]) df.index = index - dfflags.index = index cols = [c for c in df.columns if "Year" not in c and "Month" not in c] df = df[cols] df.columns = np.arange(1, 32) - dfflags.columns = np.arange(1, 32) ds = df.stack().to_frame() ds = ds.rename(columns={0: variable}) + ds.index.names = ["Year", "Month", "Day"] + + dfflags.index = index + dfflags.columns = np.arange(1, 32) ds_flag = dfflags.stack().to_frame() ds_flag = ds_flag.rename(columns={0: "flag"}) - ds.index.names = ["Year", "Month", "Day"] ds_flag.index.names = ["Year", "Month", "Day"] + ds[f"{variable}_flag"] = ds_flag["flag"] del ds_flag @@ -355,15 +236,12 @@ def convert_ahccd_fwf_files( ) ds.index = pd.to_datetime(time_ds) - ds = ds.to_xarray().rename({"index": "time"}) - ds_out = xr.Dataset(coords={"time": time1}) for v in ds.data_vars: ds_out[v] = ds[v] - ds_out[variable].attrs = attrs - # ds_out + ds_out[variable].attrs = variable_meta[variable] metadata = metadata.to_xarray().rename({"index": "station"}).drop_vars("station") metadata = metadata.assign_coords( { @@ -371,17 +249,19 @@ def convert_ahccd_fwf_files( "station_name": metadata["station_name"], } ) - # ds_out = ds_out.assign_coords({'lon': metadata['long'], 'lat': metadata['lat'], 'elevation': metadata['elev']}) - # ds_out = ds_out.assign_coords(station=metadata.stnid) metadata = metadata.drop_vars(["stnid", "station_name"]) + ds_out[f"{variable}_flag"].attrs["long_name"] = variable_meta[variable]["long_name"] ds_out["lon"] = metadata["long"] - ds_out["lon"].attrs["units"] = "degrees_east" + ds_out.lon.attrs["units"] = "degrees_east" + ds_out.lon.attrs["axis"] = "X" ds_out["lat"] = metadata["lat"] - ds_out["lat"].attrs["units"] = "degrees_north" + ds_out.lat.attrs["units"] = "degrees_north" + ds_out.lat.attrs["axis"] = "Y" ds_out["elev"] = metadata["elev"] - ds_out["elev"].attrs["units"] = "m" + ds_out.elev.attrs["units"] = "m" + ds_out.elev.attrs["axis"] = "Z" metadata = metadata.drop_vars(["long", "lat", "elev"]) for vv in metadata.data_vars: @@ -390,3 +270,128 @@ def convert_ahccd_fwf_files( else: ds_out[vv] = metadata[vv] return ds_out + + +def convert_ahccd( + data_source: str | Path, + output_dir: str | Path, + variable: str, + generation: int, +) -> None: + """Convert Adjusted and Homogenized Canadian Climate Dataset files. + + Parameters + ---------- + data_source: str or Path + output_dir: str or Path + variable: str + generation: int + + Returns + ------- + None + """ + output_dir = Path(output_dir).resolve().joinpath(variable) + output_dir.mkdir(parents=True, exist_ok=True) + + code = dict(tasmax="dx", tasmin="dn", tas="dm", pr="dt", prsn="ds", prlp="dr").get( + variable + ) + + var_meta, global_attrs = _ahccd_variable_metadata(code, generation) + + ( + col_names, + col_spaces, + header_row, + ) = _ahccd_column_definitions(code) + + gen = {2: "Second", 3: "Third"}.get(generation) + if generation == 3 and code in {"dx", "dn", "dm"}: + station_meta = "ahccd_gen3_temperature.csv" + elif generation == 2 and code in {"dt", "ds", "dr"}: + station_meta = "ahccd_gen2_precipitation.csv" + + else: + raise NotImplementedError(f"Code '{code} for generation {gen}.") + metadata_source = ( + Path(__file__).resolve().parent.joinpath("configs").joinpath(station_meta) + ) + + if "tas" in variable: + metadata = pd.read_csv(metadata_source, header=2) + metadata.columns = col_names.keys() + + elif "pr" in variable: + metadata = pd.read_csv(metadata_source, header=3) + metadata.columns = col_names.keys() + for index, row in metadata.iterrows(): + if isinstance(row["stnid"], str): + metadata.loc[index, "stnid"] = metadata.loc[index, "stnid"].replace( + " ", "" + ) + else: + raise KeyError(f"{variable} does not include 'pr' or 'tas'.") + + # Convert station .txt files to netcdf + for ff in Path(data_source).glob(f"{code}*.txt"): + outfile = output_dir.joinpath(ff.name.replace(".txt", ".nc")) + if not outfile.exists(): + logger.info(ff.name) + + station_id = ff.name[2:].split(".txt")[0] + try: + metadata_st = metadata[metadata["stnid"] == int(station_id)] + except ValueError: + metadata_st = metadata[metadata["stnid"] == station_id] + + if len(metadata_st) == 1: + ds_out = convert_ahccd_fwf_files( + ff, metadata_st, variable, generation=generation + ) + ds_out.attrs = global_attrs + + ds_out.to_netcdf(outfile, engine="h5netcdf") + else: + logger.warning( + f"metadata info for station {ff.name} not found : skipping" + ) + + # merge individual stations to single .nc file + # variable + ncfiles = list(output_dir.glob("*.nc")) + outfile = output_dir.parent.joinpath( + "merged_stations", f"ahccd_gen{generation}_{variable}.nc" + ) + + if not outfile.exists(): + logger.info("merging stations :", variable) + ds_ahccd = xr.open_mfdataset( + ncfiles, concat_dim="station", combine="nested" + ).load() + + for coord in ds_ahccd.coords: + # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files + # Do not apply to datetime object + if coord != "time" and ds_ahccd[coord].dtype == "O": + ds_ahccd[coord] = ds_ahccd[coord].astype(str) + + for v in ds_ahccd.data_vars: + # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files + # Do not apply to flag timeseries + if ds_ahccd[v].dtype == "O" and "flag" not in v: + logger.info(v) + ds_ahccd[v] = ds_ahccd[v].astype(str) + + # for clean_name, orig_name in col_names.items(): + # if clean_name in ["lat", "long"]: + # continue + # ds_ahccd[clean_name].attrs["long_name"] = orig_name + + outfile.parent.mkdir(parents=True, exist_ok=True) + ds_ahccd.to_netcdf(outfile, engine="h5netcdf", mode="w") + del ds_ahccd + for nc in outfile.parent.glob("*.nc"): + logger.info(nc) + ds = xr.open_dataset(nc) + logger.info(ds) diff --git a/miranda/eccc/data/ahccd_gen2_precipitation.csv b/miranda/preprocess/configs/ahccd_gen2_precipitation.csv similarity index 100% rename from miranda/eccc/data/ahccd_gen2_precipitation.csv rename to miranda/preprocess/configs/ahccd_gen2_precipitation.csv diff --git a/miranda/eccc/data/ahccd_gen3_temperature.csv b/miranda/preprocess/configs/ahccd_gen3_temperature.csv similarity index 100% rename from miranda/eccc/data/ahccd_gen3_temperature.csv rename to miranda/preprocess/configs/ahccd_gen3_temperature.csv diff --git a/miranda/preprocess/configs/eccc-homogenized_attrs.json b/miranda/preprocess/configs/eccc-homogenized_attrs.json index 539ad40c..b1861502 100644 --- a/miranda/preprocess/configs/eccc-homogenized_attrs.json +++ b/miranda/preprocess/configs/eccc-homogenized_attrs.json @@ -6,12 +6,7 @@ "Third": "Vincent, L.A., M.M. Hartwell and X.L. Wang, 2020: A Third Generation of Homogenized Temperature for Trend Analysis and Monitoring Changes in Canada’s Climate. Atmosphere-Ocean. https://doi.org/10.1080/07055900.2020.1765728" } }, - "_converter": true, "_miranda_version": true, - "_missing_values": [ - "-999", - "1e20" - ], "_product": { "generation": { "Second": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 2", @@ -21,6 +16,7 @@ "acknowledgement": "This data is provided by Environment and Climate Change Canada (ECCC).", "author": "Environment and Climate Change Canada (ECCC)", "contact": "info.cccs-ccsc@canada.ca", + "dataset_id": "d6813de6-b20a-46cc-8990-01862ae15c5f", "documentation": "https://www.canada.ca/en/environment-climate-change/services/climate-change/canadian-centre-climate-services/display-download/technical-documentation-adjusted-climate-data.html", "institution": "GovCan", "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html", @@ -30,7 +26,82 @@ "processing_level": "adjusted", "realm": "atmos", "source": "AHCCD", - "table_date": "2023-03-23", + "table_date": "2023-08-03", "table_id": "ECCC" + }, + "variables": { + "dm": { + "NaN_value": -9999.9, + "_variable_name": "tas", + "cell_methods": "time: mean", + "comments": "Station data converted from Mean Temp (°C)", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Near-Surface Air Temperature", + "missing_flags": "M", + "original_field": "Mean Temp (°C)", + "units": "degC" + }, + "dn": { + "NaN_value": -9999.9, + "_variable_name": "tasmin", + "cell_methods": "time: minimum", + "comments": "Station data converted from Min Temp (°C)", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Daily Minimum Near-Surface Air Temperature", + "missing_flags": "M", + "original_field": "Min Temp (°C)", + "units": "degC" + }, + "dr": { + "NaN_value": -9999.99, + "_variable_name": "prlp", + "cell_methods": "time: mean", + "comments": "Station data converted from Total Rain (mm) using a density of 1000 kg/m³", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Liquid Precipitation", + "missing_flags": "M", + "original_field": "Total Rain (mm)", + "units": "mm" + }, + "ds": { + "NaN_value": -9999.99, + "_variable_name": "prsn", + "cell_methods": "time: mean", + "comments": "station data converted from Total Snow (cm) using a density of 100 kg/m³", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Snowfall Flux", + "missing_flags": "M", + "original_field": "Total Snow (cm)", + "units": "mm" + }, + "dt": { + "NaN_value": -9999.99, + "_variable_name": "pr", + "cell_methods": "time: mean", + "comments": "Station data converted from Total Precip (mm) using a density of 1000 kg/m³", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Precipitation", + "missing_flags": "M", + "original_field": "Total Precip (mm)", + "units": "mm" + }, + "dx": { + "NaN_value": -9999.9, + "_variable_name": "tasmax", + "cell_methods": "time: maximum", + "comments": "station data converted from Max Temp (°C)", + "frequency": "day", + "grid_mapping": "regular_lon_lat", + "long_name": "Daily Maximum Near-Surface Air Temperature", + "missing_flags": "M", + "original_field": "Max Temp (°C)", + "standard_name": "air_temperature", + "units": "degC" + } } } From 53fc8f01357c69d8d6318cc51699d399af54bcef Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Mon, 7 Aug 2023 10:14:23 -0400 Subject: [PATCH 09/33] working version of ahccd conversion --- miranda/preprocess/_eccc_homogenized.py | 37 ++- miranda/preprocess/_treatments.py | 1 - .../configs/ahccd_gen2_precipitation.csv | 302 +++++++++--------- .../configs/eccc-homogenized_attrs.json | 5 +- 4 files changed, 175 insertions(+), 170 deletions(-) diff --git a/miranda/preprocess/_eccc_homogenized.py b/miranda/preprocess/_eccc_homogenized.py index 01530156..f400b5b1 100644 --- a/miranda/preprocess/_eccc_homogenized.py +++ b/miranda/preprocess/_eccc_homogenized.py @@ -16,7 +16,7 @@ logging.config.dictConfig(LOGGING_CONFIG) logger = logging.Logger("miranda") -__all__ = ["convert_ahccd", "convert_ahccd_fwf_files"] +__all__ = ["convert_ahccd", "convert_ahccd_fwf_file"] def _ahccd_variable_metadata( @@ -55,16 +55,22 @@ def _ahccd_variable_metadata( header = metadata["Header"] # Conditional handling of global attributes based on generation for field in [f for f in header if f.startswith("_")]: - if isinstance(header[field], dict): + if isinstance(header[field], bool): + if header[field] and field[1:] == "variable": + header[field[1:]] = variable_name + + elif isinstance(header[field], dict): attr_treatment = header[field]["generation"] + if field in ["_citation" "_product"]: + for attribute, value in attr_treatment.items(): + if attribute == generation: + header[field[1:]] = value + else: raise AttributeError( f"Attribute treatment configuration for field `{field}` is not properly configured. Verify JSON." ) - if field in ["_citation" "_product"]: - for attribute, value in attr_treatment.items(): - if attribute == generation: - header[field[1:]] = value + del header[field] return variable_meta, header @@ -141,7 +147,7 @@ def _ahccd_column_definitions( return column_names, column_spaces, header_row -def convert_ahccd_fwf_files( +def convert_ahccd_fwf_file( ff: Path | str, metadata: pd.DataFrame, variable: str, @@ -235,7 +241,7 @@ def convert_ahccd_fwf_files( } ) - ds.index = pd.to_datetime(time_ds) + ds.index = pd.to_datetime(time_ds) # noqa ds = ds.to_xarray().rename({"index": "time"}) ds_out = xr.Dataset(coords={"time": time1}) for v in ds.data_vars: @@ -277,6 +283,7 @@ def convert_ahccd( output_dir: str | Path, variable: str, generation: int, + merge: bool = False, ) -> None: """Convert Adjusted and Homogenized Canadian Climate Dataset files. @@ -286,6 +293,7 @@ def convert_ahccd( output_dir: str or Path variable: str generation: int + merge: bool Returns ------- @@ -299,7 +307,6 @@ def convert_ahccd( ) var_meta, global_attrs = _ahccd_variable_metadata(code, generation) - ( col_names, col_spaces, @@ -346,7 +353,7 @@ def convert_ahccd( metadata_st = metadata[metadata["stnid"] == station_id] if len(metadata_st) == 1: - ds_out = convert_ahccd_fwf_files( + ds_out = convert_ahccd_fwf_file( ff, metadata_st, variable, generation=generation ) ds_out.attrs = global_attrs @@ -356,10 +363,11 @@ def convert_ahccd( logger.warning( f"metadata info for station {ff.name} not found : skipping" ) + if not merge: + return # merge individual stations to single .nc file - # variable - ncfiles = list(output_dir.glob("*.nc")) + ncfiles = list(output_dir.glob(f"{code}*.nc")) outfile = output_dir.parent.joinpath( "merged_stations", f"ahccd_gen{generation}_{variable}.nc" ) @@ -383,11 +391,6 @@ def convert_ahccd( logger.info(v) ds_ahccd[v] = ds_ahccd[v].astype(str) - # for clean_name, orig_name in col_names.items(): - # if clean_name in ["lat", "long"]: - # continue - # ds_ahccd[clean_name].attrs["long_name"] = orig_name - outfile.parent.mkdir(parents=True, exist_ok=True) ds_ahccd.to_netcdf(outfile, engine="h5netcdf", mode="w") del ds_ahccd diff --git a/miranda/preprocess/_treatments.py b/miranda/preprocess/_treatments.py index 80d3fd45..9e667440 100644 --- a/miranda/preprocess/_treatments.py +++ b/miranda/preprocess/_treatments.py @@ -22,7 +22,6 @@ def basic_metadata_conversion( ------- xarray.Dataset """ - logging.info("Converting metadata.") header = metadata["Header"] # Static handling of version global attributes diff --git a/miranda/preprocess/configs/ahccd_gen2_precipitation.csv b/miranda/preprocess/configs/ahccd_gen2_precipitation.csv index ce59df01..6f0c0f3a 100644 --- a/miranda/preprocess/configs/ahccd_gen2_precipitation.csv +++ b/miranda/preprocess/configs/ahccd_gen2_precipitation.csv @@ -8,23 +8,23 @@ BC,ARMSTRONG HULLCAR,1160483,1912,1,1998,12,50.5,-119.216666666667,505,Yes BC,ATLIN,1200560,1906,1,2017,12,59.5666666666667,-133.7,674,No BC,BARKERVILLE,1090660,1888,1,2015,3,53.0691666666667,-121.514722222222,1283,No BC,BEAVERDELL NORTH,1130771,1926,1,2006,9,49.4783333333333,-119.047,838,Yes -BC,BELLA COOLA ,1060841,1899,1,2017,11,52.3875,-126.595833333333,36,Yes +BC,BELLA COOLA,1060841,1899,1,2017,11,52.3875,-126.595833333333,36,Yes BC,BIG CREEK,1080870,1904,1,1998,11,51.6672236111111,-123.073056944444,1175,No -BC,BLUE RIVER ,1160899,1929,1,2017,12,52.1290277777778,-119.289527777778,683,Yes +BC,BLUE RIVER,1160899,1929,1,2017,12,52.1290277777778,-119.289527777778,683,Yes BC,BRISCO,1171020,1924,1,2004,3,50.8205555555556,-116.258055555556,823,No BC,BRITANNIA BEACH FURRY CREEK,1041050,1914,1,2000,4,49.5838888888889,-123.223611111111,9,Yes BC,BURQUITLAM VANCOUVER GOLF COURSE,1101200,1926,1,2005,12,49.2516666666667,-122.876944444444,122,Yes BC,CAPE SCOTT,1031353,1921,1,2016,6,50.7822333333333,-128.427227777778,72,Yes BC,CAPE ST JAMES,1051350,1926,1,1992,8,51.9333333333333,-131.016666666667,89,No BC,CASSIAR,1191440,1954,1,1996,8,59.2833333333333,-129.833333333333,1078,No -BC,CELISTA,116146F ,1924,1,2004,7,50.9555555555556,-119.379444444444,515,Yes +BC,CELISTA,116146F,1924,1,2004,7,50.9555555555556,-119.379444444444,515,Yes BC,CHATHAM POINT,1021480,1932,1,2016,2,50.3331944444444,-125.445555555556,23,Yes -BC,COMOX ,1021830,1936,1,2017,12,49.7166666666667,-124.9,26,Yes +BC,COMOX,1021830,1936,1,2017,12,49.7166666666667,-124.9,26,Yes BC,CORTES ISLAND TIBER BAY,1021960,1919,1,2017,12,50.0713888888889,-124.949444444444,15,Yes -BC,CRANBROOK ,1152102,1909,1,2012,11,49.6122222222222,-115.781944444444,939,Yes +BC,CRANBROOK,1152102,1909,1,2012,11,49.6122222222222,-115.781944444444,939,Yes BC,CRESTON,1142160,1912,1,2015,6,49.0970555555556,-116.517833333333,597,No BC,DARFIELD,1162265,1914,1,2017,11,51.2973333333333,-120.182666666667,412,Yes -BC,DAWSON CREEK ,1182285,1952,1,2007,2,55.7416666666667,-120.181944444444,655,Yes +BC,DAWSON CREEK,1182285,1952,1,2007,2,55.7416666666667,-120.181944444444,655,Yes BC,DEASE LAKE,1192340,1945,1,2008,7,58.428335,-130.010556666667,807,No BC,DEER PARK,1142400,1924,1,1995,9,49.4166666666667,-118.05,485,No BC,DRYAD POINT,1062544,1933,1,2017,12,52.1850005555556,-128.112224444444,4,Yes @@ -33,12 +33,12 @@ BC,ESTEVAN POINT,1032730,1924,1,2017,12,49.3835,-126.550833333333,7,No BC,FALLS RIVER,1062790,1932,1,1992,10,53.9833333333333,-129.733333333333,18,No BC,FAUQUIER,1142820,1913,1,2015,6,49.8719444444444,-118.0675,490,No BC,FERNIE,1152850,1914,1,2017,12,49.4888888888889,-115.072222222222,1001,No -BC,FORT NELSON ,1192940,1938,1,2012,11,58.8363888888889,-122.597222222222,382,No +BC,FORT NELSON,1192940,1938,1,2012,11,58.8363888888889,-122.597222222222,382,No BC,FORT ST JAMES,1092970,1895,1,2017,12,54.4552802777778,-124.285556111111,686,No -BC,FORT ST JOHN ,1183000,1931,1,2012,12,56.2380555555556,-120.740277777778,695,Yes +BC,FORT ST JOHN,1183000,1931,1,2012,12,56.2380555555556,-120.740277777778,695,Yes BC,GERMANSEN LANDING,1183090,1952,1,2013,11,55.7855277777778,-124.701444444444,766,No BC,GLACIER NP ROGERS PASS,1173191,1909,1,2014,7,51.3009166666667,-117.516388888889,1323,Yes -BC,GOLDEN ,1173210,1908,1,2017,12,51.2983333333333,-116.981666666667,785,No +BC,GOLDEN,1173210,1908,1,2017,12,51.2983333333333,-116.981666666667,785,No BC,GRAND FORKS,1133270,1910,1,2008,3,49.0261666666667,-118.465666666667,532,Yes BC,GRASMERE,1153282,1896,1,1993,11,49.0833333333333,-115.066666666667,869,Yes BC,HAZELTON TEMLEHAN,1073347,1915,1,1997,4,55.2,-127.733333333333,122,Yes @@ -58,7 +58,7 @@ BC,MASSET AIRPORT,1054920,1900,1,2008,6,54.0226111111111,-132.117472222222,7,Yes BC,MCINNES ISLAND,1065010,1954,1,2017,12,52.2616666666667,-128.719444444444,26,No BC,MERRITT STP,1125079,1919,1,2017,12,50.1141677777778,-120.800834722222,609,Yes BC,MICA DAM,1175122,1962,1,2017,12,52.0530555555556,-118.585277777778,579,No -BC,NANAIMO CITY YARD,10253G0 ,1913,1,2017,12,49.1988888888889,-123.987777777778,114,Yes +BC,NANAIMO CITY YARD,10253G0,1913,1,2017,12,49.1988888888889,-123.987777777778,114,Yes BC,NASS CAMP,1075384,1924,1,2015,2,55.2375,-129.029444444444,290,Yes BC,NELSON NE,1145442,1904,1,2017,12,49.5861111111111,-117.206388888889,570,Yes BC,NEW DENVER,1145460,1924,1,2017,12,49.995835,-117.370285,570,No @@ -68,186 +68,186 @@ BC,OOTSA L SKINS L SPILLWAY,1085835,1926,1,2017,7,53.7721666666667,-125.99655555 BC,OSOYOOS WEST,1125865,1954,1,2009,9,49.0319444444444,-119.442777777778,297,Yes BC,PACHENA POINT,1035940,1925,1,2017,12,48.7227777777778,-125.097222222222,37,No BC,PEMBERTON AIRPORT,1086082,1913,1,1991,6,50.3056461111111,-122.734088888889,204,Yes -BC,PENTICTON ,1126150,1907,1,2012,5,49.4630555555556,-119.602222222222,344,Yes +BC,PENTICTON,1126150,1907,1,2012,5,49.4630555555556,-119.602222222222,344,Yes BC,PORT ALICE,1036240,1924,1,2016,4,50.3858361111111,-127.455286111111,21,No -BC,PORT HARDY ,1026270,1944,1,2013,6,50.6802777777778,-127.366111111111,22,No +BC,PORT HARDY,1026270,1944,1,2013,6,50.6802777777778,-127.366111111111,22,No BC,POWELL RIVER,1046390,1924,1,2007,7,49.8761111111111,-124.554166666667,52,No -BC,PRINCE GEORGE ,1096450,1913,1,2009,10,53.8908333333333,-122.678888888889,691,Yes -BC,PRINCE RUPERT ,1066481,1909,1,2006,3,54.2925,-130.444722222222,35,Yes -BC,PRINCETON ,1126510,1901,1,2017,12,49.4677777777778,-120.5125,700,Yes +BC,PRINCE GEORGE,1096450,1913,1,2009,10,53.8908333333333,-122.678888888889,691,Yes +BC,PRINCE RUPERT,1066481,1909,1,2006,3,54.2925,-130.444722222222,35,Yes +BC,PRINCETON,1126510,1901,1,2017,12,49.4677777777778,-120.5125,700,Yes BC,QUATSINO,1036570,1895,1,2017,12,50.5336138888889,-127.653335833333,8,No -BC,QUESNEL ,1096630,1900,1,2007,3,53.0261111111111,-122.51,545,Yes +BC,QUESNEL,1096630,1900,1,2007,3,53.0261111111111,-122.51,545,Yes BC,QUINSAM RIVER HATCHERY,1026639,1936,1,2017,12,50.0161111111111,-125.303888888889,46,Yes BC,REVELSTOKE,1176751,1898,1,1999,8,50.9533333333333,-118.166388888889,450,Yes BC,SAANICHTON,1016940,1914,1,2017,12,48.6216666666667,-123.418888888889,61,No -BC,SALMON RM ,1166R45 ,1911,1,2013,2,50.6855577777778,-119.233613611111,527,Yes -BC,SANDSPIT ,1057050,1949,1,2017,3,53.2538888888889,-131.813055555556,6,No +BC,SALMON RM,1166R45,1911,1,2013,2,50.6855577777778,-119.233613611111,527,Yes +BC,SANDSPIT,1057050,1949,1,2017,3,53.2538888888889,-131.813055555556,6,No BC,SEYMOUR FALLS,1107200,1928,1,2003,9,49.4402777777778,-122.971111111111,244,No BC,SHALALTH,1117215,1935,1,2004,4,50.7283333333333,-122.240555555556,244,Yes BC,SHAWNIGAN LAKE,1017230,1911,1,2017,12,48.6469472222222,-123.626408333333,138,No -BC,SMITHERS ,1077500,1922,1,2017,12,54.8247222222222,-127.182777777778,522,Yes +BC,SMITHERS,1077500,1922,1,2017,12,54.8247222222222,-127.182777777778,522,Yes BC,STAVE FALLS,1107680,1910,1,2004,8,49.2333333333333,-122.366666666667,110,No -BC,STEWART ,1067742,1911,1,2016,6,55.9361111111111,-129.985,7,Yes +BC,STEWART,1067742,1911,1,2016,6,55.9361111111111,-129.985,7,Yes BC,STILLWATER POWER HOUSE,1047770,1931,1,2007,7,49.7666666666667,-124.316666666667,7,No BC,TATLAYOKO LAKE,1088010,1928,1,2005,4,51.6747222222222,-124.405,870,No -BC,TERRACE ,1068130,1913,1,2013,1,54.4663888888889,-128.5775,217,Yes +BC,TERRACE,1068130,1913,1,2013,1,54.4663888888889,-128.5775,217,Yes BC,TLELL,1058190,1950,1,1999,1,53.5,-131.95,5,No -BC,TOFINO ,1038205,1942,1,2017,12,49.0822222222222,-125.772505555556,24,No +BC,TOFINO,1038205,1942,1,2017,12,49.0822222222222,-125.772505555556,24,No BC,UCLUELET KENNEDY CAMP,1038332,1958,1,2017,12,48.9452833333333,-125.527236111111,30,Yes -BC,VANCOUVER ,1108447,1896,1,2013,6,49.195,-123.181944444444,4,Yes +BC,VANCOUVER,1108447,1896,1,2013,6,49.195,-123.181944444444,4,Yes BC,VAVENBY,1168520,1913,1,2017,12,51.5761111111111,-119.778055555556,445,No BC,VERNON BELLA VISTA,1128553,1900,1,2015,6,50.2643611111111,-119.308861111111,427,Yes -BC,VICTORIA ,1018620,1899,1,2013,7,48.647225,-123.425833333333,19,Yes +BC,VICTORIA,1018620,1899,1,2013,7,48.647225,-123.425833333333,19,Yes BC,WARFIELD,1148700,1928,1,2002,12,49.1,-117.75,606,No BC,WASA,1158730,1924,1,2017,12,49.8239722222222,-115.630777777778,930,No BC,WESTWOLD,1168880,1921,1,2013,5,50.4688911111111,-119.750556388889,609,No -BC,WILLIAMS LAKE ,1098940,1936,1,2012,12,52.1830555555556,-122.054166666667,940,Yes +BC,WILLIAMS LAKE,1098940,1936,1,2012,12,52.1830555555556,-122.054166666667,940,Yes NU,ALERT,2400306,1950,1,2017,12,82.5,-62.3333333333333,65,Yes -NU,BAKER LAKE ,2300500,1949,1,2013,11,64.2988888888889,-96.0777777777778,18,No -YK,BURWASH ,2100182,1967,1,2015,2,61.3666666666667,-139.05,807,No -NU,BYRON BAY ,2400595,1957,1,1993,6,68.75,-109.066666666667,92,No -NU,CAMBRIDGE BAY ,2400600,1940,1,2015,2,69.1080555555556,-105.138333333333,27,No -NU,CAPE DORSET ,2400635,1932,1,2014,11,64.2302777777778,-76.525,50,Yes -NU,CAPE DYER ,2400654,1960,1,1993,3,66.5833333333333,-61.6166666666667,393,No +NU,BAKER LAKE,2300500,1949,1,2013,11,64.2988888888889,-96.0777777777778,18,No +YK,BURWASH,2100182,1967,1,2015,2,61.3666666666667,-139.05,807,No +NU,BYRON BAY,2400595,1957,1,1993,6,68.75,-109.066666666667,92,No +NU,CAMBRIDGE BAY,2400600,1940,1,2015,2,69.1080555555556,-105.138333333333,27,No +NU,CAPE DORSET,2400635,1932,1,2014,11,64.2302777777778,-76.525,50,Yes +NU,CAPE DYER,2400654,1960,1,1993,3,66.5833333333333,-61.6166666666667,393,No NU,CAPE HOOPER,2400660,1958,1,2007,9,68.4725,-66.8152777777778,390,No -NT,CAPE PARRY ,2200675,1960,1,1993,3,70.1666666666667,-124.716666666667,87,No +NT,CAPE PARRY,2200675,1960,1,1993,3,70.1666666666667,-124.716666666667,87,No YK,CARMACKS,2100300,1964,1,2008,2,62.1,-136.3,525,No -NU,CHESTERFIELD INLET ,2300707,1931,1,2014,11,63.3469444444444,-90.7311111111111,10,Yes +NU,CHESTERFIELD INLET,2300707,1931,1,2014,11,63.3469444444444,-90.7311111111111,10,Yes NU,CLINTON POINT,2300750,1957,1,1993,6,69.5833333333333,-120.8,101,No -NU,CLYDE ,2400800,1946,1,2002,6,70.4861111111111,-68.5166666666667,27,No -NU,CORAL HARBOUR ,2301000,1945,1,2015,5,64.1933333333333,-83.3594444444445,64,No -YK,DAWSON ,2100402,1901,1,2015,2,64.0430555555556,-139.127777777778,370,Yes +NU,CLYDE,2400800,1946,1,2002,6,70.4861111111111,-68.5166666666667,27,No +NU,CORAL HARBOUR,2301000,1945,1,2015,5,64.1933333333333,-83.3594444444445,64,No +YK,DAWSON,2100402,1901,1,2015,2,64.0430555555556,-139.127777777778,370,Yes NU,DEWAR LAKES,2401030,1958,1,1993,3,68.65,-71.1666666666667,527,No YK,DRURY CREEK,2100460,1970,1,2009,4,62.2019444444444,-134.39,609,No NU,EUREKA,2401200,1948,1,2016,2,79.9833333333333,-85.9333333333333,10,No -NT,FORT GOOD HOPE ,2201400,1945,1,2014,11,66.2408333333333,-128.650833333333,82,No -NT,FORT MCPHERSON ,2201601,1932,1,2014,11,67.4077777777778,-134.860277777778,35,Yes +NT,FORT GOOD HOPE,2201400,1945,1,2014,11,66.2408333333333,-128.650833333333,82,No +NT,FORT MCPHERSON,2201601,1932,1,2014,11,67.4077777777778,-134.860277777778,35,Yes NT,FORT RELIANCE,2201903,1949,1,2007,8,62.7113888888889,-109.168333333333,168,Yes -NT,FORT RESOLUTION ,2202000,1931,1,2014,11,61.1808333333333,-113.689722222222,160,No -NT,FORT SIMPSON ,2202101,1898,1,2014,10,61.7602777777778,-121.236666666667,169,Yes -NT,FORT SMITH ,2202200,1915,1,2014,11,60.0202777777778,-111.961944444444,205,Yes +NT,FORT RESOLUTION,2202000,1931,1,2014,11,61.1808333333333,-113.689722222222,160,No +NT,FORT SIMPSON,2202101,1898,1,2014,10,61.7602777777778,-121.236666666667,169,Yes +NT,FORT SMITH,2202200,1915,1,2014,11,60.0202777777778,-111.961944444444,205,Yes NU,FOX FIVE,2400570,1959,1,2007,9,67.5355555555556,-63.7888888888889,584,No -NU,GLADMAN POINT ,2402340,1957,1,1992,7,68.6666666666667,-97.8,14,No +NU,GLADMAN POINT,2402340,1957,1,1992,7,68.6666666666667,-97.8,14,No YK,HAINES JUNCTION,2100631,1945,1,2008,9,60.7495444444445,-137.50525,596,Yes -NU,HALL BEACH ,2402350,1957,1,2014,12,68.7758333333333,-81.2425,8,No -NT,HAY RIVER ,2202400,1909,1,2014,9,60.8397222222222,-115.782777777778,166,Yes +NU,HALL BEACH,2402350,1957,1,2014,12,68.7758333333333,-81.2425,8,No +NT,HAY RIVER,2202400,1909,1,2014,9,60.8397222222222,-115.782777777778,166,Yes NT,INUVIK,2202578,1957,1,2007,11,68.3166666666667,-133.516666666667,103,Yes NU,IQALUIT,2402592,1946,1,2007,11,63.7472222222222,-68.5444444444445,34,Yes -NU,JENNY LIND ISLAND ,2302650,1958,1,1992,7,68.65,-101.733333333333,18,No -YK,KOMAKUK BEACH ,2100685,1959,1,1993,6,69.5833333333333,-140.183333333333,7,No -NU,KUGAARUK ,2303092,1957,1,2012,8,68.5405555555556,-89.7972222222222,17,Yes -NU,KUGLUKTUK ,2300902,1931,1,2014,12,67.8166666666667,-115.143888888889,23,Yes -NU,LADY FRANKLIN POINT ,2302680,1958,1,1993,3,68.5,-113.216666666667,16,No +NU,JENNY LIND ISLAND,2302650,1958,1,1992,7,68.65,-101.733333333333,18,No +YK,KOMAKUK BEACH,2100685,1959,1,1993,6,69.5833333333333,-140.183333333333,7,No +NU,KUGAARUK,2303092,1957,1,2012,8,68.5405555555556,-89.7972222222222,17,Yes +NU,KUGLUKTUK,2300902,1931,1,2014,12,67.8166666666667,-115.143888888889,23,Yes +NU,LADY FRANKLIN POINT,2302680,1958,1,1993,3,68.5,-113.216666666667,16,No NU,LONGSTAFF BLUFF,2402684,1958,1,1991,6,68.8986111111111,-75.1408333333333,161,No -NU,LUPIN,230N002 ,1959,1,2007,7,65.7552916666667,-111.245841666667,488,Yes +NU,LUPIN,230N002,1959,1,2007,7,65.7552916666667,-111.245841666667,488,Yes NU,MACKAR INLET,2402686,1958,1,1992,5,68.3,-85.6666666666667,395,No -YK,MAYO ,2100700,1925,1,2013,11,63.6166666666667,-135.866666666667,504,No -NT,MOULD BAY,250M001 ,1948,1,2007,11,76.2375166666667,-119.347233333333,2,Yes -NU,NANISIVIK ,2402730,1938,1,2010,12,72.9833333333333,-84.6166666666667,642,Yes +YK,MAYO,2100700,1925,1,2013,11,63.6166666666667,-135.866666666667,504,No +NT,MOULD BAY,250M001,1948,1,2007,11,76.2375166666667,-119.347233333333,2,Yes +NU,NANISIVIK,2402730,1938,1,2010,12,72.9833333333333,-84.6166666666667,642,Yes NT,NICHOLSON PENINSULA,2202750,1958,1,1993,6,69.9333333333333,-128.966666666667,89,No -NT,NORMAN WELLS ,2202800,1943,1,2012,10,65.2825,-126.800277777778,73,No -YK,OLD CROW ,2100800,1952,1,2015,2,67.5705555555556,-139.839166666667,251,No +NT,NORMAN WELLS,2202800,1943,1,2012,10,65.2825,-126.800277777778,73,No +YK,OLD CROW,2100800,1952,1,2015,2,67.5705555555556,-139.839166666667,251,No YK,PELLY RANCH,2100880,1952,1,2015,3,62.8166666666667,-137.366666666667,454,No NU,RESOLUTE CARS,2403500,1948,1,2014,11,74.7169444444445,-94.9694444444445,67,No YK,ROSS RIVER YTG,2100941,1967,1,2008,2,61.9833333333333,-132.45,698,Yes -NT,SACHS HARBOUR ,2503650,1956,1,2013,2,72,-125.266666666667,86,No -NU,SHEPHERD BAY ,2303685,1957,1,1993,3,68.8166666666667,-93.4333333333333,43,No -YK,SHINGLE POINT ,2100950,1957,1,1993,3,68.95,-137.216666666667,49,No +NT,SACHS HARBOUR,2503650,1956,1,2013,2,72,-125.266666666667,86,No +NU,SHEPHERD BAY,2303685,1957,1,1993,3,68.8166666666667,-93.4333333333333,43,No +YK,SHINGLE POINT,2100950,1957,1,1993,3,68.95,-137.216666666667,49,No YK,SWIFT RIVER,2101081,1967,1,2008,2,60,-131.183333333333,891,No -YK,TESLIN ,2101100,1944,1,2013,12,60.1741388888889,-132.735888888889,705,No +YK,TESLIN,2101100,1944,1,2013,12,60.1741388888889,-132.735888888889,705,No YK,TUCHITUA,2101135,1967,1,2014,9,60.9333333333333,-129.216666666667,724,No NT,TUKTOYAKTUK,2203910,1957,1,1993,6,69.45,-133,18,No -NT,TULITA ,2201700,1904,1,2014,12,64.9086111111111,-125.568333333333,101,No -NT,ULUKHAKTOK ,2502501,1941,1,2010,6,70.7627777777778,-117.806111111111,36,Yes -YK,WATSON LAKE ,2101200,1939,1,2014,12,60.1165,-128.822333333333,687,No -YK,WHITEHORSE ,2101300,1942,1,2012,12,60.7095,-135.068833333333,706,No +NT,TULITA,2201700,1904,1,2014,12,64.9086111111111,-125.568333333333,101,No +NT,ULUKHAKTOK,2502501,1941,1,2010,6,70.7627777777778,-117.806111111111,36,Yes +YK,WATSON LAKE,2101200,1939,1,2014,12,60.1165,-128.822333333333,687,No +YK,WHITEHORSE,2101300,1942,1,2012,12,60.7095,-135.068833333333,706,No NT,WRIGLEY ,2204000,1944,1,2014,10,63.2094444444445,-123.436666666667,149,No NT,YELLOWKNIFE ,2204100,1943,1,2013,1,62.4627777777778,-114.440277777778,206,No NT,YOHIN,2204300,1957,1,2007,9,61.2419444444444,-123.741666666667,204,No -AB,ATHABASCA,3060L20 ,1918,1,2017,12,54.7222230555556,-113.2880575,515,Yes +AB,ATHABASCA,3060L20,1918,1,2017,12,54.7222230555556,-113.2880575,515,Yes AB,BANFF,3050519,1894,1,2007,11,51.1933583333333,-115.552236111111,1397,Yes AB,BEAVER MINES,3050600,1913,1,2012,3,49.4672277777778,-114.176955555556,1257,No AB,BEAVERLODGE,3070600,1916,1,2007,11,55.1966672222222,-119.396413888889,745,Yes -AB,CALGARY ,3031093,1885,1,2012,7,51.1138888888889,-114.020277777778,1084,No +AB,CALGARY,3031093,1885,1,2012,7,51.1138888888889,-114.020277777778,1084,No AB,CALMAR,3011120,1915,1,2016,12,53.2897241666667,-113.863057777778,720,No AB,CAMPSIE,3061200,1910,1,2013,10,54.1322227777778,-114.677778888889,671,No AB,CAMROSE,3011240,1946,1,2007,11,53.0347222222222,-112.814166666667,739,No AB,CARWAY,3031400,1915,1,2011,11,48.999725,-113.376111111111,1354,No -AB,CLARESHOLM MEADOW CREEK,3031F5F ,1913,1,2005,3,49.9375222222222,-113.737519444444,1035,No -AB,COLD LAKE ,3081680,1926,1,2017,12,54.4166666666667,-110.283333333333,541,Yes +AB,CLARESHOLM MEADOW CREEK,3031F5F,1913,1,2005,3,49.9375222222222,-113.737519444444,1035,No +AB,COLD LAKE,3081680,1926,1,2017,12,54.4166666666667,-110.283333333333,541,Yes AB,CORONATION,3011887,1928,1,2007,11,52.0741666666667,-111.449444444444,791,Yes -AB,CROWSNEST,3051R4R ,1913,1,2007,11,49.627525,-114.48195,1303,Yes +AB,CROWSNEST,3051R4R,1913,1,2007,11,49.627525,-114.48195,1303,Yes AB,DRUMHELLER ANDREW,3022136,1954,1,2008,3,51.4666666666667,-112.866666666667,719,No -AB,EDMONTON ,3012205,1883,1,2012,4,53.3166666666667,-113.583333333333,723,Yes +AB,EDMONTON,3012205,1883,1,2012,4,53.3166666666667,-113.583333333333,723,Yes AB,EDSON,3062246,1920,1,2007,11,53.5802797222222,-116.453335277778,927,Yes AB,ELK POINT,3012280,1913,1,1997,6,53.8833333333333,-111.066666666667,605,No AB,ENILDA-BERG,3062427,1932,1,2005,4,55.4166666666667,-116.3,591,Yes AB,FAIRVIEW THREE FOX FARM,3072539,1932,1,1999,12,56.0833333333333,-118.533333333333,604,Yes -AB,FORT CHIPEWYAN ,3072658,1884,1,2007,8,58.7666666666667,-111.116666666667,232,Yes -AB,FORT MCMURRAY ,3062693,1920,1,2007,11,56.65,-111.216666666667,369,Yes +AB,FORT CHIPEWYAN,3072658,1884,1,2007,8,58.7666666666667,-111.116666666667,232,Yes +AB,FORT MCMURRAY,3062693,1920,1,2007,11,56.65,-111.216666666667,369,Yes AB,FORT VERMILION,3072723,1909,1,2007,11,58.3823055555556,-116.040166666667,289,Yes AB,GLEICHEN,3032800,1903,1,2006,3,50.8833333333333,-113.05,905,No -AB,GRANDE PRAIRIE ,3072920,1931,1,2013,9,55.1797222222222,-118.885,669,Yes +AB,GRANDE PRAIRIE,3072920,1931,1,2013,9,55.1797222222222,-118.885,669,Yes AB,HIGHWOOD AU,3053250,1903,1,2011,9,50.5511111111111,-114.370555555556,1580,Yes -AB,HINTON VALLEY,306A009 ,1917,1,2017,12,53.40381,-117.537620277778,1011,Yes +AB,HINTON VALLEY,306A009,1917,1,2017,12,53.40381,-117.537620277778,1011,Yes AB,JASPER WARDEN,3053536,1936,1,2007,11,52.9263888888889,-118.029722222222,1020,Yes AB,JENNER,3023560,1916,1,2008,1,50.7222277777778,-111.195852777778,755,No AB,KEG RIVER,3073641,1936,1,2009,1,57.75,-117.616666666667,405,Yes AB,LACOMBE,3023722,1908,1,2007,11,52.4488905555556,-113.755834722222,860,Yes -AB,LETHBRIDGE ,3033880,1902,1,2007,8,49.6302777777778,-112.799722222222,929,Yes -AB,MEDICINE HAT ,3034480,1886,1,2006,5,50.0188888888889,-110.720833333333,717,No +AB,LETHBRIDGE,3033880,1902,1,2007,8,49.6302777777778,-112.799722222222,929,Yes +AB,MEDICINE HAT,3034480,1886,1,2006,5,50.0188888888889,-110.720833333333,717,No AB,MOUNTAIN VIEW,3034720,1913,1,2006,3,49.1269555555556,-113.630016666667,1339,No AB,OLDS,3024920,1914,1,2015,6,51.7833333333333,-114.1,1040,No AB,ONEFOUR,3044923,1928,1,2007,10,49.1166666666667,-110.466666666667,935,Yes -AB,PEACE RIVER ,3075040,1908,1,2014,5,56.2269444444444,-117.447222222222,571,Yes +AB,PEACE RIVER,3075040,1908,1,2014,5,56.2269444444444,-117.447222222222,571,Yes AB,PINCHER CREEK,3035206,1915,1,2007,11,49.5205555555556,-113.997222222222,1190,Yes AB,RANFURLY 2NW,3015405,1905,1,2014,11,53.4166666666667,-111.733333333333,673,Yes AB,ROCKY MTN HOUSE,3015523,1917,1,2007,11,52.4213905555556,-114.912223055556,988,Yes AB,SCOTFIELD,3025770,1913,1,2007,10,51.5833555555556,-111.363611666667,762,Yes AB,SION,3015960,1906,1,2004,12,53.8833333333333,-114.116666666667,701,No -AB,SLAVE LAKE ,3065999,1925,1,2007,8,55.2833333333333,-114.783333333333,583,Yes +AB,SLAVE LAKE,3065999,1925,1,2007,8,55.2833333333333,-114.783333333333,583,Yes AB,STETTLER NORTH,3016119,1919,1,2001,8,52.3333333333333,-112.716666666667,821,Yes AB,VAUXHALL,3036682,1914,1,2007,11,50.05,-112.133333333333,779,Yes AB,WABASCA,3076908,1915,1,2009,1,55.9666666666667,-113.833333333333,545,Yes -AB,WHITECOURT ,3067372,1943,1,2009,5,54.1438888888889,-115.786666666667,782,Yes +AB,WHITECOURT,3067372,1943,1,2009,5,54.1438888888889,-115.786666666667,782,Yes SK,ANEROID,4020160,1922,1,2005,4,49.7166666666667,-107.3,754,No SK,BANGOR,4010400,1951,1,2005,2,50.9,-102.283333333333,526,No -SK,BUFFALO NARROWS ,4060982,1962,1,2012,11,55.8333333333333,-108.433333333333,440,Yes +SK,BUFFALO NARROWS,4060982,1962,1,2012,11,55.8333333333333,-108.433333333333,440,Yes SK,CEYLON,4011441,1922,1,2002,12,49.3833333333333,-104.65,753,Yes SK,CHAPLIN,4021520,1904,1,1995,9,50.4666666666667,-106.65,672,No SK,COLLINS BAY CAMECO,4061632,1965,1,2017,12,58.1833333333333,-103.7,490,Yes SK,COTE,4011846,1913,1,2006,3,51.5166666666667,-101.783333333333,450,Yes SK,CREE LAKE,4061861,1962,1,1993,8,57.35,-107.133333333333,495,Yes SK,DAVIDSON,4012120,1922,1,2005,10,51.2666666666667,-105.983333333333,619,No -SK,ESTEVAN ,4012400,1902,1,2015,2,49.2166666666667,-102.966666666667,581,Yes +SK,ESTEVAN,4012400,1902,1,2015,2,49.2166666666667,-102.966666666667,581,Yes SK,HIGH POINT,4023240,1929,1,2017,7,50.9786127777778,-107.935278611111,645,No SK,HUDSON BAY,4083323,1943,1,2013,12,52.8833333333333,-102.583333333333,422,Yes SK,INDIAN HEAD,4013480,1895,1,2007,11,50.55,-103.65,579,No SK,ISLAND FALLS,4063560,1931,1,2004,9,55.5333333333333,-102.35,299,No SK,KELLIHER,4013660,1908,1,2017,12,51.2574166666667,-103.753027777778,676,Yes SK,KEY LAKE,4063755,1977,1,2017,12,57.25,-105.616666666667,509,No -SK,KINDERSLEY ,4043900,1942,1,2013,11,51.5166666666667,-109.183333333333,694,Yes +SK,KINDERSLEY,4043900,1942,1,2013,11,51.5166666666667,-109.183333333333,694,Yes SK,KLINTONEL,4024080,1911,1,1994,1,49.6833333333333,-108.916666666667,1074,No -SK,LA RONGE ,4064150,1923,1,2013,10,55.15,-105.266666666667,379,Yes -SK,LEADER AIRPORT,402DAF0 ,1923,1,2007,11,50.9094638888889,-109.501391666667,676,Yes +SK,LA RONGE,4064150,1923,1,2013,10,55.15,-105.266666666667,379,Yes +SK,LEADER AIRPORT,402DAF0,1923,1,2007,11,50.9094638888889,-109.501391666667,676,Yes SK,LOON LAKE EPF,4064600,1930,1,2005,10,54.05,-109.1,543,Yes SK,MANOR,4014913,1922,1,2004,7,49.6166666666667,-102.1,633,Yes SK,MELFORT,4055079,1910,1,2007,11,52.8166666666667,-104.6,490,Yes SK,MOOSE JAW,4015322,1895,1,2007,11,50.3316805555556,-105.537508333333,577,Yes SK,MOOSOMIN,4015360,1900,1,2000,9,50.1333333333333,-101.666666666667,576,No -SK,NIPAWIN ,4075518,1911,1,2005,9,53.3333333333333,-104,372,Yes +SK,NIPAWIN,4075518,1911,1,2005,9,53.3333333333333,-104,372,Yes SK,NORTH BATTLEFORD,4045605,1894,1,2007,11,52.7666666666667,-108.25,548,Yes SK,OUTLOOK,4055736,1915,1,2007,11,51.4833333333333,-107.05,541,Yes SK,PASWEGIN,4015960,1951,1,2003,9,51.9833333333333,-103.916666666667,533,No SK,PELLY,4086000,1952,1,2016,3,52.0833333333333,-101.866666666667,509,No SK,PILGER,4056120,1913,1,2011,9,52.4166666666667,-105.15,552,No -SK,PRINCE LBERT ,4056240,1889,1,2013,11,53.2166666666667,-105.666666666667,428,Yes -SK,REGINA ,4016560,1898,1,2007,11,50.4333333333333,-104.666666666667,577,No +SK,PRINCE LBERT,4056240,1889,1,2013,11,53.2166666666667,-105.666666666667,428,Yes +SK,REGINA,4016560,1898,1,2007,11,50.4333333333333,-104.666666666667,577,No SK,SASKATOON DIEFENBAKER ,4057120,1900,1,2007,11,52.1666666666667,-106.716666666667,504,No SK,SCOTT,4047241,1911,1,2007,11,52.35974,-108.834723333333,660,Yes SK,SWIFT CURRENT,4028060,1886,1,2007,11,50.2666666666667,-107.733333333333,825,Yes SK,TONKIN,4019082,1941,1,2016,1,51.2,-102.233333333333,527,Yes -SK,URANIUM CITY,406QLD0 ,1953,1,2007,10,59.5666666666667,-108.483333333333,318,Yes +SK,URANIUM CITY,406QLD0,1953,1,2007,10,59.5666666666667,-108.483333333333,318,Yes SK,VAL-MARIE,4038400,1937,1,2010,5,49.3700138888889,-107.847525,808,No SK,WASECA,4048520,1908,1,2014,12,53.1308555555556,-109.403902777778,638,No SK,WASKESIU LAKE,4068559,1966,1,2007,11,53.9166666666667,-106.066666666667,569,Yes @@ -258,13 +258,13 @@ MB,ARBORG,5030080,1951,1,2016,6,50.9333333333333,-97.0833333333333,224,No MB,BERENS RIVER,5030203,1905,1,2013,11,52.3597366666667,-97.0219533333333,222,Yes MB,BIRTLE,5010240,1917,1,2000,11,50.4333333333333,-101.05,522,No MB,BISSETT,5030282,1933,1,1997,6,51.0333333333333,-95.7,259,Yes -MB,BRANDON ,5010480,1890,1,2012,12,49.91,-99.9519444444445,409,Yes +MB,BRANDON,5010480,1890,1,2012,12,49.91,-99.9519444444445,409,Yes MB,CHURCHILL,5060606,1932,1,2015,12,58.7333333333333,-94.0666666666667,29,Yes MB,CYPRESS RIVER,5010640,1948,1,2012,3,49.55,-99.0833333333333,374,No MB,DAUPHIN,5040681,1911,1,2007,10,51.1003888888889,-100.056888888889,305,Yes MB,EMERSON,5020882,1942,1,2003,1,49,-97.2375,242,Yes MB,FLIN FLON,5050920,1927,1,2017,12,54.7666666666667,-101.883333333333,320,No -MB,GILLAM ,5061001,1943,1,2014,10,56.3575,-94.7105555555556,145,Yes +MB,GILLAM,5061001,1943,1,2014,10,56.3575,-94.7105555555556,145,Yes MB,GIMLI,5031039,1944,1,2008,3,50.6333333333333,-97.0166666666667,223,Yes MB,GRAND RAPIDS HYDRO,5031111,1962,1,2017,12,53.1580558333333,-99.2833444444444,223,Yes MB,GREAT FALLS,5031200,1923,1,2002,12,50.4666666666667,-96,249,No @@ -273,17 +273,17 @@ MB,LANGRUTH WEST,5041535,1958,1,2005,2,50.4138888888889,-98.8027777777778,264,Ye MB,LYNN LAKE,5061648,1952,1,2007,11,56.8638888888889,-101.076111111111,357,Yes MB,MORDEN,5021849,1888,1,2007,11,49.1876388888889,-98.0839444444444,298,Yes MB,NEEPAWA MURRAY 6 SOUTHWEST,5042004,1881,1,2008,11,50.15,-99.5666666666667,412,Yes -MB,NINETTE,50220M0 ,1916,1,1996,5,49.4166666666667,-99.65,419,Yes +MB,NINETTE,50220M0,1916,1,1996,5,49.4166666666667,-99.65,419,Yes MB,NORWAY HOUSE,5062045,1896,1,2007,11,53.9666666666667,-97.85,224,Yes MB,PIERSON,5012080,1933,1,2007,3,49.1833333333333,-101.266666666667,469,No MB,PINAWA WNRE,5032162,1915,1,2017,3,50.1805555555556,-96.0583333333333,267,Yes MB,PORTAGE LA PRAIRIE,5012321,1942,1,2017,12,49.95,-98.2666666666667,259,Yes MB,SPRAGUE,5022759,1916,1,2007,11,49.0236111111111,-95.5983358333333,329,Yes MB,STEINBACH,5022780,1956,1,2005,3,49.5333333333333,-96.7666666666667,254,No -MB,SWAN RIVER,504K80K ,1960,1,2007,10,52.1149722222222,-101.232916666667,335,Yes -MB,THE PAS ,5052880,1910,1,2014,11,53.9666666666667,-101.1,270,Yes +MB,SWAN RIVER,504K80K,1960,1,2007,10,52.1149722222222,-101.232916666667,335,Yes +MB,THE PAS,5052880,1910,1,2014,11,53.9666666666667,-101.1,270,Yes MB,THOMPSON ,5062922,1967,1,2014,11,55.8033333333333,-97.8625,222,No -MB,WINNIPEG RICHARDSON ,5023222,1872,1,2007,11,49.9166666666667,-97.2333333333333,239,Yes +MB,WINNIPEG RICHARDSON,5023222,1872,1,2007,11,49.9166666666667,-97.2333333333333,239,Yes ON,AMHERSTBURG,6130257,1917,1,2017,12,42.1033583333333,-83.0944633333333,182,Yes ON,ARMSTRONG JELLIEN,6040330,1939,1,1992,10,50.25,-89.1,341,Yes ON,ATIKOKAN MARMION,6020384,1919,1,2007,7,48.8,-91.5833333333333,442,Yes @@ -293,20 +293,20 @@ ON,BIG TROUT LAKE,6010738,1939,1,1992,10,53.8333333333333,-89.8666666666667,224, ON,BISCOTASING,6060773,1914,1,2000,10,47.3,-82.1,407,No ON,BROCKVILLE PCC,6100971,1915,1,2017,12,44.6,-75.6666666666667,96,Yes ON,CAMERON FALLS,6041109,1924,1,1998,8,49.15,-88.35,229,No -ON,CHAPLEAU ,6061361,1914,1,2015,3,47.82,-83.3466666666667,447,Yes +ON,CHAPLEAU,6061361,1914,1,2015,3,47.82,-83.3466666666667,447,Yes ON,CORNWALL,6101874,1951,1,2017,12,45.0155783333333,-74.7489,64,No -ON,DRYDEN ,6032119,1914,1,2005,1,49.8333333333333,-92.75,413,Yes -ON,EARLTON ,6072225,1939,1,2005,1,47.7,-79.85,243,No -ON,FORT FRANCES ,6022476,1912,1,2011,5,48.65,-93.4333333333333,342,Yes -ON,GERALDTON ,6042716,1950,1,2015,2,49.7828027777778,-86.9305694444445,349,Yes +ON,DRYDEN,6032119,1914,1,2005,1,49.8333333333333,-92.75,413,Yes +ON,EARLTON,6072225,1939,1,2005,1,47.7,-79.85,243,No +ON,FORT FRANCES,6022476,1912,1,2011,5,48.65,-93.4333333333333,342,Yes +ON,GERALDTON,6042716,1950,1,2015,2,49.7828027777778,-86.9305694444445,349,Yes ON,GODFREY,6102857,1924,1,2003,5,44.5666666666667,-76.6333333333333,160,Yes -ON,GORE BAY ,6092925,1916,1,1994,1,45.8833333333333,-82.5666666666667,194,Yes +ON,GORE BAY,6092925,1916,1,1994,1,45.8833333333333,-82.5666666666667,194,Yes ON,HALIBURTON,6163171,1883,1,2017,12,45.0322483333333,-78.531115,330,Yes -ON,HAMILTON ,6153194,1866,1,2011,12,43.1716866666667,-79.9341766666667,238,Yes -ON,HORNEPAYNE ,6053575,1917,1,1995,7,49.2,-84.7666666666667,335,Yes +ON,HAMILTON,6153194,1866,1,2011,12,43.1716866666667,-79.9341766666667,238,Yes +ON,HORNEPAYNE,6053575,1917,1,1995,7,49.2,-84.7666666666667,335,Yes ON,IROQUOIS FALLS,6073810,1913,1,1998,12,48.75,-80.6666666666667,259,No -ON,KAPUSKASING ,6073975,1918,1,2014,9,49.4138888888889,-82.4675,227,Yes -ON,KENORA ,6034075,1900,1,2013,2,49.7902791666667,-94.3652786111111,406,Yes +ON,KAPUSKASING,6073975,1918,1,2014,9,49.4138888888889,-82.4675,227,Yes +ON,KENORA,6034075,1900,1,2013,2,49.7902791666667,-94.3652786111111,406,Yes ON,KINGSTON PUMPING STATION,6104175,1872,1,2007,12,44.2439033333333,-76.4805666666667,77,Yes ON,LANSDOWNE HOUSE,6014350,1941,1,1989,6,52.2333333333333,-87.8833333333333,255,No ON,LONDON AIRPORT,6144475,1883,1,2017,4,43.0330555555556,-81.1511111111111,278,Yes @@ -315,68 +315,68 @@ ON,MADAWASKA,6084770,1916,1,2000,11,45.5,-77.9833333333333,316,No ON,MINE CENTRE SOUTHWEST,6025205,1914,1,2017,12,48.7597388888889,-92.6227777777778,361,Yes ON,MOOSONEE,6075425,1892,1,2017,12,51.2666666666667,-80.65,10,Yes ON,MORRISBURG,6105460,1913,1,2008,12,44.9236183333333,-75.1883433333333,82,No -ON,NORTH BAY ,6085700,1915,1,2013,1,46.3636111111111,-79.4227777777778,370,Yes +ON,NORTH BAY,6085700,1915,1,2013,1,46.3636111111111,-79.4227777777778,370,Yes ON,ORANGEVILLE MOE,6155790,1887,1,2015,12,43.9183516666667,-80.0864066666667,412,Yes ON,ORILLIA BRAIN,6115811,1871,1,2017,12,44.6027777777778,-79.4388888888889,250,Yes ON,OTTAWA,6105976,1890,1,2017,12,45.3833333333333,-75.7166666666667,79,No ON,OWEN SOUND MOE,6116132,1879,1,2007,12,44.5833333333333,-80.9333333333333,179,Yes -ON,PELEE ISLAND ,6136336,1888,1,1994,9,41.7833333333333,-82.6833333333333,174,Yes -ON,PETERBOROUGH ,6166418,1866,1,2007,5,44.2333333333333,-78.3666666666667,191,Yes -ON,PICKLE LAKE ,6016527,1933,1,2012,7,51.4463888888889,-90.2141666666667,386,Yes -ON,RED LAKE ,6016975,1939,1,2012,5,51.0669444444445,-93.7930555555556,386,No +ON,PELEE ISLAND,6136336,1888,1,1994,9,41.7833333333333,-82.6833333333333,174,Yes +ON,PETERBOROUGH,6166418,1866,1,2007,5,44.2333333333333,-78.3666666666667,191,Yes +ON,PICKLE LAKE,6016527,1933,1,2012,7,51.4463888888889,-90.2141666666667,386,Yes +ON,RED LAKE,6016975,1939,1,2012,5,51.0669444444445,-93.7930555555556,386,No ON,RIDGETOWN,6137149,1883,1,1997,4,42.45,-81.8833333333333,206,Yes -ON,SAULT STE MARIE ,6057592,1945,1,2012,3,46.4833333333333,-84.5094444444444,192,Yes -ON,SIOUX LOOKOUT ,6037775,1914,1,2013,2,50.1166666666667,-91.9,383,Yes +ON,SAULT STE MARIE,6057592,1945,1,2012,3,46.4833333333333,-84.5094444444444,192,Yes +ON,SIOUX LOOKOUT,6037775,1914,1,2013,2,50.1166666666667,-91.9,383,Yes ON,SMOKY FALLS,6077845,1934,1,1997,4,50.0666666666667,-82.1666666666667,183,No -ON,SUDBURY ,6068150,1921,1,2013,3,46.6255555555556,-80.7977777777778,348,Yes -ON,TERRACE BAY ,6048231,1910,1,2007,9,48.8166666666667,-87.1,290,Yes -ON,TIMMINS VICTOR POWER ,6078285,1955,1,2011,2,48.5697222222222,-81.3766666666667,295,No +ON,SUDBURY,6068150,1921,1,2013,3,46.6255555555556,-80.7977777777778,348,Yes +ON,TERRACE BAY,6048231,1910,1,2007,9,48.8166666666667,-87.1,290,Yes +ON,TIMMINS VICTOR POWER,6078285,1955,1,2011,2,48.5697222222222,-81.3766666666667,295,No ON,TOBERMORY CYPRUS LAKE,6128323,1915,1,1994,12,45.2333333333333,-81.5333333333333,190,Yes ON,TORONTO,6158350,1840,1,2017,4,43.6666666666667,-79.4,113,No -ON,TORONTO LESTER B. PEARSON ,6158733,1938,1,2013,6,43.6772222222222,-79.6305555555556,173,No +ON,TORONTO LESTER B. PEARSON,6158733,1938,1,2013,6,43.6772222222222,-79.6305555555556,173,No ON,TRANQUILLO RIDGE,6048864,1877,1,2007,12,48.2333333333333,-89.5166666666667,317,Yes ON,VINELAND,6139141,1919,1,2013,12,43.15,-79.4166666666667,110,Yes ON,WALLACEBURG,6139265,1906,1,1997,4,42.5833333333333,-82.4,177,No -ON,WAWA ,6059D09 ,1940,1,2014,9,47.9666666666667,-84.7833333333333,287,Yes +ON,WAWA ,6059D09,1940,1,2014,9,47.9666666666667,-84.7833333333333,287,Yes ON,WELLAND,6139445,1873,1,2014,8,42.9925266666667,-79.2611383333333,175,No -ON,WIARTON ,6119500,1948,1,2014,11,44.7458333333333,-81.1072222222222,222,No -ON,WINDSOR ,6139525,1866,1,2014,10,42.2755555555556,-82.9555555555556,190,Yes +ON,WIARTON,6119500,1948,1,2014,11,44.7458333333333,-81.1072222222222,222,No +ON,WINDSOR,6139525,1866,1,2014,10,42.2755555555556,-82.9555555555556,190,Yes ON,WOODSTOCK,6149625,1870,1,2017,12,43.1361233333333,-80.7705666666667,282,No QC,ARMAGH,7050240,1916,1,1994,5,46.75,-70.5333333333333,358,Yes QC,ARUNDEL,7030310,1914,1,2017,5,45.95,-74.6166666666667,191,Yes -QC,BAGOTVILLE ,7060400,1876,1,2017,12,48.3333333333333,-71,159,Yes +QC,BAGOTVILLE,7060400,1876,1,2017,12,48.3333333333333,-71,159,Yes QC,BARRAGE ANGLIERS,7080452,1911,1,1996,5,47.5519444444444,-79.2358333333333,267,No QC,BARRAGE TEMISCAMINGUE,7080468,1910,1,1995,10,46.7097222222222,-79.1011111111111,181,No QC,BELLETERRE,7080600,1952,1,2004,4,47.3833333333333,-78.7,322,No QC,BROME,7020840,1877,1,2014,7,45.1833333333333,-72.5666666666667,206,No QC,CAUSAPSCAL,7051200,1921,1,2017,8,48.3666666666667,-67.2333333333333,168,No -QC,CHIBOUGAMAU CHAPAIS ,7091404,1937,1,2016,11,49.7666666666667,-74.5333333333333,387,Yes +QC,CHIBOUGAMAU CHAPAIS,7091404,1937,1,2016,11,49.7666666666667,-74.5333333333333,387,Yes QC,CHELSEA,7031360,1928,1,2017,8,45.5166666666667,-75.7833333333333,113,No QC,DONNACONA,7012071,1919,1,2008,11,46.6833333333333,-71.7333333333333,46,Yes QC,DRUMMONDVILLE,7022160,1914,1,2017,8,45.8833333333333,-72.4833333333333,82,No QC,GASPE ,7052605,1916,1,2013,3,48.7769444444445,-64.4780555555556,33,Yes QC,GRANDE VALLEE,7052865,1883,1,2004,4,49.2,-65.15,8,Yes -QC,ILES DE LA MADELEINE ,705C2G9 ,1934,1,2002,11,47.4166666666667,-61.7833333333333,11,Yes +QC,ILES DE LA MADELEINE,705C2G9,1934,1,2002,11,47.4166666666667,-61.7833333333333,11,Yes QC,INUKJUAK,7103282,1938,1,1994,2,58.4666666666667,-78.0833333333333,24,No QC,JOLIETTE VILLE,7013362,1914,1,2011,4,46.0166666666667,-73.4333333333333,56,Yes -QC,KUUJJUAQ ,7113534,1947,1,2014,3,58.1,-68.4166666666667,39,No -QC,KUUJJUARAPIK ,7103536,1934,1,2014,4,55.2833333333333,-77.75,10,No +QC,KUUJJUAQ,7113534,1947,1,2014,3,58.1,-68.4166666666667,39,No +QC,KUUJJUARAPIK,7103536,1934,1,2014,4,55.2833333333333,-77.75,10,No QC,LA MALBAIE,7043960,1914,1,2004,4,47.6666666666667,-70.15,23,No QC,LA POCATIERE,7054095,1913,1,1996,3,47.35,-70.0333333333333,31,No QC,LA SARRE,7094120,1952,1,2004,4,48.7833333333333,-79.2166666666667,244,No QC,LA TUQUE,7074240,1912,1,2004,4,47.4,-72.7833333333333,152,No QC,LABRIEVILLE,7043540,1955,1,1994,12,49.3,-69.55,152,No -QC,LAC BERRY,709CEE9 ,1914,1,2017,8,48.8,-78.2833333333333,305,Yes +QC,LAC BERRY,709CEE9,1914,1,2017,8,48.8,-78.2833333333333,305,Yes QC,LAUZON,7024254,1872,1,2017,8,46.8166666666667,-71.1,69,Yes QC,LEBEL SUR QUEVILLON,7094275,1967,1,2004,4,49.05,-76.9666666666667,305,No QC,LENNOXVILLE,7024280,1915,1,1995,10,45.3688888888889,-71.8236111111111,181,No QC,LES BUISSONS,7044288,1947,1,2017,8,49.1166666666667,-68.3833333333333,15,Yes QC,LES CEDRES,7014290,1913,1,2017,8,45.3,-74.05,47,No -QC,MATAGAMI ,7094639,1964,1,1991,6,49.7666666666667,-77.8166666666667,281,Yes +QC,MATAGAMI,7094639,1964,1,1991,6,49.7666666666667,-77.8166666666667,281,Yes QC,MONT LAURIER,7035160,1920,1,2014,6,46.5666666666667,-75.55,244,Yes -QC,MONT-JOLI ,7055120,1943,1,2013,3,48.6,-68.2166666666667,52,No -QC,MONTREAL/PIERRE ELLIOTT TRUDEAU ,7025250,1872,1,2016,9,45.4666666666667,-73.75,36,Yes -QC,NATASHQUAN ,7045400,1915,1,2003,3,50.1833333333333,-61.8166666666667,11,No +QC,MONT-JOLI,7055120,1943,1,2013,3,48.6,-68.2166666666667,52,No +QC,MONTREAL/PIERRE ELLIOTT TRUDEAU,7025250,1872,1,2016,9,45.4666666666667,-73.75,36,Yes +QC,NATASHQUAN,7045400,1915,1,2003,3,50.1833333333333,-61.8166666666667,11,No QC,NICOLET,7025440,1914,1,2017,8,46.2,-72.6166666666667,30,No QC,NOMININGUE,7035520,1914,1,2013,11,46.4,-75.0833333333333,274,No QC,NORMANDIN,7065640,1936,1,1992,8,48.85,-72.5333333333333,137,No @@ -384,8 +384,8 @@ QC,PARENT S,7075799,1943,1,2004,4,47.9166666666667,-74.6166666666667,410,Yes QC,POINTE AU CHENE,7036063,1919,1,2009,6,45.65,-74.8,51,Yes QC,QUAQTAQ,7116270,1930,1,1988,5,61.05,-69.6333333333333,30,Yes QC,RIMOUSKI,7056480,1877,1,2017,8,48.45,-68.5166666666667,36,Yes -QC,ROBERVAL ,7066685,1914,1,2014,3,48.5166666666667,-72.2666666666667,179,Yes -QC,SCHEFFERVILLE ,7117825,1949,1,1993,9,54.8,-66.8166666666667,522,No +QC,ROBERVAL,7066685,1914,1,2014,3,48.5166666666667,-72.2666666666667,179,Yes +QC,SCHEFFERVILLE,7117825,1949,1,1993,9,54.8,-66.8166666666667,522,No QC,SENNETERRE,7097900,1940,1,1994,5,48.3333333333333,-77.2666666666667,310,Yes QC,SEPT-ILES,7047912,1945,1,2017,5,50.2166666666667,-66.25,53,Yes QC,SHAWINIGAN,7018000,1902,1,2004,4,46.5666666666667,-72.75,122,No @@ -402,43 +402,43 @@ QC,TADOUSSAC,7048320,1914,1,2004,4,48.15,-69.7,70,No QC,TETE A LA BALEINE,7048421,1912,1,1995,3,50.7,-59.3166666666667,9,Yes QC,THETFORD MINES,7028441,1922,1,2016,7,46.1,-71.35,381,Yes QC,TRINITE DES MONTS,7058520,1951,1,2004,4,48.1333333333333,-68.4833333333333,262,No -QC,VAL-D'OR ,7098600,1952,1,2017,12,48.0563888888889,-77.7866666666667,337,No +QC,VAL-D'OR,7098600,1952,1,2017,12,48.0563888888889,-77.7866666666667,337,No QC,VILLE MARIE,7088760,1914,1,2004,4,47.35,-79.4333333333333,213,No QC,WRIGHT,7038975,1914,1,2017,8,46.0666666666667,-76.05,142,Yes NS,ANNAPOLIS ROYAL,8200100,1915,1,2007,12,44.75,-65.5166666666667,8,No NB,AROOSTOOK,8100300,1920,1,2017,12,46.7122222222222,-67.7155555555556,91,Yes -NB,BATHURST ,8100503,1884,1,2013,10,47.6291805555556,-65.7483388888889,59,Yes +NB,BATHURST,8100503,1884,1,2013,10,47.6291805555556,-65.7483388888889,59,Yes NL,BAY D'ESPOIR,8400413,1968,1,2017,12,47.9833333333333,-55.8,23,No NS,BEAR RIVER,8200500,1915,1,2006,2,44.5666666666667,-65.6333333333333,8,Yes NL,BURGEO,8400798,1939,1,1995,7,47.6166666666667,-57.6166666666667,11,Yes NL,CARTWRIGHT,8501100,1936,1,2015,3,53.7083333333333,-57.035,14,No -NB,CHARLO ,8100880,1934,1,2002,10,47.9833333333333,-66.3333333333333,40,Yes +NB,CHARLO,8100880,1934,1,2002,10,47.9833333333333,-66.3333333333333,40,Yes PE,CHARLOTTETOWN ,8300300,1872,1,2012,9,46.2886166666667,-63.1286305555556,49,Yes -NL,CHURCHILL FALLS,850A131 ,1969,1,1998,4,53.5333333333333,-63.9666666666667,489,Yes +NL,CHURCHILL FALLS,850A131,1969,1,1998,4,53.5333333333333,-63.9666666666667,489,Yes NS,COLLEGEVILLE,8201000,1916,1,2014,6,45.4833333333333,-62.0166666666667,76,No NL,CORNER BROOK,8401300,1933,1,2017,12,48.95,-57.95,5,No NL,DANIELS HARBOUR,8401400,1947,1,1998,1,50.2363888888889,-57.5811111111111,19,No NL,DEER LAKE ,8401501,1933,1,2012,3,49.2166666666667,-57.4,22,Yes NS,DEMING,8201410,1884,1,2011,12,45.2163908333333,-61.1778027777778,16,Yes NB,DOAKTOWN,8101200,1944,1,2009,6,46.5525138888889,-66.1402916666667,38,No -NB,EDMUNDSTON,810AL00 ,1916,1,2009,7,47.3463888888889,-68.1877777777778,163,Yes +NB,EDMUNDSTON,810AL00,1916,1,2009,7,47.3463888888889,-68.1877777777778,163,Yes NL,EXPLOITS DAM,8401550,1956,1,2009,2,48.7666666666667,-56.6,154,No NB,FREDERICTON ,8101500,1874,1,2010,4,45.8721305555556,-66.5278916666667,21,Yes -NL,GANDER ,8401700,1937,1,2012,3,48.9463888888889,-54.5769444444444,151,No -NL,GOOSE ,8501900,1942,1,2017,12,53.3166666666667,-60.4166666666667,49,No +NL,GANDER,8401700,1937,1,2012,3,48.9463888888889,-54.5769444444444,151,No +NL,GOOSE,8501900,1942,1,2017,12,53.3166666666667,-60.4166666666667,49,No NL,GRAND FALLS,8402050,1937,1,2009,1,48.9333333333333,-55.6666666666667,60,No -NS,GREENWOOD ,8202000,1943,1,2017,12,44.9833333333333,-64.9166666666667,28,No -NS,HALIFAX STANFIELD ,8202250,1872,1,2012,9,44.8800166666667,-63.5000138888889,145,Yes +NS,GREENWOOD,8202000,1943,1,2017,12,44.9833333333333,-64.9166666666667,28,No +NS,HALIFAX STANFIELD,8202250,1872,1,2012,9,44.8800166666667,-63.5000138888889,145,Yes NL,ISLE UX MORTS,8402450,1909,1,2004,10,47.5833333333333,-58.9666666666667,5,Yes NB,KEDGWICK,8102300,1932,1,1994,9,47.65,-67.35,274,No NS,LIVERPOOL BIG FALLS,8203100,1940,1,2012,10,44.1333333333333,-64.9333333333333,50,No -NL,MAKKOVIK ,8502NHR ,1942,1,2014,11,55.0822222222222,-59.1886111111111,71,Yes -NL,MARY'S HARBOUR ,8502591,1881,1,1998,1,52.3036111111111,-55.8336111111111,12,Yes -NB,MIRAMICHI ,8101000,1873,1,2005,8,47.0094694444444,-65.4677888888889,33,Yes -NB,MONCTON ,8103200,1898,1,2012,6,46.1053055555556,-64.6838055555556,71,Yes +NL,MAKKOVIK,8502NHR,1942,1,2014,11,55.0822222222222,-59.1886111111111,71,Yes +NL,MARY'S HARBOUR,8502591,1881,1,1998,1,52.3036111111111,-55.8336111111111,12,Yes +NB,MIRAMICHI,8101000,1873,1,2005,8,47.0094694444444,-65.4677888888889,33,Yes +NB,MONCTON,8103200,1898,1,2012,6,46.1053055555556,-64.6838055555556,71,Yes PE,MONTICELLO,8300447,1960,1,2003,12,46.4666666666667,-62.4666666666667,32,No NS,MOUNT UNIACKE,8203600,1920,1,2003,7,44.9,-63.8333333333333,159,No -NL,NAIN ,8502800,1939,1,2013,3,56.55,-61.6833333333333,7,No +NL,NAIN,8502800,1939,1,2013,3,56.55,-61.6833333333333,7,No NS,NAPPAN,8203700,1913,1,2003,7,45.7666666666667,-64.25,20,No NB,NEPISIGUIT FALLS,8103500,1922,1,2006,2,47.4,-65.7833333333333,106,No NL,NORTH HARBOUR,8402874,1939,1,2007,11,47.1333333333333,-53.6666666666667,11,Yes @@ -446,22 +446,22 @@ NS,PARRSBORO,8204400,1897,1,2002,9,45.4,-64.3333333333333,24,No NL,PLUM POINT,8402958,1972,1,2016,6,51.0666666666667,-56.8833333333333,6,No NB,REXTON,8104400,1923,1,2009,12,46.6666666666667,-64.8666666666667,5,No NS,SABLE ISLAND,8204700,1891,1,2001,12,43.9322222222222,-60.0094444444444,5,No -NB,SAINT JOHN ,8104900,1871,1,2012,6,45.3180555555556,-65.8855694444444,109,Yes +NB,SAINT JOHN,8104900,1871,1,2012,6,45.3180555555556,-65.8855694444444,109,Yes NL,SPRINGDALE,8403700,1956,1,1993,6,49.5,-56.0833333333333,23,No NS,SPRINGFIELD,8205200,1920,1,2003,8,44.6666666666667,-64.85,167,No -NL,ST ANTHONY ,840C401 ,1883,1,2008,1,51.3833333333333,-56.1,33,Yes -NL,ST JOHN'S ,8403506,1874,1,2012,3,47.6222222222222,-52.7427777777778,141,Yes +NL,ST ANTHONY,840C401,1883,1,2008,1,51.3833333333333,-56.1,33,Yes +NL,ST JOHN'S,8403506,1874,1,2012,3,47.6222222222222,-52.7427777777778,141,Yes NS,ST MARGARET'S BAY,8204800,1922,1,2017,12,44.7,-63.9,17,No -NL,STEPHENVILLE ,8403800,1935,1,2014,10,48.5333333333333,-58.55,26,Yes -PE,SUMMERSIDE ,8300700,1936,1,2002,6,46.4388888888889,-63.8316666666667,20,Yes +NL,STEPHENVILLE,8403800,1935,1,2014,10,48.5333333333333,-58.55,26,Yes +PE,SUMMERSIDE,8300700,1936,1,2002,6,46.4388888888889,-63.8316666666667,20,Yes NB,SUSSEX,8105200,1898,1,2009,5,45.7166666666667,-65.5333333333333,21,No -NS,SYDNEY ,8205700,1870,1,2014,8,46.1666666666667,-60.0481388888889,62,Yes +NS,SYDNEY,8205700,1870,1,2014,8,46.1666666666667,-60.0481388888889,62,Yes NS,TRURO,8205990,1910,1,2002,10,45.3666666666667,-63.2666666666667,40,Yes NS,UPPER STEWIACKE,8206200,1916,1,2008,4,45.2166666666667,-63,23,No -NL,WABUSH LAKE ,8504175,1961,1,2013,2,52.9272222222222,-66.8741666666667,551,No +NL,WABUSH LAKE,8504175,1961,1,2013,2,52.9272222222222,-66.8741666666667,551,No NL,WESTBROOK ST LAWRENCE,8404201,1957,1,1995,7,46.95,-55.3833333333333,31,No NS,WESTPORT,8206260,1937,1,1993,6,44.25,-66.3666666666667,18,Yes NS,WHITE ROCK,8206316,1913,1,2017,6,45.05,-64.3833333333333,38,Yes NB,WOODSTOCK,8105600,1914,1,2017,12,46.1702777777778,-67.5536111111111,153,No NS,WRECK COVE BROOK,8206450,1951,1,2012,12,46.5333333333333,-60.45,76,Yes -NS,YARMOUTH ,8206500,1880,1,2012,4,43.8308333333333,-66.0886111111111,43,Yes +NS,YARMOUTH,8206500,1880,1,2012,4,43.8308333333333,-66.0886111111111,43,Yes diff --git a/miranda/preprocess/configs/eccc-homogenized_attrs.json b/miranda/preprocess/configs/eccc-homogenized_attrs.json index b1861502..e35bba41 100644 --- a/miranda/preprocess/configs/eccc-homogenized_attrs.json +++ b/miranda/preprocess/configs/eccc-homogenized_attrs.json @@ -13,11 +13,13 @@ "Third": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 3" } }, + "_variable": true, "acknowledgement": "This data is provided by Environment and Climate Change Canada (ECCC).", "author": "Environment and Climate Change Canada (ECCC)", "contact": "info.cccs-ccsc@canada.ca", "dataset_id": "d6813de6-b20a-46cc-8990-01862ae15c5f", "documentation": "https://www.canada.ca/en/environment-climate-change/services/climate-change/canadian-centre-climate-services/display-download/technical-documentation-adjusted-climate-data.html", + "domain": "AMNO", "institution": "GovCan", "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html", "license_preamble": "The data is owned by the Government of Canada (Environment and Climate Change Canada), and fall under the licence agreement for use of Environment and Climate Change Canada data.", @@ -27,7 +29,8 @@ "realm": "atmos", "source": "AHCCD", "table_date": "2023-08-03", - "table_id": "ECCC" + "table_id": "ECCC", + "type": "station-obs" }, "variables": { "dm": { From f454cb61ec9ec62f8f14b504e87d86d931a09db4 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Mon, 7 Aug 2023 14:03:27 -0400 Subject: [PATCH 10/33] naming and more dynamic handling of variables --- miranda/io/_output.py | 15 +- miranda/io/utils.py | 12 +- miranda/preprocess/__init__.py | 5 + miranda/preprocess/_eccc_homogenized.py | 154 ++++++++++++------ .../configs/ahccd_gen3_temperature.csv | 14 +- .../configs/eccc-homogenized_attrs.json | 2 + templates/ahccd_preprocess.py | 10 ++ 7 files changed, 146 insertions(+), 66 deletions(-) create mode 100644 templates/ahccd_preprocess.py diff --git a/miranda/io/_output.py b/miranda/io/_output.py index 015a5c88..8d0667dc 100644 --- a/miranda/io/_output.py +++ b/miranda/io/_output.py @@ -34,6 +34,7 @@ def write_dataset( ds: xr.DataArray | xr.Dataset, output_path: str | os.PathLike, output_format: str, + output_name: str | None = None, chunks: dict | None = None, overwrite: bool = False, compute: bool = True, @@ -48,6 +49,8 @@ def write_dataset( Output folder path. output_format: {"netcdf", "zarr"} Output data container type. + output_name: str, optional + Output file name. chunks : dict, optional Chunking layout to be written to new files. If None, chunking will be left to the relevant backend engine. overwrite : bool @@ -64,11 +67,15 @@ def write_dataset( if isinstance(output_path, str): output_path = Path(output_path) - outfile = name_output_file(ds, output_format) - outfile_path = output_path.joinpath(outfile) + if not output_name: + output_name = name_output_file(ds, output_format) + else: + output_name = str(output_name) + + outfile_path = output_path.joinpath(output_name) if overwrite and outfile_path.exists(): - logging.warning(f"Removing existing {output_format} files for {outfile}.") + logging.warning(f"Removing existing {output_format} files for {output_name}.") if outfile_path.is_dir(): shutil.rmtree(outfile_path) if outfile_path.is_file(): @@ -78,7 +85,7 @@ def write_dataset( freq = ds.attrs["frequency"] # TOD0: check that this is really there chunks = fetch_chunk_config(priority="time", freq=freq, dims=ds.dims) - logging.info(f"Writing {outfile}.") + logging.info(f"Writing {output_name}.") write_object = delayed_write( ds, outfile_path, diff --git a/miranda/io/utils.py b/miranda/io/utils.py index 33da8643..15fc771c 100644 --- a/miranda/io/utils.py +++ b/miranda/io/utils.py @@ -33,7 +33,9 @@ def name_output_file( - ds_or_dict: xr.Dataset | dict[str, str], output_format: str + ds_or_dict: xr.Dataset | dict[str, str], + output_format: str, + data_vars: str | None = None, ) -> str: """Name an output file based on facets within a Dataset or a dictionary. @@ -43,6 +45,8 @@ def name_output_file( A miranda-converted Dataset or a dictionary containing the appropriate facets. output_format : {"netcdf", "zarr"} Output filetype to be used for generating filename suffix. + data_vars : str, optional + If using a Dataset, the name of the data variable to be used for naming the file. Returns ------- @@ -62,7 +66,9 @@ def name_output_file( facets["suffix"] = suffix if isinstance(ds_or_dict, xr.Dataset): - if len(ds_or_dict.data_vars) == 1: + if data_vars is not None: + facets["variable"] = data_vars + elif len(ds_or_dict.data_vars) == 1: facets["variable"] = list(ds_or_dict.data_vars.keys())[0] elif ( len(ds_or_dict.data_vars) == 2 @@ -73,7 +79,7 @@ def name_output_file( ][0] else: raise NotImplementedError( - f"Too many `data_vars` in Dataset: {' ,'.join(ds_or_dict.data_vars.keys())}." + f"Too many `data_vars` in Dataset: {', '.join(ds_or_dict.data_vars.keys())}." ) for f in [ "bias_adjust_project", diff --git a/miranda/preprocess/__init__.py b/miranda/preprocess/__init__.py index 0ae1f1d6..4601a7cc 100644 --- a/miranda/preprocess/__init__.py +++ b/miranda/preprocess/__init__.py @@ -1 +1,6 @@ """Preprocessing tools for Miranda.""" +from __future__ import annotations + +from ._eccc_homogenized import * +from ._eccc_obs import * +from ._eccc_summaries import * diff --git a/miranda/preprocess/_eccc_homogenized.py b/miranda/preprocess/_eccc_homogenized.py index f400b5b1..33f7c9ca 100644 --- a/miranda/preprocess/_eccc_homogenized.py +++ b/miranda/preprocess/_eccc_homogenized.py @@ -9,6 +9,8 @@ import pandas as pd import xarray as xr +from miranda.io import write_dataset +from miranda.io.utils import name_output_file from miranda.preprocess._data_definitions import load_json_data_mappings from miranda.preprocess._treatments import basic_metadata_conversion from miranda.scripting import LOGGING_CONFIG @@ -16,7 +18,29 @@ logging.config.dictConfig(LOGGING_CONFIG) logger = logging.Logger("miranda") -__all__ = ["convert_ahccd", "convert_ahccd_fwf_file"] +__all__ = ["convert_ahccd", "convert_ahccd_fwf_file", "merge_ahccd"] + + +def _ahccd_variable_code(code: str): + config = load_json_data_mappings("eccc-homogenized") + variable_codes = {} + for variable_code in config["variables"]: + variable_name = config["variables"][variable_code].get("_variable_name") + if variable_name: + variable_codes[variable_name] = variable_code + else: + raise AttributeError( + f"Variable `{variable_code}` is not properly configured. Verify JSON." + ) + + if code in variable_codes.values(): + variable = code + else: + variable = variable_codes.get(code) + if not variable: + raise NotImplementedError(f"Variable `{code}` not supported.") + + return variable def _ahccd_variable_metadata( @@ -27,7 +51,7 @@ def _ahccd_variable_metadata( Parameters ---------- - variable_code: {"dm", "dn", "dr", "ds", "dt", "dx"} + variable_code gen: {1, 2, 3} Returns @@ -40,11 +64,9 @@ def _ahccd_variable_metadata( config = load_json_data_mappings("eccc-homogenized") metadata = basic_metadata_conversion("eccc-homogenized", config) + code = _ahccd_variable_code(variable_code) - variable_meta = metadata["variables"].get(variable_code) - if not variable_meta: - raise NotImplementedError(f"Variable `{variable_code}` not supported.") - + variable_meta = metadata["variables"].get(code) variable_name = variable_meta.get("_variable_name") if variable_name: variable_meta = {variable_name: variable_meta} @@ -53,24 +75,25 @@ def _ahccd_variable_metadata( variable_meta = {variable_code: variable_meta} header = metadata["Header"] + to_delete = [] # Conditional handling of global attributes based on generation for field in [f for f in header if f.startswith("_")]: if isinstance(header[field], bool): - if header[field] and field[1:] == "variable": + if header[field] and field == "_variable": header[field[1:]] = variable_name - elif isinstance(header[field], dict): attr_treatment = header[field]["generation"] if field in ["_citation" "_product"]: for attribute, value in attr_treatment.items(): if attribute == generation: header[field[1:]] = value - else: raise AttributeError( f"Attribute treatment configuration for field `{field}` is not properly configured. Verify JSON." ) + to_delete.append(field) + for field in to_delete: del header[field] return variable_meta, header @@ -167,9 +190,7 @@ def convert_ahccd_fwf_file( ------- xarray.Dataset """ - code = dict(tasmax="dx", tasmin="dn", tas="dm", pr="dt", prsn="ds", prlp="dr").get( - variable - ) + code = _ahccd_variable_code(variable) variable_meta, global_attrs = _ahccd_variable_metadata(code, generation) col_names, cols_specs, header = _ahccd_column_definitions(code) @@ -282,8 +303,10 @@ def convert_ahccd( data_source: str | Path, output_dir: str | Path, variable: str, + *, generation: int, merge: bool = False, + overwrite: bool = False, ) -> None: """Convert Adjusted and Homogenized Canadian Climate Dataset files. @@ -294,6 +317,7 @@ def convert_ahccd( variable: str generation: int merge: bool + overwrite: bool Returns ------- @@ -302,10 +326,7 @@ def convert_ahccd( output_dir = Path(output_dir).resolve().joinpath(variable) output_dir.mkdir(parents=True, exist_ok=True) - code = dict(tasmax="dx", tasmin="dn", tas="dm", pr="dt", prsn="ds", prlp="dr").get( - variable - ) - + code = _ahccd_variable_code(variable) var_meta, global_attrs = _ahccd_variable_metadata(code, generation) ( col_names, @@ -343,14 +364,11 @@ def convert_ahccd( # Convert station .txt files to netcdf for ff in Path(data_source).glob(f"{code}*.txt"): outfile = output_dir.joinpath(ff.name.replace(".txt", ".nc")) - if not outfile.exists(): + if not outfile.exists() or overwrite: logger.info(ff.name) station_id = ff.name[2:].split(".txt")[0] - try: - metadata_st = metadata[metadata["stnid"] == int(station_id)] - except ValueError: - metadata_st = metadata[metadata["stnid"] == station_id] + metadata_st = metadata[metadata["stnid"] == station_id] if len(metadata_st) == 1: ds_out = convert_ahccd_fwf_file( @@ -363,38 +381,70 @@ def convert_ahccd( logger.warning( f"metadata info for station {ff.name} not found : skipping" ) - if not merge: - return + else: + logger.info(f"{outfile.name} already exists: Skipping...") + if merge: + merge_ahccd(data_source, output_dir, variable) + return + + +def merge_ahccd( + data_source: str | Path, + output_dir: str | Path | None = None, + variable: str | None = None, + overwrite: bool = False, +) -> None: + """Merge Adjusted and Homogenized Canadian Climate Dataset files.""" + if variable: + code = _ahccd_variable_code(variable) + glob_pattern = f"{code}*.nc" + output_dir = Path(output_dir).resolve().joinpath(variable) + else: + glob_pattern = "*.nc" + output_dir = Path(output_dir).resolve() + output_dir.mkdir(parents=True, exist_ok=True) + + # Merge individual stations to single .nc file + ds_ahccd = xr.open_mfdataset( + list(data_source.glob(glob_pattern)), concat_dim="station", combine="nested" + ) - # merge individual stations to single .nc file - ncfiles = list(output_dir.glob(f"{code}*.nc")) - outfile = output_dir.parent.joinpath( - "merged_stations", f"ahccd_gen{generation}_{variable}.nc" + for coord in ds_ahccd.coords: + # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files + # Do not apply to datetime object + if coord != "time" and ds_ahccd[coord].dtype == "O": + ds_ahccd[coord] = ds_ahccd[coord].astype(str) + + variables_found = set() + for v in ds_ahccd.data_vars: + # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files + # Do not apply to flag timeseries + if ds_ahccd[v].dtype == "O" and "flag" not in v: + ds_ahccd[v] = ds_ahccd[v].astype(str) + try: + variables_found.add(_ahccd_variable_code(str(v))) + except NotImplementedError: + pass + + # Name output file + ds_ahccd.attrs["variable"] = ", ".join(variables_found) + variables = "-".join(variables_found) + output_name = name_output_file(ds_ahccd, "netcdf", variables) + logger.info( + f"Many variables found. Merging station files in {data_source} as `{output_name}`." ) - if not outfile.exists(): - logger.info("merging stations :", variable) - ds_ahccd = xr.open_mfdataset( - ncfiles, concat_dim="station", combine="nested" - ).load() - - for coord in ds_ahccd.coords: - # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files - # Do not apply to datetime object - if coord != "time" and ds_ahccd[coord].dtype == "O": - ds_ahccd[coord] = ds_ahccd[coord].astype(str) - - for v in ds_ahccd.data_vars: - # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files - # Do not apply to flag timeseries - if ds_ahccd[v].dtype == "O" and "flag" not in v: - logger.info(v) - ds_ahccd[v] = ds_ahccd[v].astype(str) - - outfile.parent.mkdir(parents=True, exist_ok=True) - ds_ahccd.to_netcdf(outfile, engine="h5netcdf", mode="w") + try: + logger.info(f"Writing merged file to: {output_dir}.") + write_dataset( + ds_ahccd, + output_dir, + output_format="netcdf", + output_name=output_name, + chunks={"time": 365}, + overwrite=overwrite, + compute=True, + ) del ds_ahccd - for nc in outfile.parent.glob("*.nc"): - logger.info(nc) - ds = xr.open_dataset(nc) - logger.info(ds) + except FileExistsError: + logger.info("Merged file already exists. Use overwrite=`True` to overwrite.") diff --git a/miranda/preprocess/configs/ahccd_gen3_temperature.csv b/miranda/preprocess/configs/ahccd_gen3_temperature.csv index 8c56a6b5..4a65dc15 100644 --- a/miranda/preprocess/configs/ahccd_gen3_temperature.csv +++ b/miranda/preprocess/configs/ahccd_gen3_temperature.csv @@ -24,7 +24,7 @@ No,StnId,Station name,Prov,FromYear,FromMonth,ToYear,ToMonth,%Miss,Lat(deg),Long 21,1161663,CLINTON_AUT,BC,1993,1,2019,12,4.6,51.1,-121.5,105,y,y 22,1021830,COMOX,BC,1935,11,2019,12,1.2,49.7,-124.9,2,y,n 23,1021960,CORTES_ISLAND,BC,1947,3,2019,2,9.9,50,-124.9,1,y,n -24,1012010,COWICHAN_BAY_CHERRY_,BC,1913,10,1984,3,7.7,48.7,-123.5,0,n,n +24,1012010,COWICHAN_BAY_CHERRY,BC,1913,10,1984,3,7.7,48.7,-123.5,0,n,n 25,1152106,CRANBROOK,BC,1901,1,2019,12,6.6,49.6,-115.7,92,y,y 26,114B1F0,CRESTON,BC,1912,6,2019,12,0.5,49,-116.5,64,y,y 27,1022250,CUMBERLAND,BC,1922,5,1977,6,4.7,49.6,-125,15,n,n @@ -102,7 +102,7 @@ No,StnId,Station name,Prov,FromYear,FromMonth,ToYear,ToMonth,%Miss,Lat(deg),Long 99,1176755,REVELSTOKE,BC,1898,5,2019,12,7.3,50.9,-118.1,44,y,y 100,1016940,SAANICHTON_CDA,BC,1914,3,2019,7,0.6,48.6,-123.4,6,n,n 101,1167337,SALMON_ARM,BC,1911,7,2019,12,1.1,50.5,-119.3,41,y,n -102,1016995,SALTSPRING_,BC,1909,11,2019,12,1,48.8,-123.5,4,y,n +102,1016995,SALTSPRING,BC,1909,11,2019,12,1,48.8,-123.5,4,y,n 103,1057051,SANDSPIT,BC,1945,9,2019,12,4.2,53.2,-131.8,0,y,y 104,1017099,SATURNA_CAPMON,BC,1989,6,2019,12,3,48.7,-123.1,17,y,y 105,1017230,SHAWNIGAN_LAKE,BC,1913,4,2019,12,0.6,48.6,-123.6,15,n,n @@ -620,7 +620,7 @@ No,StnId,Station name,Prov,FromYear,FromMonth,ToYear,ToMonth,%Miss,Lat(deg),Long 617,7055122,MONT_JOLI,QUE,1875,10,2019,12,0.6,48.6,-68.2,5,y,y 618,7035160,MONT_LAURIER,QUE,1920,7,2014,6,7.2,46.5,-75.5,24,y,n 619,7024745,MONTREAL_TAVISH,QUE,1871,7,2019,12,2.8,45.5,-73.5,7,y,n -620,702S006,MONTREAL__TRUDEAU_IN,QUE,1953,1,2019,12,0.5,45.4,-73.7,3,y,y +620,702S006,MONTREAL_TRUDEAU_INTERNATIONAL,QUE,1953,1,2019,12,0.5,45.4,-73.7,3,y,y 621,7045401,NATASHQUAN,QUE,1914,10,2019,12,4.1,50.1,-61.8,1,y,y 622,7055422,NEW_CARLISLE,QUE,1963,1,2019,12,17.8,48,-65.3,4,y,n 623,7025442,NICOLET,QUE,1913,11,2019,12,2.9,46.2,-72.6,0,y,n @@ -657,8 +657,8 @@ No,StnId,Station name,Prov,FromYear,FromMonth,ToYear,ToMonth,%Miss,Lat(deg),Long 654,7016800,ST_ALBAN,QUE,1949,9,2019,10,2.3,46.7,-72,7,n,n 655,7066820,ST_AMBROISE,QUE,1954,9,2019,10,4.5,48.5,-71.3,12,n,n 656,702FQLF,ST_ANICET,QUE,1960,11,2019,12,2,45.1,-74.2,4,y,y -657,7056930,ST_CAMILLE_,QUE,1963,7,2019,10,2,46.4,-70.2,39,n,n -658,7016960,ST_CHARLES_DE_MANDE_,QUE,1976,6,2019,10,21.4,46.3,-73.3,16,n,n +657,7056930,ST_CAMILLE,QUE,1963,7,2019,10,2,46.4,-70.2,39,n,n +658,7016960,ST_CHARLES_DE_MANDE,QUE,1976,6,2019,10,21.4,46.3,-73.3,16,n,n 659,7017080,ST_COME,QUE,1950,12,2018,11,4.6,46.2,-73.7,24,n,n 660,7027083,ST_COME_DE_LINIERE,QUE,1965,9,2019,10,3.7,46,-70.5,24,n,n 661,7027200,ST_EPHREM,QUE,1929,2,2019,10,18.1,46,-70.9,31,n,n @@ -666,7 +666,7 @@ No,StnId,Station name,Prov,FromYear,FromMonth,ToYear,ToMonth,%Miss,Lat(deg),Long 663,7027259,ST_FLAVIEN,QUE,1963,1,2016,8,2.1,46.4,-71.5,13,n,n 664,7027302,ST_GUILLAUME,QUE,1963,1,2015,10,7.6,45.8,-72.7,4,n,n 665,7037310,ST_HIPPOLYTE,QUE,1961,2,2019,10,4.9,45.9,-74,36,n,n -666,7027329,ST_HUBERT_MONT_,QUE,1953,1,2019,12,0.8,45.5,-73.4,2,y,n +666,7027329,ST_HUBERT_MONT,QUE,1953,1,2019,12,0.8,45.5,-73.4,2,y,n 667,7027361,ST_HYACINTHE,QUE,1935,1,2019,10,8.4,45.5,-72.9,3,y,n 668,7037400,ST_JEROME,QUE,1932,5,2019,10,4.3,45.8,-74,17,n,n 669,7027516,ST_LUDGER,QUE,1964,10,2019,10,3.1,45.7,-70.6,33,n,n @@ -778,6 +778,6 @@ No,StnId,Station name,Prov,FromYear,FromMonth,ToYear,ToMonth,%Miss,Lat(deg),Long 775,8403603,ST_JOHN_WEST,NFLD,1950,11,2019,12,6.6,47.5,-52.7,11,y,y 776,8403619,ST_LAWRENCE,NFLD,1989,11,2019,12,14.6,46.9,-55.3,4,y,y 777,8403820,STEPHENVILLE,NFLD,1895,6,2019,12,6.6,48.5,-58.5,5,y,y -778,8403851,TERRA_NOVA_NAT_PARK_,NFLD,1962,3,2019,12,7.1,48.5,-53.9,10,y,y +778,8403851,TERRA_NOVA_NAT_PARK,NFLD,1962,3,2019,12,7.1,48.5,-53.9,10,y,y 779,8504177,WABUSH_LAKE,NFLD,1960,11,2019,12,0.8,52.9,-66.8,55,y,y 780,8404343,WRECKHOUSE,NFLD,1981,6,2019,12,1.5,47.7,-59.3,3,y,y diff --git a/miranda/preprocess/configs/eccc-homogenized_attrs.json b/miranda/preprocess/configs/eccc-homogenized_attrs.json index e35bba41..fc3efb6e 100644 --- a/miranda/preprocess/configs/eccc-homogenized_attrs.json +++ b/miranda/preprocess/configs/eccc-homogenized_attrs.json @@ -20,12 +20,14 @@ "dataset_id": "d6813de6-b20a-46cc-8990-01862ae15c5f", "documentation": "https://www.canada.ca/en/environment-climate-change/services/climate-change/canadian-centre-climate-services/display-download/technical-documentation-adjusted-climate-data.html", "domain": "AMNO", + "frequency": "day", "institution": "GovCan", "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html", "license_preamble": "The data is owned by the Government of Canada (Environment and Climate Change Canada), and fall under the licence agreement for use of Environment and Climate Change Canada data.", "license_type": "permissive", "organization": "ECCC", "processing_level": "adjusted", + "project": "AHCCD", "realm": "atmos", "source": "AHCCD", "table_date": "2023-08-03", diff --git a/templates/ahccd_preprocess.py b/templates/ahccd_preprocess.py new file mode 100644 index 00000000..2f413a5f --- /dev/null +++ b/templates/ahccd_preprocess.py @@ -0,0 +1,10 @@ +from pathlib import Path + +from miranda.preprocess import convert_ahccd, merge_ahccd + +in_files = Path("~/Desktop/ec_data/ahccd").expanduser() +output = Path().cwd().parent / "test" +variable = "tas" + +convert_ahccd(in_files, output, variable, generation=3) +merge_ahccd(output.joinpath("tas"), output, variable, overwrite=True) From 9339f30ace1c23c7d9f2ce4bf853463c62e11e6b Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Mon, 7 Aug 2023 15:36:35 -0400 Subject: [PATCH 11/33] working version --- miranda/io/_output.py | 11 +++++-- miranda/io/data/ouranos_chunk_config.json | 14 ++++++++ miranda/preprocess/_eccc_homogenized.py | 39 +++++++++++------------ templates/ahccd_preprocess.py | 2 +- 4 files changed, 42 insertions(+), 24 deletions(-) diff --git a/miranda/io/_output.py b/miranda/io/_output.py index 8d0667dc..4027ffbd 100644 --- a/miranda/io/_output.py +++ b/miranda/io/_output.py @@ -82,8 +82,15 @@ def write_dataset( outfile_path.unlink() if chunks is None and "frequency" in ds.attrs: - freq = ds.attrs["frequency"] # TOD0: check that this is really there - chunks = fetch_chunk_config(priority="time", freq=freq, dims=ds.dims) + freq = ds.attrs.get("frequency") + if not freq: + raise ValueError( + "If 'chunks' are not provided, the 'frequency' attribute must be set." + ) + if "lat" in ds.dims and "lon" in ds.dims: + chunks = fetch_chunk_config(priority="time", freq=freq, dims=ds.dims) + elif "lat" not in ds.dims and "lon" not in ds.dims: + chunks = fetch_chunk_config(priority="stations", freq=freq, dims=ds.dims) logging.info(f"Writing {output_name}.") write_object = delayed_write( diff --git a/miranda/io/data/ouranos_chunk_config.json b/miranda/io/data/ouranos_chunk_config.json index 2ac759b7..0f18928d 100644 --- a/miranda/io/data/ouranos_chunk_config.json +++ b/miranda/io/data/ouranos_chunk_config.json @@ -37,6 +37,20 @@ } } }, + "stations": { + "1hr": { + "default": { + "station": 50, + "time": "5 years" + } + }, + "day": { + "default": { + "station": 200, + "time": "10 years" + } + } + }, "time": { "1hr": { "default": { diff --git a/miranda/preprocess/_eccc_homogenized.py b/miranda/preprocess/_eccc_homogenized.py index 33f7c9ca..ae08af01 100644 --- a/miranda/preprocess/_eccc_homogenized.py +++ b/miranda/preprocess/_eccc_homogenized.py @@ -3,6 +3,7 @@ import calendar import logging.config +import warnings from pathlib import Path import numpy as np @@ -29,9 +30,11 @@ def _ahccd_variable_code(code: str): if variable_name: variable_codes[variable_name] = variable_code else: - raise AttributeError( - f"Variable `{variable_code}` is not properly configured. Verify JSON." + warnings.warn( + f"Variable `{variable_code}` does not have accompanying `variable_name`. " + f"Verify JSON. Continuing with `{variable_code}` as `variable_name`." ) + variable_codes[variable_code] = variable_code if code in variable_codes.values(): variable = code @@ -69,6 +72,7 @@ def _ahccd_variable_metadata( variable_meta = metadata["variables"].get(code) variable_name = variable_meta.get("_variable_name") if variable_name: + variable_meta["original_variable_name"] = variable_code variable_meta = {variable_name: variable_meta} del variable_meta[variable_name]["_variable_name"] else: @@ -99,10 +103,6 @@ def _ahccd_variable_metadata( return variable_meta, header -def _ahccd_station_metadata(code): - pass - - def _ahccd_column_definitions( variable_code: str, ) -> tuple[dict, list[tuple[int, int]], int]: @@ -270,13 +270,8 @@ def convert_ahccd_fwf_file( ds_out[variable].attrs = variable_meta[variable] metadata = metadata.to_xarray().rename({"index": "station"}).drop_vars("station") - metadata = metadata.assign_coords( - { - "stnid": metadata["stnid"].astype(str), - "station_name": metadata["station_name"], - } - ) - ds_out = ds_out.assign_coords(station=metadata.stnid) + metadata = metadata.assign_coords(dict(station_name=metadata["station_name"])) + ds_out = ds_out.assign_coords(station=metadata.stnid.astype(str)) metadata = metadata.drop_vars(["stnid", "station_name"]) ds_out[f"{variable}_flag"].attrs["long_name"] = variable_meta[variable]["long_name"] @@ -367,7 +362,7 @@ def convert_ahccd( if not outfile.exists() or overwrite: logger.info(ff.name) - station_id = ff.name[2:].split(".txt")[0] + station_id = ff.stem[2:] metadata_st = metadata[metadata["stnid"] == station_id] if len(metadata_st) == 1: @@ -410,14 +405,14 @@ def merge_ahccd( ) for coord in ds_ahccd.coords: - # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files + # xarray object datatypes mix string and int (e.g. station) convert to string for merged nc files # Do not apply to datetime object if coord != "time" and ds_ahccd[coord].dtype == "O": ds_ahccd[coord] = ds_ahccd[coord].astype(str) variables_found = set() for v in ds_ahccd.data_vars: - # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files + # xarray object datatypes mix string and int (e.g. station) convert to string for merged nc files # Do not apply to flag timeseries if ds_ahccd[v].dtype == "O" and "flag" not in v: ds_ahccd[v] = ds_ahccd[v].astype(str) @@ -428,11 +423,14 @@ def merge_ahccd( # Name output file ds_ahccd.attrs["variable"] = ", ".join(variables_found) - variables = "-".join(variables_found) + if len(variables_found) > 1: + variables = "-".join(variables_found) + logger.info( + f"Many variables found. Merging station and variables files in {data_source}." + ) + else: + variables = variables_found.pop() output_name = name_output_file(ds_ahccd, "netcdf", variables) - logger.info( - f"Many variables found. Merging station files in {data_source} as `{output_name}`." - ) try: logger.info(f"Writing merged file to: {output_dir}.") @@ -441,7 +439,6 @@ def merge_ahccd( output_dir, output_format="netcdf", output_name=output_name, - chunks={"time": 365}, overwrite=overwrite, compute=True, ) diff --git a/templates/ahccd_preprocess.py b/templates/ahccd_preprocess.py index 2f413a5f..27a88072 100644 --- a/templates/ahccd_preprocess.py +++ b/templates/ahccd_preprocess.py @@ -7,4 +7,4 @@ variable = "tas" convert_ahccd(in_files, output, variable, generation=3) -merge_ahccd(output.joinpath("tas"), output, variable, overwrite=True) +merge_ahccd(output.joinpath("tas"), output.joinpath("merged"), variable, overwrite=True) From e096824729c214bd7b4cd78475e7ae54afe81c5b Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Mon, 7 Aug 2023 16:54:14 -0400 Subject: [PATCH 12/33] begin work on obs-summaries --- miranda/preprocess/_data_definitions.py | 40 ++++- miranda/preprocess/_eccc_homogenized.py | 40 ++--- miranda/preprocess/_eccc_obs.py | 2 +- miranda/preprocess/_eccc_summaries.py | 2 +- .../configs/eccc-homogenized_attrs.json | 4 +- ...attrs.json => eccc-obs-summary_attrs.json} | 147 ++++++++---------- .../preprocess/configs/eccc-obs_attrs.json | 4 +- ...preprocess.py => eccc-ahccd_preprocess.py} | 0 templates/eccc-obs_preprocess.py | 0 templates/eccc_ahccd_conversion.py | 28 ---- 10 files changed, 121 insertions(+), 146 deletions(-) rename miranda/preprocess/configs/{eccc-obs-summary_cf_attrs.json => eccc-obs-summary_attrs.json} (54%) rename templates/{ahccd_preprocess.py => eccc-ahccd_preprocess.py} (100%) create mode 100644 templates/eccc-obs_preprocess.py delete mode 100644 templates/eccc_ahccd_conversion.py diff --git a/miranda/preprocess/_data_definitions.py b/miranda/preprocess/_data_definitions.py index 815b1048..f73251e9 100644 --- a/miranda/preprocess/_data_definitions.py +++ b/miranda/preprocess/_data_definitions.py @@ -1,13 +1,14 @@ from __future__ import annotations import json +import warnings from pathlib import Path from typing import Any _config_folder = Path(__file__).resolve().parent / "configs" -__all__ = ["load_json_data_mappings"] +__all__ = ["load_json_data_mappings", "find_project_variable_codes"] def load_json_data_mappings(project: str) -> dict[str, Any]: @@ -35,3 +36,40 @@ def load_json_data_mappings(project: str) -> dict[str, Any]: raise NotImplementedError(f"Project not supported: {project}") return metadata_definition + + +def find_project_variable_codes(code: str, table: str) -> str: + """Find the variable code for a given variable name and project. + + Parameters + ---------- + code : str + Variable name. + table : str + Project name. + + Returns + ------- + str + """ + config = load_json_data_mappings(table) + variable_codes = {} + for variable_code in config["variables"]: + variable_name = config["variables"][variable_code].get("_variable_name") + if variable_name: + variable_codes[variable_name] = variable_code + else: + warnings.warn( + f"Variable `{variable_code}` does not have accompanying `variable_name`. " + f"Verify JSON. Continuing with `{variable_code}` as `variable_name`." + ) + variable_codes[variable_code] = variable_code + + if code in variable_codes.values(): + variable = code + else: + variable = variable_codes.get(code) + if not variable: + raise NotImplementedError(f"Variable `{code}` not supported.") + + return variable diff --git a/miranda/preprocess/_eccc_homogenized.py b/miranda/preprocess/_eccc_homogenized.py index ae08af01..7e4a7559 100644 --- a/miranda/preprocess/_eccc_homogenized.py +++ b/miranda/preprocess/_eccc_homogenized.py @@ -3,7 +3,6 @@ import calendar import logging.config -import warnings from pathlib import Path import numpy as np @@ -12,7 +11,10 @@ from miranda.io import write_dataset from miranda.io.utils import name_output_file -from miranda.preprocess._data_definitions import load_json_data_mappings +from miranda.preprocess._data_definitions import ( + find_project_variable_codes, + load_json_data_mappings, +) from miranda.preprocess._treatments import basic_metadata_conversion from miranda.scripting import LOGGING_CONFIG @@ -22,30 +24,6 @@ __all__ = ["convert_ahccd", "convert_ahccd_fwf_file", "merge_ahccd"] -def _ahccd_variable_code(code: str): - config = load_json_data_mappings("eccc-homogenized") - variable_codes = {} - for variable_code in config["variables"]: - variable_name = config["variables"][variable_code].get("_variable_name") - if variable_name: - variable_codes[variable_name] = variable_code - else: - warnings.warn( - f"Variable `{variable_code}` does not have accompanying `variable_name`. " - f"Verify JSON. Continuing with `{variable_code}` as `variable_name`." - ) - variable_codes[variable_code] = variable_code - - if code in variable_codes.values(): - variable = code - else: - variable = variable_codes.get(code) - if not variable: - raise NotImplementedError(f"Variable `{code}` not supported.") - - return variable - - def _ahccd_variable_metadata( variable_code: str, gen: int, @@ -67,7 +45,7 @@ def _ahccd_variable_metadata( config = load_json_data_mappings("eccc-homogenized") metadata = basic_metadata_conversion("eccc-homogenized", config) - code = _ahccd_variable_code(variable_code) + code = find_project_variable_codes(variable_code, "eccc-homogenized") variable_meta = metadata["variables"].get(code) variable_name = variable_meta.get("_variable_name") @@ -190,7 +168,7 @@ def convert_ahccd_fwf_file( ------- xarray.Dataset """ - code = _ahccd_variable_code(variable) + code = find_project_variable_codes(variable, "eccc-homogenized") variable_meta, global_attrs = _ahccd_variable_metadata(code, generation) col_names, cols_specs, header = _ahccd_column_definitions(code) @@ -321,7 +299,7 @@ def convert_ahccd( output_dir = Path(output_dir).resolve().joinpath(variable) output_dir.mkdir(parents=True, exist_ok=True) - code = _ahccd_variable_code(variable) + code = find_project_variable_codes(variable, "eccc-homogenized") var_meta, global_attrs = _ahccd_variable_metadata(code, generation) ( col_names, @@ -391,7 +369,7 @@ def merge_ahccd( ) -> None: """Merge Adjusted and Homogenized Canadian Climate Dataset files.""" if variable: - code = _ahccd_variable_code(variable) + code = find_project_variable_codes(variable, "eccc-homogenized") glob_pattern = f"{code}*.nc" output_dir = Path(output_dir).resolve().joinpath(variable) else: @@ -417,7 +395,7 @@ def merge_ahccd( if ds_ahccd[v].dtype == "O" and "flag" not in v: ds_ahccd[v] = ds_ahccd[v].astype(str) try: - variables_found.add(_ahccd_variable_code(str(v))) + variables_found.add(find_project_variable_codes(str(v), "eccc-homogenized")) except NotImplementedError: pass diff --git a/miranda/preprocess/_eccc_obs.py b/miranda/preprocess/_eccc_obs.py index 7cfa9249..5691ee81 100644 --- a/miranda/preprocess/_eccc_obs.py +++ b/miranda/preprocess/_eccc_obs.py @@ -37,7 +37,7 @@ from xclim.core.units import convert_units_to from miranda.archive import group_by_length -from miranda.convert import load_json_data_mappings +from miranda.preprocess._data_definitions import load_json_data_mappings from miranda.scripting import LOGGING_CONFIG from miranda.storage import file_size, report_file_size from miranda.utils import generic_extract_archive diff --git a/miranda/preprocess/_eccc_summaries.py b/miranda/preprocess/_eccc_summaries.py index 3c31ba32..118d1c31 100755 --- a/miranda/preprocess/_eccc_summaries.py +++ b/miranda/preprocess/_eccc_summaries.py @@ -30,7 +30,7 @@ __all__ = ["extract_daily_summaries", "daily_summaries_to_netcdf"] eccc_metadata = json.load( - open(Path(__file__).resolve().parent / "configs" / "eccc-obs-summary_cf_attrs.json") + open(Path(__file__).resolve().parent / "configs" / "eccc-obs-summary_attrs.json") )["variable_entry"] diff --git a/miranda/preprocess/configs/eccc-homogenized_attrs.json b/miranda/preprocess/configs/eccc-homogenized_attrs.json index fc3efb6e..a56c5b51 100644 --- a/miranda/preprocess/configs/eccc-homogenized_attrs.json +++ b/miranda/preprocess/configs/eccc-homogenized_attrs.json @@ -19,7 +19,7 @@ "contact": "info.cccs-ccsc@canada.ca", "dataset_id": "d6813de6-b20a-46cc-8990-01862ae15c5f", "documentation": "https://www.canada.ca/en/environment-climate-change/services/climate-change/canadian-centre-climate-services/display-download/technical-documentation-adjusted-climate-data.html", - "domain": "AMNO", + "domain": "CAN", "frequency": "day", "institution": "GovCan", "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html", @@ -29,7 +29,7 @@ "processing_level": "adjusted", "project": "AHCCD", "realm": "atmos", - "source": "AHCCD", + "source": "msc", "table_date": "2023-08-03", "table_id": "ECCC", "type": "station-obs" diff --git a/miranda/preprocess/configs/eccc-obs-summary_cf_attrs.json b/miranda/preprocess/configs/eccc-obs-summary_attrs.json similarity index 54% rename from miranda/preprocess/configs/eccc-obs-summary_cf_attrs.json rename to miranda/preprocess/configs/eccc-obs-summary_attrs.json index b21f224e..11b3dc51 100644 --- a/miranda/preprocess/configs/eccc-obs-summary_cf_attrs.json +++ b/miranda/preprocess/configs/eccc-obs-summary_attrs.json @@ -1,173 +1,160 @@ { "Header": { - "Conventions": "CF-1.8", + "_miranda_version": true, + "_variable": true, + "acknowledgement": "This data is provided by Environment and Climate Change Canada (ECCC).", + "author": "Environment and Climate Change Canada (ECCC)", "contact": "info.cccs-ccsc@canada.ca", + "dataset_id": "b24efb37-11b6-5d03-ab19-5759f83db546", + "documentation": "https://climate.weather.gc.ca/doc/Technical_Documentation.pdf", + "domain": "CAN", + "frequency": "mon", "institution": "GovCan", - "int_missing_value": "-999", "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html", + "license_preamble": "The data is owned by the Government of Canada (Environment and Climate Change Canada), and fall under the licence agreement for use of Environment and Climate Change Canada data.", "license_type": "permissive", - "missing_value": "1e20", "organization": "ECCC", "processing_level": "raw", + "product": "A cross-country summary of the averages and extremes for the month, including precipitation totals, max-min temperatures, and degree days.", + "project": "ECCC-SUMMARIES", "realm": "atmos", "source": "msc", - "table_date": "2023-03-23", + "table_date": "2023-08-07", + "table_id": "ECCC", "type": "station-obs" }, "variable_entry": { "cdd": { - "add_offset": 0, + "_variable_name": "cdd", "cell_methods": "time: sum", "comments": "Station data converted from Cool Deg Days (°C)", "frequency": "day", "grid_mapping": "regular_lon_lat", "long_name": "Number of Degrees Celsius Over a Mean Temperature of 18 °C", - "original_variable": "Cool Deg Days (°C)", - "out_name": "cdd", - "scale_factor": 1, - "standard_name": "cooling_degree_days", + "original_field": "Cool Deg Days (°C)", "type": "real", - "units": "C" + "units": "degC" }, "hdd": { - "add_offset": 0, + "_variable_name": "hdd", "cell_methods": "time: sum", "comments": "Station data converted from Heat Deg Days (°C)", - "frequency": "day", + "frequency": "mon", "grid_mapping": "regular_lon_lat", "long_name": "Number of Degrees Celsius Under a Mean Temperature of 18 °C", - "original_variable": "Heat Deg Days (°C)", - "out_name": "hdd", - "scale_factor": 1, - "standard_name": "heating_degree_days", + "original_field": "Heat Deg Days (°C)", "type": "real", - "units": "C" + "units": "degC" }, "pr": { - "add_offset": 0, + "_variable_name": "pr", "cell_methods": "time: mean", "comments": "Station data converted from Total Precip (mm) using a density of 1000 kg/m³", - "frequency": "day", + "frequency": "mon", "grid_mapping": "regular_lon_lat", "long_name": "Precipitation", - "original_variable": "Total Precip (mm)", - "out_name": "pr", - "scale_factor": 1.1574074074074073e-05, - "standard_name": "precipitation_flux", + "original_field": "Total Precip (mm)", "type": "real", - "units": "kg m-2 s-1" + "units": "mm" }, "prlp": { - "add_offset": 0, + "_variable_name": "prlp", "cell_methods": "time: mean", "comments": "Station data converted from Total Rain (mm) using a density of 1000 kg/m³", - "frequency": "day", + "frequency": "mon", "grid_mapping": "regular_lon_lat", "long_name": "Liquid Precipitation", - "original_variable": "Total Rain (mm)", - "out_name": "prlp", - "scale_factor": 1.1574074074074073e-05, - "standard_name": "rainfall_flux", + "original_field": "Total Rain (mm)", "type": "real", - "units": "kg m-2 s-1" + "units": "mm" }, "prsn": { - "add_offset": 0, + "_variable_name": "prsn", "cell_methods": "time: mean", "comments": "station data converted from Total Snow (cm) using a density of 100 kg/m³", - "frequency": "day", + "frequency": "mon", "grid_mapping": "regular_lon_lat", "long_name": "Snowfall Flux", - "original_variable": "Total Snow (cm)", - "out_name": "prsn", - "scale_factor": 1.1574074074074073e-05, - "standard_name": "snowfall_flux", + "original_field": "Total Snow (cm)", "type": "real", - "units": "kg m-2 s-1" + "units": "cm" }, "sfcWindAz": { - "add_offset": 0, + "_variable_name": "sfcWindAz", "cell_methods": "time: mean", "comments": "Station data converted from Dir of Max Gust (10s deg)", - "frequency": "day", + "frequency": "mon", "grid_mapping": "regular_lon_lat", "long_name": "Direction from which the Daily Maximum Near-Surface Gust Wind Speed maximum Blows", - "original_variable": "Dir of Max Gust (10s deg)", - "out_name": "sfcWindAz", - "scale_factor": 1, - "standard_name": "wind_direction", + "original_field": "Dir of Max Gust (10s deg)", "type": "real", "units": "degree" }, "sfcWindMax": { - "add_offset": 0, + "_variable_name": "sfcWindMax", "cell_methods": "time: max", "comments": "Station data converted from Spd of Max Gust (km/h)", - "frequency": "day", + "frequency": "mon", "grid_mapping": "regular_lon_lat", "long_name": "Daily Maximum Near-Surface Gust Wind Speed maximum", - "original_variable": "Spd of Max Gust (km/h)", - "out_name": "sfcWindMax", - "scale_factor": 0.2777777777777778, - "standard_name": "wind_speed_of_gust maximum", + "original_field": "Spd of Max Gust (km/h)", "type": "real", - "units": "m s-1" + "units": "km h-1" }, "snd": { - "add_offset": 0, + "_variable_name": "snd", "cell_methods": "time: mean", "comments": "Station data converted from Snow on Grnd (cm)", - "frequency": "day", + "frequency": "mon", "grid_mapping": "regular_lon_lat", "long_name": "Snow Depth", - "original_variable": "Snow on Grnd (cm)", - "out_name": "snd", - "scale_factor": 0.01, - "standard_name": "surface_snow_thickness", + "original_field": "Snow on Grnd (cm)", "type": "real", - "units": "m" + "units": "cm" }, "tas": { - "add_offset": 273.15, + "_variable_name": "tas", "cell_methods": "time: mean", - "comments": "Station data converted from Mean Temp (°C)", - "frequency": "day", + "comments": "Station data converted from Mean Temperature (°C)", + "frequency": "mon", "grid_mapping": "regular_lon_lat", "long_name": "Near-Surface Air Temperature", - "original_variable": "Mean Temp (°C)", - "out_name": "tas", - "scale_factor": 1, - "standard_name": "air_temperature", + "original_field": "Mean Temperature", + "type": "real", + "units": "degC" + }, + "tas_days": { + "_variable_name": "tas_days", + "cell_methods": "time: count", + "comments": "Station data converted from Days With Valid Mean Temperature", + "frequency": "mon", + "grid_mapping": "regular_lon_lat", + "long_name": "Number of Days With Valid Near-Surface Air Temperature", + "original_field": "Days With Valid Mean Temp", "type": "real", - "units": "K" + "units": "1" }, "tasmax": { - "add_offset": 273.15, + "_variable_name": "tasmax", "cell_methods": "time: maximum", "comments": "station data converted from Max Temp (°C)", - "frequency": "day", + "frequency": "mon", "grid_mapping": "regular_lon_lat", "long_name": "Daily Maximum Near-Surface Air Temperature", - "original_variable": "Max Temp (°C)", - "out_name": "tasmax", - "scale_factor": 1, - "standard_name": "air_temperature", + "original_field": "Max Temp (°C)", "type": "real", - "units": "K" + "units": "degC" }, "tasmin": { - "add_offset": 273.15, + "_variable_name": "tasmin", "cell_methods": "time: minimum", "comments": "Station data converted from Min Temp (°C)", - "frequency": "day", + "frequency": "mon", "grid_mapping": "regular_lon_lat", "long_name": "Daily Minimum Near-Surface Air Temperature", - "original_variable": "Min Temp (°C)", - "out_name": "tasmin", - "scale_factor": 1, - "standard_name": "air_temperature", + "original_field": "Min Temp (°C)", "type": "real", - "units": "K" + "units": "degC" } } } diff --git a/miranda/preprocess/configs/eccc-obs_attrs.json b/miranda/preprocess/configs/eccc-obs_attrs.json index 7265ca71..82d93e55 100644 --- a/miranda/preprocess/configs/eccc-obs_attrs.json +++ b/miranda/preprocess/configs/eccc-obs_attrs.json @@ -12,7 +12,7 @@ ], "acknowledgement": "This data is provided by Environment and Climate Change Canada (ECCC).", "author": "Environment and Climate Change Canada (ECCC)", - "contact": "climatcentre-climatecentral@ec.gc.ca", + "contact": "ccsc-cccs@ec.gc.ca", "documentation": "https://climate.weather.gc.ca/doc/Technical_Documentation.pdf", "institution": "GovCan", "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html", @@ -20,7 +20,7 @@ "license_type": "permissive", "organization": "ECCC", "processing_level": "raw", - "source": "ECCC-OBS", + "source": "msc", "table_date": "2023-08-02", "title": "Environment and Climate Change Canada (ECCC) weather station observations", "type": "station-obs", diff --git a/templates/ahccd_preprocess.py b/templates/eccc-ahccd_preprocess.py similarity index 100% rename from templates/ahccd_preprocess.py rename to templates/eccc-ahccd_preprocess.py diff --git a/templates/eccc-obs_preprocess.py b/templates/eccc-obs_preprocess.py new file mode 100644 index 00000000..e69de29b diff --git a/templates/eccc_ahccd_conversion.py b/templates/eccc_ahccd_conversion.py deleted file mode 100644 index e29dd643..00000000 --- a/templates/eccc_ahccd_conversion.py +++ /dev/null @@ -1,28 +0,0 @@ -from os import getenv -from pathlib import Path - -from miranda.eccc import convert_ahccd - -if __name__ == "__main__": - in_files = getenv("in") - out_files = getenv("out") - - source_files = Path(in_files) - output_path = Path(out_files) - - source_var_gens = { - "Generation3/Homog_daily_mean_temp_v2019/": ("tas", 3), - "Generation3/Homog_daily_max_temp_v2019/": ("tasmax", 3), - "Generation3/Homog_daily_min_temp_v2019/": ("tasmin", 3), - "Generation2/Adj_Daily_Total_v2017/": ("pr", 2), - "Generation2/Adj_Daily_Snow_v2017/": ("prsn", 2), - "Generation2/Adj_Daily_Rain_v2017/": ("prlp", 2), - } - - for folder, (variable, generation) in source_var_gens.items(): - convert_ahccd( - source_files.expanduser().joinpath(folder), - output_path, - variable, - generation, - ) From 268cca198ddbe78b73431b93332fccbdcb0c8f55 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 9 Aug 2023 12:26:17 -0400 Subject: [PATCH 13/33] finishing touches on ahccd --- miranda/preprocess/_eccc_homogenized.py | 56 ++++++++++++++---- miranda/preprocess/_eccc_obs.py | 79 ++----------------------- miranda/preprocess/eccc.py | 5 -- 3 files changed, 48 insertions(+), 92 deletions(-) diff --git a/miranda/preprocess/_eccc_homogenized.py b/miranda/preprocess/_eccc_homogenized.py index 7e4a7559..00a417da 100644 --- a/miranda/preprocess/_eccc_homogenized.py +++ b/miranda/preprocess/_eccc_homogenized.py @@ -4,6 +4,7 @@ import calendar import logging.config from pathlib import Path +from typing import Any, Dict, List, Tuple, Type import numpy as np import pandas as pd @@ -83,7 +84,7 @@ def _ahccd_variable_metadata( def _ahccd_column_definitions( variable_code: str, -) -> tuple[dict, list[tuple[int, int]], int]: +) -> tuple[dict, list[tuple[int, int]], dict[str, type[str | int | float] | Any], int]: config = load_json_data_mappings("eccc-homogenized") metadata = basic_metadata_conversion("eccc-homogenized", config) @@ -105,8 +106,25 @@ def _ahccd_column_definitions( "Joined", "RCS", ] + dtypes = [ + str, + str, + str, + str, + int, + int, + int, + int, + float, + float, + float, + int, + str, + str, + ] column_spaces = [(0, 5), (5, 6), (6, 8), (8, 9)] ii = 9 + # 31 days in a month for i in range(1, 32): column_spaces.append((ii, ii + 7)) ii += 7 @@ -128,8 +146,10 @@ def _ahccd_column_definitions( "elev (m)", "stns joined", ] + dtypes = [str, str, str, int, int, int, int, float, float, int, str] column_spaces = [(0, 4), (4, 5), (5, 7), (7, 8)] ii = 8 + # 31 days in a month for i in range(1, 32): column_spaces.append((ii, ii + 8)) ii += 8 @@ -144,8 +164,12 @@ def _ahccd_column_definitions( col.lower().split("(")[0].replace("%", "pct_").strip().replace(" ", "_"): col for col in list(column_names) } + # + column_dtypes = {} + for col in column_names.keys(): + column_dtypes[col] = dtypes[list(column_names.keys()).index(col)] - return column_names, column_spaces, header_row + return column_names, column_spaces, column_dtypes, header_row def convert_ahccd_fwf_file( @@ -171,9 +195,9 @@ def convert_ahccd_fwf_file( code = find_project_variable_codes(variable, "eccc-homogenized") variable_meta, global_attrs = _ahccd_variable_metadata(code, generation) - col_names, cols_specs, header = _ahccd_column_definitions(code) + column_names, column_spaces, column_dtypes, header = _ahccd_column_definitions(code) - df = pd.read_fwf(ff, header=header, colspecs=cols_specs) + df = pd.read_fwf(ff, header=header, colspecs=column_spaces, dtype=column_dtypes) if "pr" in variable: cols = list(df.columns[0:3]) cols = cols[0::2] @@ -302,8 +326,9 @@ def convert_ahccd( code = find_project_variable_codes(variable, "eccc-homogenized") var_meta, global_attrs = _ahccd_variable_metadata(code, generation) ( - col_names, - col_spaces, + column_names, + column_spaces, + column_dtypes, header_row, ) = _ahccd_column_definitions(code) @@ -321,11 +346,11 @@ def convert_ahccd( if "tas" in variable: metadata = pd.read_csv(metadata_source, header=2) - metadata.columns = col_names.keys() + metadata.columns = column_names.keys() elif "pr" in variable: metadata = pd.read_csv(metadata_source, header=3) - metadata.columns = col_names.keys() + metadata.columns = column_names.keys() for index, row in metadata.iterrows(): if isinstance(row["stnid"], str): metadata.loc[index, "stnid"] = metadata.loc[index, "stnid"].replace( @@ -336,8 +361,8 @@ def convert_ahccd( # Convert station .txt files to netcdf for ff in Path(data_source).glob(f"{code}*.txt"): - outfile = output_dir.joinpath(ff.name.replace(".txt", ".nc")) - if not outfile.exists() or overwrite: + output_name = ff.name.replace(".txt", ".nc") + if not output_dir.joinpath(output_name).exists() or overwrite: logger.info(ff.name) station_id = ff.stem[2:] @@ -349,13 +374,20 @@ def convert_ahccd( ) ds_out.attrs = global_attrs - ds_out.to_netcdf(outfile, engine="h5netcdf") + write_dataset( + ds_out, + output_dir, + output_format="netcdf", + output_name=output_name, + overwrite=overwrite, + compute=True, + ) else: logger.warning( f"metadata info for station {ff.name} not found : skipping" ) else: - logger.info(f"{outfile.name} already exists: Skipping...") + logger.info(f"{output_name} already exists: Skipping...") if merge: merge_ahccd(data_source, output_dir, variable) return diff --git a/miranda/preprocess/_eccc_obs.py b/miranda/preprocess/_eccc_obs.py index 5691ee81..f4601058 100644 --- a/miranda/preprocess/_eccc_obs.py +++ b/miranda/preprocess/_eccc_obs.py @@ -45,19 +45,14 @@ config.dictConfig(LOGGING_CONFIG) __all__ = [ - "convert_flat_files", "convert_station", "merge_converted_variables", "merge_stations", ] - -KiB = int(pow(2, 10)) -MiB = int(pow(2, 20)) -GiB = int(pow(2, 30)) TABLE_DATE = dt.now().strftime("%d %B %Y") -def _fwf_column_definitions( +def _obs_fwf_column_definitions( time_frequency: str, ) -> tuple[list[str], list[int], list[type[str | int]]]: """Return the column names, widths, and data types for the fixed-width format.""" @@ -104,7 +99,7 @@ def convert_station( ): """Convert a single station's data from the fixed-width format to a netCDF file.""" data = Path(data) - column_names, column_widths, column_dtypes = _fwf_column_definitions(mode) + column_names, column_widths, column_dtypes = _obs_fwf_column_definitions(mode) if using_dask_array: pandas_reader = dd @@ -360,7 +355,8 @@ def _convert_station_file( data_files = [file] logging.info(f"Processing file: {file}.") - size_limit = 1 * GiB + # 1 GiB + size_limit = 2**30 for data in data_files: if file_size(data) > size_limit and "dask" in sys.modules: @@ -388,73 +384,6 @@ def _convert_station_file( temporary_file.unlink() -def convert_flat_files( - source_files: str | os.PathLike, - output_folder: str | os.PathLike | list[str | int], - variables: str | int | list[str | int], - mode: str = "hourly", - n_workers: int = 4, -) -> None: - """ - - Parameters - ---------- - source_files: str or Path - output_folder: str or Path - variables: str or List[str] - mode: {"hourly", "daily"} - n_workers: int - - Returns - ------- - None - """ - if isinstance(variables, (str, int)): - variables = [variables] - - for variable_code in variables: - variable_code = str(variable_code).zfill(3) - metadata = load_json_data_mappings("eccc-obs")[variable_code] - nc_name = metadata["cf_variable_name"] - - rep_nc = Path(output_folder).joinpath(nc_name) - rep_nc.mkdir(parents=True, exist_ok=True) - - # Loop on the files - logging.info( - f"Collecting files for variable '{metadata['standard_name']}' " - f"(filenames containing '{metadata['_table_name']}')." - ) - list_files = list() - if isinstance(source_files, list) or Path(source_files).is_file(): - list_files.append(source_files) - else: - glob_patterns = [g for g in metadata["_table_name"]] - for pattern in glob_patterns: - list_files.extend( - [f for f in Path(source_files).rglob(f"{pattern}*") if f.is_file()] - ) - manager = mp.Manager() - errored_files = manager.list() - converter_func = functools.partial( - _convert_station_file, - output_path=rep_nc, - errored_files=errored_files, - mode=mode, - variable_code=variable_code, - **metadata, - ) - with mp.Pool(processes=n_workers) as pool: - pool.map(converter_func, list_files) - pool.close() - pool.join() - - if errored_files: - logging.warning( - "Some files failed to be properly parsed:\n", ", ".join(errored_files) - ) - - def merge_stations( source_files: str | os.PathLike | None = None, output_folder: str | os.PathLike | None = None, diff --git a/miranda/preprocess/eccc.py b/miranda/preprocess/eccc.py index aab3595d..688ed196 100644 --- a/miranda/preprocess/eccc.py +++ b/miranda/preprocess/eccc.py @@ -6,7 +6,6 @@ import logging.config import multiprocessing as mp import os -import time from functools import partial from pathlib import Path @@ -58,8 +57,6 @@ def convert_flat_files( ------- None """ - func_time = time.time() - if mode.lower() in ["h", "hour", "hourly"]: num_observations = 24 column_names = ["code", "year", "month", "day", "code_var"] @@ -124,5 +121,3 @@ def convert_flat_files( logging.warning( "Some files failed to be properly parsed:\n", ", ".join(errored_files) ) - - logging.warning(f"Process completed in {time.time() - func_time:.2f} seconds") From 8728c66b1747abe8669e054be1351651f3db9b65 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 9 Aug 2023 17:24:26 -0400 Subject: [PATCH 14/33] significant refactoring --- miranda/eccc/_utils.py | 838 ------------------ miranda/preprocess/_eccc_homogenized.py | 171 +--- miranda/preprocess/_eccc_obs.py | 165 ++-- miranda/preprocess/_metadata.py | 190 ++++ miranda/preprocess/_treatments.py | 42 - .../preprocess/configs/eccc-obs_attrs.json | 730 ++++++++++++++- miranda/preprocess/eccc.py | 233 +++-- miranda/vocabularies/eccc.py | 125 +-- 8 files changed, 1198 insertions(+), 1296 deletions(-) delete mode 100644 miranda/eccc/_utils.py create mode 100644 miranda/preprocess/_metadata.py delete mode 100644 miranda/preprocess/_treatments.py diff --git a/miranda/eccc/_utils.py b/miranda/eccc/_utils.py deleted file mode 100644 index a501dac6..00000000 --- a/miranda/eccc/_utils.py +++ /dev/null @@ -1,838 +0,0 @@ -from __future__ import annotations - -import logging.config -from collections.abc import Mapping -from datetime import datetime as dt - -from miranda.scripting import LOGGING_CONFIG - -__all__ = ["cf_station_metadata"] - -logging.config.dictConfig(LOGGING_CONFIG) - - -def cf_station_metadata(variable_code: int | str) -> Mapping[str, int | float | str]: - """ - - Parameters - ---------- - variable_code: int or str - - Returns - ------- - dict - """ - ec_hourly_variables = { - "001": { - "_table_name": {"DLY02", "DLY04", "DLY44"}, - "original_units": "0.1 °C", - "raw_units": "degC", - "units": "K", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Daily Maximum Temperature", - "standard_name": "air_temperature_maximum", - "nc_name": "tasmax", - }, - "002": { - "_table_name": {"DLY02", "DLY04", "DLY44"}, - "original_units": "0.1 °C", - "raw_units": "degC", - "units": "K", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Daily Minimum Temperature", - "standard_name": "air_temperature_minimum", - "nc_name": "tasmin", - }, - "003": { - "_table_name": {"DLY02", "DLY04", "DLY44"}, - "original_units": "0.1 °C", - "raw_units": "degC", - "units": "K", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Daily Mean Temperature", - "standard_name": "air_temperature", - "nc_name": "tas", - }, - "010": { - "_table_name": {"DLY02", "DLY04", "DLY44"}, - "original_units": "0.1 mm day-1", - "raw_units": "mm", - "units": "m", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Daily Total Rainfall", - "standard_name": "liquid_precipitation_amount", - "nc_name": "prlptot", - }, - "011": { - "_table_name": {"DLY02", "DLY04", "DLY44"}, - "original_units": "0.1 cm day-1", - "raw_units": "cm", - "units": "m", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Daily Total Snowfall", - "standard_name": "solid_precipitation_amount", - "nc_name": "prsntot", - }, - "012": { - "_table_name": {"DLY02", "DLY04", "DLY44"}, - "original_units": "0.1 mm day-1", - "raw_units": "mm", - "units": "m", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Daily Total Precipitation", - "standard_name": "precipitation_amount", - "nc_name": "prcptot", - }, - "013": { - "_table_name": {"DLY02", "DLY04", "DLY44"}, - "original_units": "cm", - "raw_units": "cm", - "units": "m", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Snow on the Ground", - "standard_name": "surface_snow_thickness", - "nc_name": "sndtot", - }, - "014": { - "_table_name": {"DLY02", "DLY04", "DLY44"}, - "raw_units": "1", - "units": "1", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Thunderstorms", - "standard_name": "thunderstorm_presence", - "nc_name": "thunder", - }, - "015": { - "_table_name": {"DLY02", "DLY04", "DLY44"}, - "raw_units": "1", - "units": "1", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Freezing rain or drizzle", - "standard_name": "freeze_rain_drizzle_presence", - "nc_name": "freezing_rain_drizzle", - }, - "016": { - "_table_name": {"DLY02", "DLY04", "DLY44"}, - "raw_units": "1", - "units": "1", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Hail", - "standard_name": "hail_presence", - "nc_name": "hail", - }, - "017": { - "_table_name": {"DLY02", "DLY04", "DLY44"}, - "raw_units": "1", - "units": "1", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Fog or Ice Fog", - "standard_name": "fog_ice_fog_presence", - "nc_name": "fog_ice_fog", - }, - "018": { - "_table_name": {"DLY02", "DLY04"}, - "raw_units": "1", - "units": "1", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Smoke or Haze", - "standard_name": "smoke_haze_presence", - "nc_name": "smoke_haze", - }, - "019": { - "_table_name": {"DLY02", "DLY04"}, - "raw_units": "1", - "units": "1", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Blowing Dust or Sand", - "standard_name": "blowing_dust_sand_presence", - "nc_name": "blowing_dust_sand", - }, - "020": { - "_table_name": {"DLY02", "DLY04"}, - "raw_units": "1", - "units": "1", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Blowing snow", - "standard_name": "blowing_snow_presence", - "nc_name": "blow_snow", - }, - "021": { - "_table_name": {"DLY02", "DLY04"}, - "raw_units": "1", - "units": "1", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Wind speed >= 28 Knots", - "standard_name": "wind_exceeding_28_knots", - "nc_name": "wind_gt_28kt", - }, - "022": { - "_table_name": {"DLY02", "DLY04"}, - "raw_units": "1", - "units": "1", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Wind speed >= 34 Knots", - "standard_name": "wind_exceeding_34_knots", - "nc_name": "wind_gt_34kt", - }, - "023": { - "_table_name": {"DLY02", "DLY04"}, - "original_units": "10's of degrees", - "raw_units": "deg", - "units": "deg", - "scale_factor": 10, - "add_offset": 0, - "long_name": "Direction of extreme gust (16 pts) to December 1976", - "standard_name": "gust_to_direction", - "nc_name": "gust_dir_16pts", - }, - "024": { - "_table_name": {"DLY02", "DLY04"}, - "original_units": "km/h", - "raw_units": "km h-1", - "units": "m s-1", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Speed of extreme gust", - "standard_name": "wind_speed_of_gust", - "nc_name": "gust_speed", - }, - "025": { - "_table_name": {"DLY02", "DLY04"}, - "raw_units": "h", - "units": "h", - "scale_factor": 1, - "add_offset": 0, - "long_name": "UTC hour of extreme gust", - "standard_name": "hour_of_extreme_gust", - "nc_name": "gust_hour", - }, - "061": { - "_table_name": {"HLY11"}, - "original_units": "0.001 MJ/m", - "raw_units": "W m-2 h-1", - "units": "W m-2 h-1", - "scale_factor": 1e6 / (60 * 60), - "add_offset": 0, - "long_name": "RF1 global solar radiation", - "standard_name": "solar_radiation_flux", - "nc_name": "rf1_radiation", - }, - "062": { - "_table_name": {"HLY11"}, - "original_units": "0.001 MJ/m", - "raw_units": "W m-2 h-1", - "units": "W m-2 h-1", - "scale_factor": 1e6 / (60 * 60), - "add_offset": 0, - "long_name": "RF2 sky (diffuse) radiation", - "standard_name": "solar_radiation_flux", - "nc_name": "rf2_radiation", - }, - "063": { - "_table_name": {"HLY11"}, - "original_units": "0.001 MJ/m", - "raw_units": "W m-2 h-1", - "units": "W m-2 h-1", - "scale_factor": 1e6 / (60 * 60), - "add_offset": 0, - "long_name": "RF3 reflected solar radiation", - "standard_name": "solar_radiation_flux", - "nc_name": "rf3_radiation", - }, - "064": { - "_table_name": {"HLY11"}, - "original_units": "0.001 MJ/m", - "raw_units": "W m-2 h-1", - "units": "W m-2 h-1", - "scale_factor": 1e6 / (60 * 60), - "add_offset": 0, - "long_name": "RF4 net all wave radiation", - "standard_name": "solar_radiation_flux", - "nc_name": "rf4_radiation", - }, - "067": { - "_table_name": {"HLY11"}, - "original_units": "0.01 Kilolux_hrs", - "raw_units": "lux h", - "units": "lux h", - "scale_factor": 10, - "add_offset": 0, - "long_name": "RF7 daylight illumination", - "standard_name": "solar_radiation_flux", - "nc_name": "rf7_radiation", - }, - "068": { - "_table_name": {"HLY11"}, - "original_units": "0.001 MJ/m", - "raw_units": "W m-2 h-1", - "units": "W m-2 h-1", - "scale_factor": 1e6 / (60 * 60), - "add_offset": 0, - "long_name": "RF8 direct solar radiation", - "standard_name": "solar_radiation_flux", - "nc_name": "rf8_radiation", - }, - "069": { - "_table_name": {"HLY15"}, - "original_units": "10's of degrees", - "raw_units": "deg", - "units": "deg", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Direction - 45B anemometer (8 pts)", - "standard_name": "wind_to_direction", - "nc_name": "wind_dir_45B", - }, - "071": { - "_table_name": {"HLY01"}, - "original_units": "30's of meters", - "raw_units": "m", - "units": "m", - "scale_factor": 30, - "add_offset": 0, - "long_name": "Ceiling height of lowest layer of clouds", - "standard_name": "ceiling_cloud_height", - "nc_name": "ceiling_hgt", - }, - "072": { - "_table_name": {"HLY01"}, - "original_units": "0.1 km", - "raw_units": "km", - "units": "m", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Visibility", - "standard_name": "visibility_in_air", - "nc_name": "visibility", - }, - "073": { - "_table_name": {"HLY01"}, - "original_units": "0.01 kPa", - "raw_units": "Pa", - "units": "Pa", - "scale_factor": 10, - "add_offset": 0, - "long_name": "Sea Level Pressure", - "standard_name": "air_pressure_at_mean_sea_level", - "nc_name": "psl", - }, - "074": { - "_table_name": {"HLY01"}, - "original_units": "0.1 °C", - "raw_units": "degC", - "units": "K", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Dew Point Temperature", - "standard_name": "dew_point_temperature", - "nc_name": "tds", - }, - "075": { - "_table_name": {"HLY01"}, - "original_units": "10's of degrees", - "raw_units": "deg", - "units": "deg", - "scale_factor": 10, - "add_offset": 0, - "long_name": "Wind Direction at 2 m (U2A Anemometer) (16 pts)", - "standard_name": "wind_direction_u2a", - "nc_name": "wind_dir_u2a_16", - }, - "076": { - "_table_name": {"HLY01"}, - "original_units": "km/h", - "raw_units": "km h-1", - "units": "m s-1", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Wind Speed - U2A (16 pts) to December 1970", - "standard_name": "wind_speed_u2a", - "nc_name": "wind_speed_u2a", - }, - "077": { - "_table_name": {"HLY01"}, - "original_units": "0.01 kPa", - "raw_units": "Pa", - "units": "Pa", - "scale_factor": 10, - "add_offset": 0, - "long_name": "Station Pressure", - "standard_name": "atmospheric_pressure", - "nc_name": "pressure", - }, - "078": { - "_table_name": {"HLY01"}, - "original_units": "0.1 °C", - "raw_units": "degC", - "units": "K", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Dry Bulb Temperature", - "standard_name": "dry_bulb_temperature", - "nc_name": "tas_dry", - }, - "079": { - "_table_name": {"HLY01"}, - "original_units": "0.1 °C", - "raw_units": "degC", - "units": "K", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Wet Bulb temperature", - "standard_name": "wet_bulb_temperature", - "nc_name": "tas_wet", - }, - "080": { - "_table_name": {"HLY01"}, - "original_units": "%", - "raw_units": "1", - "units": "1", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Relative Humidity", - "standard_name": "relative_humidity", - "nc_name": "hur", - }, - "081": { - "_table_name": {"HLY01"}, - "original_units": "%", - "raw_units": "1", - "units": "1", - "scale_factor": 10, - "add_offset": 0, - "long_name": "Total Cloud Opacity", - "standard_name": "cloud_albedo", - "nc_name": "clo", - }, - "082": { - "_table_name": {"HLY01"}, - "original_units": "%", - "raw_units": "1", - "units": "1", - "scale_factor": 10, - "add_offset": 0, - "long_name": "Total Cloud Amount", - "standard_name": "cloud_area_fraction", - "nc_name": "clt", - }, - "089": { - "_table_name": {"HLY01"}, - "raw_units": "1", - "units": "1", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Freezing Rain", - "standard_name": "freezing_rain", - "nc_name": "freeze_rain", - }, - "094": { - "_table_name": {"HLY01"}, - "raw_units": "1", - "units": "1", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Ice Pellets", - "standard_name": "ice_pellet_presence", - "nc_name": "ice_pellets", - }, - "107": { - "_table_name": {"HLY01"}, - "original_units": "Tenths", - "raw_units": "1", - "units": "1", - "scale_factor": 10, - "add_offset": 0, - "long_name": "Lowest cloud layer opacity", - "standard_name": "low_type_cloud_opacity_fraction", - "nc_name": "1low_cloud_opac", - }, - "108": { - "_table_name": {"HLY01"}, - "original_units": "Tenths", - "raw_units": "1", - "units": "1", - "scale_factor": 10, - "add_offset": 0, - "long_name": "Lowest cloud layer amount or condition", - "standard_name": "low_type_cloud_area_fraction", - "nc_name": "1low_cloud_frac", - }, - "109": { - "_table_name": {"HLY01"}, - "raw_units": "1", - "units": "1", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Lowest cloud layer type", - "standard_name": "low_type_cloud_type", - "nc_name": "1low_cloud_type", - }, - "110": { - "_table_name": {"HLY01"}, - "original_units": "30's of meters", - "raw_units": "m", - "units": "m", - "scale_factor": 30, - "add_offset": 0, - "long_name": "Lowest cloud layer height", - "standard_name": "low_type_cloud_height", - "nc_name": "1low_cloud_hgt", - }, - "111": { - "_table_name": {"HLY01"}, - "original_units": "Tenths", - "raw_units": "1", - "units": "1", - "scale_factor": 10, - "add_offset": 0, - "long_name": "Second lowest cloud layer opacity", - "standard_name": "low_type_cloud_opacity_fraction", - "nc_name": "2low_cloud_opac", - }, - "112": { - "_table_name": {"HLY01"}, - "original_units": "Tenths", - "raw_units": "1", - "units": "1", - "scale_factor": 10, - "add_offset": 0, - "long_name": "Second lowest cloud layer amount or condition", - "standard_name": "low_type_cloud_area_fraction", - "nc_name": "2low_cloud_frac", - }, - "113": { - "_table_name": {"HLY01"}, - "original_units": "", - "raw_units": "1", - "units": "1", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Second lowest cloud layer type", - "standard_name": "low_type_cloud_type", - "nc_name": "2low_cloud_type", - }, - "114": { - "_table_name": {"HLY01"}, - "original_units": "30's of meters", - "raw_units": "m", - "units": "m", - "scale_factor": 30, - "add_offset": 0, - "long_name": "Second lowest cloud layer height", - "standard_name": "low_type_cloud_height", - "nc_name": "2low_cloud_hgt", - }, - "115": { - "_table_name": {"HLY01"}, - "original_units": "Tenths", - "raw_units": "1", - "units": "1", - "scale_factor": 10, - "add_offset": 0, - "long_name": "Thirsd lowest cloud layer opacity", - "standard_name": "low_type_cloud_opacity_fraction", - "nc_name": "3low_cloud_opac", - }, - "116": { - "_table_name": {"HLY01"}, - "original_units": "Tenths", - "raw_units": "1", - "units": "1", - "scale_factor": 10, - "add_offset": 0, - "long_name": "Third lowest cloud layer amount or condition", - "standard_name": "low_type_cloud_area_fraction", - "nc_name": "3low_cloud_frac", - }, - "117": { - "_table_name": {"HLY01"}, - "original_units": "", - "raw_units": "1", - "units": "1", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Third lowest cloud layer type", - "standard_name": "low_type_cloud_type", - "nc_name": "3low_cloud_type", - }, - "118": { - "_table_name": {"HLY01"}, - "original_units": "30's of meters", - "raw_units": "m", - "units": "m", - "scale_factor": 30, - "add_offset": 0, - "long_name": "Third lowest cloud layer height", - "standard_name": "low_type_cloud_height", - "nc_name": "3low_cloud_hgt", - }, - "123": { - "_table_name": {"HLY01"}, - "original_units": "0.1 mm", - "raw_units": "mm h-1", - "units": "kg m2 s-1", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Total Rainfall", - "standard_name": "rainfall_flux", - "nc_name": "rainfall", - }, - "133": { - "_table_name": {"HLY10"}, - "original_units": "0.1 hrs", - "raw_units": "h", - "units": "s", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Sunshine", - "standard_name": "duration_of_sunshine", - "nc_name": "sun", - }, - "156": { - "_table_name": {"HLY01"}, - "original_units": "10's of degrees", - "raw_units": "deg", - "units": "deg", - "scale_factor": 10, - "long_name": "Wind Direction - U2A (36 pts) from January 1971", - "standard_name": "wind_direction_u2a", - "nc_name": "wind_dir_u2a_36", - }, - "262": { - "_table_name": {"HLY01_RCS"}, - "original_units": "0.1 mm", - "raw_units": "mm", - "units": "kg m-2", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Total Precipitation (minutes 00-60)", - "standard_name": "precipitation_amount", - "nc_name": "prtot", - }, - "263": { - "_table_name": {"HLY01_RCS"}, - "original_units": "0.1 mm", - "raw_units": "mm", - "units": "kg m-2", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Total Precipitation (minutes 00-15)", - "standard_name": "precipitation_amount", - "nc_name": "prtot_q1", - }, - "264": { - "_table_name": {"HLY01_RCS"}, - "original_units": "0.1 mm", - "raw_units": "mm", - "units": "kg m-2", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Total Precipitation (minutes 15-30)", - "standard_name": "precipitation_amount", - "nc_name": "prtot_q2", - }, - "265": { - "_table_name": {"HLY01_RCS"}, - "original_units": "0.1 mm", - "raw_units": "mm", - "units": "kg m-2", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Total Precipitation (minutes 30-45)", - "standard_name": "precipitation_amount", - "nc_name": "prtot_q3", - }, - "266": { - "_table_name": {"HLY01_RCS"}, - "original_units": "0.1 mm", - "raw_units": "mm", - "units": "kg m-2", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Total Precipitation (minutes 45-60)", - "standard_name": "precipitation_amount", - "nc_name": "prtot_q4", - }, - "267": { - "_table_name": {"HLY01_RCS"}, - "original_units": "0.1 kg/m²", - "raw_units": "kg m-2", - "units": "kg m-2", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Precipitation Gauge Weight per Unit Area (at minute 15)", - "standard_name": "precipitation_amount", - "nc_name": "precipitation_weight_q1", - }, - "268": { - "_table_name": {"HLY01_RCS"}, - "original_units": "0.1 kg/m²", - "raw_units": "kg m-2", - "units": "kg m-2", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Precipitation Gauge Weight per Unit Area (at minute 30)", - "standard_name": "precipitation_amount", - "nc_name": "precipitation_weight_q2", - }, - "269": { - "_table_name": {"HLY01_RCS"}, - "original_units": "0.1 kg/m²", - "raw_units": "kg m-2", - "units": "kg m-2", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Precipitation Gauge Weight per Unit Area (at minute 45)", - "standard_name": "precipitation_amount", - "nc_name": "precipitation_weight_q3", - }, - "270": { - "_table_name": {"HLY01_RCS"}, - "original_units": "0.1 kg/m²", - "raw_units": "kg m-2", - "units": "kg m-2", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Precipitation Gauge Weight per Unit Area (at minute 60)", - "standard_name": "precipitation_amount", - "nc_name": "precipitation_weight_q4", - }, - "271": { - "_table_name": {"HLY01_RCS"}, - "original_units": "0.1 km/h", - "raw_units": "km h-1", - "nc_units": "m s-1", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Wind Speed at 2 m (minutes 00-15)", - "standard_name": "wind_speed", - "nc_name": "wind_speed_q1", - }, - "272": { - "_table_name": {"HLY01_RCS"}, - "original_units": "0.1 km/h", - "raw_units": "km h-1", - "nc_units": "m s-1", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Wind Speed at 2 m (minutes 15-30)", - "standard_name": "wind_speed", - "nc_name": "wind_speed_q2", - }, - "273": { - "_table_name": {"HLY01_RCS"}, - "original_units": "0.1 km/h", - "raw_units": "km h-1", - "nc_units": "m s-1", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Wind Speed at 2 m (minutes 30-45)", - "standard_name": "wind_speed", - "nc_name": "wind_speed_q3", - }, - "274": { - "_table_name": {"HLY01_RCS"}, - "original_units": "0.1 km/h", - "raw_units": "km h-1", - "nc_units": "m s-1", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Wind Speed at 2 m (minutes 45-60)", - "standard_name": "wind_speed", - "nc_name": "wind_speed_q4", - }, - "275": { - "_table_name": {"HLY01_RCS"}, - "original_units": "cm", - "raw_units": "cm", - "units": "m", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Snow Depth (at minute 60)", - "standard_name": "surface_snow_thickness", - "nc_name": "snd_q4", - }, - "276": { - "_table_name": {"HLY01_RCS"}, - "original_units": "cm", - "raw_units": "cm", - "units": "m", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Snow Depth (at minute 15)", - "standard_name": "surface_snow_thickness", - "nc_name": "snd_q1", - }, - "277": { - "_table_name": {"HLY01_RCS"}, - "original_units": "cm", - "raw_units": "cm", - "units": "m", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Snow Depth (at minute 30)", - "standard_name": "surface_snow_thickness", - "nc_name": "snd_q2", - }, - "278": { - "_table_name": {"HLY01_RCS"}, - "original_units": "cm", - "raw_units": "cm", - "units": "m", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Snow Depth (at minute 45)", - "standard_name": "surface_snow_thickness", - "nc_name": "snd_q3", - }, - "279": { - "_table_name": {"HLY01_RCS"}, - "original_units": "Degrees", - "raw_units": "deg", - "nc_units": "deg", - "scale_factor": 1, - "add_offset": 0, - "long_name": "Wind Direction at 2 m (minutes 50-60)", - "standard_name": "wind_direction", - "nc_name": "wind_dir", - }, - "280": { - "_table_name": {"HLY01_RCS"}, - "original_units": "0.1 km/h", - "raw_units": "km h-1", - "units": "m s-1", - "scale_factor": 0.1, - "add_offset": 0, - "long_name": "Wind Speed at 2 m (minutes 50-60)", - "standard_name": "wind_speed", - "nc_name": "wind_speed", - }, - } - code = str(variable_code).zfill(3) - if code in ["061"]: - raise NotImplementedError() - try: - variable = ec_hourly_variables[code] - variable["missing_flags"] = "M" - variable["missing_values"] = {-9999, "#####"} - variable["least_significant_digit"] = "" - except KeyError: - logging.error(f"Hourly variable `{code}` not supported.") - raise - return variable diff --git a/miranda/preprocess/_eccc_homogenized.py b/miranda/preprocess/_eccc_homogenized.py index 00a417da..a16b3e14 100644 --- a/miranda/preprocess/_eccc_homogenized.py +++ b/miranda/preprocess/_eccc_homogenized.py @@ -4,7 +4,6 @@ import calendar import logging.config from pathlib import Path -from typing import Any, Dict, List, Tuple, Type import numpy as np import pandas as pd @@ -12,11 +11,11 @@ from miranda.io import write_dataset from miranda.io.utils import name_output_file -from miranda.preprocess._data_definitions import ( - find_project_variable_codes, - load_json_data_mappings, +from miranda.preprocess._data_definitions import find_project_variable_codes +from miranda.preprocess._metadata import ( + eccc_variable_metadata, + homogenized_column_definitions, ) -from miranda.preprocess._treatments import basic_metadata_conversion from miranda.scripting import LOGGING_CONFIG logging.config.dictConfig(LOGGING_CONFIG) @@ -25,153 +24,6 @@ __all__ = ["convert_ahccd", "convert_ahccd_fwf_file", "merge_ahccd"] -def _ahccd_variable_metadata( - variable_code: str, - gen: int, -) -> (dict[str, int | float | str], dict, list[tuple[int, int]], int): - """ - - Parameters - ---------- - variable_code - gen: {1, 2, 3} - - Returns - ------- - dict[str, int or str or float], dict, list[tuple[int, int]], int - """ - generation = {1: "First", 2: "Second", 3: "Third"}.get(gen) - if not generation: - raise NotImplementedError(f"Generation '{gen}' not supported") - - config = load_json_data_mappings("eccc-homogenized") - metadata = basic_metadata_conversion("eccc-homogenized", config) - code = find_project_variable_codes(variable_code, "eccc-homogenized") - - variable_meta = metadata["variables"].get(code) - variable_name = variable_meta.get("_variable_name") - if variable_name: - variable_meta["original_variable_name"] = variable_code - variable_meta = {variable_name: variable_meta} - del variable_meta[variable_name]["_variable_name"] - else: - variable_meta = {variable_code: variable_meta} - - header = metadata["Header"] - to_delete = [] - # Conditional handling of global attributes based on generation - for field in [f for f in header if f.startswith("_")]: - if isinstance(header[field], bool): - if header[field] and field == "_variable": - header[field[1:]] = variable_name - elif isinstance(header[field], dict): - attr_treatment = header[field]["generation"] - if field in ["_citation" "_product"]: - for attribute, value in attr_treatment.items(): - if attribute == generation: - header[field[1:]] = value - else: - raise AttributeError( - f"Attribute treatment configuration for field `{field}` is not properly configured. Verify JSON." - ) - to_delete.append(field) - - for field in to_delete: - del header[field] - - return variable_meta, header - - -def _ahccd_column_definitions( - variable_code: str, -) -> tuple[dict, list[tuple[int, int]], dict[str, type[str | int | float] | Any], int]: - config = load_json_data_mappings("eccc-homogenized") - metadata = basic_metadata_conversion("eccc-homogenized", config) - - variable = metadata["variables"][variable_code]["_variable_name"] - if variable.startswith("tas"): - column_names = [ - "No", - "StnId", - "Station name", - "Prov", - "FromYear", - "FromMonth", - "ToYear", - "ToMonth", - "%Miss", - "Lat(deg)", - "Long(deg)", - "Elev(m)", - "Joined", - "RCS", - ] - dtypes = [ - str, - str, - str, - str, - int, - int, - int, - int, - float, - float, - float, - int, - str, - str, - ] - column_spaces = [(0, 5), (5, 6), (6, 8), (8, 9)] - ii = 9 - # 31 days in a month - for i in range(1, 32): - column_spaces.append((ii, ii + 7)) - ii += 7 - column_spaces.append((ii, ii + 1)) - ii += 1 - header_row = 3 - - elif variable.startswith("pr"): - column_names = [ - "Prov", - "Station name", - "stnid", - "beg yr", - "beg mon", - "end yr", - "end mon", - "lat (deg)", - "long (deg)", - "elev (m)", - "stns joined", - ] - dtypes = [str, str, str, int, int, int, int, float, float, int, str] - column_spaces = [(0, 4), (4, 5), (5, 7), (7, 8)] - ii = 8 - # 31 days in a month - for i in range(1, 32): - column_spaces.append((ii, ii + 8)) - ii += 8 - column_spaces.append((ii, ii + 1)) - ii += 1 - header_row = 0 - - else: - raise KeyError - - column_names = { - col.lower().split("(")[0].replace("%", "pct_").strip().replace(" ", "_"): col - for col in list(column_names) - } - # - column_dtypes = {} - for col in column_names.keys(): - column_dtypes[col] = dtypes[list(column_names.keys()).index(col)] - - return column_names, column_spaces, column_dtypes, header_row - - def convert_ahccd_fwf_file( ff: Path | str, metadata: pd.DataFrame, @@ -194,8 +46,12 @@ def convert_ahccd_fwf_file( """ code = find_project_variable_codes(variable, "eccc-homogenized") - variable_meta, global_attrs = _ahccd_variable_metadata(code, generation) - column_names, column_spaces, column_dtypes, header = _ahccd_column_definitions(code) + variable_meta, global_attrs = eccc_variable_metadata( + code, "eccc-homogenized", generation + ) + column_names, column_spaces, column_dtypes, header = homogenized_column_definitions( + code + ) df = pd.read_fwf(ff, header=header, colspecs=column_spaces, dtype=column_dtypes) if "pr" in variable: @@ -324,20 +180,21 @@ def convert_ahccd( output_dir.mkdir(parents=True, exist_ok=True) code = find_project_variable_codes(variable, "eccc-homogenized") - var_meta, global_attrs = _ahccd_variable_metadata(code, generation) + var_meta, global_attrs = eccc_variable_metadata( + code, "eccc-homogenized", generation + ) ( column_names, column_spaces, column_dtypes, header_row, - ) = _ahccd_column_definitions(code) + ) = homogenized_column_definitions(code) gen = {2: "Second", 3: "Third"}.get(generation) if generation == 3 and code in {"dx", "dn", "dm"}: station_meta = "ahccd_gen3_temperature.csv" elif generation == 2 and code in {"dt", "ds", "dr"}: station_meta = "ahccd_gen2_precipitation.csv" - else: raise NotImplementedError(f"Code '{code} for generation {gen}.") metadata_source = ( diff --git a/miranda/preprocess/_eccc_obs.py b/miranda/preprocess/_eccc_obs.py index f4601058..6ff64d8f 100644 --- a/miranda/preprocess/_eccc_obs.py +++ b/miranda/preprocess/_eccc_obs.py @@ -1,33 +1,18 @@ """Specialized conversion tools for Environment and Climate Change Canada / Meteorological Service of Canada data.""" -###################################################################### -# S.Biner, Ouranos, mai 2019 -# -# methodologie -# -# 1) on rassemble les fichiers netcdf des differentes eccc en un seul fichier netCDF. -# -# 2) on scan les fichiers sources annuels en cherchant une variable et on sauve -# ce qu'on trouve dans des fichiers netcdf. On applique aussi les flags -# et on fait les changements d'unites -# -# obtenu via http://climate.weather.gc.ca/index_e.html en cliquant sur 'about the data' -####################################################################### from __future__ import annotations -import contextlib import functools import logging import multiprocessing as mp import os import re -import sys import tempfile import time from calendar import monthrange from datetime import datetime as dt from logging import config from pathlib import Path -from typing import Any, List +from typing import Any import dask.dataframe as dd import numpy as np @@ -37,10 +22,13 @@ from xclim.core.units import convert_units_to from miranda.archive import group_by_length -from miranda.preprocess._data_definitions import load_json_data_mappings +from miranda.preprocess._data_definitions import ( + find_project_variable_codes, + load_json_data_mappings, +) +from miranda.preprocess._metadata import eccc_variable_metadata, obs_column_definitions from miranda.scripting import LOGGING_CONFIG -from miranda.storage import file_size, report_file_size -from miranda.utils import generic_extract_archive +from miranda.vocabularies.eccc import obs_vocabularies config.dictConfig(LOGGING_CONFIG) @@ -52,32 +40,58 @@ TABLE_DATE = dt.now().strftime("%d %B %Y") -def _obs_fwf_column_definitions( - time_frequency: str, -) -> tuple[list[str], list[int], list[type[str | int]]]: - """Return the column names, widths, and data types for the fixed-width format.""" - if time_frequency.lower() in ["h", "hour", "hourly"]: - num_observations = 24 - column_names = ["code", "year", "month", "day", "code_var"] - column_widths = [7, 4, 2, 2, 3] - column_dtypes = [str, int, int, int, str] - elif time_frequency.lower() in ["d", "day", "daily"]: - num_observations = 31 - column_names = ["code", "year", "month", "code_var"] - column_widths = [7, 4, 2, 3] - column_dtypes = [str, int, int, str] +def convert_observation( + data_source: str | Path | list[str | Path], + output_dir: str | Path, + variable: str, + *, + generation: int | None = None, + merge: bool = False, + overwrite: bool = False, +): + """Convert a single station's data from the fixed-width format to a netCDF file.""" + + output_dir = Path(output_dir).resolve().joinpath(variable) + output_dir.mkdir(parents=True, exist_ok=True) + + code = find_project_variable_codes(variable, "eccc-obs") + var_meta, global_attrs = eccc_variable_metadata(code, "eccc-obs", generation) + ( + column_names, + column_spaces, + column_dtypes, + header_row, + ) = obs_column_definitions(code) + + archives = list() + if isinstance(data_source, list) or Path(data_source).is_file(): + archives.append(data_source) else: - raise NotImplementedError("`mode` must be 'h'/'hourly or 'd'/'daily'.") + tables = [] + for repository in obs_vocabularies: + if code in repository.values(): + tables.append(str(repository.keys())) + logging.info( + f"Collecting files for variable '{variable}'. " + f"Filename patterns containing variable code '{code}: {', '.join(tables)}'." + ) + for table in tables: + archives.extend([f for f in Path(data_source).rglob(f"{table}*.gz")]) + + # Create the output directory + output_variable_dir = Path(output_dir).joinpath(variable) + output_variable_dir.mkdir(parents=True, exist_ok=True) - # Add the data columns - for i in range(1, num_observations + 1): - data_entry, flag_entry = f"D{i:0n}", f"F{i:0n}" - column_names.append(data_entry) - column_names.append(flag_entry) - column_widths.extend([6, 1] * num_observations) - column_dtypes.extend([str, str]) + # Loop on the files + errored_files = [] + for file in archives: + # FIXME: convert the file using the appropriate function + pass - return column_names, column_widths, column_dtypes + if errored_files: + logging.warning( + "Some files failed to be properly parsed:\n", ", ".join(errored_files) + ) def _remove_duplicates(ds): @@ -91,6 +105,7 @@ def _remove_duplicates(ds): def convert_station( data: str | os.PathLike, + variable: str, mode: str, using_dask_array: bool = False, *, @@ -99,11 +114,13 @@ def convert_station( ): """Convert a single station's data from the fixed-width format to a netCDF file.""" data = Path(data) - column_names, column_widths, column_dtypes = _obs_fwf_column_definitions(mode) + variable_code = find_project_variable_codes(variable, "eccc-obs") + column_names, column_widths, column_dtypes, header = obs_column_definitions(mode) if using_dask_array: pandas_reader = dd - chunks = dict(blocksize=200 * MiB) + # set the blocksize to 200 MB + chunks = dict(blocksize=200 * 2**20) else: pandas_reader = pd chunks = dict() @@ -148,12 +165,12 @@ def convert_station( has_variable_codes = (df_code["code_var"] == variable_code).any() if not has_variable_codes: logging.info( - f"Variable `{nc_name}` not found for station code: {code} in file {data}. Continuing..." + f"Variable `{variable}` not found for station code: {code} in file {data}. Continuing..." ) continue # Perform the data treatment - logging.info(f"Converting `{nc_name}` for station code: {code}") + logging.info(f"Converting `{variable}` for station code: {code}") # Dump the data into a DataFrame df_var = df_code[df_code["code_var"] == variable_code].copy() @@ -328,62 +345,6 @@ def convert_station( del df -def _convert_station_file( - file: Path, - output_path: Path, - errored_files: list[Path], - mode: str, - add_offset: float, - long_name: str, - missing_flags: set[str], - missing_values: set[str], - nc_name: str, - raw_units: str, - units: str, - scale_factor: float, - standard_name: str, - variable_code: str, - **dask_kwargs, -): - if not missing_values: - missing_values = {-9999, "#####"} - - with tempfile.TemporaryDirectory() as temp_folder: - if file.suffix in [".gz", ".tar", ".zip", ".7z"]: - data_files = generic_extract_archive(file, output_dir=temp_folder) - else: - data_files = [file] - logging.info(f"Processing file: {file}.") - - # 1 GiB - size_limit = 2**30 - - for data in data_files: - if file_size(data) > size_limit and "dask" in sys.modules: - logging.info( - f"File exceeds {report_file_size(size_limit)} - Using dask.dataframes." - ) - client = ProgressBar - using_dask = True - else: - logging.info( - f"File below {report_file_size(size_limit)} - Using pandas.dataframes." - ) - client = contextlib.nullcontext - using_dask = False - - with client(**dask_kwargs) as c: - try: - convert_station(data, mode, using_dask=using_dask, client=c) - except FileNotFoundError: - errored_files.append(data) - - if os.listdir(temp_folder): - for temporary_file in Path(temp_folder).glob("*"): - if temporary_file in data_files: - temporary_file.unlink() - - def merge_stations( source_files: str | os.PathLike | None = None, output_folder: str | os.PathLike | None = None, diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py new file mode 100644 index 00000000..a663c25a --- /dev/null +++ b/miranda/preprocess/_metadata.py @@ -0,0 +1,190 @@ +from __future__ import annotations + +import logging +from typing import Any + +from miranda import __version__ as __miranda_version__ +from miranda.preprocess._data_definitions import ( + find_project_variable_codes, + load_json_data_mappings, +) + +__all__ = [ + "eccc_variable_metadata", + "homogenized_column_definitions", + "obs_column_definitions", +] + + +def eccc_variable_metadata( + variable_code: str, project: str, gen: int | None = None +) -> (dict[str, int | float | str], dict, list[tuple[int, int]], int): + """ + + Parameters + ---------- + variable_code: str + project: {"eccc-homogenized", "eccc-obs", "eccc-obs-summary"} + gen: {1, 2, 3}, optional + + Returns + ------- + dict[str, int or str or float], dict, list[tuple[int, int]], int + """ + if project == "eccc-homogenized": + generation = {1: "First", 2: "Second", 3: "Third"}.get(gen) + if not generation: + raise NotImplementedError(f"Generation '{gen}' not supported") + else: + generation = None + + metadata = load_json_data_mappings(project) + code = find_project_variable_codes(variable_code, project) + + # Variable metadata + variable_meta = metadata["variables"].get(code) + variable_name = variable_meta.get("_variable_name") + if variable_name: + variable_meta["original_variable_name"] = variable_code + variable_meta = {variable_name: variable_meta} + del variable_meta[variable_name]["_variable_name"] + else: + variable_meta = {variable_code: variable_meta} + + # Dataset metadata + header = metadata.get("Header") + # Static handling of version global attributes + miranda_version = header.get("_miranda_version") + if miranda_version: + if isinstance(miranda_version, bool): + header["miranda_version"] = __miranda_version__ + elif isinstance(miranda_version, dict): + if project in miranda_version.keys(): + header["miranda_version"] = __miranda_version__ + else: + logging.warning( + f"`_miranda_version` not set for project `{project}`. Not appending." + ) + if "_miranda_version" in header: + del header["_miranda_version"] + + to_delete = [] + # Conditional handling of global attributes based on fields + for field in [f for f in header if f.startswith("_")]: + if isinstance(header[field], bool): + if header[field] and field == "_variable": + header[field[1:]] = variable_name + elif isinstance(header[field], dict) and generation: + attr_treatment = header[field]["generation"] + if field in ["_citation" "_product"]: + for attribute, value in attr_treatment.items(): + if attribute == generation: + header[field[1:]] = value + else: + raise AttributeError( + f"Attribute treatment configuration for field `{field}` is not properly configured. Verify JSON." + ) + to_delete.append(field) + + for field in to_delete: + del header[field] + + return variable_meta, header + + +def homogenized_column_definitions( + variable_code: str, +) -> tuple[dict, list[tuple[int, int]], dict[str, type[str | int | float] | Any], int]: + metadata = load_json_data_mappings("eccc-homogenized") + + variable = metadata["variables"][variable_code]["_variable_name"] + if variable.startswith("tas"): + column_dtypes = { + "No": str, + "StnId": str, + "Station name": str, + "Prov": str, + "FromYear": int, + "FromMonth": int, + "ToYear": int, + "ToMonth": int, + "%Miss": float, + "Lat(deg)": float, + "Long(deg)": float, + "Elev(m)": int, + "Joined": str, + "RCS": str, + } + column_spaces = [(0, 5), (5, 6), (6, 8), (8, 9)] + ii = 9 + # 31 days in a month + for i in range(1, 32): + column_spaces.append((ii, ii + 7)) + ii += 7 + column_spaces.append((ii, ii + 1)) + ii += 1 + header_row = 3 + + elif variable.startswith("pr"): + column_dtypes = { + "Prov": str, + "Station name": str, + "stnid": str, + "beg yr": int, + "beg mon": int, + "end yr": int, + "end mon": int, + "lat (deg)": float, + "long (deg)": float, + "elev (m)": int, + "stns joined": str, + } + column_spaces = [(0, 4), (4, 5), (5, 7), (7, 8)] + ii = 8 + # 31 days in a month + for i in range(1, 32): + column_spaces.append((ii, ii + 8)) + ii += 8 + column_spaces.append((ii, ii + 1)) + ii += 1 + header_row = 0 + + else: + raise KeyError + + column_names = { + col.lower().split("(")[0].replace("%", "pct_").strip().replace(" ", "_"): col + for col in list(column_dtypes.keys()) + } + + return column_names, column_spaces, column_dtypes, header_row + + +def obs_column_definitions( + time_frequency: str, +) -> tuple[list[str], list[int], list[type[str | int]], int]: + """Return the column names, widths, and data types for the fixed-width format.""" + if time_frequency.lower() in ["h", "hour", "hourly"]: + num_observations = 24 + column_names = ["code", "year", "month", "day", "code_var"] + column_widths = [7, 4, 2, 2, 3] + column_dtypes = [str, int, int, int, str] + elif time_frequency.lower() in ["d", "day", "daily"]: + num_observations = 31 + column_names = ["code", "year", "month", "code_var"] + column_widths = [7, 4, 2, 3] + column_dtypes = [str, int, int, str] + else: + raise NotImplementedError("`mode` must be 'h'/'hourly or 'd'/'daily'.") + + header = 0 + + # Add the data columns + for i in range(1, num_observations + 1): + data_entry, flag_entry = f"D{i:0n}", f"F{i:0n}" + column_names.append(data_entry) + column_names.append(flag_entry) + column_widths.extend([6, 1] * num_observations) + column_dtypes.extend([str, str]) + + return column_names, column_widths, column_dtypes, header diff --git a/miranda/preprocess/_treatments.py b/miranda/preprocess/_treatments.py deleted file mode 100644 index 9e667440..00000000 --- a/miranda/preprocess/_treatments.py +++ /dev/null @@ -1,42 +0,0 @@ -from __future__ import annotations - -import logging -from typing import Any - -from miranda import __version__ as __miranda_version__ - - -def basic_metadata_conversion( - project: str, metadata: dict -) -> dict[str, dict[str, Any]]: - """Present basic metadata conversion. - - Parameters - ---------- - project : str - Dataset project name. - metadata : dict - Metadata definition dictionary for project and variable(s). - - Returns - ------- - xarray.Dataset - """ - header = metadata["Header"] - - # Static handling of version global attributes - miranda_version = header.get("_miranda_version") - if miranda_version: - if isinstance(miranda_version, bool): - header["miranda_version"] = __miranda_version__ - elif isinstance(miranda_version, dict): - if project in miranda_version.keys(): - header["miranda_version"] = __miranda_version__ - else: - logging.warning( - f"`_miranda_version` not set for project `{project}`. Not appending." - ) - if "_miranda_version" in header: - del header["_miranda_version"] - - return metadata diff --git a/miranda/preprocess/configs/eccc-obs_attrs.json b/miranda/preprocess/configs/eccc-obs_attrs.json index 82d93e55..7eb3b7b6 100644 --- a/miranda/preprocess/configs/eccc-obs_attrs.json +++ b/miranda/preprocess/configs/eccc-obs_attrs.json @@ -1,6 +1,5 @@ { "Header": { - "_converter": true, "_frequency": true, "_miranda_version": true, "_missing_flags": "M", @@ -25,5 +24,734 @@ "title": "Environment and Climate Change Canada (ECCC) weather station observations", "type": "station-obs", "usage": "The original data is owned by the Government of Canada (Environment and Climate Change Canada), and falls under the licence agreement for use of Environment and Climate Change Canada data" + }, + "variables": { + "001": { + "_variable_name": "tasmax", + "least_significant_digit": "", + "long_name": "Daily Maximum Temperature", + "original_units": "0.1 °C", + "raw_units": "degC", + "scale_factor": 0.1, + "standard_name": "air_temperature_maximum", + "units": "K" + }, + "002": { + "_variable_name": "tasmin", + "least_significant_digit": "", + "long_name": "Daily Minimum Temperature", + "original_units": "0.1 °C", + "raw_units": "degC", + "scale_factor": 0.1, + "standard_name": "air_temperature_minimum", + "units": "K" + }, + "003": { + "_variable_name": "tas", + "least_significant_digit": "", + "long_name": "Daily Mean Temperature", + "original_units": "0.1 °C", + "raw_units": "degC", + "scale_factor": 0.1, + "standard_name": "air_temperature", + "units": "K" + }, + "010": { + "_variable_name": "prlptot", + "least_significant_digit": "", + "long_name": "Daily Total Rainfall", + "original_units": "0.1 mm day-1", + "raw_units": "mm", + "scale_factor": 0.1, + "standard_name": "liquid_precipitation_amount", + "units": "m" + }, + "011": { + "_variable_name": "prsntot", + "least_significant_digit": "", + "long_name": "Daily Total Snowfall", + "original_units": "0.1 cm day-1", + "raw_units": "cm", + "scale_factor": 0.1, + "standard_name": "solid_precipitation_amount", + "units": "m" + }, + "012": { + "_variable_name": "prcptot", + "least_significant_digit": "", + "long_name": "Daily Total Precipitation", + "original_units": "0.1 mm day-1", + "raw_units": "mm", + "scale_factor": 0.1, + "standard_name": "precipitation_amount", + "units": "m" + }, + "013": { + "_variable_name": "sndtot", + "least_significant_digit": "", + "long_name": "Snow on the Ground", + "original_units": "cm", + "raw_units": "cm", + "scale_factor": 1, + "standard_name": "surface_snow_thickness", + "units": "m" + }, + "014": { + "_variable_name": "thunder", + "least_significant_digit": "", + "long_name": "Thunderstorms", + "raw_units": "1", + "scale_factor": 1, + "standard_name": "thunderstorm_presence", + "units": "1" + }, + "015": { + "_variable_name": "freezing_rain_drizzle", + "least_significant_digit": "", + "long_name": "Freezing rain or drizzle", + "raw_units": "1", + "scale_factor": 1, + "standard_name": "freeze_rain_drizzle_presence", + "units": "1" + }, + "016": { + "_variable_name": "hail", + "least_significant_digit": "", + "long_name": "Hail", + "raw_units": "1", + "scale_factor": 1, + "standard_name": "hail_presence", + "units": "1" + }, + "017": { + "_variable_name": "fog_ice_fog", + "least_significant_digit": "", + "long_name": "Fog or Ice Fog", + "raw_units": "1", + "scale_factor": 1, + "standard_name": "fog_ice_fog_presence", + "units": "1" + }, + "018": { + "_variable_name": "smoke_haze", + "least_significant_digit": "", + "long_name": "Smoke or Haze", + "raw_units": "1", + "scale_factor": 1, + "standard_name": "smoke_haze_presence", + "units": "1" + }, + "019": { + "_variable_name": "blowing_dust_sand", + "least_significant_digit": "", + "long_name": "Blowing Dust or Sand", + "raw_units": "1", + "scale_factor": 1, + "standard_name": "blowing_dust_sand_presence", + "units": "1" + }, + "020": { + "_variable_name": "blow_snow", + "least_significant_digit": "", + "long_name": "Blowing snow", + "raw_units": "1", + "scale_factor": 1, + "standard_name": "blowing_snow_presence", + "units": "1" + }, + "021": { + "_variable_name": "wind_gt_28kt", + "least_significant_digit": "", + "long_name": "Wind speed >= 28 Knots", + "raw_units": "1", + "scale_factor": 1, + "standard_name": "wind_exceeding_28_knots", + "units": "1" + }, + "022": { + "_variable_name": "wind_gt_34kt", + "least_significant_digit": "", + "long_name": "Wind speed >= 34 Knots", + "raw_units": "1", + "scale_factor": 1, + "standard_name": "wind_exceeding_34_knots", + "units": "1" + }, + "023": { + "_variable_name": "gust_dir_16pts", + "least_significant_digit": "", + "long_name": "Direction of extreme gust (16 pts) to December 1976", + "original_units": "10's of degrees", + "raw_units": "deg", + "scale_factor": 10, + "standard_name": "gust_to_direction", + "units": "deg" + }, + "024": { + "_variable_name": "gust_speed", + "least_significant_digit": "", + "long_name": "Speed of extreme gust", + "original_units": "km/h", + "raw_units": "km h-1", + "scale_factor": 1, + "standard_name": "wind_speed_of_gust", + "units": "m s-1" + }, + "025": { + "_variable_name": "gust_hour", + "least_significant_digit": "", + "long_name": "UTC hour of extreme gust", + "raw_units": "h", + "scale_factor": 1, + "standard_name": "hour_of_extreme_gust", + "units": "h" + }, + "061": { + "_variable_name": "rf1_radiation", + "least_significant_digit": "", + "long_name": "RF1 global solar radiation", + "original_units": "0.001 MJ/m", + "raw_units": "W m-2 h-1", + "scale_factor": 277.77777777777777, + "standard_name": "solar_radiation_flux", + "units": "W m-2 h-1" + }, + "062": { + "_variable_name": "rf2_radiation", + "least_significant_digit": "", + "long_name": "RF2 sky (diffuse) radiation", + "original_units": "0.001 MJ/m", + "raw_units": "W m-2 h-1", + "scale_factor": 277.77777777777777, + "standard_name": "solar_radiation_flux", + "units": "W m-2 h-1" + }, + "063": { + "_variable_name": "rf3_radiation", + "least_significant_digit": "", + "long_name": "RF3 reflected solar radiation", + "original_units": "0.001 MJ/m", + "raw_units": "W m-2 h-1", + "scale_factor": 277.77777777777777, + "standard_name": "solar_radiation_flux", + "units": "W m-2 h-1" + }, + "064": { + "_variable_name": "rf4_radiation", + "least_significant_digit": "", + "long_name": "RF4 net all wave radiation", + "original_units": "0.001 MJ/m", + "raw_units": "W m-2 h-1", + "scale_factor": 277.77777777777777, + "standard_name": "solar_radiation_flux", + "units": "W m-2 h-1" + }, + "067": { + "_variable_name": "rf7_radiation", + "least_significant_digit": "", + "long_name": "RF7 daylight illumination", + "original_units": "0.01 Kilolux_hrs", + "raw_units": "lux h", + "scale_factor": 10, + "standard_name": "solar_radiation_flux", + "units": "lux h" + }, + "068": { + "_variable_name": "rf8_radiation", + "least_significant_digit": "", + "long_name": "RF8 direct solar radiation", + "original_units": "0.001 MJ/m", + "raw_units": "W m-2 h-1", + "scale_factor": 277.77777777777777, + "standard_name": "solar_radiation_flux", + "units": "W m-2 h-1" + }, + "069": { + "_variable_name": "wind_dir_45B", + "least_significant_digit": "", + "long_name": "Direction - 45B anemometer (8 pts)", + "original_units": "10's of degrees", + "raw_units": "deg", + "scale_factor": 1, + "standard_name": "wind_to_direction", + "units": "deg" + }, + "071": { + "_variable_name": "ceiling_hgt", + "least_significant_digit": "", + "long_name": "Ceiling height of lowest layer of clouds", + "original_units": "30's of meters", + "raw_units": "m", + "scale_factor": 30, + "standard_name": "ceiling_cloud_height", + "units": "m" + }, + "072": { + "_variable_name": "visibility", + "least_significant_digit": "", + "long_name": "Visibility", + "original_units": "0.1 km", + "raw_units": "km", + "scale_factor": 0.1, + "standard_name": "visibility_in_air", + "units": "m" + }, + "073": { + "_variable_name": "psl", + "least_significant_digit": "", + "long_name": "Sea Level Pressure", + "original_units": "0.01 kPa", + "raw_units": "Pa", + "scale_factor": 10, + "standard_name": "air_pressure_at_mean_sea_level", + "units": "Pa" + }, + "074": { + "_variable_name": "tds", + "least_significant_digit": "", + "long_name": "Dew Point Temperature", + "original_units": "0.1 °C", + "raw_units": "degC", + "scale_factor": 0.1, + "standard_name": "dew_point_temperature", + "units": "K" + }, + "075": { + "_variable_name": "wind_dir_u2a_16", + "least_significant_digit": "", + "long_name": "Wind Direction at 2 m (U2A Anemometer) (16 pts)", + "original_units": "10's of degrees", + "raw_units": "deg", + "scale_factor": 10, + "standard_name": "wind_direction_u2a", + "units": "deg" + }, + "076": { + "_variable_name": "wind_speed_u2a", + "least_significant_digit": "", + "long_name": "Wind Speed - U2A (16 pts) to December 1970", + "original_units": "km/h", + "raw_units": "km h-1", + "scale_factor": 1, + "standard_name": "wind_speed_u2a", + "units": "m s-1" + }, + "077": { + "_variable_name": "pressure", + "least_significant_digit": "", + "long_name": "Station Pressure", + "original_units": "0.01 kPa", + "raw_units": "Pa", + "scale_factor": 10, + "standard_name": "atmospheric_pressure", + "units": "Pa" + }, + "078": { + "_variable_name": "tas_dry", + "least_significant_digit": "", + "long_name": "Dry Bulb Temperature", + "original_units": "0.1 °C", + "raw_units": "degC", + "scale_factor": 0.1, + "standard_name": "dry_bulb_temperature", + "units": "K" + }, + "079": { + "_variable_name": "tas_wet", + "least_significant_digit": "", + "long_name": "Wet Bulb temperature", + "original_units": "0.1 °C", + "raw_units": "degC", + "scale_factor": 0.1, + "standard_name": "wet_bulb_temperature", + "units": "K" + }, + "080": { + "_variable_name": "hur", + "least_significant_digit": "", + "long_name": "Relative Humidity", + "original_units": "%", + "raw_units": "1", + "scale_factor": 1, + "standard_name": "relative_humidity", + "units": "1" + }, + "081": { + "_variable_name": "clo", + "least_significant_digit": "", + "long_name": "Total Cloud Opacity", + "original_units": "%", + "raw_units": "1", + "scale_factor": 10, + "standard_name": "cloud_albedo", + "units": "1" + }, + "082": { + "_variable_name": "clt", + "least_significant_digit": "", + "long_name": "Total Cloud Amount", + "original_units": "%", + "raw_units": "1", + "scale_factor": 10, + "standard_name": "cloud_area_fraction", + "units": "1" + }, + "089": { + "_variable_name": "freeze_rain", + "least_significant_digit": "", + "long_name": "Freezing Rain", + "raw_units": "1", + "scale_factor": 1, + "standard_name": "freezing_rain", + "units": "1" + }, + "094": { + "_variable_name": "ice_pellets", + "least_significant_digit": "", + "long_name": "Ice Pellets", + "raw_units": "1", + "scale_factor": 1, + "standard_name": "ice_pellet_presence", + "units": "1" + }, + "107": { + "_variable_name": "1low_cloud_opac", + "least_significant_digit": "", + "long_name": "Lowest cloud layer opacity", + "original_units": "Tenths", + "raw_units": "1", + "scale_factor": 10, + "standard_name": "low_type_cloud_opacity_fraction", + "units": "1" + }, + "108": { + "_variable_name": "1low_cloud_frac", + "least_significant_digit": "", + "long_name": "Lowest cloud layer amount or condition", + "original_units": "Tenths", + "raw_units": "1", + "scale_factor": 10, + "standard_name": "low_type_cloud_area_fraction", + "units": "1" + }, + "109": { + "_variable_name": "1low_cloud_type", + "least_significant_digit": "", + "long_name": "Lowest cloud layer type", + "raw_units": "1", + "scale_factor": 1, + "standard_name": "low_type_cloud_type", + "units": "1" + }, + "110": { + "_variable_name": "1low_cloud_hgt", + "least_significant_digit": "", + "long_name": "Lowest cloud layer height", + "original_units": "30's of meters", + "raw_units": "m", + "scale_factor": 30, + "standard_name": "low_type_cloud_height", + "units": "m" + }, + "111": { + "_variable_name": "2low_cloud_opac", + "least_significant_digit": "", + "long_name": "Second lowest cloud layer opacity", + "original_units": "Tenths", + "raw_units": "1", + "scale_factor": 10, + "standard_name": "low_type_cloud_opacity_fraction", + "units": "1" + }, + "112": { + "_variable_name": "2low_cloud_frac", + "least_significant_digit": "", + "long_name": "Second lowest cloud layer amount or condition", + "original_units": "Tenths", + "raw_units": "1", + "scale_factor": 10, + "standard_name": "low_type_cloud_area_fraction", + "units": "1" + }, + "113": { + "_variable_name": "2low_cloud_type", + "least_significant_digit": "", + "long_name": "Second lowest cloud layer type", + "original_units": "", + "raw_units": "1", + "scale_factor": 1, + "standard_name": "low_type_cloud_type", + "units": "1" + }, + "114": { + "_variable_name": "2low_cloud_hgt", + "least_significant_digit": "", + "long_name": "Second lowest cloud layer height", + "original_units": "30's of meters", + "raw_units": "m", + "scale_factor": 30, + "standard_name": "low_type_cloud_height", + "units": "m" + }, + "115": { + "_variable_name": "3low_cloud_opac", + "least_significant_digit": "", + "long_name": "Thirsd lowest cloud layer opacity", + "original_units": "Tenths", + "raw_units": "1", + "scale_factor": 10, + "standard_name": "low_type_cloud_opacity_fraction", + "units": "1" + }, + "116": { + "_variable_name": "3low_cloud_frac", + "least_significant_digit": "", + "long_name": "Third lowest cloud layer amount or condition", + "original_units": "Tenths", + "raw_units": "1", + "scale_factor": 10, + "standard_name": "low_type_cloud_area_fraction", + "units": "1" + }, + "117": { + "_variable_name": "3low_cloud_type", + "least_significant_digit": "", + "long_name": "Third lowest cloud layer type", + "original_units": "", + "raw_units": "1", + "scale_factor": 1, + "standard_name": "low_type_cloud_type", + "units": "1" + }, + "118": { + "_variable_name": "3low_cloud_hgt", + "least_significant_digit": "", + "long_name": "Third lowest cloud layer height", + "original_units": "30's of meters", + "raw_units": "m", + "scale_factor": 30, + "standard_name": "low_type_cloud_height", + "units": "m" + }, + "123": { + "_variable_name": "rainfall", + "least_significant_digit": "", + "long_name": "Total Rainfall", + "original_units": "0.1 mm", + "raw_units": "mm h-1", + "scale_factor": 0.1, + "standard_name": "rainfall_flux", + "units": "kg m2 s-1" + }, + "133": { + "_variable_name": "sun", + "least_significant_digit": "", + "long_name": "Sunshine", + "original_units": "0.1 hrs", + "raw_units": "h", + "scale_factor": 0.1, + "standard_name": "duration_of_sunshine", + "units": "s" + }, + "156": { + "_variable_name": "wind_dir_u2a_36", + "least_significant_digit": "", + "long_name": "Wind Direction - U2A (36 pts) from January 1971", + "original_units": "10's of degrees", + "raw_units": "deg", + "scale_factor": 10, + "standard_name": "wind_direction_u2a", + "units": "deg" + }, + "262": { + "_variable_name": "prtot", + "least_significant_digit": "", + "long_name": "Total Precipitation (minutes 00-60)", + "original_units": "0.1 mm", + "raw_units": "mm", + "scale_factor": 0.1, + "standard_name": "precipitation_amount", + "units": "kg m-2" + }, + "263": { + "_variable_name": "prtot_q1", + "least_significant_digit": "", + "long_name": "Total Precipitation (minutes 00-15)", + "original_units": "0.1 mm", + "raw_units": "mm", + "scale_factor": 0.1, + "standard_name": "precipitation_amount", + "units": "kg m-2" + }, + "264": { + "_variable_name": "prtot_q2", + "least_significant_digit": "", + "long_name": "Total Precipitation (minutes 15-30)", + "original_units": "0.1 mm", + "raw_units": "mm", + "scale_factor": 0.1, + "standard_name": "precipitation_amount", + "units": "kg m-2" + }, + "265": { + "_variable_name": "prtot_q3", + "least_significant_digit": "", + "long_name": "Total Precipitation (minutes 30-45)", + "original_units": "0.1 mm", + "raw_units": "mm", + "scale_factor": 0.1, + "standard_name": "precipitation_amount", + "units": "kg m-2" + }, + "266": { + "_variable_name": "prtot_q4", + "least_significant_digit": "", + "long_name": "Total Precipitation (minutes 45-60)", + "original_units": "0.1 mm", + "raw_units": "mm", + "scale_factor": 0.1, + "standard_name": "precipitation_amount", + "units": "kg m-2" + }, + "267": { + "_variable_name": "precipitation_weight_q1", + "least_significant_digit": "", + "long_name": "Precipitation Gauge Weight per Unit Area (at minute 15)", + "original_units": "0.1 kg/m²", + "raw_units": "kg m-2", + "scale_factor": 0.1, + "standard_name": "precipitation_amount", + "units": "kg m-2" + }, + "268": { + "_variable_name": "precipitation_weight_q2", + "least_significant_digit": "", + "long_name": "Precipitation Gauge Weight per Unit Area (at minute 30)", + "original_units": "0.1 kg/m²", + "raw_units": "kg m-2", + "scale_factor": 0.1, + "standard_name": "precipitation_amount", + "units": "kg m-2" + }, + "269": { + "_variable_name": "precipitation_weight_q3", + "least_significant_digit": "", + "long_name": "Precipitation Gauge Weight per Unit Area (at minute 45)", + "original_units": "0.1 kg/m²", + "raw_units": "kg m-2", + "scale_factor": 0.1, + "standard_name": "precipitation_amount", + "units": "kg m-2" + }, + "270": { + "_variable_name": "precipitation_weight_q4", + "least_significant_digit": "", + "long_name": "Precipitation Gauge Weight per Unit Area (at minute 60)", + "original_units": "0.1 kg/m²", + "raw_units": "kg m-2", + "scale_factor": 0.1, + "standard_name": "precipitation_amount", + "units": "kg m-2" + }, + "271": { + "_variable_name": "wind_speed_q1", + "least_significant_digit": "", + "long_name": "Wind Speed at 2 m (minutes 00-15)", + "nc_units": "m s-1", + "original_units": "0.1 km/h", + "raw_units": "km h-1", + "scale_factor": 0.1, + "standard_name": "wind_speed" + }, + "272": { + "_variable_name": "wind_speed_q2", + "least_significant_digit": "", + "long_name": "Wind Speed at 2 m (minutes 15-30)", + "nc_units": "m s-1", + "original_units": "0.1 km/h", + "raw_units": "km h-1", + "scale_factor": 0.1, + "standard_name": "wind_speed" + }, + "273": { + "_variable_name": "wind_speed_q3", + "least_significant_digit": "", + "long_name": "Wind Speed at 2 m (minutes 30-45)", + "nc_units": "m s-1", + "original_units": "0.1 km/h", + "raw_units": "km h-1", + "scale_factor": 0.1, + "standard_name": "wind_speed" + }, + "274": { + "_variable_name": "wind_speed_q4", + "least_significant_digit": "", + "long_name": "Wind Speed at 2 m (minutes 45-60)", + "nc_units": "m s-1", + "original_units": "0.1 km/h", + "raw_units": "km h-1", + "scale_factor": 0.1, + "standard_name": "wind_speed" + }, + "275": { + "_variable_name": "snd_q4", + "least_significant_digit": "", + "long_name": "Snow Depth (at minute 60)", + "original_units": "cm", + "raw_units": "cm", + "scale_factor": 1, + "standard_name": "surface_snow_thickness", + "units": "m" + }, + "276": { + "_variable_name": "snd_q1", + "least_significant_digit": "", + "long_name": "Snow Depth (at minute 15)", + "original_units": "cm", + "raw_units": "cm", + "scale_factor": 1, + "standard_name": "surface_snow_thickness", + "units": "m" + }, + "277": { + "_variable_name": "snd_q2", + "least_significant_digit": "", + "long_name": "Snow Depth (at minute 30)", + "original_units": "cm", + "raw_units": "cm", + "scale_factor": 1, + "standard_name": "surface_snow_thickness", + "units": "m" + }, + "278": { + "_variable_name": "snd_q3", + "least_significant_digit": "", + "long_name": "Snow Depth (at minute 45)", + "original_units": "cm", + "raw_units": "cm", + "scale_factor": 1, + "standard_name": "surface_snow_thickness", + "units": "m" + }, + "279": { + "_variable_name": "wind_dir", + "least_significant_digit": "", + "long_name": "Wind Direction at 2 m (minutes 50-60)", + "nc_units": "deg", + "original_units": "Degrees", + "raw_units": "deg", + "scale_factor": 1, + "standard_name": "wind_direction" + }, + "280": { + "_variable_name": "wind_speed", + "least_significant_digit": "", + "long_name": "Wind Speed at 2 m (minutes 50-60)", + "original_units": "0.1 km/h", + "raw_units": "km h-1", + "scale_factor": 0.1, + "standard_name": "wind_speed", + "units": "m s-1" + } } } diff --git a/miranda/preprocess/eccc.py b/miranda/preprocess/eccc.py index 688ed196..be087085 100644 --- a/miranda/preprocess/eccc.py +++ b/miranda/preprocess/eccc.py @@ -2,122 +2,163 @@ from __future__ import annotations +import contextlib import json import logging.config import multiprocessing as mp import os +import tempfile from functools import partial from pathlib import Path +from typing import Callable + +from dask.distributed import ProgressBar -from miranda.eccc._utils import cf_station_metadata -from miranda.preprocess._eccc_obs import _convert_station_file from miranda.scripting import LOGGING_CONFIG +from miranda.storage import file_size, report_file_size +from miranda.utils import generic_extract_archive logging.config.dictConfig(LOGGING_CONFIG) _data_folder = Path(__file__).parent / "configs" -eccc_observation_variables = dict() -eccc_observation_variables["flat"] = [ - v for v in json.load(open(_data_folder / "eccc-obs_attrs.json"))["variables"].keys() -] -eccc_observation_variables["summary"] = [ - attrs["_cf_variable_name"] - for attrs in json.load(open(_data_folder / "eccc-obs-summary_attrs.json"))[ - "variables" - ].values() -] -eccc_observation_variables["homogenized"] = [ - attrs["_cf_variable_name"] - for attrs in json.load(open(_data_folder / "eccc-homogenized_attrs.json"))[ - "variables" - ].values() -] - - -def convert_flat_files( - source_files: str | os.PathLike, - output_folder: str | os.PathLike | list[str | int], - variables: str | int | list[str | int], - mode: str = "hourly", - n_workers: int = 4, -) -> None: - """ + +def _run_func_on_archive_with_optional_dask( + file: Path, + function: Callable, + errored_files: list[Path], + **dask_kwargs, +): + r"""Run a function on a file archive, extracting it if necessary. + + Notes + ----- + If the file is larger than 1 GiB or dask_kwargs are passed, dask.dataframes will be used. + Partial function requires the function to accept the following parameters: + - file: Path + - using_dask: bool + - client: dask.distributed.Client Parameters ---------- - source_files: str or Path - output_folder: str or Path - variables: str or List[str] - mode: {"hourly", "daily"} - n_workers: int + file: Path + File archive to process. + function: Callable + Function to run on the file. + errored_files: list[Path] + List of files that errored during processing. + \*\*dask_kwargs + Keyword arguments to pass to dask.distributed.Client. Returns ------- - None + """ - if mode.lower() in ["h", "hour", "hourly"]: - num_observations = 24 - column_names = ["code", "year", "month", "day", "code_var"] - column_dtypes = [str, float, float, float, str] - elif mode.lower() in ["d", "day", "daily"]: - num_observations = 31 - column_names = ["code", "year", "month", "code_var"] - column_dtypes = [str, float, float, str] - else: - raise NotImplementedError("`mode` must be 'h'/'hourly or 'd'/'daily'.") - - # Preparing the data column headers - for i in range(1, num_observations + 1): - data_entry, flag_entry = f"D{i:0n}", f"F{i:0n}" - column_names.append(data_entry) - column_names.append(flag_entry) - column_dtypes.extend([str, str]) - - if isinstance(variables, (str, int)): - variables = [variables] - - for variable_code in variables: - variable_code = str(variable_code).zfill(3) - metadata = cf_station_metadata(variable_code) - nc_name = metadata["nc_name"] - - rep_nc = Path(output_folder).joinpath(nc_name) - rep_nc.mkdir(parents=True, exist_ok=True) - - # Loop on the files - logging.info( - f"Collecting files for variable '{metadata['standard_name']}' " - f"(filenames containing '{metadata['_table_name']}')." - ) - list_files = list() - if isinstance(source_files, list) or Path(source_files).is_file(): - list_files.append(source_files) + + with tempfile.TemporaryDirectory() as temp_folder: + if file.suffix in [".gz", ".tar", ".zip", ".7z"]: + data_files = generic_extract_archive(file, output_dir=temp_folder) else: - glob_patterns = [g for g in metadata["_table_name"]] - for pattern in glob_patterns: - list_files.extend( - [f for f in Path(source_files).rglob(f"{pattern}*") if f.is_file()] + data_files = [file] + logging.info(f"Processing file: {file}.") + + # 1 GiB + size_limit = 2**30 + + for data in data_files: + size = file_size(data) + if size > size_limit or dask_kwargs: + if size > size_limit: + logging.info( + f"File exceeds {report_file_size(size_limit)} - Using dask.dataframes." + ) + client = ProgressBar + using_dask = True + else: + logging.info( + f"File below {report_file_size(size_limit)} - Using pandas.dataframes." ) - manager = mp.Manager() - errored_files = manager.list() - converter_func = partial( - _convert_station_file, - output_path=rep_nc, - errored_files=errored_files, - mode=mode, - variable_code=variable_code, - column_names=column_names, - column_dtypes=column_dtypes, - **metadata, - ) - with mp.Pool(processes=n_workers) as pool: - pool.map(converter_func, list_files) - pool.close() - pool.join() - - if errored_files: - logging.warning( - "Some files failed to be properly parsed:\n", ", ".join(errored_files) - ) + client = contextlib.nullcontext + using_dask = False + + with client(**dask_kwargs) as c: + try: + function(data, using_dask=using_dask, client=c) + except FileNotFoundError: + errored_files.append(data) + + if os.listdir(temp_folder): + for temporary_file in Path(temp_folder).glob("*"): + if temporary_file in data_files: + temporary_file.unlink() + + +# def convert_flat_files( +# source_files: str | os.PathLike, +# output_folder: str | os.PathLike | list[str | int], +# variables: str | int | list[str | int], +# project: str = "eccc-obs", +# mode: str = "hourly", +# **dask_kwargs, +# ) -> None: +# """ +# +# Parameters +# ---------- +# source_files: str or Path +# output_folder: str or Path +# variables: str or List[str] +# project: {"eccc-obs", "eccc-obs-summary", "eccc-homogenized"} +# mode: {"hourly", "daily"} +# +# Returns +# ------- +# None +# """ +# +# if isinstance(variables, (str, int)): +# variables = [variables] +# +# for variable_code in variables: +# variable_code = str(variable_code).zfill(3) +# metadata = load_json_data_mappings("eccc-obs").get(variable_code) +# +# +# +# # Loop on the files +# logging.info( +# f"Collecting files for variable '{metadata['standard_name']}' " +# f"(filenames containing '{metadata['_table_name']}')." +# ) +# list_files = list() +# if isinstance(source_files, list) or Path(source_files).is_file(): +# list_files.append(source_files) +# else: +# glob_patterns = [g for g in metadata["_table_name"]] +# for pattern in glob_patterns: +# list_files.extend( +# [f for f in Path(source_files).rglob(f"{pattern}*") if f.is_file()] +# ) +# +# +# +# +# manager = mp.Manager() +# errored_files = manager.list() +# converter_func = partial( +# _convert_station_file, +# output_path=rep_nc, +# errored_files=errored_files, +# mode=mode, +# variable_code=variable_code, +# column_names=column_names, +# column_dtypes=column_dtypes, +# **metadata, +# ) +# with mp.Pool(processes=n_workers) as pool: +# pool.map(converter_func, list_files) +# pool.close() +# pool.join() +# +# diff --git a/miranda/vocabularies/eccc.py b/miranda/vocabularies/eccc.py index f86ebb53..bd739fed 100644 --- a/miranda/vocabularies/eccc.py +++ b/miranda/vocabularies/eccc.py @@ -3,86 +3,91 @@ # For more information see the ECCC Technical Documentation __all__ = [ - "DLY", - "DLY02", - "DLY03", - "DLY04", - "DLY12", - "DLY13", - "DLY21", - "DLY44", - "HLY", - "HLY01", - "HLY01_RCS", - "HLY03", - "HLY10", - "HLY15", - "HLY21", - "MLY", - "MLY04", + "obs_groupings", + "obs_vocabularies", ] +obs_vocabularies = dict() + # Hourly Data -HLY01 = [] -HLY01.extend(list(range(71, 123))) # Hourly variables -HLY01.extend([209, 210]) # Wind character and gust speed -HLY01.extend(list(range(219, 231))) # Cloud layers -HLY01.append(244) # Precipitation type -HLY01.append(260) # Freezing fog +obs_vocabularies["HLY01"] = [] +obs_vocabularies["HLY01"].extend(list(range(71, 123))) # Hourly variables +obs_vocabularies["HLY01"].extend([209, 210]) # Wind character and gust speed +obs_vocabularies["HLY01"].extend(list(range(219, 231))) # Cloud layers +obs_vocabularies["HLY01"].append(244) # Precipitation type +obs_vocabularies["HLY01"].append(260) # Freezing fog -HLY01_RCS = HLY01.copy() -HLY01_RCS.extend( +obs_vocabularies["HLY01_RCS"] = obs_vocabularies["HLY01"].copy() +obs_vocabularies["HLY01_RCS"].extend( list(range(262, 281)) ) # Reference Climate Surface (RCS) weather stations -HLY03 = [] -HLY03.extend(list(range(123, 133))) # Hourly rainfall -HLY03.extend([160, 161]) - -HLY10 = [] -HLY10.extend(list(range(61, 69))) # Sunshine -HLY10.extend([133, 169, 170, 171, 172]) # Solar radiation +obs_vocabularies["HLY03"] = [] +obs_vocabularies["HLY03"].extend(list(range(123, 133))) # Hourly rainfall +obs_vocabularies["HLY03"].extend([160, 161]) -HLY15 = [69, 70, 76, 156] # Wind +obs_vocabularies["HLY10"] = [] +obs_vocabularies["HLY10"].extend(list(range(61, 69))) # Sunshine +obs_vocabularies["HLY10"].extend([133, 169, 170, 171, 172]) # Solar radiation -HLY21 = [123] # Fischer/Porter precipitation +obs_vocabularies["HLY15"] = [69, 70, 76, 156] # Wind -HLY = list(set(HLY01 + HLY01_RCS + HLY03 + HLY10 + HLY15 + HLY21)) +obs_vocabularies["HLY21"] = [123] # Fischer/Porter precipitation # Daily Data -DLY02 = [] -DLY02.extend(list(range(1, 26))) # Daily variables -DLY02.append(157) # Direction of extreme gust -DLY02.append(179) # Daily bright sunshine +obs_vocabularies["DLY02"] = [] +obs_vocabularies["DLY02"].extend(list(range(1, 26))) # Daily variables +obs_vocabularies["DLY02"].append(157) # Direction of extreme gust +obs_vocabularies["DLY02"].append(179) # Daily bright sunshine -DLY03 = [] -DLY03.extend(list(range(124, 133))) -DLY03.extend([160, 161]) +obs_vocabularies["DLY03"] = [] +obs_vocabularies["DLY03"].extend(list(range(124, 133))) +obs_vocabularies["DLY03"].extend([160, 161]) -DLY04 = DLY02.copy() +obs_vocabularies["DLY04"] = obs_vocabularies["DLY02"].copy() -DLY12 = [] -DLY12.extend(list(range(134, 151))) # Soil temperatures +obs_vocabularies["DLY12"] = [] +obs_vocabularies["DLY12"].extend(list(range(134, 151))) # Soil temperatures -DLY13 = list(range(151, 156)) # Pan evaporation +obs_vocabularies["DLY13"] = list(range(151, 156)) # Pan evaporation -DLY21 = [12] # Precipitation -DLY21.extend(list(range(127, 133))) # Precipitation over time -DLY21.append(161) # Most precipitation in 25 hours +obs_vocabularies["DLY21"] = [12] # Precipitation +obs_vocabularies["DLY21"].extend(list(range(127, 133))) # Precipitation over time +obs_vocabularies["DLY21"].append(161) # Most precipitation in 25 hours -DLY44 = [] -DLY44.extend([1, 2, 3]) # Temperature -DLY44.extend(list(range(10, 18))) # Precipitation - -DLY = list(set(DLY02 + DLY03 + DLY04 + DLY12 + DLY13 + DLY21 + DLY44)) +obs_vocabularies["DLY44"] = [] +obs_vocabularies["DLY44"].extend([1, 2, 3]) # Temperature +obs_vocabularies["DLY44"].extend(list(range(10, 18))) # Precipitation # Monthly data -MLY04 = [] -MLY04.extend(list(range(26, 39))) # Days with variables -MLY04.extend(list(range(39, 61))) # Means of variables -MLY04.append(158) # Direction of extreme gust - -MLY = list(set(MLY04)) +obs_vocabularies["MLY04"] = [] +obs_vocabularies["MLY04"].extend(list(range(26, 39))) # Days with variables +obs_vocabularies["MLY04"].extend(list(range(39, 61))) # Means of variables +obs_vocabularies["MLY04"].append(158) # Direction of extreme gust + +# Groupings + +obs_groupings = dict() +obs_groupings["HLY"] = list( + obs_vocabularies["HLY01"] + + obs_vocabularies["HLY01_RCS"] + + obs_vocabularies["HLY03"] + + obs_vocabularies["HLY10"] + + obs_vocabularies["HLY15"] + + obs_vocabularies["HLY21"] +) +obs_groupings["DLY"] = list( + set( + obs_vocabularies["DLY02"] + + obs_vocabularies["DLY03"] + + obs_vocabularies["DLY04"] + + obs_vocabularies["DLY12"] + + obs_vocabularies["DLY13"] + + obs_vocabularies["DLY21"] + + obs_vocabularies["DLY44"] + ) +) +obs_groupings["MLY"] = list(set(obs_vocabularies["MLY04"])) From 010d3ea7d4180fffba4113d3a203bd46537345b5 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Thu, 10 Aug 2023 12:36:13 -0400 Subject: [PATCH 15/33] reduce amount of unit conversions --- miranda/preprocess/_eccc_obs.py | 26 +- .../preprocess/configs/eccc-obs_attrs.json | 243 ++++-------------- miranda/preprocess/eccc.py | 7 +- 3 files changed, 60 insertions(+), 216 deletions(-) diff --git a/miranda/preprocess/_eccc_obs.py b/miranda/preprocess/_eccc_obs.py index 6ff64d8f..a3c55dfc 100644 --- a/miranda/preprocess/_eccc_obs.py +++ b/miranda/preprocess/_eccc_obs.py @@ -40,6 +40,15 @@ TABLE_DATE = dt.now().strftime("%d %B %Y") +def _remove_duplicates(ds): + if any(ds.get_index("time").duplicated()): + logging.info( + f"Found {ds.get_index('time').duplicated().sum()} duplicated time coordinates " + f"for station {ds.station_id.values}. Assuming first value." + ) + return ds.sel(time=~ds.get_index("time").duplicated()) + + def convert_observation( data_source: str | Path | list[str | Path], output_dir: str | Path, @@ -94,15 +103,6 @@ def convert_observation( ) -def _remove_duplicates(ds): - if any(ds.get_index("time").duplicated()): - logging.info( - f"Found {ds.get_index('time').duplicated().sum()} duplicated time coordinates " - f"for station {ds.station_id.values}. Assuming first value." - ) - return ds.sel(time=~ds.get_index("time").duplicated()) - - def convert_station( data: str | os.PathLike, variable: str, @@ -665,7 +665,7 @@ def _combine_years( if _verbose: logging.info(f"Opening: {', '.join([p.name for p in nc_files])}") - ds = xr.open_mfdataset(nc_files, combine="nested", concat_dim={"time"}) + ds = xr.open_mfdataset(nc_files, combine="nested", concat_dim="time") outfile = Path(out_folder).joinpath( f'{nc_files[0].name.split(f"_{varia}_")[0]}_{varia}_' f"{ds.time.dt.year.min().values}-{ds.time.dt.year.max().values}.nc" @@ -702,9 +702,9 @@ def _combine_years( "elevation", ] for vv in meta.data_vars: - if vv.lower() not in keep_coords: + if str(vv).lower() not in keep_coords: continue - ds = ds.assign_coords({vv.lower(): meta[vv]}) + ds = ds.assign_coords({str(vv).lower(): meta[vv]}) for vv in ds.data_vars: if ds[vv].dtype == "O": @@ -730,7 +730,6 @@ def merge_converted_variables( source_files: str | os.PathLike, output_folder: str | os.PathLike, variables: str | int | list[str | int] | None = None, - station_metadata: str | os.PathLike | None = None, overwrite: bool = False, n_workers: int = 1, ) -> None: @@ -741,7 +740,6 @@ def merge_converted_variables( source_files : str, Path output_folder : str, Path variables : str or int or list of str or int, optional - station_metadata : str or Path, optional overwrite : bool n_workers : int diff --git a/miranda/preprocess/configs/eccc-obs_attrs.json b/miranda/preprocess/configs/eccc-obs_attrs.json index 7eb3b7b6..300b559a 100644 --- a/miranda/preprocess/configs/eccc-obs_attrs.json +++ b/miranda/preprocess/configs/eccc-obs_attrs.json @@ -28,730 +28,577 @@ "variables": { "001": { "_variable_name": "tasmax", - "least_significant_digit": "", "long_name": "Daily Maximum Temperature", "original_units": "0.1 °C", - "raw_units": "degC", "scale_factor": 0.1, "standard_name": "air_temperature_maximum", - "units": "K" + "units": "degC" }, "002": { "_variable_name": "tasmin", - "least_significant_digit": "", "long_name": "Daily Minimum Temperature", "original_units": "0.1 °C", - "raw_units": "degC", "scale_factor": 0.1, "standard_name": "air_temperature_minimum", - "units": "K" + "units": "degC" }, "003": { "_variable_name": "tas", - "least_significant_digit": "", "long_name": "Daily Mean Temperature", "original_units": "0.1 °C", - "raw_units": "degC", "scale_factor": 0.1, "standard_name": "air_temperature", - "units": "K" + "units": "degC" }, "010": { "_variable_name": "prlptot", - "least_significant_digit": "", "long_name": "Daily Total Rainfall", "original_units": "0.1 mm day-1", - "raw_units": "mm", "scale_factor": 0.1, "standard_name": "liquid_precipitation_amount", - "units": "m" + "units": "mmn day-1" }, "011": { "_variable_name": "prsntot", - "least_significant_digit": "", "long_name": "Daily Total Snowfall", "original_units": "0.1 cm day-1", - "raw_units": "cm", "scale_factor": 0.1, "standard_name": "solid_precipitation_amount", - "units": "m" + "units": "cm day-1" }, "012": { "_variable_name": "prcptot", - "least_significant_digit": "", "long_name": "Daily Total Precipitation", "original_units": "0.1 mm day-1", - "raw_units": "mm", "scale_factor": 0.1, "standard_name": "precipitation_amount", - "units": "m" + "units": "mm day-1" }, "013": { "_variable_name": "sndtot", - "least_significant_digit": "", "long_name": "Snow on the Ground", "original_units": "cm", - "raw_units": "cm", "scale_factor": 1, "standard_name": "surface_snow_thickness", - "units": "m" + "units": "cm" }, "014": { "_variable_name": "thunder", - "least_significant_digit": "", "long_name": "Thunderstorms", - "raw_units": "1", "scale_factor": 1, "standard_name": "thunderstorm_presence", "units": "1" }, "015": { "_variable_name": "freezing_rain_drizzle", - "least_significant_digit": "", "long_name": "Freezing rain or drizzle", - "raw_units": "1", "scale_factor": 1, "standard_name": "freeze_rain_drizzle_presence", "units": "1" }, "016": { "_variable_name": "hail", - "least_significant_digit": "", "long_name": "Hail", - "raw_units": "1", "scale_factor": 1, "standard_name": "hail_presence", "units": "1" }, "017": { "_variable_name": "fog_ice_fog", - "least_significant_digit": "", "long_name": "Fog or Ice Fog", - "raw_units": "1", "scale_factor": 1, "standard_name": "fog_ice_fog_presence", "units": "1" }, "018": { "_variable_name": "smoke_haze", - "least_significant_digit": "", "long_name": "Smoke or Haze", - "raw_units": "1", "scale_factor": 1, "standard_name": "smoke_haze_presence", "units": "1" }, "019": { "_variable_name": "blowing_dust_sand", - "least_significant_digit": "", "long_name": "Blowing Dust or Sand", - "raw_units": "1", "scale_factor": 1, "standard_name": "blowing_dust_sand_presence", "units": "1" }, "020": { "_variable_name": "blow_snow", - "least_significant_digit": "", "long_name": "Blowing snow", - "raw_units": "1", "scale_factor": 1, "standard_name": "blowing_snow_presence", "units": "1" }, "021": { "_variable_name": "wind_gt_28kt", - "least_significant_digit": "", "long_name": "Wind speed >= 28 Knots", - "raw_units": "1", "scale_factor": 1, "standard_name": "wind_exceeding_28_knots", "units": "1" }, "022": { "_variable_name": "wind_gt_34kt", - "least_significant_digit": "", "long_name": "Wind speed >= 34 Knots", - "raw_units": "1", "scale_factor": 1, "standard_name": "wind_exceeding_34_knots", "units": "1" }, "023": { "_variable_name": "gust_dir_16pts", - "least_significant_digit": "", "long_name": "Direction of extreme gust (16 pts) to December 1976", "original_units": "10's of degrees", - "raw_units": "deg", "scale_factor": 10, "standard_name": "gust_to_direction", "units": "deg" }, "024": { "_variable_name": "gust_speed", - "least_significant_digit": "", "long_name": "Speed of extreme gust", "original_units": "km/h", - "raw_units": "km h-1", - "scale_factor": 1, "standard_name": "wind_speed_of_gust", - "units": "m s-1" + "units": "km h-1" }, "025": { "_variable_name": "gust_hour", - "least_significant_digit": "", "long_name": "UTC hour of extreme gust", - "raw_units": "h", - "scale_factor": 1, "standard_name": "hour_of_extreme_gust", "units": "h" }, "061": { "_variable_name": "rf1_radiation", - "least_significant_digit": "", "long_name": "RF1 global solar radiation", "original_units": "0.001 MJ/m", - "raw_units": "W m-2 h-1", - "scale_factor": 277.77777777777777, + "scale_factor": 0.001, "standard_name": "solar_radiation_flux", - "units": "W m-2 h-1" + "units": "MJ m-1" }, "062": { "_variable_name": "rf2_radiation", - "least_significant_digit": "", "long_name": "RF2 sky (diffuse) radiation", "original_units": "0.001 MJ/m", - "raw_units": "W m-2 h-1", "scale_factor": 277.77777777777777, "standard_name": "solar_radiation_flux", - "units": "W m-2 h-1" + "units": "MJ m-1" }, "063": { "_variable_name": "rf3_radiation", - "least_significant_digit": "", "long_name": "RF3 reflected solar radiation", "original_units": "0.001 MJ/m", - "raw_units": "W m-2 h-1", "scale_factor": 277.77777777777777, "standard_name": "solar_radiation_flux", - "units": "W m-2 h-1" + "units": "MJ m-1" }, "064": { "_variable_name": "rf4_radiation", - "least_significant_digit": "", "long_name": "RF4 net all wave radiation", "original_units": "0.001 MJ/m", - "raw_units": "W m-2 h-1", "scale_factor": 277.77777777777777, "standard_name": "solar_radiation_flux", - "units": "W m-2 h-1" + "units": "MJ m-1" }, "067": { "_variable_name": "rf7_radiation", - "least_significant_digit": "", "long_name": "RF7 daylight illumination", "original_units": "0.01 Kilolux_hrs", - "raw_units": "lux h", - "scale_factor": 10, + "scale_factor": 0.01, "standard_name": "solar_radiation_flux", - "units": "lux h" + "units": "klux h" }, "068": { "_variable_name": "rf8_radiation", - "least_significant_digit": "", "long_name": "RF8 direct solar radiation", "original_units": "0.001 MJ/m", - "raw_units": "W m-2 h-1", "scale_factor": 277.77777777777777, "standard_name": "solar_radiation_flux", "units": "W m-2 h-1" }, "069": { "_variable_name": "wind_dir_45B", - "least_significant_digit": "", "long_name": "Direction - 45B anemometer (8 pts)", "original_units": "10's of degrees", - "raw_units": "deg", - "scale_factor": 1, + "scale_factor": 10, "standard_name": "wind_to_direction", "units": "deg" }, "071": { "_variable_name": "ceiling_hgt", - "least_significant_digit": "", "long_name": "Ceiling height of lowest layer of clouds", "original_units": "30's of meters", - "raw_units": "m", "scale_factor": 30, "standard_name": "ceiling_cloud_height", "units": "m" }, "072": { "_variable_name": "visibility", - "least_significant_digit": "", "long_name": "Visibility", "original_units": "0.1 km", - "raw_units": "km", "scale_factor": 0.1, "standard_name": "visibility_in_air", - "units": "m" + "units": "km" }, "073": { "_variable_name": "psl", - "least_significant_digit": "", "long_name": "Sea Level Pressure", "original_units": "0.01 kPa", - "raw_units": "Pa", - "scale_factor": 10, + "scale_factor": 0.01, "standard_name": "air_pressure_at_mean_sea_level", - "units": "Pa" + "units": "kPa" }, "074": { "_variable_name": "tds", - "least_significant_digit": "", "long_name": "Dew Point Temperature", "original_units": "0.1 °C", - "raw_units": "degC", "scale_factor": 0.1, "standard_name": "dew_point_temperature", - "units": "K" + "units": "degC" }, "075": { "_variable_name": "wind_dir_u2a_16", - "least_significant_digit": "", "long_name": "Wind Direction at 2 m (U2A Anemometer) (16 pts)", "original_units": "10's of degrees", - "raw_units": "deg", "scale_factor": 10, "standard_name": "wind_direction_u2a", "units": "deg" }, "076": { "_variable_name": "wind_speed_u2a", - "least_significant_digit": "", "long_name": "Wind Speed - U2A (16 pts) to December 1970", "original_units": "km/h", - "raw_units": "km h-1", "scale_factor": 1, "standard_name": "wind_speed_u2a", - "units": "m s-1" + "units": "km h-1" }, "077": { "_variable_name": "pressure", - "least_significant_digit": "", "long_name": "Station Pressure", "original_units": "0.01 kPa", - "raw_units": "Pa", - "scale_factor": 10, + "scale_factor": 0.01, "standard_name": "atmospheric_pressure", - "units": "Pa" + "units": "kPa" }, "078": { "_variable_name": "tas_dry", - "least_significant_digit": "", "long_name": "Dry Bulb Temperature", "original_units": "0.1 °C", - "raw_units": "degC", "scale_factor": 0.1, "standard_name": "dry_bulb_temperature", - "units": "K" + "units": "degC" }, "079": { "_variable_name": "tas_wet", - "least_significant_digit": "", "long_name": "Wet Bulb temperature", "original_units": "0.1 °C", - "raw_units": "degC", "scale_factor": 0.1, "standard_name": "wet_bulb_temperature", - "units": "K" + "units": "degC" }, "080": { "_variable_name": "hur", - "least_significant_digit": "", "long_name": "Relative Humidity", "original_units": "%", - "raw_units": "1", "scale_factor": 1, "standard_name": "relative_humidity", "units": "1" }, "081": { "_variable_name": "clo", - "least_significant_digit": "", "long_name": "Total Cloud Opacity", "original_units": "%", - "raw_units": "1", "scale_factor": 10, "standard_name": "cloud_albedo", "units": "1" }, "082": { "_variable_name": "clt", - "least_significant_digit": "", "long_name": "Total Cloud Amount", "original_units": "%", - "raw_units": "1", "scale_factor": 10, "standard_name": "cloud_area_fraction", "units": "1" }, "089": { "_variable_name": "freeze_rain", - "least_significant_digit": "", "long_name": "Freezing Rain", - "raw_units": "1", "scale_factor": 1, "standard_name": "freezing_rain", "units": "1" }, "094": { "_variable_name": "ice_pellets", - "least_significant_digit": "", "long_name": "Ice Pellets", - "raw_units": "1", "scale_factor": 1, "standard_name": "ice_pellet_presence", "units": "1" }, "107": { "_variable_name": "1low_cloud_opac", - "least_significant_digit": "", "long_name": "Lowest cloud layer opacity", "original_units": "Tenths", - "raw_units": "1", "scale_factor": 10, "standard_name": "low_type_cloud_opacity_fraction", "units": "1" }, "108": { "_variable_name": "1low_cloud_frac", - "least_significant_digit": "", "long_name": "Lowest cloud layer amount or condition", "original_units": "Tenths", - "raw_units": "1", "scale_factor": 10, "standard_name": "low_type_cloud_area_fraction", "units": "1" }, "109": { "_variable_name": "1low_cloud_type", - "least_significant_digit": "", "long_name": "Lowest cloud layer type", - "raw_units": "1", - "scale_factor": 1, "standard_name": "low_type_cloud_type", "units": "1" }, "110": { "_variable_name": "1low_cloud_hgt", - "least_significant_digit": "", "long_name": "Lowest cloud layer height", "original_units": "30's of meters", - "raw_units": "m", "scale_factor": 30, "standard_name": "low_type_cloud_height", "units": "m" }, "111": { "_variable_name": "2low_cloud_opac", - "least_significant_digit": "", "long_name": "Second lowest cloud layer opacity", "original_units": "Tenths", - "raw_units": "1", "scale_factor": 10, "standard_name": "low_type_cloud_opacity_fraction", "units": "1" }, "112": { "_variable_name": "2low_cloud_frac", - "least_significant_digit": "", "long_name": "Second lowest cloud layer amount or condition", "original_units": "Tenths", - "raw_units": "1", "scale_factor": 10, "standard_name": "low_type_cloud_area_fraction", "units": "1" }, "113": { "_variable_name": "2low_cloud_type", - "least_significant_digit": "", "long_name": "Second lowest cloud layer type", "original_units": "", - "raw_units": "1", "scale_factor": 1, "standard_name": "low_type_cloud_type", "units": "1" }, "114": { "_variable_name": "2low_cloud_hgt", - "least_significant_digit": "", "long_name": "Second lowest cloud layer height", "original_units": "30's of meters", - "raw_units": "m", "scale_factor": 30, "standard_name": "low_type_cloud_height", "units": "m" }, "115": { "_variable_name": "3low_cloud_opac", - "least_significant_digit": "", "long_name": "Thirsd lowest cloud layer opacity", "original_units": "Tenths", - "raw_units": "1", "scale_factor": 10, "standard_name": "low_type_cloud_opacity_fraction", "units": "1" }, "116": { "_variable_name": "3low_cloud_frac", - "least_significant_digit": "", "long_name": "Third lowest cloud layer amount or condition", "original_units": "Tenths", - "raw_units": "1", "scale_factor": 10, "standard_name": "low_type_cloud_area_fraction", "units": "1" }, "117": { "_variable_name": "3low_cloud_type", - "least_significant_digit": "", "long_name": "Third lowest cloud layer type", "original_units": "", - "raw_units": "1", "scale_factor": 1, "standard_name": "low_type_cloud_type", "units": "1" }, "118": { "_variable_name": "3low_cloud_hgt", - "least_significant_digit": "", "long_name": "Third lowest cloud layer height", "original_units": "30's of meters", - "raw_units": "m", "scale_factor": 30, "standard_name": "low_type_cloud_height", "units": "m" }, "123": { "_variable_name": "rainfall", - "least_significant_digit": "", "long_name": "Total Rainfall", "original_units": "0.1 mm", - "raw_units": "mm h-1", "scale_factor": 0.1, "standard_name": "rainfall_flux", - "units": "kg m2 s-1" + "units": "mm h-1" }, "133": { "_variable_name": "sun", - "least_significant_digit": "", "long_name": "Sunshine", "original_units": "0.1 hrs", - "raw_units": "h", "scale_factor": 0.1, "standard_name": "duration_of_sunshine", - "units": "s" + "units": "h" }, "156": { "_variable_name": "wind_dir_u2a_36", - "least_significant_digit": "", "long_name": "Wind Direction - U2A (36 pts) from January 1971", "original_units": "10's of degrees", - "raw_units": "deg", "scale_factor": 10, "standard_name": "wind_direction_u2a", "units": "deg" }, "262": { "_variable_name": "prtot", - "least_significant_digit": "", "long_name": "Total Precipitation (minutes 00-60)", "original_units": "0.1 mm", - "raw_units": "mm", "scale_factor": 0.1, "standard_name": "precipitation_amount", - "units": "kg m-2" + "units": "mm" }, "263": { "_variable_name": "prtot_q1", - "least_significant_digit": "", "long_name": "Total Precipitation (minutes 00-15)", "original_units": "0.1 mm", - "raw_units": "mm", "scale_factor": 0.1, "standard_name": "precipitation_amount", - "units": "kg m-2" + "units": "mm" }, "264": { "_variable_name": "prtot_q2", - "least_significant_digit": "", "long_name": "Total Precipitation (minutes 15-30)", "original_units": "0.1 mm", - "raw_units": "mm", "scale_factor": 0.1, "standard_name": "precipitation_amount", - "units": "kg m-2" + "units": "mm" }, "265": { "_variable_name": "prtot_q3", - "least_significant_digit": "", "long_name": "Total Precipitation (minutes 30-45)", "original_units": "0.1 mm", - "raw_units": "mm", "scale_factor": 0.1, "standard_name": "precipitation_amount", - "units": "kg m-2" + "units": "mm" }, "266": { "_variable_name": "prtot_q4", - "least_significant_digit": "", "long_name": "Total Precipitation (minutes 45-60)", "original_units": "0.1 mm", - "raw_units": "mm", "scale_factor": 0.1, "standard_name": "precipitation_amount", - "units": "kg m-2" + "units": "mm" }, "267": { "_variable_name": "precipitation_weight_q1", - "least_significant_digit": "", "long_name": "Precipitation Gauge Weight per Unit Area (at minute 15)", "original_units": "0.1 kg/m²", - "raw_units": "kg m-2", "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, "268": { "_variable_name": "precipitation_weight_q2", - "least_significant_digit": "", "long_name": "Precipitation Gauge Weight per Unit Area (at minute 30)", "original_units": "0.1 kg/m²", - "raw_units": "kg m-2", "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, "269": { "_variable_name": "precipitation_weight_q3", - "least_significant_digit": "", "long_name": "Precipitation Gauge Weight per Unit Area (at minute 45)", "original_units": "0.1 kg/m²", - "raw_units": "kg m-2", "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, "270": { "_variable_name": "precipitation_weight_q4", - "least_significant_digit": "", "long_name": "Precipitation Gauge Weight per Unit Area (at minute 60)", "original_units": "0.1 kg/m²", - "raw_units": "kg m-2", "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "kg m-2" }, "271": { "_variable_name": "wind_speed_q1", - "least_significant_digit": "", "long_name": "Wind Speed at 2 m (minutes 00-15)", - "nc_units": "m s-1", "original_units": "0.1 km/h", - "raw_units": "km h-1", "scale_factor": 0.1, - "standard_name": "wind_speed" + "standard_name": "wind_speed", + "units": "km h-1" }, "272": { "_variable_name": "wind_speed_q2", - "least_significant_digit": "", "long_name": "Wind Speed at 2 m (minutes 15-30)", - "nc_units": "m s-1", "original_units": "0.1 km/h", - "raw_units": "km h-1", "scale_factor": 0.1, - "standard_name": "wind_speed" + "standard_name": "wind_speed", + "units": "km h-1" }, "273": { "_variable_name": "wind_speed_q3", - "least_significant_digit": "", "long_name": "Wind Speed at 2 m (minutes 30-45)", - "nc_units": "m s-1", "original_units": "0.1 km/h", - "raw_units": "km h-1", "scale_factor": 0.1, - "standard_name": "wind_speed" + "standard_name": "wind_speed", + "units": "km h-1" }, "274": { "_variable_name": "wind_speed_q4", - "least_significant_digit": "", "long_name": "Wind Speed at 2 m (minutes 45-60)", - "nc_units": "m s-1", "original_units": "0.1 km/h", - "raw_units": "km h-1", "scale_factor": 0.1, - "standard_name": "wind_speed" + "standard_name": "wind_speed", + "units": "km h-1" }, "275": { "_variable_name": "snd_q4", - "least_significant_digit": "", "long_name": "Snow Depth (at minute 60)", "original_units": "cm", - "raw_units": "cm", - "scale_factor": 1, "standard_name": "surface_snow_thickness", - "units": "m" + "units": "cm" }, "276": { "_variable_name": "snd_q1", - "least_significant_digit": "", "long_name": "Snow Depth (at minute 15)", "original_units": "cm", - "raw_units": "cm", "scale_factor": 1, "standard_name": "surface_snow_thickness", - "units": "m" + "units": "cm" }, "277": { "_variable_name": "snd_q2", - "least_significant_digit": "", "long_name": "Snow Depth (at minute 30)", "original_units": "cm", - "raw_units": "cm", "scale_factor": 1, "standard_name": "surface_snow_thickness", - "units": "m" + "units": "cm" }, "278": { "_variable_name": "snd_q3", - "least_significant_digit": "", "long_name": "Snow Depth (at minute 45)", "original_units": "cm", - "raw_units": "cm", "scale_factor": 1, "standard_name": "surface_snow_thickness", - "units": "m" + "units": "cm" }, "279": { "_variable_name": "wind_dir", - "least_significant_digit": "", "long_name": "Wind Direction at 2 m (minutes 50-60)", "nc_units": "deg", "original_units": "Degrees", - "raw_units": "deg", - "scale_factor": 1, "standard_name": "wind_direction" }, "280": { "_variable_name": "wind_speed", - "least_significant_digit": "", "long_name": "Wind Speed at 2 m (minutes 50-60)", "original_units": "0.1 km/h", - "raw_units": "km h-1", "scale_factor": 0.1, "standard_name": "wind_speed", - "units": "m s-1" + "units": "km h-1" } } } diff --git a/miranda/preprocess/eccc.py b/miranda/preprocess/eccc.py index be087085..9631090c 100644 --- a/miranda/preprocess/eccc.py +++ b/miranda/preprocess/eccc.py @@ -3,12 +3,9 @@ from __future__ import annotations import contextlib -import json import logging.config -import multiprocessing as mp import os import tempfile -from functools import partial from pathlib import Path from typing import Callable @@ -69,7 +66,9 @@ def _run_func_on_archive_with_optional_dask( for data in data_files: size = file_size(data) if size > size_limit or dask_kwargs: - if size > size_limit: + if dask_kwargs: + logging.info(f"`dask_kwargs` provided - Using dask.dataframes.") + elif size > size_limit: logging.info( f"File exceeds {report_file_size(size_limit)} - Using dask.dataframes." ) From 7d8fdf9734adea87b110aef9a53f641304695ce9 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:26:01 -0400 Subject: [PATCH 16/33] refactoring - move treatments to new module, load_json_data_mappings as a dynamic and shared utility, renaming of configuration files for better coupling --- miranda/convert/__init__.py | 1 - miranda/convert/_data_definitions.py | 71 ++----------------- ...trs.json => agcfsr|agmerra2_cf_attrs.json} | 0 ... => cmip5|cmip6|cordex_ouranos_attrs.json} | 0 ...cf_attrs.json => eccc-ahccd_cf_attrs.json} | 0 ...ttrs.json => era5|era5-land_cf_attrs.json} | 0 ..._cf_attrs.json => ets-grnch_cf_attrs.json} | 0 ...ttrs.json => wfdei-gem-capa_cf_attrs.json} | 0 miranda/convert/corrections.py | 41 +++++++++-- miranda/convert/melcc.py | 6 +- miranda/preprocess/_data_definitions.py | 33 +-------- miranda/preprocess/_eccc_obs.py | 4 +- ...nized_attrs.json => eccc-ahccd_attrs.json} | 0 miranda/preprocess/eccc.py | 2 +- miranda/treatments/__init__.py | 5 ++ .../{convert => treatments}/_treatments.py | 43 ++++++++++- 16 files changed, 94 insertions(+), 112 deletions(-) rename miranda/convert/configs/{nasa_ag_cf_attrs.json => agcfsr|agmerra2_cf_attrs.json} (100%) rename miranda/convert/configs/{cmip_ouranos_attrs.json => cmip5|cmip6|cordex_ouranos_attrs.json} (100%) rename miranda/convert/configs/{eccc-homogenized_cf_attrs.json => eccc-ahccd_cf_attrs.json} (100%) rename miranda/convert/configs/{ecmwf_cf_attrs.json => era5|era5-land_cf_attrs.json} (100%) rename miranda/convert/configs/{ets_grnch_cf_attrs.json => ets-grnch_cf_attrs.json} (100%) rename miranda/convert/configs/{usask_cf_attrs.json => wfdei-gem-capa_cf_attrs.json} (100%) rename miranda/preprocess/configs/{eccc-homogenized_attrs.json => eccc-ahccd_attrs.json} (100%) create mode 100644 miranda/treatments/__init__.py rename miranda/{convert => treatments}/_treatments.py (95%) diff --git a/miranda/convert/__init__.py b/miranda/convert/__init__.py index bfc31224..3533bcac 100644 --- a/miranda/convert/__init__.py +++ b/miranda/convert/__init__.py @@ -4,6 +4,5 @@ from . import deh, hq, melcc, utils from ._aggregation import * from ._data_definitions import * -from ._treatments import * # from ._reconstruction import * diff --git a/miranda/convert/_data_definitions.py b/miranda/convert/_data_definitions.py index 1b6dbc8a..e592da65 100644 --- a/miranda/convert/_data_definitions.py +++ b/miranda/convert/_data_definitions.py @@ -5,7 +5,6 @@ import logging.config import os from pathlib import Path -from typing import Any from miranda.scripting import LOGGING_CONFIG from miranda.storage import report_file_size @@ -26,7 +25,6 @@ "gather_sc_earth", "gather_wfdei_gem_capa", "gather_emdna", - "load_json_data_mappings", "nasa_ag_variables", "nrcan_variables", "project_institutes", @@ -38,65 +36,6 @@ _config_folder = Path(__file__).resolve().parent / "configs" -def load_json_data_mappings(project: str) -> dict[str, Any]: - """Load JSON mappings for supported dataset conversions. - - Parameters - ---------- - project : str - - Returns - ------- - dict[str, Any] - """ - if project.startswith("era5"): - metadata_definition = json.load(open(_config_folder / "ecmwf_cf_attrs.json")) - elif project in ["rdrs-v21"]: - metadata_definition = json.load( - open(_config_folder / "eccc-rdrs_cf_attrs.json") - ) - elif project == "eccc-obs": - metadata_definition = json.load(open(_config_folder / "eccc-obs_cf_attrs.json")) - elif project in ["agcfsr", "agmerra2"]: - metadata_definition = json.load(open(_config_folder / "nasa_ag_cf_attrs.json")) - elif project in ["cordex", "cmip5", "cmip6"]: - metadata_definition = json.load( - open(_config_folder / "cmip_ouranos_attrs.json") - ) - elif project == "ets-grnch": - metadata_definition = json.load( - open(_config_folder / "ets_grnch_cf_attrs.json") - ) - elif project == "nrcan-gridded-10km": - raise NotImplementedError() - elif project == "wfdei-gem-capa": - metadata_definition = json.load(open(_config_folder / "usask_cf_attrs.json")) - elif project == "melcc": - metadata_definition = json.load(open(_config_folder / "melcc_cf_attrs.json")) - elif project == "eccc-canswe": - metadata_definition = json.load( - open(_config_folder / "eccc-canswe_cf_attrs.json") - ) - elif project == "eccc-homogenized": - metadata_definition = json.load( - open(_config_folder / "eccc-homogenized_cf_attrs.json") - ) - elif project in ["NEX-GDDP-CMIP6"]: - metadata_definition = json.load( - open(_config_folder / "nex-gddp-cmip6_attrs.json") - ) - elif project in ["ESPO-G6-R2"]: - metadata_definition = json.load(open(_config_folder / "espo-g6-r2_attrs.json")) - elif project in ["ESPO-G6-E5L"]: - metadata_definition = json.load(open(_config_folder / "espo-g6-e5l_attrs.json")) - elif project in ["EMDNA"]: - metadata_definition = json.load(open(_config_folder / "emdna_cf_attrs.json")) - else: - raise NotImplementedError(f"Project not supported: {project}") - - return metadata_definition - - eccc_rdrs_variables = dict() eccc_rdrs_variables["raw"] = [ v @@ -111,18 +50,18 @@ def load_json_data_mappings(project: str) -> dict[str, Any]: ].values() ] -era5_variables = json.load(open(_config_folder / "ecmwf_cf_attrs.json"))[ +era5_variables = json.load(open(_config_folder / "era5|era5-land_cf_attrs.json"))[ "variables" ].keys() grnch_variables = ["T", "Tmin", "Tmax", "P"] nrcan_variables = ["tasmin", "tasmax", "pr"] -nasa_ag_variables = json.load(open(_config_folder / "nasa_ag_cf_attrs.json"))[ +nasa_ag_variables = json.load(open(_config_folder / "agcfsr|agmerra2_cf_attrs.json"))[ "variables" ].keys() sc_earth_variables = ["prcp", "tdew", "tmean", "trange", "wind"] -wfdei_gem_capa_variables = json.load(open(_config_folder / "usask_cf_attrs.json"))[ - "variables" -].keys() +wfdei_gem_capa_variables = json.load( + open(_config_folder / "wfdei-gem-capa_cf_attrs.json") +)["variables"].keys() project_institutes = { "cfsr": "ncar", diff --git a/miranda/convert/configs/nasa_ag_cf_attrs.json b/miranda/convert/configs/agcfsr|agmerra2_cf_attrs.json similarity index 100% rename from miranda/convert/configs/nasa_ag_cf_attrs.json rename to miranda/convert/configs/agcfsr|agmerra2_cf_attrs.json diff --git a/miranda/convert/configs/cmip_ouranos_attrs.json b/miranda/convert/configs/cmip5|cmip6|cordex_ouranos_attrs.json similarity index 100% rename from miranda/convert/configs/cmip_ouranos_attrs.json rename to miranda/convert/configs/cmip5|cmip6|cordex_ouranos_attrs.json diff --git a/miranda/convert/configs/eccc-homogenized_cf_attrs.json b/miranda/convert/configs/eccc-ahccd_cf_attrs.json similarity index 100% rename from miranda/convert/configs/eccc-homogenized_cf_attrs.json rename to miranda/convert/configs/eccc-ahccd_cf_attrs.json diff --git a/miranda/convert/configs/ecmwf_cf_attrs.json b/miranda/convert/configs/era5|era5-land_cf_attrs.json similarity index 100% rename from miranda/convert/configs/ecmwf_cf_attrs.json rename to miranda/convert/configs/era5|era5-land_cf_attrs.json diff --git a/miranda/convert/configs/ets_grnch_cf_attrs.json b/miranda/convert/configs/ets-grnch_cf_attrs.json similarity index 100% rename from miranda/convert/configs/ets_grnch_cf_attrs.json rename to miranda/convert/configs/ets-grnch_cf_attrs.json diff --git a/miranda/convert/configs/usask_cf_attrs.json b/miranda/convert/configs/wfdei-gem-capa_cf_attrs.json similarity index 100% rename from miranda/convert/configs/usask_cf_attrs.json rename to miranda/convert/configs/wfdei-gem-capa_cf_attrs.json diff --git a/miranda/convert/corrections.py b/miranda/convert/corrections.py index 8a5ef0ee..e2157de2 100644 --- a/miranda/convert/corrections.py +++ b/miranda/convert/corrections.py @@ -9,8 +9,9 @@ import xarray as xr -from miranda.convert._data_definitions import load_json_data_mappings -from miranda.convert._treatments import ( +from miranda.convert.utils import find_version_hash +from miranda.gis import subset_domain +from miranda.treatments import ( cf_units_conversion, clip_values, conservative_regrid, @@ -18,6 +19,7 @@ dimensions_compliance, ensure_correct_time_frequency, invert_value_sign, + load_json_data_mappings, metadata_conversion, offset_time_dimension, preprocessing_corrections, @@ -25,13 +27,42 @@ transform_values, variable_conversion, ) -from miranda.convert.utils import find_version_hash -from miranda.gis import subset_domain + +CONFIG_FOLDER = Path(__file__).parent / "data" +CONFIG_FILES = { + "EMDNA": "emdna_cf_attrs.json", + "ESPO-G6-E5L": "espo-g6-e5l_attrs.json", + "ESPO-G6-R2": "espo-g6-r2_attrs.json", + "NEX-GDDP-CMIP6": "nex-gddp-cmip6_attrs.json", + "agcfsr": "agcfsr|agmerra2_cf_attrs.json", + "agmerra2": "agcfsr|agmerra2_cf_attrs.json", + "cmip": "cmip5|cmip6|cordex_ouranos_attrs.json", + "cordex": "cmip5|cmip6|cordex_ouranos_attrs.json", + "eccc-canswe": "eccc-canswe_cf_attrs.json", + "eccc-ahccd": "eccc-ahccd_cf_attrs.json", + "eccc-obs": "eccc-obs_cf_attrs.json", + "era5-land": "era5|era5-land_cf_attrs.json", + "era5-land-monthly-means": "era5|era5-land_cf_attrs.json", + "era5-pressure-levels": "era5|era5-land_cf_attrs.json", + "era5-pressure-levels-monthly-means": "era5|era5-land_cf_attrs.json", + "era5-pressure-levels-monthly-means-preliminary-back-extension": "era5|era5-land_cf_attrs.json", + "era5-pressure-levels-preliminary-back-extension": "era5|era5-land_cf_attrs.json", + "era5-single-levels": "era5|era5-land_cf_attrs.json", + "era5-single-levels-monthly-means": "era5|era5-land_cf_attrs.json", + "era5-single-levels-monthly-means-preliminary-back-extension": "era5|era5-land_cf_attrs.json", + "era5-single-levels-preliminary-back-extension": "era5|era5-land_cf_attrs.json", + "ets-grnch": "ets-grnch_cf_attrs.json", + "melcc": "melcc_cf_attrs.json", + "rdrs-v21": "eccc-rdrs_cf_attrs.json", + "wfdei-gem-capa": "wfdei-gem-capa_cf_attrs.json", +} +for k, v in CONFIG_FILES.items(): + CONFIG_FILES[k] = CONFIG_FOLDER / v def dataset_corrections(ds: xr.Dataset, project: str) -> xr.Dataset: """Convert variables to CF-compliant format""" - metadata_definition = load_json_data_mappings(project) + metadata_definition = load_json_data_mappings(project, CONFIG_FILES) ds = correct_unit_names(ds, project, metadata_definition) ds = transform_values(ds, project, metadata_definition) diff --git a/miranda/convert/melcc.py b/miranda/convert/melcc.py index dba999dc..5dd034ec 100644 --- a/miranda/convert/melcc.py +++ b/miranda/convert/melcc.py @@ -21,11 +21,9 @@ from xclim.core.units import convert_units_to, pint_multiply, str2pint from miranda import __version__ -from miranda.convert._data_definitions import load_json_data_mappings from miranda.convert.corrections import dataset_corrections from miranda.scripting import LOGGING_CONFIG - -from ._treatments import metadata_conversion +from miranda.treatments import load_json_data_mappings, metadata_conversion logging.config.dictConfig(LOGGING_CONFIG) logger = logging.getLogger(__name__) @@ -562,7 +560,7 @@ def convert_snow_table(file: str | Path, output: str | Path): ) ds.attrs.update(frequency="2sem") - meta = load_json_data_mappings("melcc-snow") + meta = load_json_data_mappings("melcc") ds = metadata_conversion(ds, "melcc-snow", meta) date = "-".join(ds.indexes["time"][[0, -1]].strftime("%Y%m")) # Save diff --git a/miranda/preprocess/_data_definitions.py b/miranda/preprocess/_data_definitions.py index f73251e9..e99d5458 100644 --- a/miranda/preprocess/_data_definitions.py +++ b/miranda/preprocess/_data_definitions.py @@ -1,41 +1,14 @@ from __future__ import annotations -import json import warnings from pathlib import Path -from typing import Any -_config_folder = Path(__file__).resolve().parent / "configs" - - -__all__ = ["load_json_data_mappings", "find_project_variable_codes"] +from miranda.treatments import load_json_data_mappings +_config_folder = Path(__file__).resolve().parent / "configs" -def load_json_data_mappings(project: str) -> dict[str, Any]: - """Load JSON mappings for supported dataset conversions. - - Parameters - ---------- - project : str - - Returns - ------- - dict[str, Any] - """ - if project == "eccc-homogenized": - metadata_definition = json.load( - open(_config_folder / "eccc-homogenized_attrs.json") - ) - elif project == "eccc-obs": - metadata_definition = json.load(open(_config_folder / "eccc-obs_attrs.json")) - elif project == "eccc-obs-summary": - metadata_definition = json.load( - open(_config_folder / "eccc-obs-summary_attrs.json") - ) - else: - raise NotImplementedError(f"Project not supported: {project}") - return metadata_definition +__all__ = ["find_project_variable_codes"] def find_project_variable_codes(code: str, table: str) -> str: diff --git a/miranda/preprocess/_eccc_obs.py b/miranda/preprocess/_eccc_obs.py index a3c55dfc..5ca34dc4 100644 --- a/miranda/preprocess/_eccc_obs.py +++ b/miranda/preprocess/_eccc_obs.py @@ -119,7 +119,7 @@ def convert_station( if using_dask_array: pandas_reader = dd - # set the blocksize to 200 MB + # set the block size to 200 MB chunks = dict(blocksize=200 * 2**20) else: pandas_reader = pd @@ -149,7 +149,7 @@ def convert_station( except UnicodeDecodeError as e: msg = f"File {data.name} was unable to be read. This is probably an issue with the file: {e}" logging.error(msg) - raise UnicodeDecodeError(msg) + raise # Loop through the station codes station_codes = df["code"].unique() diff --git a/miranda/preprocess/configs/eccc-homogenized_attrs.json b/miranda/preprocess/configs/eccc-ahccd_attrs.json similarity index 100% rename from miranda/preprocess/configs/eccc-homogenized_attrs.json rename to miranda/preprocess/configs/eccc-ahccd_attrs.json diff --git a/miranda/preprocess/eccc.py b/miranda/preprocess/eccc.py index 9631090c..8b40b848 100644 --- a/miranda/preprocess/eccc.py +++ b/miranda/preprocess/eccc.py @@ -67,7 +67,7 @@ def _run_func_on_archive_with_optional_dask( size = file_size(data) if size > size_limit or dask_kwargs: if dask_kwargs: - logging.info(f"`dask_kwargs` provided - Using dask.dataframes.") + logging.info("`dask_kwargs` provided - Using dask.dataframes.") elif size > size_limit: logging.info( f"File exceeds {report_file_size(size_limit)} - Using dask.dataframes." diff --git a/miranda/treatments/__init__.py b/miranda/treatments/__init__.py new file mode 100644 index 00000000..57319980 --- /dev/null +++ b/miranda/treatments/__init__.py @@ -0,0 +1,5 @@ +"""Treatments module.""" + +from __future__ import annotations + +from miranda.treatments._treatments import * diff --git a/miranda/convert/_treatments.py b/miranda/treatments/_treatments.py similarity index 95% rename from miranda/convert/_treatments.py rename to miranda/treatments/_treatments.py index e53e0051..d7e1cd99 100644 --- a/miranda/convert/_treatments.py +++ b/miranda/treatments/_treatments.py @@ -1,11 +1,14 @@ from __future__ import annotations import datetime +import inspect +import json import logging.config import os import warnings from functools import partial from pathlib import Path +from typing import Any, Dict import numpy as np import xarray as xr @@ -15,12 +18,10 @@ from xclim.core.calendar import parse_offset from miranda import __version__ as __miranda_version__ +from miranda.convert.utils import date_parser from miranda.scripting import LOGGING_CONFIG from miranda.units import get_time_frequency -from ._data_definitions import load_json_data_mappings -from .utils import date_parser - logging.config.dictConfig(LOGGING_CONFIG) VERSION = datetime.datetime.now().strftime("%Y.%m.%d") @@ -33,6 +34,7 @@ "dimensions_compliance", "ensure_correct_time_frequency", "invert_value_sign", + "load_json_data_mappings", "metadata_conversion", "offset_time_dimension", "preprocessing_corrections", @@ -42,6 +44,41 @@ ] +def load_json_data_mappings( + project: str, configurations: dict[str, Path] | None = None +) -> dict[str, Any]: + """Load JSON mappings for supported dataset conversions. + + Parameters + ---------- + project : str + configurations: dict, optional + + Returns + ------- + dict[str, Any] + """ + if configurations is None: + calling_frame = inspect.currentframe().f_back + calling_file_path = calling_frame.f_globals["__file__"] + config_folder = Path(calling_file_path).parent / "configs" + + configurations = {} + for configuration in config_folder.glob("*attrs.json"): + project = str(configuration.stem).split("_")[0] + if "|" in project: + for p in project.split("|"): + configurations[p] = configuration + configurations[project] = configuration + + if project in configurations.keys(): + config_file = configurations[project] + metadata_definition = json.load(config_file.open()) + return metadata_definition + else: + raise NotImplementedError(f"Project not supported: {project}") + + def _get_section_entry_key(meta, entry, var, key, project): var_meta = meta[entry].get(var, {}) if key in var_meta: From 318957e47b8aa5f33452f7dc3e209eb7810beea8 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Mon, 14 Aug 2023 17:38:38 -0400 Subject: [PATCH 17/33] more refactoring --- miranda/convert/__init__.py | 2 - .../convert/configs/eccc-ahccd_cf_attrs.json | 43 +- .../configs/era5|era5-land_cf_attrs.json | 4 +- miranda/convert/corrections.py | 6 +- miranda/convert/melcc.py | 3 +- miranda/gis/__init__.py | 1 + miranda/gis/utils.py | 149 +++ miranda/preprocess/__init__.py | 2 +- miranda/preprocess/_data_definitions.py | 48 - .../{_eccc_homogenized.py => _eccc_ahccd.py} | 29 +- miranda/preprocess/_eccc_obs.py | 6 +- miranda/preprocess/_metadata.py | 37 +- .../preprocess/configs/eccc-ahccd_attrs.json | 19 + miranda/preprocess/eccc.py | 5 +- miranda/treatments/__init__.py | 115 ++- miranda/treatments/_dimensions.py | 243 +++++ miranda/treatments/_preprocessing.py | 111 +++ miranda/treatments/_treatments.py | 854 ------------------ miranda/treatments/_variables.py | 273 ++++++ miranda/treatments/utils.py | 64 ++ miranda/vocabularies/__init__.py | 3 + miranda/vocabularies/eccc.py | 14 +- 22 files changed, 1069 insertions(+), 962 deletions(-) create mode 100644 miranda/gis/utils.py delete mode 100644 miranda/preprocess/_data_definitions.py rename miranda/preprocess/{_eccc_homogenized.py => _eccc_ahccd.py} (92%) create mode 100644 miranda/treatments/_dimensions.py create mode 100644 miranda/treatments/_preprocessing.py delete mode 100644 miranda/treatments/_treatments.py create mode 100644 miranda/treatments/_variables.py create mode 100644 miranda/treatments/utils.py diff --git a/miranda/convert/__init__.py b/miranda/convert/__init__.py index 3533bcac..d57a32ec 100644 --- a/miranda/convert/__init__.py +++ b/miranda/convert/__init__.py @@ -4,5 +4,3 @@ from . import deh, hq, melcc, utils from ._aggregation import * from ._data_definitions import * - -# from ._reconstruction import * diff --git a/miranda/convert/configs/eccc-ahccd_cf_attrs.json b/miranda/convert/configs/eccc-ahccd_cf_attrs.json index 5c777230..594de4e2 100644 --- a/miranda/convert/configs/eccc-ahccd_cf_attrs.json +++ b/miranda/convert/configs/eccc-ahccd_cf_attrs.json @@ -1,6 +1,6 @@ { "Header": { - "Conventions": "CF-1.8", + "Conventions": "CF-1.9", "_citation": { "gen2": "Mekis, É and L.A. Vincent, 2011: An overview of the second generation adjusted daily precipitation dataset for trend analysis in Canada. Atmosphere-Ocean 49(2), 163-177 doi:10.1080/07055900.2011.583910", "gen3": "Vincent, L.A., M.M. Hartwell and X.L. Wang, 2020: A Third Generation of Homogenized Temperature for Trend Analysis and Monitoring Changes in Canada’s Climate. Atmosphere-Ocean. https://doi.org/10.1080/07055900.2020.1765728" @@ -27,8 +27,30 @@ "table_date": "2023-03-23", "table_id": "ECCC" }, - "variable_entry": { + "dimensions:": { + "lat": { + "axis": "Y", + "long_name": "Latitude", + "standard_name": "latitude", + "units": "degrees_north" + }, + "long": { + "_cf_dimension_name": "lon", + "axis": "X", + "long_name": "Longitude", + "standard_name": "longitude", + "units": "degrees_east" + }, + "time": { + "axis": "T", + "calendar": "gregorian", + "long_name": "Time", + "standard_name": "time" + } + }, + "variables": { "dm": { + "_cf_variable_name": "tas", "add_offset": 273.15, "cell_methods": "time: mean", "comments": "Station data converted from Mean Temp (°C)", @@ -36,13 +58,12 @@ "grid_mapping": "regular_lon_lat", "long_name": "Near-Surface Air Temperature", "original_field": "Mean Temp (°C)", - "out_name": "tas", - "scale_factor": 1, "standard_name": "air_temperature", "type": "real", "units": "K" }, "dn": { + "_cf_variable_name": "tasmin", "add_offset": 273.15, "cell_methods": "time: minimum", "comments": "Station data converted from Min Temp (°C)", @@ -50,55 +71,51 @@ "grid_mapping": "regular_lon_lat", "long_name": "Daily Minimum Near-Surface Air Temperature", "original_field": "Min Temp (°C)", - "out_name": "tasmin", - "scale_factor": 1, "standard_name": "air_temperature", "type": "real", "units": "K" }, "dr": { - "add_offset": 0, + "_cf_variable_name": "prlp", "cell_methods": "time: mean", "comments": "Station data converted from Total Rain (mm) using a density of 1000 kg/m³", "frequency": "day", "grid_mapping": "regular_lon_lat", "long_name": "Liquid Precipitation", "original_field": "Total Rain (mm)", - "out_name": "prlp", "scale_factor": 1.1574074074074073e-05, "standard_name": "rainfall_flux", "type": "real", "units": "kg m-2 s-1" }, "ds": { - "add_offset": 0, + "_cf_variable_name": "prsn", "cell_methods": "time: mean", "comments": "station data converted from Total Snow (cm) using a density of 100 kg/m³", "frequency": "day", "grid_mapping": "regular_lon_lat", "long_name": "Snowfall Flux", "original_field": "Total Snow (cm)", - "out_name": "prsn", "scale_factor": 1.1574074074074073e-05, "standard_name": "snowfall_flux", "type": "real", "units": "kg m-2 s-1" }, "dt": { - "add_offset": 0, + "_cf_variable_name": "pr", "cell_methods": "time: mean", "comments": "Station data converted from Total Precip (mm) using a density of 1000 kg/m³", "frequency": "day", "grid_mapping": "regular_lon_lat", "long_name": "Precipitation", "original_field": "Total Precip (mm)", - "out_name": "pr", "scale_factor": 1.1574074074074073e-05, "standard_name": "precipitation_flux", "type": "real", "units": "kg m-2 s-1" }, "dx": { + "_cf_variable_name": "tasmax", "add_offset": 273.15, "cell_methods": "time: maximum", "comments": "station data converted from Max Temp (°C)", @@ -106,8 +123,6 @@ "grid_mapping": "regular_lon_lat", "long_name": "Daily Maximum Near-Surface Air Temperature", "original_field": "Max Temp (°C)", - "out_name": "tasmax", - "scale_factor": 1, "standard_name": "air_temperature", "type": "real", "units": "K" diff --git a/miranda/convert/configs/era5|era5-land_cf_attrs.json b/miranda/convert/configs/era5|era5-land_cf_attrs.json index 1a0afa83..1cf1257c 100644 --- a/miranda/convert/configs/era5|era5-land_cf_attrs.json +++ b/miranda/convert/configs/era5|era5-land_cf_attrs.json @@ -45,6 +45,7 @@ "era5-land-monthly-means": 4 }, "axis": "Y", + "long_name": "Latitude", "standard_name": "latitude" }, "longitude": { @@ -54,6 +55,7 @@ "era5-land-monthly-means": 4 }, "axis": "X", + "long_name": "Longitude", "standard_name": "longitude" }, "time": { @@ -71,7 +73,7 @@ }, "_strict_time": false, "axis": "T", - "long_name": "time", + "long_name": "Time", "standard_name": "time" } }, diff --git a/miranda/convert/corrections.py b/miranda/convert/corrections.py index e2157de2..44e1247d 100644 --- a/miranda/convert/corrections.py +++ b/miranda/convert/corrections.py @@ -10,23 +10,21 @@ import xarray as xr from miranda.convert.utils import find_version_hash -from miranda.gis import subset_domain +from miranda.gis import conservative_regrid, subset_domain, threshold_mask from miranda.treatments import ( cf_units_conversion, clip_values, - conservative_regrid, correct_unit_names, dimensions_compliance, ensure_correct_time_frequency, invert_value_sign, - load_json_data_mappings, metadata_conversion, offset_time_dimension, preprocessing_corrections, - threshold_mask, transform_values, variable_conversion, ) +from miranda.treatments.utils import load_json_data_mappings CONFIG_FOLDER = Path(__file__).parent / "data" CONFIG_FILES = { diff --git a/miranda/convert/melcc.py b/miranda/convert/melcc.py index 5dd034ec..d5bb084d 100644 --- a/miranda/convert/melcc.py +++ b/miranda/convert/melcc.py @@ -23,7 +23,8 @@ from miranda import __version__ from miranda.convert.corrections import dataset_corrections from miranda.scripting import LOGGING_CONFIG -from miranda.treatments import load_json_data_mappings, metadata_conversion +from miranda.treatments import metadata_conversion +from miranda.treatments.utils import load_json_data_mappings logging.config.dictConfig(LOGGING_CONFIG) logger = logging.getLogger(__name__) diff --git a/miranda/gis/__init__.py b/miranda/gis/__init__.py index fe49f1c9..288cf522 100644 --- a/miranda/gis/__init__.py +++ b/miranda/gis/__init__.py @@ -2,3 +2,4 @@ from __future__ import annotations from ._domains import * +from .utils import * diff --git a/miranda/gis/utils.py b/miranda/gis/utils.py new file mode 100644 index 00000000..1d5dddbd --- /dev/null +++ b/miranda/gis/utils.py @@ -0,0 +1,149 @@ +"""Utility functions for GIS operations.""" +from __future__ import annotations + +import datetime +import logging +import warnings + +import numpy as np +import xarray as xr + +__all__ = [ + "conservative_regrid", + "threshold_mask", +] + + +def _simple_fix_dims(d: xr.Dataset | xr.DataArray) -> xr.Dataset | xr.DataArray: + """Adjust dimensions found in a file so that it can be used for regridding purposes.""" + if "lon" not in d.dims or "lat" not in d.dims: + dim_rename = dict() + for dim in d.dims: + if str(dim).lower().startswith("lon"): + dim_rename[str(dim)] = "lon" + if str(dim).lower().startswith("lat"): + dim_rename[str(dim)] = "lat" + d = d.rename(dim_rename) + if np.any(d.lon > 180): + lon_wrapped = d.lon.where(d.lon <= 180.0, d.lon - 360.0) + d["lon"] = lon_wrapped + d = d.sortby(["lon"]) + + if "time" in d.dims: + d = d.isel(time=0, drop=True) + + return d + + +def conservative_regrid( + ds: xr.DataArray | xr.Dataset, ref_grid: xr.DataArray | xr.Dataset +) -> xr.DataArray | xr.Dataset: + """Perform a conservative_normed regridding""" + try: + import xesmf as xe # noqa + except ModuleNotFoundError: + raise ModuleNotFoundError( + "This function requires the `xesmf` library which is not installed. " + "Regridding step will be skipped." + ) + + ref_grid = _simple_fix_dims(ref_grid) + method = "conservative_normed" + + logging.info( + f"Performing regridding and masking with `xesmf` using method: {method}." + ) + + regridder = xe.Regridder(ds, ref_grid, method, periodic=False) + ds = regridder(ds) + + ds.attrs["history"] = ( + f"{datetime.datetime.now()}:" + f"Regridded dataset using xesmf with method: {method}. " + f"{ds.attrs.get('history')}".strip() + ) + return ds + + +def threshold_mask( + ds: xr.Dataset | xr.DataArray, + *, + mask: xr.Dataset | xr.DataArray, + mask_cutoff: float | bool = False, +) -> xr.Dataset | xr.DataArray: + """Land-Sea mask operations. + + Parameters + ---------- + ds : xr.Dataset or str or os.PathLike + mask : xr.Dataset or xr.DataArray + mask_cutoff : float or bool + + Returns + ------- + xr.Dataset or xr.DataArray + """ + mask = _simple_fix_dims(mask) + + if isinstance(mask, xr.Dataset): + if len(mask.data_vars) == 1: + mask_variable = list(mask.data_vars)[0] + mask = mask[mask_variable] + else: + raise ValueError( + "More than one data variable found in land-sea mask. Supply a DataArray instead." + ) + else: + mask_variable = mask.name + + try: + from clisops.core import subset_bbox # noqa + + log_msg = f"Masking dataset with {mask_variable}." + if mask_cutoff: + log_msg = f"{log_msg.strip('.')} at `{mask_cutoff}` cutoff value." + logging.info(log_msg) + + lon_bounds = np.array([ds.lon.min(), ds.lon.max()]) + lat_bounds = np.array([ds.lat.min(), ds.lat.max()]) + + mask_subset = subset_bbox( + mask, + lon_bnds=lon_bounds, + lat_bnds=lat_bounds, + ).load() + except ModuleNotFoundError: + log_msg = ( + "This function requires the `clisops` library which is not installed. " + "subsetting step will be skipped." + ) + warnings.warn(log_msg) + mask_subset = mask.load() + + if mask_subset.dtype == bool: + if mask_cutoff: + logging.warning("Mask value cutoff set for boolean mask. Ignoring.") + mask_subset = mask_subset.where(mask) + else: + mask_subset = mask_subset.where(mask >= mask_cutoff) + ds = ds.where(mask_subset.notnull()) + + if mask_subset.min() >= 0: + if mask_subset.max() <= 1.00000001: + cutoff_info = f"{mask_cutoff * 100} %" + elif mask_subset.max() <= 100.00000001: + cutoff_info = f"{mask_cutoff} %" + else: + cutoff_info = f"{mask_cutoff}" + else: + cutoff_info = f"{mask_cutoff}" + ds.attrs["mask_cutoff"] = cutoff_info + + prev_history = ds.attrs.get("history", "") + history_msg = f"Mask calculated using `{mask_variable}`." + if mask_cutoff: + history_msg = f"{history_msg.strip('.')} with cutoff value `{cutoff_info}`." + history = f"{history_msg} {prev_history}".strip() + ds.attrs.update(dict(history=history)) + + return ds diff --git a/miranda/preprocess/__init__.py b/miranda/preprocess/__init__.py index 4601a7cc..03673b8d 100644 --- a/miranda/preprocess/__init__.py +++ b/miranda/preprocess/__init__.py @@ -1,6 +1,6 @@ """Preprocessing tools for Miranda.""" from __future__ import annotations -from ._eccc_homogenized import * +from ._eccc_ahccd import * from ._eccc_obs import * from ._eccc_summaries import * diff --git a/miranda/preprocess/_data_definitions.py b/miranda/preprocess/_data_definitions.py deleted file mode 100644 index e99d5458..00000000 --- a/miranda/preprocess/_data_definitions.py +++ /dev/null @@ -1,48 +0,0 @@ -from __future__ import annotations - -import warnings -from pathlib import Path - -from miranda.treatments import load_json_data_mappings - -_config_folder = Path(__file__).resolve().parent / "configs" - - -__all__ = ["find_project_variable_codes"] - - -def find_project_variable_codes(code: str, table: str) -> str: - """Find the variable code for a given variable name and project. - - Parameters - ---------- - code : str - Variable name. - table : str - Project name. - - Returns - ------- - str - """ - config = load_json_data_mappings(table) - variable_codes = {} - for variable_code in config["variables"]: - variable_name = config["variables"][variable_code].get("_variable_name") - if variable_name: - variable_codes[variable_name] = variable_code - else: - warnings.warn( - f"Variable `{variable_code}` does not have accompanying `variable_name`. " - f"Verify JSON. Continuing with `{variable_code}` as `variable_name`." - ) - variable_codes[variable_code] = variable_code - - if code in variable_codes.values(): - variable = code - else: - variable = variable_codes.get(code) - if not variable: - raise NotImplementedError(f"Variable `{code}` not supported.") - - return variable diff --git a/miranda/preprocess/_eccc_homogenized.py b/miranda/preprocess/_eccc_ahccd.py similarity index 92% rename from miranda/preprocess/_eccc_homogenized.py rename to miranda/preprocess/_eccc_ahccd.py index a16b3e14..8d76b05c 100644 --- a/miranda/preprocess/_eccc_homogenized.py +++ b/miranda/preprocess/_eccc_ahccd.py @@ -11,12 +11,12 @@ from miranda.io import write_dataset from miranda.io.utils import name_output_file -from miranda.preprocess._data_definitions import find_project_variable_codes from miranda.preprocess._metadata import ( eccc_variable_metadata, homogenized_column_definitions, ) from miranda.scripting import LOGGING_CONFIG +from miranda.treatments import find_project_variable_codes, load_json_data_mappings logging.config.dictConfig(LOGGING_CONFIG) logger = logging.Logger("miranda") @@ -44,16 +44,19 @@ def convert_ahccd_fwf_file( ------- xarray.Dataset """ - code = find_project_variable_codes(variable, "eccc-homogenized") + configuration = load_json_data_mappings("eccc-ahccd") + code = find_project_variable_codes(variable, configuration) variable_meta, global_attrs = eccc_variable_metadata( - code, "eccc-homogenized", generation + code, "eccc-ahccd", generation, configuration ) column_names, column_spaces, column_dtypes, header = homogenized_column_definitions( code ) df = pd.read_fwf(ff, header=header, colspecs=column_spaces, dtype=column_dtypes) + + # Handle different variable types if "pr" in variable: cols = list(df.columns[0:3]) cols = cols[0::2] @@ -67,6 +70,7 @@ def convert_ahccd_fwf_file( else: raise NotImplementedError(f"Variable `{variable}` not supported.") + # Extract relevant columns df = df[cols] df.replace(variable_meta[variable]["NaN_value"], np.NaN, inplace=True) @@ -133,6 +137,7 @@ def convert_ahccd_fwf_file( metadata = metadata.drop_vars(["stnid", "station_name"]) ds_out[f"{variable}_flag"].attrs["long_name"] = variable_meta[variable]["long_name"] + ds_out["lon"] = metadata["long"] ds_out.lon.attrs["units"] = "degrees_east" ds_out.lon.attrs["axis"] = "X" @@ -140,9 +145,9 @@ def convert_ahccd_fwf_file( ds_out.lat.attrs["units"] = "degrees_north" ds_out.lat.attrs["axis"] = "Y" ds_out["elev"] = metadata["elev"] - ds_out.elev.attrs["units"] = "m" + ds_out.elev.attrs["units"] = "meters" + ds_out.elev.attrs["positive"] = "up" ds_out.elev.attrs["axis"] = "Z" - metadata = metadata.drop_vars(["long", "lat", "elev"]) for vv in metadata.data_vars: if metadata[vv].dtype == "O" and (variable not in vv): @@ -176,12 +181,14 @@ def convert_ahccd( ------- None """ + configuration = load_json_data_mappings("eccc-ahccd") + output_dir = Path(output_dir).resolve().joinpath(variable) output_dir.mkdir(parents=True, exist_ok=True) - code = find_project_variable_codes(variable, "eccc-homogenized") - var_meta, global_attrs = eccc_variable_metadata( - code, "eccc-homogenized", generation + code = find_project_variable_codes(variable, configuration) + variable_meta, global_attrs = eccc_variable_metadata( + code, "eccc-ahccd", generation, configuration ) ( column_names, @@ -257,8 +264,10 @@ def merge_ahccd( overwrite: bool = False, ) -> None: """Merge Adjusted and Homogenized Canadian Climate Dataset files.""" + configuration = load_json_data_mappings("eccc-ahccd") + if variable: - code = find_project_variable_codes(variable, "eccc-homogenized") + code = find_project_variable_codes(variable, configuration) glob_pattern = f"{code}*.nc" output_dir = Path(output_dir).resolve().joinpath(variable) else: @@ -284,7 +293,7 @@ def merge_ahccd( if ds_ahccd[v].dtype == "O" and "flag" not in v: ds_ahccd[v] = ds_ahccd[v].astype(str) try: - variables_found.add(find_project_variable_codes(str(v), "eccc-homogenized")) + variables_found.add(find_project_variable_codes(str(v), configuration)) except NotImplementedError: pass diff --git a/miranda/preprocess/_eccc_obs.py b/miranda/preprocess/_eccc_obs.py index 5ca34dc4..e48cc756 100644 --- a/miranda/preprocess/_eccc_obs.py +++ b/miranda/preprocess/_eccc_obs.py @@ -22,12 +22,9 @@ from xclim.core.units import convert_units_to from miranda.archive import group_by_length -from miranda.preprocess._data_definitions import ( - find_project_variable_codes, - load_json_data_mappings, -) from miranda.preprocess._metadata import eccc_variable_metadata, obs_column_definitions from miranda.scripting import LOGGING_CONFIG +from miranda.treatments import find_project_variable_codes, load_json_data_mappings from miranda.vocabularies.eccc import obs_vocabularies config.dictConfig(LOGGING_CONFIG) @@ -59,7 +56,6 @@ def convert_observation( overwrite: bool = False, ): """Convert a single station's data from the fixed-width format to a netCDF file.""" - output_dir = Path(output_dir).resolve().joinpath(variable) output_dir.mkdir(parents=True, exist_ok=True) diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py index a663c25a..9fb53af3 100644 --- a/miranda/preprocess/_metadata.py +++ b/miranda/preprocess/_metadata.py @@ -4,10 +4,8 @@ from typing import Any from miranda import __version__ as __miranda_version__ -from miranda.preprocess._data_definitions import ( - find_project_variable_codes, - load_json_data_mappings, -) +from miranda.treatments import find_project_variable_codes +from miranda.treatments.utils import load_json_data_mappings __all__ = [ "eccc_variable_metadata", @@ -17,29 +15,34 @@ def eccc_variable_metadata( - variable_code: str, project: str, gen: int | None = None + variable_code: str, + project: str, + generation: int | None = None, + metadata: dict | None = None, ) -> (dict[str, int | float | str], dict, list[tuple[int, int]], int): """ Parameters ---------- variable_code: str - project: {"eccc-homogenized", "eccc-obs", "eccc-obs-summary"} - gen: {1, 2, 3}, optional + project: {"eccc-ahccd", "eccc-obs", "eccc-obs-summary"} + generation: {1, 2, 3}, optional + metadata: dict, optional Returns ------- dict[str, int or str or float], dict, list[tuple[int, int]], int """ - if project == "eccc-homogenized": - generation = {1: "First", 2: "Second", 3: "Third"}.get(gen) + if project == "eccc-ahccd": + generation = {1: "First", 2: "Second", 3: "Third"}.get(generation) if not generation: - raise NotImplementedError(f"Generation '{gen}' not supported") + raise NotImplementedError(f"Generation '{generation}' not supported") else: generation = None - metadata = load_json_data_mappings(project) - code = find_project_variable_codes(variable_code, project) + if not metadata: + metadata = load_json_data_mappings(project) + code = find_project_variable_codes(variable_code, metadata) # Variable metadata variable_meta = metadata["variables"].get(code) @@ -95,6 +98,16 @@ def eccc_variable_metadata( def homogenized_column_definitions( variable_code: str, ) -> tuple[dict, list[tuple[int, int]], dict[str, type[str | int | float] | Any], int]: + """Return the column names, widths, and data types for the AHCCD fixed-width format data. + + Parameters + ---------- + variable_code : str + + Returns + ------- + tuple[dict, list[tuple[int, int]], dict[str, type[str | int | float] | Any], int] + """ metadata = load_json_data_mappings("eccc-homogenized") variable = metadata["variables"][variable_code]["_variable_name"] diff --git a/miranda/preprocess/configs/eccc-ahccd_attrs.json b/miranda/preprocess/configs/eccc-ahccd_attrs.json index a56c5b51..3de37b07 100644 --- a/miranda/preprocess/configs/eccc-ahccd_attrs.json +++ b/miranda/preprocess/configs/eccc-ahccd_attrs.json @@ -34,6 +34,25 @@ "table_id": "ECCC", "type": "station-obs" }, + "dimensions:": { + "lat": { + "axis": "Y", + "long_name": "Latitude", + "standard_name": "latitude", + "units": "degrees_north" + }, + "long": { + "axis": "X", + "long_name": "Longitude", + "standard_name": "longitude", + "units": "degrees_east" + }, + "time": { + "axis": "T", + "long_name": "Time", + "standard_name": "time" + } + }, "variables": { "dm": { "NaN_value": -9999.9, diff --git a/miranda/preprocess/eccc.py b/miranda/preprocess/eccc.py index 8b40b848..30648057 100644 --- a/miranda/preprocess/eccc.py +++ b/miranda/preprocess/eccc.py @@ -26,7 +26,7 @@ def _run_func_on_archive_with_optional_dask( function: Callable, errored_files: list[Path], **dask_kwargs, -): +) -> None: r"""Run a function on a file archive, extracting it if necessary. Notes @@ -50,9 +50,8 @@ def _run_func_on_archive_with_optional_dask( Returns ------- - + None """ - with tempfile.TemporaryDirectory() as temp_folder: if file.suffix in [".gz", ".tar", ".zip", ".7z"]: data_files = generic_extract_archive(file, output_dir=temp_folder) diff --git a/miranda/treatments/__init__.py b/miranda/treatments/__init__.py index 57319980..11e62fd2 100644 --- a/miranda/treatments/__init__.py +++ b/miranda/treatments/__init__.py @@ -2,4 +2,117 @@ from __future__ import annotations -from miranda.treatments._treatments import * +import datetime +import logging.config + +import xarray as xr + +from miranda import __version__ as __miranda_version__ +from miranda.scripting import LOGGING_CONFIG +from miranda.treatments._dimensions import * +from miranda.treatments._preprocessing import * +from miranda.treatments._variables import * +from miranda.treatments.utils import * +from miranda.units import get_time_frequency + +logging.config.dictConfig(LOGGING_CONFIG) +VERSION = datetime.datetime.now().strftime("%Y.%m.%d") + + +def metadata_conversion(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: + """Update xarray dataset and data_vars with project-specific metadata fields. + + Parameters + ---------- + d : xarray.Dataset + Dataset with metadata to be updated. + p : str + Dataset project name. + m : dict + Metadata definition dictionary for project and variable(s). + + Returns + ------- + xarray.Dataset + """ + logging.info("Converting metadata to CF-like conventions.") + + header = m["Header"] + + # Static handling of version global attributes + miranda_version = header.get("_miranda_version") + if miranda_version: + if isinstance(miranda_version, bool): + header["miranda_version"] = __miranda_version__ + elif isinstance(miranda_version, dict): + if p in miranda_version.keys(): + header["miranda_version"] = __miranda_version__ + else: + logging.warning( + f"`_miranda_version` not set for project `{p}`. Not appending." + ) + if "_miranda_version" in header: + del header["_miranda_version"] + + frequency = m["Header"].get("_frequency") + if frequency: + if isinstance(frequency, bool): + _, m["Header"]["frequency"] = get_time_frequency(d) + elif isinstance(frequency, dict): + if p in frequency.keys(): + m["Header"]["frequency"] = get_time_frequency(d) + else: + logging.warning("`frequency` not set for project. Not appending.") + if "_frequency" in m["Header"]: + del m["Header"]["_frequency"] + + # Conditional handling of global attributes based on project name + for field in [f for f in header if f.startswith("_")]: + if isinstance(header[field], list): + if p in header[field]: + attr_treatment = header[field][p] + else: + logging.warning( + f"Attribute handling (`{field}`) not set for project `{p}`. Continuing..." + ) + continue + elif isinstance(header[field], dict): + attr_treatment = header[field] + else: + raise AttributeError( + f"Attribute treatment configuration for field `{field}` is not properly configured. Verify JSON." + ) + + if field == "_map_attrs": + for attribute, mapping in attr_treatment.items(): + header[mapping] = d.attrs[attribute] + del d.attrs[attribute] + elif field == "_remove_attrs": + for ff in attr_treatment: + del d.attrs[ff] + else: + if field[1:] in d.attrs: + logging.warning( + f"Overwriting `{field[1:]}` based on JSON configuration." + ) + header[field[1:]] = attr_treatment + + del header[field] + + # Add global attributes + d.attrs.update(header) + d.attrs.update(dict(project=p)) + + # Date-based versioning + if not d.attrs.get("version"): + d.attrs.update(dict(version=f"v{VERSION}")) + + prev_history = d.attrs.get("history", "") + history = ( + f"[{datetime.datetime.now()}] " + "Converted variables and modified metadata for CF-like compliance: " + f"{prev_history}".strip() + ) + d.attrs.update(dict(history=history)) + + return d diff --git a/miranda/treatments/_dimensions.py b/miranda/treatments/_dimensions.py new file mode 100644 index 00000000..cd56e243 --- /dev/null +++ b/miranda/treatments/_dimensions.py @@ -0,0 +1,243 @@ +from __future__ import annotations + +import logging +import warnings +from typing import Any + +import numpy as np +import xarray as xr +from xclim.core.calendar import parse_offset + +from miranda.treatments.utils import _get_section_entry_key, _iter_entry_key # noqa +from miranda.units import get_time_frequency + + +def find_project_variable_codes(code: str, configuration: dict[str, Any]) -> str: + """Find the variable code for a given variable name and project. + + Parameters + ---------- + code : str + Variable name. + configuration : dict + Configuration dictionary. + + Returns + ------- + str + """ + variable_codes = {} + for variable_code in configuration["variables"]: + variable_name = configuration["variables"][variable_code].get("_variable_name") + if variable_name: + variable_codes[variable_name] = variable_code + else: + warnings.warn( + f"Variable `{variable_code}` does not have accompanying `variable_name`. " + f"Verify JSON. Continuing with `{variable_code}` as `variable_name`." + ) + variable_codes[variable_code] = variable_code + + if code in variable_codes.values(): + variable = code + else: + variable = variable_codes.get(code) + if not variable: + raise NotImplementedError(f"Variable `{code}` not supported.") + + return variable + + +def dimensions_compliance(ds: xr.Dataset, project: str, metadata: dict) -> xr.Dataset: + """Rename dimensions to CF to their equivalents and reorder them if needed. + + Parameters + ---------- + ds : xarray.Dataset + Dataset with dimensions to be updated. + project : str + Dataset project name. + metadata : dict + Metadata definition dictionary for project and variable(s). + + Returns + ------- + xarray.Dataset + """ + rename_dims = dict() + for dim in ds.dims: + if dim in metadata["dimensions"].keys(): + cf_name = _get_section_entry_key( + metadata, "dimensions", dim, "_cf_dimension_name", project + ) + if cf_name: + rename_dims[dim] = cf_name + + # Rename dimensions + logging.info(f"Renaming dimensions: {', '.join(rename_dims.keys())}.") + ds = ds.rename(rename_dims) + for new in ["lon", "lat"]: + if new == "lon" and "lon" in ds.coords: + if np.any(ds.lon > 180): + lon1 = ds.lon.where(ds.lon <= 180.0, ds.lon - 360.0) + ds[new] = lon1 + + coord_precision = _get_section_entry_key( + metadata, "dimensions", new, "_precision", project + ) + if coord_precision is not None: + ds[new] = ds[new].round(coord_precision) + + # Ensure that lon and lat are written in proper order for plotting purposes + logging.info("Reordering dimensions.") + transpose_order = [] + if "lat" in ds.dims and "lon" in ds.dims: + transpose_order = ["lat", "lon"] + elif "rlat" in ds.dims and "rlon" in ds.dims: + transpose_order = ["rlat", "rlon"] + if "time" in ds.dims and transpose_order: + transpose_order.insert(0, "time") + transpose_order.extend(list(set(ds.dims) - set(transpose_order))) + ds = ds.transpose(*transpose_order) + ds = ds.sortby(transpose_order) + + # Add dimension original name and update attrs + logging.info("Updating dimension attributes.") + dim_descriptions = metadata["dimensions"] + for dim in metadata["dimensions"].keys(): + cf_name = dim_descriptions[dim].get("_cf_dimension_name") + if cf_name is not None and cf_name in ds.dims: + ds[cf_name].attrs.update(dict(original_variable=dim)) + else: + # variable name already follows CF standards + cf_name = dim + for field in dim_descriptions[dim].keys(): + if not field.startswith("_"): + ds[cf_name].attrs.update({field: dim_descriptions[dim][field]}) + + prev_history = ds.attrs.get("history", "") + history = f"Transposed and renamed dimensions. {prev_history}" + ds.attrs.update(dict(history=history)) + + return ds + + +def ensure_correct_time_frequency(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: + """Ensure that time frequency is consistent with expected frequency for project.""" + key = "_ensure_correct_time" + strict_time = "_strict_time" + + if "time" not in m["dimensions"].keys(): + warnings.warn(f"No time corrections listed for project `{p}`. Continuing...") + return d + + if "time" not in list(d.variables.keys()): + logging.info( + "No time dimension among data variables: " + f"{' ,'.join([str(v) for v in d.variables.keys()])}. " + "Continuing..." + ) + return d + + if key in m["dimensions"]["time"].keys(): + freq_found = xr.infer_freq(d.time) + if strict_time in m["dimensions"]["time"].keys(): + if not freq_found: + msg = ( + "Time frequency could not be found. There may be missing timesteps." + ) + if m["dimensions"]["time"].get(strict_time): + raise ValueError(msg) + else: + warnings.warn(f"{msg} Continuing...") + return d + + correct_time_entry = m["dimensions"]["time"][key] + if isinstance(correct_time_entry, str): + correct_times = [parse_offset(correct_time_entry)[1]] + elif isinstance(correct_time_entry, dict): + correct_times = correct_time_entry.get(p) + if isinstance(correct_times, list): + correct_times = [parse_offset(t)[1] for t in correct_times] + if correct_times is None: + warnings.warn(f"No expected times set for specified project `{p}`.") + elif isinstance(correct_time_entry, list): + correct_times = correct_time_entry + else: + warnings.warn("No expected times set for family of projects.") + return d + + if freq_found not in correct_times: + error_msg = ( + f"Time frequency {freq_found} not among allowed frequencies: " + f"{', '.join(correct_times) if isinstance(correct_times, list) else correct_times}" + ) + if isinstance(correct_time_entry, dict): + error_msg = f"{error_msg} for project `{p}`." + else: + error_msg = f"{error_msg}." + raise ValueError(error_msg) + + logging.info(f"Resampling dataset with time frequency: {freq_found}.") + with xr.set_options(keep_attrs=True): + d_out = d.assign_coords( + time=d.time.resample(time=freq_found).mean(dim="time").time + ) + d_out.time.attrs.update(d.time.attrs) + + prev_history = d.attrs.get("history", "") + history = f"Resampled time with `freq={freq_found}`. {prev_history}" + d_out.attrs.update(dict(history=history)) + return d_out + + return d + + +def offset_time_dimension(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: + """Offset time dimension using listed frequency.""" + key = "_offset_time" + d_out = xr.Dataset(coords=d.coords, attrs=d.attrs) + converted = [] + offset, offset_meaning = None, None + + time_freq = dict() + expected_period = _get_section_entry_key( + m, "dimensions", "time", "_ensure_correct_time", p + ) + if isinstance(expected_period, str): + time_freq["expected_period"] = expected_period + + for vv, offs in _iter_entry_key(d, m, "dimensions", key, p): + if offs: + # Offset time by value of one time-step + if offset is None and offset_meaning is None: + try: + offset, offset_meaning = get_time_frequency(d, **time_freq) + except TypeError: + logging.error( + "Unable to parse the time frequency. Verify data integrity before retrying." + ) + raise + + logging.info( + f"Offsetting data for `{vv}` by `{offset[0]} {offset_meaning}(s)`." + ) + with xr.set_options(keep_attrs=True): + out = d[vv] + out["time"] = out.time - np.timedelta64(offset[0], offset[1]) + d_out[vv] = out + converted.append(vv) + elif offs is False: + logging.info( + f"No time offsetting needed for `{vv}` in `{p}` (Explicitly set to False)." + ) + continue + prev_history = d.attrs.get("history", "") + history = f"Offset variable `{vv}` values by `{offset[0]} {offset_meaning}(s). {prev_history}" + d_out.attrs.update(dict(history=history)) + + # Copy unconverted variables + for vv in d.data_vars: + if vv not in converted: + d_out[vv] = d[vv] + return d_out diff --git a/miranda/treatments/_preprocessing.py b/miranda/treatments/_preprocessing.py new file mode 100644 index 00000000..7d411d6b --- /dev/null +++ b/miranda/treatments/_preprocessing.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +from functools import partial +from pathlib import Path +from typing import Any, Dict + +import numpy as np +import xarray as xr + +from miranda.convert.utils import date_parser + + +def correct_time_entries( + ds: xr.Dataset, + split: str = "_", + location: int = -1, + field: str = "time", +) -> xr.Dataset: + """Correct time entries in dataset. + + Parameters + ---------- + ds : xarray.Dataset + split : str + location : int + field : str + + Returns + ------- + xarray.Dataset + """ + filename = ds.encoding["source"] + date = date_parser(Path(filename).stem.split(split)[location]) + vals = np.arange(len(ds[field])) + days_since = f"days since {date}" + time = xr.coding.times.decode_cf_datetime( + vals, units=days_since, calendar="standard" + ) + ds = ds.assign_coords({field: time}) + + prev_history = ds.attrs.get("history", "") + history = ( + f"Time index recalculated in preprocessing step ({days_since}). {prev_history}" + ) + ds.attrs.update(dict(history=history)) + + return ds + + +def correct_var_names( + ds: xr.Dataset, split: str = "_", location: int = 0 +) -> xr.Dataset: + """Correct variable names in dataset. + + Parameters + ---------- + ds : xarray.Dataset + split : str + location : int + + Returns + ------- + xarray.Dataset + """ + filename = ds.encoding["source"] + new_name = Path(filename).stem.split(split)[location] + old_name = list(ds.data_vars.keys())[0] + + prev_history = ds.attrs.get("history", "") + history = f"Variable renamed in preprocessing step ({old_name}: {new_name}). {prev_history}" + ds.attrs.update(dict(history=history)) + + return ds.rename({old_name: new_name}) + + +def preprocessing_corrections( + ds: xr.Dataset, configuration: dict[str, Any] +) -> xr.Dataset: + """Corrections function dispatcher to ensure minimal dataset validity on open. + + Parameters + ---------- + ds : xarray.Dataset + configuration : dict + + Returns + ------- + xarray.Dataset + """ + + def _preprocess_correct(d: xr.Dataset, *, ops: list[partial]) -> xr.Dataset: + for correction in ops: + d = correction(d) + return d + + correction_fields = configuration.get("_preprocess") + if correction_fields: + preprocess_ops = [] + for field in correction_fields: + if field == "_variable_name": + preprocess_ops.append( + partial(correct_var_names, **correction_fields[field]) + ) + if field == "_time": + preprocess_ops.append( + partial(correct_time_entries, **correction_fields[field]) + ) + if preprocess_ops: + corrector = partial(_preprocess_correct, ops=preprocess_ops) + return corrector(ds) + return ds diff --git a/miranda/treatments/_treatments.py b/miranda/treatments/_treatments.py deleted file mode 100644 index d7e1cd99..00000000 --- a/miranda/treatments/_treatments.py +++ /dev/null @@ -1,854 +0,0 @@ -from __future__ import annotations - -import datetime -import inspect -import json -import logging.config -import os -import warnings -from functools import partial -from pathlib import Path -from typing import Any, Dict - -import numpy as np -import xarray as xr -import xclim.core.units -from xarray.coding import times -from xclim.core import units -from xclim.core.calendar import parse_offset - -from miranda import __version__ as __miranda_version__ -from miranda.convert.utils import date_parser -from miranda.scripting import LOGGING_CONFIG -from miranda.units import get_time_frequency - -logging.config.dictConfig(LOGGING_CONFIG) - -VERSION = datetime.datetime.now().strftime("%Y.%m.%d") - -__all__ = [ - "cf_units_conversion", - "clip_values", - "conservative_regrid", - "correct_unit_names", - "dimensions_compliance", - "ensure_correct_time_frequency", - "invert_value_sign", - "load_json_data_mappings", - "metadata_conversion", - "offset_time_dimension", - "preprocessing_corrections", - "threshold_mask", - "transform_values", - "variable_conversion", -] - - -def load_json_data_mappings( - project: str, configurations: dict[str, Path] | None = None -) -> dict[str, Any]: - """Load JSON mappings for supported dataset conversions. - - Parameters - ---------- - project : str - configurations: dict, optional - - Returns - ------- - dict[str, Any] - """ - if configurations is None: - calling_frame = inspect.currentframe().f_back - calling_file_path = calling_frame.f_globals["__file__"] - config_folder = Path(calling_file_path).parent / "configs" - - configurations = {} - for configuration in config_folder.glob("*attrs.json"): - project = str(configuration.stem).split("_")[0] - if "|" in project: - for p in project.split("|"): - configurations[p] = configuration - configurations[project] = configuration - - if project in configurations.keys(): - config_file = configurations[project] - metadata_definition = json.load(config_file.open()) - return metadata_definition - else: - raise NotImplementedError(f"Project not supported: {project}") - - -def _get_section_entry_key(meta, entry, var, key, project): - var_meta = meta[entry].get(var, {}) - if key in var_meta: - if isinstance(var_meta[key], dict): - config = var_meta[key].get(project) - if config is None and "all" in var_meta[key].keys(): - config = var_meta[key].get("all") - return config - return var_meta[key] - return None - - -def _iter_entry_key(ds, meta, entry, key, project): - for vv in set(ds.data_vars).intersection(meta[entry]): - val = _get_section_entry_key(meta, entry, vv, key, project) - yield vv, val - - -def _simple_fix_dims(d: xr.Dataset | xr.DataArray) -> xr.Dataset | xr.DataArray: - """Adjust dimensions found in a file so that it can be used for regridding purposes.""" - if "lon" not in d.dims or "lat" not in d.dims: - dim_rename = dict() - for dim in d.dims: - if str(dim).lower().startswith("lon"): - dim_rename[str(dim)] = "lon" - if str(dim).lower().startswith("lat"): - dim_rename[str(dim)] = "lat" - d = d.rename(dim_rename) - if np.any(d.lon > 180): - lon_wrapped = d.lon.where(d.lon <= 180.0, d.lon - 360.0) - d["lon"] = lon_wrapped - d = d.sortby(["lon"]) - - if "time" in d.dims: - d = d.isel(time=0, drop=True) - - return d - - -def conservative_regrid( - ds: xr.DataArray | xr.Dataset, ref_grid: xr.DataArray | xr.Dataset -) -> xr.DataArray | xr.Dataset: - """Perform a conservative_normed regridding""" - try: - import xesmf as xe # noqa - except ModuleNotFoundError: - raise ModuleNotFoundError( - "This function requires the `xesmf` library which is not installed. " - "Regridding step will be skipped." - ) - - ref_grid = _simple_fix_dims(ref_grid) - method = "conservative_normed" - - logging.info( - f"Performing regridding and masking with `xesmf` using method: {method}." - ) - - regridder = xe.Regridder(ds, ref_grid, method, periodic=False) - ds = regridder(ds) - - ds.attrs["history"] = ( - f"{datetime.datetime.now()}:" - f"Regridded dataset using xesmf with method: {method}. " - f"{ds.attrs.get('history')}".strip() - ) - return ds - - -def threshold_mask( - ds: xr.Dataset | xr.DataArray, - *, - mask: xr.Dataset | xr.DataArray, - mask_cutoff: float | bool = False, -) -> xr.Dataset | xr.DataArray: - """Land-Sea mask operations. - - Parameters - ---------- - ds : xr.Dataset or str or os.PathLike - mask : xr.Dataset or xr.DataArray - mask_cutoff : float or bool - - Returns - ------- - xr.Dataset or xr.DataArray - """ - mask = _simple_fix_dims(mask) - - if isinstance(mask, xr.Dataset): - if len(mask.data_vars) == 1: - mask_variable = list(mask.data_vars)[0] - mask = mask[mask_variable] - else: - raise ValueError( - "More than one data variable found in land-sea mask. Supply a DataArray instead." - ) - else: - mask_variable = mask.name - - try: - from clisops.core import subset_bbox # noqa - - log_msg = f"Masking dataset with {mask_variable}." - if mask_cutoff: - log_msg = f"{log_msg.strip('.')} at `{mask_cutoff}` cutoff value." - logging.info(log_msg) - - lon_bounds = np.array([ds.lon.min(), ds.lon.max()]) - lat_bounds = np.array([ds.lat.min(), ds.lat.max()]) - - mask_subset = subset_bbox( - mask, - lon_bnds=lon_bounds, - lat_bnds=lat_bounds, - ).load() - except ModuleNotFoundError: - log_msg = ( - "This function requires the `clisops` library which is not installed. " - "subsetting step will be skipped." - ) - warnings.warn(log_msg) - mask_subset = mask.load() - - if mask_subset.dtype == bool: - if mask_cutoff: - logging.warning("Mask value cutoff set for boolean mask. Ignoring.") - mask_subset = mask_subset.where(mask) - else: - mask_subset = mask_subset.where(mask >= mask_cutoff) - ds = ds.where(mask_subset.notnull()) - - if mask_subset.min() >= 0: - if mask_subset.max() <= 1.00000001: - cutoff_info = f"{mask_cutoff * 100} %" - elif mask_subset.max() <= 100.00000001: - cutoff_info = f"{mask_cutoff} %" - else: - cutoff_info = f"{mask_cutoff}" - else: - cutoff_info = f"{mask_cutoff}" - ds.attrs["mask_cutoff"] = cutoff_info - - prev_history = ds.attrs.get("history", "") - history_msg = f"Mask calculated using `{mask_variable}`." - if mask_cutoff: - history_msg = f"{history_msg.strip('.')} with cutoff value `{cutoff_info}`." - history = f"{history_msg} {prev_history}".strip() - ds.attrs.update(dict(history=history)) - - return ds - - -def correct_time_entries( - d: xr.Dataset, - split: str = "_", - location: int = -1, - field: str = "time", -) -> xr.Dataset: - filename = d.encoding["source"] - date = date_parser(Path(filename).stem.split(split)[location]) - vals = np.arange(len(d[field])) - days_since = f"days since {date}" - time = xr.coding.times.decode_cf_datetime( - vals, units=days_since, calendar="standard" - ) - d = d.assign_coords({field: time}) - - prev_history = d.attrs.get("history", "") - history = ( - f"Time index recalculated in preprocessing step ({days_since}). {prev_history}" - ) - d.attrs.update(dict(history=history)) - - return d - - -def correct_var_names(d: xr.Dataset, split: str = "_", location: int = 0) -> xr.Dataset: - """ - - Parameters - ---------- - d : xarray.Dataset - split : str - location : int - - Returns - ------- - xarray.Dataset - """ - filename = d.encoding["source"] - new_name = Path(filename).stem.split(split)[location] - old_name = list(d.data_vars.keys())[0] - - prev_history = d.attrs.get("history", "") - history = f"Variable renamed in preprocessing step ({old_name}: {new_name}). {prev_history}" - d.attrs.update(dict(history=history)) - - return d.rename({old_name: new_name}) - - -def preprocessing_corrections(ds: xr.Dataset, project: str) -> xr.Dataset: - """Corrections function dispatcher to ensure minimal dataset validity on open. - - Parameters - ---------- - ds : xarray.Dataset - project : str - - Returns - ------- - xarray.Dataset - """ - - def _preprocess_correct(d: xr.Dataset, *, ops: list[partial]) -> xr.Dataset: - for correction in ops: - d = correction(d) - return d - - correction_fields = load_json_data_mappings(project).get("_preprocess") - if correction_fields: - preprocess_ops = [] - for field in correction_fields: - if field == "_variable_name": - preprocess_ops.append( - partial(correct_var_names, **correction_fields[field]) - ) - if field == "_time": - preprocess_ops.append( - partial(correct_time_entries, **correction_fields[field]) - ) - if preprocess_ops: - corrector = partial(_preprocess_correct, ops=preprocess_ops) - return corrector(ds) - return ds - - -def correct_unit_names(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: - """Correct unit names.""" - key = "_corrected_units" - for var, val in _iter_entry_key(d, m, "variables", key, p): - if val: - d[var].attrs["units"] = val - - # FIXME: This is no longer relevant. Performed under dimension conversion step. - val_time = _get_section_entry_key(m, "variables", "time", key, p) - if val_time: - d["time"].attrs["units"] = val_time - - return d - - -# for de-accumulation or conversion to flux -def transform_values(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: - """Transform dataset values according to operation listed.""" - key = "_transformation" - d_out = xr.Dataset(coords=d.coords, attrs=d.attrs) - converted = [] - offset, offset_meaning = None, None - - time_freq = dict() - expected_period = _get_section_entry_key( - m, "dimensions", "time", "_ensure_correct_time", p - ) - if isinstance(expected_period, str): - time_freq["expected_period"] = expected_period - - for vv, trans in _iter_entry_key(d, m, "variables", key, p): - if trans: - if trans == "deaccumulate": - # Time-step accumulated total to time-based flux (de-accumulation) - if offset is None and offset_meaning is None: - try: - offset, offset_meaning = get_time_frequency(d, **time_freq) - except TypeError: - logging.error( - "Unable to parse the time frequency. Verify data integrity before retrying." - ) - raise - - logging.info(f"De-accumulating units for variable `{vv}`.") - with xr.set_options(keep_attrs=True): - out = d[vv].diff(dim="time") - out = d[vv].where( - getattr(d[vv].time.dt, offset_meaning) == offset[0], - out.broadcast_like(d[vv]), - ) - out = units.amount2rate(out, out_units=m["variables"][vv]["units"]) - d_out[vv] = out - converted.append(vv) - elif trans == "amount2rate": - # NOTE: This treatment is no longer needed in xclim v0.43.0+ but is kept for backwards compatibility - # frequency-based totals to time-based flux - logging.info( - f"Performing amount-to-rate units conversion for variable `{vv}`." - ) - with xr.set_options(keep_attrs=True): - out = units.amount2rate( - d[vv], - out_units=m["variables"][vv]["units"], - ) - d_out[vv] = out - converted.append(vv) - elif isinstance(trans, str): - if trans.startswith("op "): - op = trans[3] - value = trans[4:].strip() - if value.startswith("attrs"): - value = units.str2pint(d[vv].attrs[value[6:]]) - else: - value = units.str2pint(value) - with xr.set_options(keep_attrs=True): - if op == "+": - value = units.convert_units_to(value, d[vv]) - d_out[vv] = d[vv] + value - elif op == "-": - value = units.convert_units_to(value, d[vv]) - d_out[vv] = d[vv] - value - elif op == "*": - d_out[vv] = units.pint_multiply(d[vv], value) - elif op == "/": - d_out[vv] = units.pint_multiply(d[vv], 1 / value) - else: - raise NotImplementedError( - f"Op transform doesn't implement the «{op}» operator." - ) - converted.append(vv) - else: - raise NotImplementedError(f"Unknown transformation: {trans}") - elif trans is False: - logging.info( - f"No transformations needed for `{vv}` (Explicitly set to False)." - ) - continue - - prev_history = d.attrs.get("history", "") - history = ( - f"Transformed variable `{vv}` values using method `{trans}`. {prev_history}" - ) - d_out.attrs.update(dict(history=history)) - - # Copy unconverted variables - for vv in d.data_vars: - if vv not in converted: - d_out[vv] = d[vv] - return d_out - - -def offset_time_dimension(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: - """Offset time dimension using listed frequency.""" - key = "_offset_time" - d_out = xr.Dataset(coords=d.coords, attrs=d.attrs) - converted = [] - offset, offset_meaning = None, None - - time_freq = dict() - expected_period = _get_section_entry_key( - m, "dimensions", "time", "_ensure_correct_time", p - ) - if isinstance(expected_period, str): - time_freq["expected_period"] = expected_period - - for vv, offs in _iter_entry_key(d, m, "dimensions", key, p): - if offs: - # Offset time by value of one time-step - if offset is None and offset_meaning is None: - try: - offset, offset_meaning = get_time_frequency(d, **time_freq) - except TypeError: - logging.error( - "Unable to parse the time frequency. Verify data integrity before retrying." - ) - raise - - logging.info( - f"Offsetting data for `{vv}` by `{offset[0]} {offset_meaning}(s)`." - ) - with xr.set_options(keep_attrs=True): - out = d[vv] - out["time"] = out.time - np.timedelta64(offset[0], offset[1]) - d_out[vv] = out - converted.append(vv) - elif offs is False: - logging.info( - f"No time offsetting needed for `{vv}` in `{p}` (Explicitly set to False)." - ) - continue - prev_history = d.attrs.get("history", "") - history = f"Offset variable `{vv}` values by `{offset[0]} {offset_meaning}(s). {prev_history}" - d_out.attrs.update(dict(history=history)) - - # Copy unconverted variables - for vv in d.data_vars: - if vv not in converted: - d_out[vv] = d[vv] - return d_out - - -def invert_value_sign(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: - """Flip value of DataArray.""" - key = "_invert_sign" - d_out = xr.Dataset(coords=d.coords, attrs=d.attrs) - converted = [] - for vv, inv_sign in _iter_entry_key(d, m, "variables", key, p): - if inv_sign: - logging.info(f"Inverting sign for `{vv}` (switching direction of values).") - with xr.set_options(keep_attrs=True): - out = d[vv] - d_out[out.name] = -out - converted.append(vv) - elif inv_sign is False: - logging.info( - f"No sign inversion needed for `{vv}` in `{p}` (Explicitly set to False)." - ) - continue - prev_history = d.attrs.get("history", "") - history = f"Inverted sign for variable `{vv}` (switched direction of values). {prev_history}" - d_out.attrs.update(dict(history=history)) - - # Copy unconverted variables - for vv in d.data_vars: - if vv not in converted: - d_out[vv] = d[vv] - return d_out - - -# For converting variable units to standard workflow units -def cf_units_conversion(d: xr.Dataset, m: dict) -> xr.Dataset: - """Perform pint-based units-conversion.""" - if "time" in m["dimensions"].keys(): - if m["dimensions"]["time"].get("units"): - d["time"]["units"] = m["dimensions"]["time"]["units"] - - for vv, unit in _iter_entry_key(d, m, "variables", "units", None): - if unit: - with xr.set_options(keep_attrs=True): - d[vv] = units.convert_units_to(d[vv], unit, context="hydro") - prev_history = d.attrs.get("history", "") - history = f"Converted variable `{vv}` to CF-compliant units (`{unit}`). {prev_history}" - d.attrs.update(dict(history=history)) - - return d - - -# For clipping variable values to an established maximum/minimum -def clip_values(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: - """Clip values to an appropriate range,.""" - key = "_clip_values" - d_out = xr.Dataset(coords=d.coords, attrs=d.attrs) - converted = [] - for vv in d.data_vars: - if vv in m["variables"].keys(): - clip_values = _get_section_entry_key(m, "variables", vv, key, p) - if clip_values: - min_value, max_value = None, None - # Gather unit conversion context, if applicable - context = clip_values.get("context", None) - for op, value in clip_values.items(): - if op == "min": - min_value = xclim.core.units.convert_units_to( - value, d[vv], context - ) - if op == "max": - max_value = xclim.core.units.convert_units_to( - value, d[vv], context - ) - logging.info( - f"Clipping min/max values for `{vv}` ({min_value}/{max_value})." - ) - with xr.set_options(keep_attrs=True): - out = d[vv] - d_out[out.name] = out.clip(min_value, max_value) - converted.append(vv) - elif clip_values is False: - logging.info( - f"No clipping of values needed for `{vv}` in `{p}` (Explicitly set to False)." - ) - continue - else: - logging.info(f"No clipping of values needed for `{vv}` in `{p}`.") - continue - - prev_history = d.attrs.get("history", "") - history = f"Clipped variable `{vv}` with `min={min_value}` and `max={max_value}`. {prev_history}" - d_out.attrs.update(dict(history=history)) - - # Copy unconverted variables - for vv in d.data_vars: - if vv not in converted: - d_out[vv] = d[vv] - - return d_out - - -def ensure_correct_time_frequency(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: - """Ensure that time frequency is consistent with expected frequency for project.""" - key = "_ensure_correct_time" - strict_time = "_strict_time" - - if "time" not in m["dimensions"].keys(): - logging.warning(f"No time corrections listed for project `{p}`. Continuing...") - return d - - if "time" not in list(d.variables.keys()): - logging.info( - "No time dimension among data variables: " - f"{' ,'.join([str(v) for v in d.variables.keys()])}. " - "Continuing..." - ) - return d - - if key in m["dimensions"]["time"].keys(): - freq_found = xr.infer_freq(d.time) - if strict_time in m["dimensions"]["time"].keys(): - if not freq_found: - msg = ( - "Time frequency could not be found. There may be missing timesteps." - ) - if m["dimensions"]["time"].get(strict_time): - raise ValueError(msg) - else: - logging.warning(f"{msg} Continuing...") - return d - - correct_time_entry = m["dimensions"]["time"][key] - if isinstance(correct_time_entry, str): - correct_times = [parse_offset(correct_time_entry)[1]] - elif isinstance(correct_time_entry, dict): - correct_times = correct_time_entry.get(p) - if isinstance(correct_times, list): - correct_times = [parse_offset(t)[1] for t in correct_times] - if correct_times is None: - logging.warning(f"No expected times set for specified project `{p}`.") - elif isinstance(correct_time_entry, list): - correct_times = correct_time_entry - else: - logging.warning("No expected times set for family of projects.") - return d - - if freq_found not in correct_times: - error_msg = ( - f"Time frequency {freq_found} not among allowed frequencies: " - f"{', '.join(correct_times) if isinstance(correct_times, list) else correct_times}" - ) - if isinstance(correct_time_entry, dict): - error_msg = f"{error_msg} for project `{p}`." - else: - error_msg = f"{error_msg}." - raise ValueError(error_msg) - - logging.info(f"Resampling dataset with time frequency: {freq_found}.") - with xr.set_options(keep_attrs=True): - d_out = d.assign_coords( - time=d.time.resample(time=freq_found).mean(dim="time").time - ) - d_out.time.attrs.update(d.time.attrs) - - prev_history = d.attrs.get("history", "") - history = f"Resampled time with `freq={freq_found}`. {prev_history}" - d_out.attrs.update(dict(history=history)) - return d_out - - return d - - -# For renaming and reordering lat and lon dims -def dimensions_compliance(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: - """Rename dimensions to CF to their equivalents and reorder them if needed. - - Parameters - ---------- - d : xarray.Dataset - Dataset with dimensions to be updated. - p : str - Dataset project name. - m : dict - Metadata definition dictionary for project and variable(s). - - Returns - ------- - xarray.Dataset - """ - rename_dims = dict() - for dim in d.dims: - if dim in m["dimensions"].keys(): - cf_name = _get_section_entry_key( - m, "dimensions", dim, "_cf_dimension_name", p - ) - if cf_name: - rename_dims[dim] = cf_name - d = d.rename(rename_dims) - for new in ["lon", "lat"]: - if new == "lon" and "lon" in d.coords: - if np.any(d.lon > 180): - lon1 = d.lon.where(d.lon <= 180.0, d.lon - 360.0) - d[new] = lon1 - - coord_precision = _get_section_entry_key(m, "dimensions", new, "_precision", p) - if coord_precision is not None: - d[new] = d[new].round(coord_precision) - - # Ensure that lon and lat are written in proper order for plotting purposes - transpose_order = [] - if "lat" in d.dims and "lon" in d.dims: - transpose_order = ["lat", "lon"] - elif "rlat" in d.dims and "rlon" in d.dims: - transpose_order = ["rlat", "rlon"] - if "time" in d.dims and transpose_order: - transpose_order.insert(0, "time") - transpose_order.extend(list(set(d.dims) - set(transpose_order))) - d = d.transpose(*transpose_order) - d = d.sortby(transpose_order) - - # Add dimension original name and update attrs - dim_descriptions = m["dimensions"] - for dim in m["dimensions"].keys(): - cf_name = dim_descriptions[dim].get("_cf_dimension_name") - if cf_name is not None and cf_name in d.dims: - d[cf_name].attrs.update(dict(original_variable=dim)) - else: - # variable name already follows CF standards - cf_name = dim - for field in dim_descriptions[dim].keys(): - if not field.startswith("_"): - d[cf_name].attrs.update({field: dim_descriptions[dim][field]}) - - prev_history = d.attrs.get("history", "") - history = f"Transposed and renamed dimensions. {prev_history}" - d.attrs.update(dict(history=history)) - - return d - - -def variable_conversion(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: - """Add variable metadata and remove nonstandard entries. - - Parameters - ---------- - d : xarray.Dataset - Dataset with variable(s) to be updated. - p : str - Dataset project name. - m : dict - Metadata definition dictionary for project and variable(s). - - Returns - ------- - xarray.Dataset - """ - var_descriptions = m["variables"] - var_correction_fields = [ - "_clip_values", - "_corrected_units", - "_invert_sign", - "_offset_time", - "_transformation", - ] - for var in d.variables: - if var in var_descriptions.keys(): - for field in var_correction_fields: - if field in var_descriptions[var].keys(): - del var_descriptions[var][field] - d[var].attrs.update(var_descriptions[var]) - - # Rename data variables - for orig_var_name, cf_name in _iter_entry_key( - d, m, "variables", "_cf_variable_name", None - ): - if cf_name is not None: - d = d.rename({orig_var_name: cf_name}) - d[cf_name].attrs.update(dict(original_variable=orig_var_name)) - del d[cf_name].attrs["_cf_variable_name"] - - return d - - -def metadata_conversion(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: - """Update xarray dataset and data_vars with project-specific metadata fields. - - Parameters - ---------- - d : xarray.Dataset - Dataset with metadata to be updated. - p : str - Dataset project name. - m : dict - Metadata definition dictionary for project and variable(s). - - Returns - ------- - xarray.Dataset - """ - logging.info("Converting metadata to CF-like conventions.") - - header = m["Header"] - - # Static handling of version global attributes - miranda_version = header.get("_miranda_version") - if miranda_version: - if isinstance(miranda_version, bool): - header["miranda_version"] = __miranda_version__ - elif isinstance(miranda_version, dict): - if p in miranda_version.keys(): - header["miranda_version"] = __miranda_version__ - else: - logging.warning( - f"`_miranda_version` not set for project `{p}`. Not appending." - ) - if "_miranda_version" in header: - del header["_miranda_version"] - - frequency = m["Header"].get("_frequency") - if frequency: - if isinstance(frequency, bool): - _, m["Header"]["frequency"] = get_time_frequency(d) - elif isinstance(frequency, dict): - if p in frequency.keys(): - m["Header"]["frequency"] = get_time_frequency(d) - else: - logging.warning("`frequency` not set for project. Not appending.") - if "_frequency" in m["Header"]: - del m["Header"]["_frequency"] - - # Conditional handling of global attributes based on project name - for field in [f for f in header if f.startswith("_")]: - if isinstance(header[field], list): - if p in header[field]: - attr_treatment = header[field][p] - else: - logging.warning( - f"Attribute handling (`{field}`) not set for project `{p}`. Continuing..." - ) - continue - elif isinstance(header[field], dict): - attr_treatment = header[field] - else: - raise AttributeError( - f"Attribute treatment configuration for field `{field}` is not properly configured. Verify JSON." - ) - - if field == "_map_attrs": - for attribute, mapping in attr_treatment.items(): - header[mapping] = d.attrs[attribute] - del d.attrs[attribute] - elif field == "_remove_attrs": - for ff in attr_treatment: - del d.attrs[ff] - else: - if field[1:] in d.attrs: - logging.warning( - f"Overwriting `{field[1:]}` based on JSON configuration." - ) - header[field[1:]] = attr_treatment - - del header[field] - - # Add global attributes - d.attrs.update(header) - d.attrs.update(dict(project=p)) - - # Date-based versioning - if not d.attrs.get("version"): - d.attrs.update(dict(version=f"v{VERSION}")) - - prev_history = d.attrs.get("history", "") - history = ( - f"[{datetime.datetime.now()}] " - "Converted variables and modified metadata for CF-like compliance: " - f"{prev_history}".strip() - ) - d.attrs.update(dict(history=history)) - - return d diff --git a/miranda/treatments/_variables.py b/miranda/treatments/_variables.py new file mode 100644 index 00000000..5991e4c7 --- /dev/null +++ b/miranda/treatments/_variables.py @@ -0,0 +1,273 @@ +from __future__ import annotations + +import logging.config + +import xarray as xr +import xclim.core.units +from xclim.core import units + +from miranda.treatments.utils import _get_section_entry_key # noqa +from miranda.treatments.utils import _iter_entry_key # noqa +from miranda.units import get_time_frequency + +__all__ = [ + "cf_units_conversion", + "clip_values", + "correct_unit_names", + "invert_value_sign", + "transform_values", + "variable_conversion", +] + + +def correct_unit_names(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: + """Correct unit names.""" + key = "_corrected_units" + for var, val in _iter_entry_key(d, m, "variables", key, p): + if val: + d[var].attrs["units"] = val + prev_history = d.attrs.get("history", "") + history = ( + f"Corrected units name for variable `{var}` to `{val}`. {prev_history}" + ) + d.attrs.update(dict(history=history)) + + return d + + +# for de-accumulation or conversion to flux +def transform_values(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: + """Transform dataset values according to operation listed.""" + key = "_transformation" + d_out = xr.Dataset(coords=d.coords, attrs=d.attrs) + converted = [] + offset, offset_meaning = None, None + + time_freq = dict() + expected_period = _get_section_entry_key( + m, "dimensions", "time", "_ensure_correct_time", p + ) + if isinstance(expected_period, str): + time_freq["expected_period"] = expected_period + + for vv, trans in _iter_entry_key(d, m, "variables", key, p): + if trans: + if trans == "deaccumulate": + # Time-step accumulated total to time-based flux (de-accumulation) + if offset is None and offset_meaning is None: + try: + offset, offset_meaning = get_time_frequency(d, **time_freq) + except TypeError: + logging.error( + "Unable to parse the time frequency. Verify data integrity before retrying." + ) + raise + + logging.info(f"De-accumulating units for variable `{vv}`.") + with xr.set_options(keep_attrs=True): + out = d[vv].diff(dim="time") + out = d[vv].where( + getattr(d[vv].time.dt, offset_meaning) == offset[0], + out.broadcast_like(d[vv]), + ) + out = units.amount2rate(out, out_units=m["variables"][vv]["units"]) + d_out[vv] = out + converted.append(vv) + elif trans == "amount2rate": + # NOTE: This treatment is no longer needed in xclim v0.43.0+ but is kept for backwards compatibility + # frequency-based totals to time-based flux + logging.info( + f"Performing amount-to-rate units conversion for variable `{vv}`." + ) + with xr.set_options(keep_attrs=True): + out = units.amount2rate( + d[vv], + out_units=m["variables"][vv]["units"], + ) + d_out[vv] = out + converted.append(vv) + elif isinstance(trans, str): + if trans.startswith("op "): + op = trans[3] + value = trans[4:].strip() + if value.startswith("attrs"): + value = units.str2pint(d[vv].attrs[value[6:]]) + else: + value = units.str2pint(value) + with xr.set_options(keep_attrs=True): + if op == "+": + value = units.convert_units_to(value, d[vv]) + d_out[vv] = d[vv] + value + elif op == "-": + value = units.convert_units_to(value, d[vv]) + d_out[vv] = d[vv] - value + elif op == "*": + d_out[vv] = units.pint_multiply(d[vv], value) + elif op == "/": + d_out[vv] = units.pint_multiply(d[vv], 1 / value) + else: + raise NotImplementedError( + f"Op transform doesn't implement the «{op}» operator." + ) + converted.append(vv) + else: + raise NotImplementedError(f"Unknown transformation: {trans}") + elif trans is False: + logging.info( + f"No transformations needed for `{vv}` (Explicitly set to False)." + ) + continue + + prev_history = d.attrs.get("history", "") + history = ( + f"Transformed variable `{vv}` values using method `{trans}`. {prev_history}" + ) + d_out.attrs.update(dict(history=history)) + + # Copy unconverted variables + for vv in d.data_vars: + if vv not in converted: + d_out[vv] = d[vv] + return d_out + + +def invert_value_sign(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: + """Flip value of DataArray.""" + key = "_invert_sign" + d_out = xr.Dataset(coords=d.coords, attrs=d.attrs) + converted = [] + for vv, inv_sign in _iter_entry_key(d, m, "variables", key, p): + if inv_sign: + logging.info(f"Inverting sign for `{vv}` (switching direction of values).") + with xr.set_options(keep_attrs=True): + out = d[vv] + d_out[out.name] = -out + converted.append(vv) + elif inv_sign is False: + logging.info( + f"No sign inversion needed for `{vv}` in `{p}` (Explicitly set to False)." + ) + continue + prev_history = d.attrs.get("history", "") + history = f"Inverted sign for variable `{vv}` (switched direction of values). {prev_history}" + d_out.attrs.update(dict(history=history)) + + # Copy unconverted variables + for vv in d.data_vars: + if vv not in converted: + d_out[vv] = d[vv] + return d_out + + +# For converting variable units to standard workflow units +def cf_units_conversion(d: xr.Dataset, m: dict) -> xr.Dataset: + """Perform pint-based units-conversion.""" + if "time" in m["dimensions"].keys(): + if m["dimensions"]["time"].get("units"): + d["time"]["units"] = m["dimensions"]["time"]["units"] + + for vv, unit in _iter_entry_key(d, m, "variables", "units", None): + if unit: + with xr.set_options(keep_attrs=True): + d[vv] = units.convert_units_to(d[vv], unit, context="hydro") + prev_history = d.attrs.get("history", "") + history = f"Converted variable `{vv}` to CF-compliant units (`{unit}`). {prev_history}" + d.attrs.update(dict(history=history)) + + return d + + +# For clipping variable values to an established maximum/minimum +def clip_values(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: + """Clip values to an appropriate range,.""" + key = "_clip_values" + d_out = xr.Dataset(coords=d.coords, attrs=d.attrs) + converted = [] + for vv in d.data_vars: + if vv in m["variables"].keys(): + clip_values = _get_section_entry_key(m, "variables", vv, key, p) + if clip_values: + min_value, max_value = None, None + # Gather unit conversion context, if applicable + context = clip_values.get("context", None) + for op, value in clip_values.items(): + if op == "min": + min_value = xclim.core.units.convert_units_to( + value, d[vv], context + ) + if op == "max": + max_value = xclim.core.units.convert_units_to( + value, d[vv], context + ) + logging.info( + f"Clipping min/max values for `{vv}` ({min_value}/{max_value})." + ) + with xr.set_options(keep_attrs=True): + out = d[vv] + d_out[out.name] = out.clip(min_value, max_value) + converted.append(vv) + elif clip_values is False: + logging.info( + f"No clipping of values needed for `{vv}` in `{p}` (Explicitly set to False)." + ) + continue + else: + logging.info(f"No clipping of values needed for `{vv}` in `{p}`.") + continue + + prev_history = d.attrs.get("history", "") + history = f"Clipped variable `{vv}` with `min={min_value}` and `max={max_value}`. {prev_history}" + d_out.attrs.update(dict(history=history)) + + # Copy unconverted variables + for vv in d.data_vars: + if vv not in converted: + d_out[vv] = d[vv] + + return d_out + + +# For renaming and reordering lat and lon dims + + +def variable_conversion(d: xr.Dataset, p: str | None, m: dict) -> xr.Dataset: + """Add variable metadata and remove nonstandard entries. + + Parameters + ---------- + d : xarray.Dataset + Dataset with variable(s) to be updated. + p : str + Dataset project name. + m : dict + Metadata definition dictionary for project and variable(s). + + Returns + ------- + xarray.Dataset + """ + var_descriptions = m["variables"] + var_correction_fields = [ + "_clip_values", + "_corrected_units", + "_invert_sign", + "_offset_time", + "_transformation", + ] + for var in d.variables: + if var in var_descriptions.keys(): + for field in var_correction_fields: + if field in var_descriptions[var].keys(): + del var_descriptions[var][field] + d[var].attrs.update(var_descriptions[var]) + + # Rename data variables + for orig_var_name, cf_name in _iter_entry_key( + d, m, "variables", "_cf_variable_name", p + ): + if cf_name is not None: + d = d.rename({orig_var_name: cf_name}) + d[cf_name].attrs.update(dict(original_variable=orig_var_name)) + del d[cf_name].attrs["_cf_variable_name"] + + return d diff --git a/miranda/treatments/utils.py b/miranda/treatments/utils.py new file mode 100644 index 00000000..e1b15a3a --- /dev/null +++ b/miranda/treatments/utils.py @@ -0,0 +1,64 @@ +"""Utility functions for GIS operations.""" +from __future__ import annotations + +import inspect +import json +from pathlib import Path +from typing import Any + +__all__ = [ + "load_json_data_mappings", +] + + +def _get_section_entry_key(meta, entry, var, key, project): + var_meta = meta[entry].get(var, {}) + if key in var_meta: + if isinstance(var_meta[key], dict): + config = var_meta[key].get(project) + if config is None and "all" in var_meta[key].keys(): + config = var_meta[key].get("all") + return config + return var_meta[key] + return None + + +def _iter_entry_key(ds, meta, entry, key, project): + for vv in set(ds.data_vars).intersection(meta[entry]): + val = _get_section_entry_key(meta, entry, vv, key, project) + yield vv, val + + +def load_json_data_mappings( + project: str, configurations: dict[str, Path] | None = None +) -> dict[str, Any]: + """Load JSON mappings for supported dataset conversions. + + Parameters + ---------- + project : str + configurations: dict, optional + + Returns + ------- + dict[str, Any] + """ + if configurations is None: + calling_frame = inspect.currentframe().f_back + calling_file_path = calling_frame.f_globals["__file__"] + config_folder = Path(calling_file_path).parent / "configs" + + configurations = {} + for configuration in config_folder.glob("*attrs.json"): + project = str(configuration.stem).split("_")[0] + if "|" in project: + for p in project.split("|"): + configurations[p] = configuration + configurations[project] = configuration + + if project in configurations.keys(): + config_file = configurations[project] + metadata_definition = json.load(config_file.open()) + return metadata_definition + else: + raise NotImplementedError(f"Project not supported: {project}") diff --git a/miranda/vocabularies/__init__.py b/miranda/vocabularies/__init__.py index 74f0a223..0e05c103 100644 --- a/miranda/vocabularies/__init__.py +++ b/miranda/vocabularies/__init__.py @@ -1 +1,4 @@ """Controlled Vocabulary module.""" +from __future__ import annotations + +from . import eccc, esgf diff --git a/miranda/vocabularies/eccc.py b/miranda/vocabularies/eccc.py index bd739fed..f668ec63 100644 --- a/miranda/vocabularies/eccc.py +++ b/miranda/vocabularies/eccc.py @@ -72,12 +72,14 @@ obs_groupings = dict() obs_groupings["HLY"] = list( - obs_vocabularies["HLY01"] - + obs_vocabularies["HLY01_RCS"] - + obs_vocabularies["HLY03"] - + obs_vocabularies["HLY10"] - + obs_vocabularies["HLY15"] - + obs_vocabularies["HLY21"] + set( + obs_vocabularies["HLY01"] + + obs_vocabularies["HLY01_RCS"] + + obs_vocabularies["HLY03"] + + obs_vocabularies["HLY10"] + + obs_vocabularies["HLY15"] + + obs_vocabularies["HLY21"] + ) ) obs_groupings["DLY"] = list( set( From f5ca6824cc5d501005de1f348166fd78f054eed2 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Thu, 29 Feb 2024 16:20:11 -0500 Subject: [PATCH 18/33] fix metadata, adjust tests --- miranda/preprocess/_metadata.py | 14 +- .../preprocess/configs/eccc-obs_attrs.json | 163 ++++++++++++++++-- tests/test_utils.py | 37 ++-- 3 files changed, 181 insertions(+), 33 deletions(-) diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py index c8fcafa1..842fb92c 100644 --- a/miranda/preprocess/_metadata.py +++ b/miranda/preprocess/_metadata.py @@ -15,23 +15,23 @@ def eccc_variable_metadata( - variable_code: str, + variable_code: str | int, project: str, generation: int | None = None, metadata: dict | None = None, -) -> (dict[str, int | float | str], dict, list[tuple[int, int]], int): +) -> dict[str, Any]: """Return the metadata for a given variable code and project. Parameters ---------- - variable_code: str + variable_code: str or int project: {"eccc-ahccd", "eccc-obs", "eccc-obs-summary"} generation: {1, 2, 3}, optional metadata: dict, optional Returns ------- - dict[str, int or str or float], dict, list[tuple[int, int]], int + dict """ if project == "eccc-ahccd": generation = {1: "First", 2: "Second", 3: "Third"}.get(generation) @@ -42,6 +42,10 @@ def eccc_variable_metadata( if not metadata: metadata = load_json_data_mappings(project) + + if isinstance(variable_code, int): + variable_code = str(variable_code).zfill(3) + code = find_project_variable_codes(variable_code, metadata) # Variable metadata @@ -92,7 +96,7 @@ def eccc_variable_metadata( for field in to_delete: del header[field] - return variable_meta, header + return dict(metadata=variable_meta, header=header) def homogenized_column_definitions( diff --git a/miranda/preprocess/configs/eccc-obs_attrs.json b/miranda/preprocess/configs/eccc-obs_attrs.json index 300b559a..8f438b16 100644 --- a/miranda/preprocess/configs/eccc-obs_attrs.json +++ b/miranda/preprocess/configs/eccc-obs_attrs.json @@ -2,13 +2,6 @@ "Header": { "_frequency": true, "_miranda_version": true, - "_missing_flags": "M", - "_missing_values": [ - "-999", - "1e20", - "-9999", - "#####" - ], "acknowledgement": "This data is provided by Environment and Climate Change Canada (ECCC).", "author": "Environment and Climate Change Canada (ECCC)", "contact": "ccsc-cccs@ec.gc.ca", @@ -28,6 +21,8 @@ "variables": { "001": { "_variable_name": "tasmax", + "missing_flags": "M", + "missing_values": "-99999", "long_name": "Daily Maximum Temperature", "original_units": "0.1 °C", "scale_factor": 0.1, @@ -36,6 +31,8 @@ }, "002": { "_variable_name": "tasmin", + "missing_flags": "M", + "missing_values": "-99999", "long_name": "Daily Minimum Temperature", "original_units": "0.1 °C", "scale_factor": 0.1, @@ -44,6 +41,8 @@ }, "003": { "_variable_name": "tas", + "missing_flags": "M", + "missing_values": "-99999", "long_name": "Daily Mean Temperature", "original_units": "0.1 °C", "scale_factor": 0.1, @@ -52,6 +51,8 @@ }, "010": { "_variable_name": "prlptot", + "missing_flags": "M", + "missing_values": "-99999", "long_name": "Daily Total Rainfall", "original_units": "0.1 mm day-1", "scale_factor": 0.1, @@ -60,6 +61,8 @@ }, "011": { "_variable_name": "prsntot", + "missing_flags": "M", + "missing_values": "-99999", "long_name": "Daily Total Snowfall", "original_units": "0.1 cm day-1", "scale_factor": 0.1, @@ -68,6 +71,8 @@ }, "012": { "_variable_name": "prcptot", + "missing_flags": "M", + "missing_values": "-99999", "long_name": "Daily Total Precipitation", "original_units": "0.1 mm day-1", "scale_factor": 0.1, @@ -76,6 +81,8 @@ }, "013": { "_variable_name": "sndtot", + "missing_flags": "M", + "missing_values": "-99999", "long_name": "Snow on the Ground", "original_units": "cm", "scale_factor": 1, @@ -84,6 +91,8 @@ }, "014": { "_variable_name": "thunder", + "missing_flags": "M", + "missing_values": "-99999", "long_name": "Thunderstorms", "scale_factor": 1, "standard_name": "thunderstorm_presence", @@ -91,6 +100,8 @@ }, "015": { "_variable_name": "freezing_rain_drizzle", + "missing_flags": "M", + "missing_values": "-99999", "long_name": "Freezing rain or drizzle", "scale_factor": 1, "standard_name": "freeze_rain_drizzle_presence", @@ -98,6 +109,8 @@ }, "016": { "_variable_name": "hail", + "missing_flags": "M", + "missing_values": "-99999", "long_name": "Hail", "scale_factor": 1, "standard_name": "hail_presence", @@ -105,6 +118,8 @@ }, "017": { "_variable_name": "fog_ice_fog", + "missing_flags": "M", + "missing_values": "-99999", "long_name": "Fog or Ice Fog", "scale_factor": 1, "standard_name": "fog_ice_fog_presence", @@ -112,6 +127,8 @@ }, "018": { "_variable_name": "smoke_haze", + "missing_flags": "M", + "missing_values": "-99999", "long_name": "Smoke or Haze", "scale_factor": 1, "standard_name": "smoke_haze_presence", @@ -119,6 +136,8 @@ }, "019": { "_variable_name": "blowing_dust_sand", + "missing_flags": "M", + "missing_values": "-99999", "long_name": "Blowing Dust or Sand", "scale_factor": 1, "standard_name": "blowing_dust_sand_presence", @@ -126,6 +145,8 @@ }, "020": { "_variable_name": "blow_snow", + "missing_flags": "M", + "missing_values": "-99999", "long_name": "Blowing snow", "scale_factor": 1, "standard_name": "blowing_snow_presence", @@ -135,12 +156,16 @@ "_variable_name": "wind_gt_28kt", "long_name": "Wind speed >= 28 Knots", "scale_factor": 1, + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "wind_exceeding_28_knots", "units": "1" }, "022": { "_variable_name": "wind_gt_34kt", "long_name": "Wind speed >= 34 Knots", + "missing_flags": "M", + "missing_values": "-99999", "scale_factor": 1, "standard_name": "wind_exceeding_34_knots", "units": "1" @@ -150,6 +175,8 @@ "long_name": "Direction of extreme gust (16 pts) to December 1976", "original_units": "10's of degrees", "scale_factor": 10, + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "gust_to_direction", "units": "deg" }, @@ -157,6 +184,8 @@ "_variable_name": "gust_speed", "long_name": "Speed of extreme gust", "original_units": "km/h", + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "wind_speed_of_gust", "units": "km h-1" }, @@ -164,6 +193,8 @@ "_variable_name": "gust_hour", "long_name": "UTC hour of extreme gust", "standard_name": "hour_of_extreme_gust", + "missing_flags": "M", + "missing_values": "-99999", "units": "h" }, "061": { @@ -171,6 +202,8 @@ "long_name": "RF1 global solar radiation", "original_units": "0.001 MJ/m", "scale_factor": 0.001, + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "solar_radiation_flux", "units": "MJ m-1" }, @@ -179,6 +212,8 @@ "long_name": "RF2 sky (diffuse) radiation", "original_units": "0.001 MJ/m", "scale_factor": 277.77777777777777, + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "solar_radiation_flux", "units": "MJ m-1" }, @@ -186,6 +221,8 @@ "_variable_name": "rf3_radiation", "long_name": "RF3 reflected solar radiation", "original_units": "0.001 MJ/m", + "missing_flags": "M", + "missing_values": "-99999", "scale_factor": 277.77777777777777, "standard_name": "solar_radiation_flux", "units": "MJ m-1" @@ -196,11 +233,15 @@ "original_units": "0.001 MJ/m", "scale_factor": 277.77777777777777, "standard_name": "solar_radiation_flux", + "missing_flags": "M", + "missing_values": "-99999", "units": "MJ m-1" }, "067": { "_variable_name": "rf7_radiation", "long_name": "RF7 daylight illumination", + "missing_flags": "M", + "missing_values": "-99999", "original_units": "0.01 Kilolux_hrs", "scale_factor": 0.01, "standard_name": "solar_radiation_flux", @@ -210,6 +251,8 @@ "_variable_name": "rf8_radiation", "long_name": "RF8 direct solar radiation", "original_units": "0.001 MJ/m", + "missing_flags": "M", + "missing_values": "-99999", "scale_factor": 277.77777777777777, "standard_name": "solar_radiation_flux", "units": "W m-2 h-1" @@ -218,6 +261,8 @@ "_variable_name": "wind_dir_45B", "long_name": "Direction - 45B anemometer (8 pts)", "original_units": "10's of degrees", + "missing_flags": "M", + "missing_values": "-99999", "scale_factor": 10, "standard_name": "wind_to_direction", "units": "deg" @@ -225,6 +270,8 @@ "071": { "_variable_name": "ceiling_hgt", "long_name": "Ceiling height of lowest layer of clouds", + "missing_flags": "M", + "missing_values": "-99999", "original_units": "30's of meters", "scale_factor": 30, "standard_name": "ceiling_cloud_height", @@ -235,6 +282,8 @@ "long_name": "Visibility", "original_units": "0.1 km", "scale_factor": 0.1, + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "visibility_in_air", "units": "km" }, @@ -242,6 +291,8 @@ "_variable_name": "psl", "long_name": "Sea Level Pressure", "original_units": "0.01 kPa", + "missing_flags": "M", + "missing_values": "-99999", "scale_factor": 0.01, "standard_name": "air_pressure_at_mean_sea_level", "units": "kPa" @@ -249,6 +300,8 @@ "074": { "_variable_name": "tds", "long_name": "Dew Point Temperature", + "missing_flags": "M", + "missing_values": "-99999", "original_units": "0.1 °C", "scale_factor": 0.1, "standard_name": "dew_point_temperature", @@ -257,6 +310,8 @@ "075": { "_variable_name": "wind_dir_u2a_16", "long_name": "Wind Direction at 2 m (U2A Anemometer) (16 pts)", + "missing_flags": "M", + "missing_values": "-99999", "original_units": "10's of degrees", "scale_factor": 10, "standard_name": "wind_direction_u2a", @@ -264,6 +319,8 @@ }, "076": { "_variable_name": "wind_speed_u2a", + "missing_flags": "M", + "missing_values": "-99999", "long_name": "Wind Speed - U2A (16 pts) to December 1970", "original_units": "km/h", "scale_factor": 1, @@ -273,6 +330,8 @@ "077": { "_variable_name": "pressure", "long_name": "Station Pressure", + "missing_flags": "M", + "missing_values": "-99999", "original_units": "0.01 kPa", "scale_factor": 0.01, "standard_name": "atmospheric_pressure", @@ -283,6 +342,8 @@ "long_name": "Dry Bulb Temperature", "original_units": "0.1 °C", "scale_factor": 0.1, + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "dry_bulb_temperature", "units": "degC" }, @@ -291,6 +352,8 @@ "long_name": "Wet Bulb temperature", "original_units": "0.1 °C", "scale_factor": 0.1, + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "wet_bulb_temperature", "units": "degC" }, @@ -298,6 +361,8 @@ "_variable_name": "hur", "long_name": "Relative Humidity", "original_units": "%", + "missing_flags": "M", + "missing_values": "-99999", "scale_factor": 1, "standard_name": "relative_humidity", "units": "1" @@ -306,6 +371,8 @@ "_variable_name": "clo", "long_name": "Total Cloud Opacity", "original_units": "%", + "missing_flags": "M", + "missing_values": "-99999", "scale_factor": 10, "standard_name": "cloud_albedo", "units": "1" @@ -314,6 +381,8 @@ "_variable_name": "clt", "long_name": "Total Cloud Amount", "original_units": "%", + "missing_flags": "M", + "missing_values": "-99999", "scale_factor": 10, "standard_name": "cloud_area_fraction", "units": "1" @@ -323,20 +392,26 @@ "long_name": "Freezing Rain", "scale_factor": 1, "standard_name": "freezing_rain", - "units": "1" + "units": "1", + "missing_flags": "M", + "missing_values": "-99999" }, "094": { "_variable_name": "ice_pellets", "long_name": "Ice Pellets", "scale_factor": 1, "standard_name": "ice_pellet_presence", - "units": "1" + "units": "1", + "missing_flags": "M", + "missing_values": "-99999" }, "107": { "_variable_name": "1low_cloud_opac", "long_name": "Lowest cloud layer opacity", "original_units": "Tenths", "scale_factor": 10, + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "low_type_cloud_opacity_fraction", "units": "1" }, @@ -345,6 +420,8 @@ "long_name": "Lowest cloud layer amount or condition", "original_units": "Tenths", "scale_factor": 10, + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "low_type_cloud_area_fraction", "units": "1" }, @@ -352,6 +429,8 @@ "_variable_name": "1low_cloud_type", "long_name": "Lowest cloud layer type", "standard_name": "low_type_cloud_type", + "missing_flags": "M", + "missing_values": "-99999", "units": "1" }, "110": { @@ -359,6 +438,8 @@ "long_name": "Lowest cloud layer height", "original_units": "30's of meters", "scale_factor": 30, + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "low_type_cloud_height", "units": "m" }, @@ -366,6 +447,8 @@ "_variable_name": "2low_cloud_opac", "long_name": "Second lowest cloud layer opacity", "original_units": "Tenths", + "missing_flags": "M", + "missing_values": "-99999", "scale_factor": 10, "standard_name": "low_type_cloud_opacity_fraction", "units": "1" @@ -373,6 +456,8 @@ "112": { "_variable_name": "2low_cloud_frac", "long_name": "Second lowest cloud layer amount or condition", + "missing_flags": "M", + "missing_values": "-99999", "original_units": "Tenths", "scale_factor": 10, "standard_name": "low_type_cloud_area_fraction", @@ -381,6 +466,8 @@ "113": { "_variable_name": "2low_cloud_type", "long_name": "Second lowest cloud layer type", + "missing_flags": "M", + "missing_values": "-99999", "original_units": "", "scale_factor": 1, "standard_name": "low_type_cloud_type", @@ -390,6 +477,8 @@ "_variable_name": "2low_cloud_hgt", "long_name": "Second lowest cloud layer height", "original_units": "30's of meters", + "missing_flags": "M", + "missing_values": "-99999", "scale_factor": 30, "standard_name": "low_type_cloud_height", "units": "m" @@ -398,12 +487,16 @@ "_variable_name": "3low_cloud_opac", "long_name": "Thirsd lowest cloud layer opacity", "original_units": "Tenths", + "missing_flags": "M", + "missing_values": "-99999", "scale_factor": 10, "standard_name": "low_type_cloud_opacity_fraction", "units": "1" }, "116": { "_variable_name": "3low_cloud_frac", + "missing_flags": "M", + "missing_values": "-99999", "long_name": "Third lowest cloud layer amount or condition", "original_units": "Tenths", "scale_factor": 10, @@ -414,6 +507,8 @@ "_variable_name": "3low_cloud_type", "long_name": "Third lowest cloud layer type", "original_units": "", + "missing_flags": "M", + "missing_values": "-99999", "scale_factor": 1, "standard_name": "low_type_cloud_type", "units": "1" @@ -423,6 +518,8 @@ "long_name": "Third lowest cloud layer height", "original_units": "30's of meters", "scale_factor": 30, + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "low_type_cloud_height", "units": "m" }, @@ -431,6 +528,8 @@ "long_name": "Total Rainfall", "original_units": "0.1 mm", "scale_factor": 0.1, + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "rainfall_flux", "units": "mm h-1" }, @@ -439,6 +538,8 @@ "long_name": "Sunshine", "original_units": "0.1 hrs", "scale_factor": 0.1, + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "duration_of_sunshine", "units": "h" }, @@ -446,6 +547,8 @@ "_variable_name": "wind_dir_u2a_36", "long_name": "Wind Direction - U2A (36 pts) from January 1971", "original_units": "10's of degrees", + "missing_flags": "M", + "missing_values": "-99999", "scale_factor": 10, "standard_name": "wind_direction_u2a", "units": "deg" @@ -455,6 +558,8 @@ "long_name": "Total Precipitation (minutes 00-60)", "original_units": "0.1 mm", "scale_factor": 0.1, + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "precipitation_amount", "units": "mm" }, @@ -462,6 +567,8 @@ "_variable_name": "prtot_q1", "long_name": "Total Precipitation (minutes 00-15)", "original_units": "0.1 mm", + "missing_flags": "M", + "missing_values": "-99999", "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "mm" @@ -472,6 +579,8 @@ "original_units": "0.1 mm", "scale_factor": 0.1, "standard_name": "precipitation_amount", + "missing_flags": "M", + "missing_values": "-99999", "units": "mm" }, "265": { @@ -479,6 +588,8 @@ "long_name": "Total Precipitation (minutes 30-45)", "original_units": "0.1 mm", "scale_factor": 0.1, + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "precipitation_amount", "units": "mm" }, @@ -486,12 +597,16 @@ "_variable_name": "prtot_q4", "long_name": "Total Precipitation (minutes 45-60)", "original_units": "0.1 mm", + "missing_flags": "M", + "missing_values": "-99999", "scale_factor": 0.1, "standard_name": "precipitation_amount", "units": "mm" }, "267": { "_variable_name": "precipitation_weight_q1", + "missing_flags": "M", + "missing_values": "-99999", "long_name": "Precipitation Gauge Weight per Unit Area (at minute 15)", "original_units": "0.1 kg/m²", "scale_factor": 0.1, @@ -504,6 +619,8 @@ "original_units": "0.1 kg/m²", "scale_factor": 0.1, "standard_name": "precipitation_amount", + "missing_flags": "M", + "missing_values": "-99999", "units": "kg m-2" }, "269": { @@ -512,7 +629,9 @@ "original_units": "0.1 kg/m²", "scale_factor": 0.1, "standard_name": "precipitation_amount", - "units": "kg m-2" + "units": "kg m-2", + "missing_flags": "M", + "missing_values": "-99999" }, "270": { "_variable_name": "precipitation_weight_q4", @@ -520,11 +639,15 @@ "original_units": "0.1 kg/m²", "scale_factor": 0.1, "standard_name": "precipitation_amount", + "missing_flags": "M", + "missing_values": "-99999", "units": "kg m-2" }, "271": { "_variable_name": "wind_speed_q1", "long_name": "Wind Speed at 2 m (minutes 00-15)", + "missing_flags": "M", + "missing_values": "-99999", "original_units": "0.1 km/h", "scale_factor": 0.1, "standard_name": "wind_speed", @@ -535,6 +658,8 @@ "long_name": "Wind Speed at 2 m (minutes 15-30)", "original_units": "0.1 km/h", "scale_factor": 0.1, + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "wind_speed", "units": "km h-1" }, @@ -544,11 +669,15 @@ "original_units": "0.1 km/h", "scale_factor": 0.1, "standard_name": "wind_speed", - "units": "km h-1" + "units": "km h-1", + "missing_flags": "M", + "missing_values": "-99999" }, "274": { "_variable_name": "wind_speed_q4", "long_name": "Wind Speed at 2 m (minutes 45-60)", + "missing_flags": "M", + "missing_values": "-99999", "original_units": "0.1 km/h", "scale_factor": 0.1, "standard_name": "wind_speed", @@ -558,6 +687,8 @@ "_variable_name": "snd_q4", "long_name": "Snow Depth (at minute 60)", "original_units": "cm", + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "surface_snow_thickness", "units": "cm" }, @@ -565,6 +696,8 @@ "_variable_name": "snd_q1", "long_name": "Snow Depth (at minute 15)", "original_units": "cm", + "missing_flags": "M", + "missing_values": "-99999", "scale_factor": 1, "standard_name": "surface_snow_thickness", "units": "cm" @@ -574,6 +707,8 @@ "long_name": "Snow Depth (at minute 30)", "original_units": "cm", "scale_factor": 1, + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "surface_snow_thickness", "units": "cm" }, @@ -581,6 +716,8 @@ "_variable_name": "snd_q3", "long_name": "Snow Depth (at minute 45)", "original_units": "cm", + "missing_flags": "M", + "missing_values": "-99999", "scale_factor": 1, "standard_name": "surface_snow_thickness", "units": "cm" @@ -588,6 +725,8 @@ "279": { "_variable_name": "wind_dir", "long_name": "Wind Direction at 2 m (minutes 50-60)", + "missing_flags": "M", + "missing_values": "-99999", "nc_units": "deg", "original_units": "Degrees", "standard_name": "wind_direction" @@ -597,6 +736,8 @@ "long_name": "Wind Speed at 2 m (minutes 50-60)", "original_units": "0.1 km/h", "scale_factor": 0.1, + "missing_flags": "M", + "missing_values": "-99999", "standard_name": "wind_speed", "units": "km h-1" } diff --git a/tests/test_utils.py b/tests/test_utils.py index ce1bfca3..85aee93a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -4,9 +4,9 @@ from datetime import date from pathlib import Path -import pytest # noqa +import pytest -import miranda.eccc._utils as eccc_utils # noqa +import miranda.preprocess._metadata as metadata import miranda.utils @@ -28,12 +28,13 @@ def test_hourly_cf_dictionaries(self): codes = list() variables = dict() for key in keys: - variables[key] = eccc_utils.cf_station_metadata(key) - codes.append(variables[key]["standard_name"]) - if variables[key]["standard_name"] == "dry_bulb_temperature": - assert variables[key]["raw_units"] == "degC" - assert variables[key]["units"] == "K" - assert variables[key]["missing_flags"] == "M" + variables[key] = metadata.eccc_variable_metadata(key, "eccc-obs") + var_name = next(iter(variables[key]["metadata"])) + var_metadata = variables[key]["metadata"][var_name] + codes.append(var_metadata["standard_name"]) + if var_metadata["standard_name"] == "dry_bulb_temperature": + assert var_metadata["units"] == "degC" + assert var_metadata["missing_flags"] == "M" assert set(codes) == { "wind_speed_u2a", @@ -57,15 +58,17 @@ def test_daily_cf_dictionaries(self): codes = list() variables = dict() for key in keys: - variables[key] = eccc_utils.cf_station_metadata(key) - codes.append(variables[key]["standard_name"]) - if variables[key]["standard_name"].startswith("air_temperature"): - assert variables[key]["raw_units"] == "degC" - assert variables[key]["units"] == "K" - elif variables[key]["standard_name"].endswith("precipitation_amount"): - assert variables[key]["raw_units"] in ["cm", "mm"] - assert variables[key]["units"] == "m" - assert variables[key]["missing_flags"] == "M" + variables[key] = metadata.eccc_variable_metadata(key, "eccc-obs") + + var_name = next(iter(variables[key]["metadata"])) + var_metadata = variables[key]["metadata"][var_name] + codes.append(var_metadata["standard_name"]) + + if var_name.startswith("air_temperature"): + assert var_metadata["units"] == "degC" + elif var_name.endswith("precipitation_amount"): + assert var_metadata["units"] in ["cm", "mm"] + assert var_metadata["missing_flags"] == "M" assert set(codes) == { "air_temperature", From c2e442aa323b0d0a1f425ae5bd58cd3ad2bc65a4 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 6 Mar 2024 15:47:04 -0500 Subject: [PATCH 19/33] error handling --- miranda/preprocess/_metadata.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py index 842fb92c..f39ff79d 100644 --- a/miranda/preprocess/_metadata.py +++ b/miranda/preprocess/_metadata.py @@ -46,7 +46,10 @@ def eccc_variable_metadata( if isinstance(variable_code, int): variable_code = str(variable_code).zfill(3) - code = find_project_variable_codes(variable_code, metadata) + try: + code = find_project_variable_codes(variable_code, metadata) + except KeyError: + raise KeyError(f"Variable code `{variable_code}` not found in metadata.") # Variable metadata variable_meta = metadata["variables"].get(code) From f5fce92f45766e5a1fdc365623efaca213d30415 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 6 Mar 2024 15:48:30 -0500 Subject: [PATCH 20/33] better error handling --- miranda/preprocess/_metadata.py | 5 +---- miranda/treatments/_dimensions.py | 4 ++++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py index f39ff79d..842fb92c 100644 --- a/miranda/preprocess/_metadata.py +++ b/miranda/preprocess/_metadata.py @@ -46,10 +46,7 @@ def eccc_variable_metadata( if isinstance(variable_code, int): variable_code = str(variable_code).zfill(3) - try: - code = find_project_variable_codes(variable_code, metadata) - except KeyError: - raise KeyError(f"Variable code `{variable_code}` not found in metadata.") + code = find_project_variable_codes(variable_code, metadata) # Variable metadata variable_meta = metadata["variables"].get(code) diff --git a/miranda/treatments/_dimensions.py b/miranda/treatments/_dimensions.py index cd56e243..134faf23 100644 --- a/miranda/treatments/_dimensions.py +++ b/miranda/treatments/_dimensions.py @@ -27,6 +27,10 @@ def find_project_variable_codes(code: str, configuration: dict[str, Any]) -> str str """ variable_codes = {} + + if "variables" not in configuration: + raise ValueError("No `variables` section found in configuration. Check JSON.") + for variable_code in configuration["variables"]: variable_name = configuration["variables"][variable_code].get("_variable_name") if variable_name: From 75e8015e1f7431b95d3f171c5258a2bf6bbf72bc Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 6 Mar 2024 15:56:38 -0500 Subject: [PATCH 21/33] import fixes --- miranda/convert/eccc_rdrs.py | 7 ++----- miranda/preprocess/eccc.py | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/miranda/convert/eccc_rdrs.py b/miranda/convert/eccc_rdrs.py index e0abfbcf..cd8c63a6 100644 --- a/miranda/convert/eccc_rdrs.py +++ b/miranda/convert/eccc_rdrs.py @@ -11,14 +11,11 @@ from miranda.io import fetch_chunk_config, write_dataset_dict from miranda.scripting import LOGGING_CONFIG +from miranda.treatments import load_json_data_mappings from miranda.units import get_time_frequency from ._aggregation import aggregate -from ._data_definitions import ( - gather_eccc_rdrs, - gather_raw_rdrs_by_years, - load_json_data_mappings, -) +from ._data_definitions import gather_eccc_rdrs, gather_raw_rdrs_by_years from .corrections import dataset_conversion logging.config.dictConfig(LOGGING_CONFIG) diff --git a/miranda/preprocess/eccc.py b/miranda/preprocess/eccc.py index c94f9f9d..9dec2a56 100644 --- a/miranda/preprocess/eccc.py +++ b/miranda/preprocess/eccc.py @@ -9,7 +9,7 @@ from pathlib import Path from typing import Callable -from dask.distributed import ProgressBar +from dask.diagnostics import ProgressBar from miranda.scripting import LOGGING_CONFIG from miranda.storage import file_size, report_file_size From 8c4ffccc7099f4c2511393d60f363dbd78d5ee0d Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Mon, 18 Mar 2024 15:50:05 -0400 Subject: [PATCH 22/33] fix logic --- miranda/preprocess/_metadata.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py index 842fb92c..db24aae1 100644 --- a/miranda/preprocess/_metadata.py +++ b/miranda/preprocess/_metadata.py @@ -4,7 +4,8 @@ from typing import Any from miranda import __version__ as __miranda_version__ -from miranda.treatments import find_project_variable_codes + +# from miranda.treatments import find_project_variable_codes from miranda.treatments.utils import load_json_data_mappings __all__ = [ @@ -46,13 +47,13 @@ def eccc_variable_metadata( if isinstance(variable_code, int): variable_code = str(variable_code).zfill(3) - code = find_project_variable_codes(variable_code, metadata) + # code = find_project_variable_codes(variable_code, metadata) # Variable metadata - variable_meta = metadata["variables"].get(code) + variable_meta = metadata["variables"].get(variable_code) variable_name = variable_meta.get("_variable_name") if variable_name: - variable_meta["original_variable_name"] = variable_code + variable_meta["original_variable_code"] = variable_code variable_meta = {variable_name: variable_meta} del variable_meta[variable_name]["_variable_name"] else: From 448ba0782919130b4087ae608157aa5f5788fa77 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Mon, 18 Mar 2024 16:19:57 -0400 Subject: [PATCH 23/33] fix logic --- miranda/preprocess/_metadata.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py index db24aae1..77a827ba 100644 --- a/miranda/preprocess/_metadata.py +++ b/miranda/preprocess/_metadata.py @@ -51,11 +51,14 @@ def eccc_variable_metadata( # Variable metadata variable_meta = metadata["variables"].get(variable_code) - variable_name = variable_meta.get("_variable_name") - if variable_name: - variable_meta["original_variable_code"] = variable_code - variable_meta = {variable_name: variable_meta} - del variable_meta[variable_name]["_variable_name"] + variable_name_fields = ["_variable_name", "_cf_variable_name"] + if set(variable_name_fields).issubset(variable_meta.keys()): + for variable_field in variable_name_fields: + variable_name = variable_meta.get(variable_field) + if variable_name: + variable_meta["original_variable_code"] = variable_code + del variable_meta[variable_field] + variable_meta = {variable_name: variable_meta} else: variable_meta = {variable_code: variable_meta} From 53b9c35d9113f88b64d22180b3a5a23c13d11f47 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Mon, 18 Mar 2024 16:36:00 -0400 Subject: [PATCH 24/33] fix logic --- miranda/preprocess/_metadata.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py index 77a827ba..798a710d 100644 --- a/miranda/preprocess/_metadata.py +++ b/miranda/preprocess/_metadata.py @@ -49,8 +49,11 @@ def eccc_variable_metadata( # code = find_project_variable_codes(variable_code, metadata) + print("stuff") + # Variable metadata variable_meta = metadata["variables"].get(variable_code) + variable_name = "" variable_name_fields = ["_variable_name", "_cf_variable_name"] if set(variable_name_fields).issubset(variable_meta.keys()): for variable_field in variable_name_fields: @@ -61,6 +64,8 @@ def eccc_variable_metadata( variable_meta = {variable_name: variable_meta} else: variable_meta = {variable_code: variable_meta} + if not variable_name: + variable_name = variable_code # Dataset metadata header = metadata.get("Header") From 159fc94dfd47fd251d818cc7ff12d5267bba48f4 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Mon, 18 Mar 2024 16:49:03 -0400 Subject: [PATCH 25/33] add files to sdist --- pyproject.toml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 89f43a68..2323ecd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -185,7 +185,11 @@ include = [ "docs/make.bat", "tests/*.py", "tox.ini", - "miranda" + "miranda", + "miranda/convert/configs/*.json", + "miranda/preprocess/configs/*.csv", + "miranda/preprocess/configs/*.json", + "miranda/structure/data/*.yml" ] exclude = [ "*.py[co]", From 586f4e6d3500795ae6324ff18229e10d70253a81 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Mon, 18 Mar 2024 17:05:17 -0400 Subject: [PATCH 26/33] synchronize dependencies --- .github/workflows/main.yml | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a2eaded8..e6779597 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -40,7 +40,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install tox run: | - python -m pip install tox + python -m pip install "tox>=4.5.0" "pip>=23.3.0" flit - name: Run lint and docs testing suite run: | python -m tox -e ${{ matrix.tox-env }} @@ -77,7 +77,7 @@ jobs: sudo apt-get install libgdal-dev - name: Install tox run: | - python -m pip install tox + python -m pip install "tox>=4.5.0" "pip>=23.3.0" flit - name: Test with tox run: | python -m tox -e ${{ matrix.tox-env }} diff --git a/pyproject.toml b/pyproject.toml index 2323ecd5..0a2a58d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,7 @@ dependencies = [ [project.optional-dependencies] dev = [ # Dev tools and testing - "pip >=23.1.2", + "pip >=23.3.0", "bump-my-version >=0.18.3", "watchdog >=3.0.0", "flake8 >=6.1.0", From 6c5586ba95304bec7d6fb565af9fad8e928ab626 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Mon, 18 Mar 2024 17:05:25 -0400 Subject: [PATCH 27/33] debugging --- miranda/preprocess/_metadata.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py index 798a710d..bbe234ec 100644 --- a/miranda/preprocess/_metadata.py +++ b/miranda/preprocess/_metadata.py @@ -34,6 +34,8 @@ def eccc_variable_metadata( ------- dict """ + print(locals()) + if project == "eccc-ahccd": generation = {1: "First", 2: "Second", 3: "Third"}.get(generation) if not generation: @@ -43,16 +45,18 @@ def eccc_variable_metadata( if not metadata: metadata = load_json_data_mappings(project) + print(metadata) if isinstance(variable_code, int): variable_code = str(variable_code).zfill(3) # code = find_project_variable_codes(variable_code, metadata) - print("stuff") - # Variable metadata variable_meta = metadata["variables"].get(variable_code) + if variable_meta is None: + raise ValueError(f"No metadata found for variable code: {variable_code}") + variable_name = "" variable_name_fields = ["_variable_name", "_cf_variable_name"] if set(variable_name_fields).issubset(variable_meta.keys()): From e2762d396d24cb4cfb8f2959bd59b0eb05b64bba Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Mon, 18 Mar 2024 17:20:53 -0400 Subject: [PATCH 28/33] chase down bug --- miranda/preprocess/_metadata.py | 3 --- miranda/treatments/utils.py | 6 +++--- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py index bbe234ec..867f53e6 100644 --- a/miranda/preprocess/_metadata.py +++ b/miranda/preprocess/_metadata.py @@ -34,8 +34,6 @@ def eccc_variable_metadata( ------- dict """ - print(locals()) - if project == "eccc-ahccd": generation = {1: "First", 2: "Second", 3: "Third"}.get(generation) if not generation: @@ -45,7 +43,6 @@ def eccc_variable_metadata( if not metadata: metadata = load_json_data_mappings(project) - print(metadata) if isinstance(variable_code, int): variable_code = str(variable_code).zfill(3) diff --git a/miranda/treatments/utils.py b/miranda/treatments/utils.py index 9e49bc63..e6f332a6 100644 --- a/miranda/treatments/utils.py +++ b/miranda/treatments/utils.py @@ -51,11 +51,11 @@ def load_json_data_mappings( configurations = {} for configuration in config_folder.glob("*attrs.json"): - project = str(configuration.stem).split("_")[0] + project_config = str(configuration.stem).split("_")[0] if "|" in project: - for p in project.split("|"): + for p in project_config.split("|"): configurations[p] = configuration - configurations[project] = configuration + configurations[project_config] = configuration if project in configurations.keys(): config_file = configurations[project] From d20be40dc45aaada37726df9244eff25d9fc9679 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Tue, 19 Mar 2024 11:20:56 -0400 Subject: [PATCH 29/33] update attr treatments --- miranda/convert/corrections.py | 27 +++++++++++++-------------- miranda/treatments/__init__.py | 25 ++++++++++++------------- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/miranda/convert/corrections.py b/miranda/convert/corrections.py index 0fcce7d4..7745cc42 100644 --- a/miranda/convert/corrections.py +++ b/miranda/convert/corrections.py @@ -3,9 +3,8 @@ from __future__ import annotations import datetime -import os +import pathlib from functools import partial -from pathlib import Path from typing import Callable, Iterator, Sequence import xarray as xr @@ -27,7 +26,7 @@ ) from miranda.treatments.utils import load_json_data_mappings -CONFIG_FOLDER = Path(__file__).parent / "data" +CONFIG_FOLDER = pathlib.Path(__file__).parent / "data" CONFIG_FILES = { "EMDNA": "emdna_cf_attrs.json", "ESPO-G6-E5L": "espo-g6-e5l_attrs.json", @@ -89,9 +88,9 @@ def dataset_corrections(ds: xr.Dataset, project: str) -> xr.Dataset: def dataset_conversion( input_files: ( str - | os.PathLike - | Sequence[str | os.PathLike] - | Iterator[os.PathLike] + | pathlib.Path + | Sequence[str | pathlib.Path] + | Iterator[pathlib.Path] | xr.Dataset ), project: str, @@ -107,7 +106,7 @@ def dataset_conversion( Parameters ---------- - input_files : str or os.PathLike or Sequence[str or os.PathLike] or Iterator[os.PathLike] or xr.Dataset + input_files : str or pathlib.Path or Sequence[str or pathlib.Path] or Iterator[pathlib.Path] or xr.Dataset Files or objects to be converted. If sent a list or GeneratorType, will open with :py:func:`xarray.open_mfdataset` and concatenate files. project : {"cordex", "cmip5", "cmip6", "ets-grnch", "isimip-ft", "pcic-candcs-u6", "converted"} @@ -136,15 +135,15 @@ def dataset_conversion( if isinstance(input_files, xr.Dataset): ds = input_files else: - if isinstance(input_files, (str, os.PathLike)): - if Path(input_files).is_dir(): + if isinstance(input_files, (str, pathlib.Path)): + if pathlib.Path(input_files).is_dir(): files = [] - files.extend([f for f in Path(input_files).glob("*.nc")]) - files.extend([f for f in Path(input_files).glob("*.zarr")]) + files.extend([f for f in pathlib.Path(input_files).glob("*.nc")]) + files.extend([f for f in pathlib.Path(input_files).glob("*.zarr")]) else: - files = [Path(input_files)] + files = [pathlib.Path(input_files)] elif isinstance(input_files, (Sequence, Iterator)): - files = [Path(f) for f in input_files] + files = [pathlib.Path(f) for f in input_files] else: files = input_files version_hashes = dict() @@ -175,7 +174,7 @@ def dataset_conversion( if domain: ds = subset_domain(ds, domain) - if isinstance(mask, (str, Path)): + if isinstance(mask, (str, pathlib.Path)): mask = xr.open_dataset(mask) if isinstance(mask, (xr.Dataset, xr.DataArray)): if regrid: diff --git a/miranda/treatments/__init__.py b/miranda/treatments/__init__.py index 11e62fd2..91255248 100644 --- a/miranda/treatments/__init__.py +++ b/miranda/treatments/__init__.py @@ -5,7 +5,7 @@ import datetime import logging.config -import xarray as xr +import xarray from miranda import __version__ as __miranda_version__ from miranda.scripting import LOGGING_CONFIG @@ -19,7 +19,7 @@ VERSION = datetime.datetime.now().strftime("%Y.%m.%d") -def metadata_conversion(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: +def metadata_conversion(d: xarray.Dataset, p: str, m: dict) -> xarray.Dataset: """Update xarray dataset and data_vars with project-specific metadata fields. Parameters @@ -67,36 +67,35 @@ def metadata_conversion(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: del m["Header"]["_frequency"] # Conditional handling of global attributes based on project name - for field in [f for f in header if f.startswith("_")]: + for field in [f for f in header.keys() if f.startswith("_")]: if isinstance(header[field], list): if p in header[field]: - attr_treatment = header[field][p] + attr_treatments = header[field][p] else: logging.warning( f"Attribute handling (`{field}`) not set for project `{p}`. Continuing..." ) continue elif isinstance(header[field], dict): - attr_treatment = header[field] + attr_treatments = header[field] else: raise AttributeError( f"Attribute treatment configuration for field `{field}` is not properly configured. Verify JSON." ) + if field[1:] in d.attrs: + logging.warning(f"Overwriting `{field[1:]}` based on JSON configuration.") if field == "_map_attrs": - for attribute, mapping in attr_treatment.items(): + for attribute, mapping in attr_treatments.items(): header[mapping] = d.attrs[attribute] del d.attrs[attribute] elif field == "_remove_attrs": - for ff in attr_treatment: + for ff in attr_treatments: del d.attrs[ff] + elif field.startswith("_") and p in attr_treatments: + header[field[1:]] = attr_treatments[p] else: - if field[1:] in d.attrs: - logging.warning( - f"Overwriting `{field[1:]}` based on JSON configuration." - ) - header[field[1:]] = attr_treatment - + header[field[1:]] = attr_treatments del header[field] # Add global attributes From 74e7949773a060362e74ead1c1d67b451d6bf49f Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Tue, 19 Mar 2024 11:23:33 -0400 Subject: [PATCH 30/33] fixes --- miranda/io/_input.py | 2 +- templates/restructure_datasets.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/miranda/io/_input.py b/miranda/io/_input.py index d9b0141b..e91992a5 100644 --- a/miranda/io/_input.py +++ b/miranda/io/_input.py @@ -50,7 +50,7 @@ def discover_data( input_files = sorted(list(input_files.glob(f"*.{suffix}"))) else: input_files = input_files.rglob(f"*.{suffix}") - if input_files.is_file(): + elif input_files.is_file(): logging.warning( "Data discovery yielded a single file. Casting to `list[Path]`." ) diff --git a/templates/restructure_datasets.py b/templates/restructure_datasets.py index d10fa8dc..f0d45ee9 100644 --- a/templates/restructure_datasets.py +++ b/templates/restructure_datasets.py @@ -17,5 +17,5 @@ guess=False, method="copy", make_dirs=True, - filename_pattern="*.zarr", + suffix="zarr", ) From 42b957dc62f8a93a3a87e553895552bd7ab76b65 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Tue, 19 Mar 2024 12:55:33 -0400 Subject: [PATCH 31/33] fix folder name --- miranda/convert/corrections.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/miranda/convert/corrections.py b/miranda/convert/corrections.py index 7745cc42..321b4cc7 100644 --- a/miranda/convert/corrections.py +++ b/miranda/convert/corrections.py @@ -26,7 +26,7 @@ ) from miranda.treatments.utils import load_json_data_mappings -CONFIG_FOLDER = pathlib.Path(__file__).parent / "data" +CONFIG_FOLDER = pathlib.Path(__file__).parent / "configs" CONFIG_FILES = { "EMDNA": "emdna_cf_attrs.json", "ESPO-G6-E5L": "espo-g6-e5l_attrs.json", From 288582f95dd0628489d9463fa25bcccf4c239fbf Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 27 Mar 2024 14:48:44 -0400 Subject: [PATCH 32/33] add support for new `h` freq --- miranda/convert/_data_definitions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/miranda/convert/_data_definitions.py b/miranda/convert/_data_definitions.py index bf69606b..91713a62 100644 --- a/miranda/convert/_data_definitions.py +++ b/miranda/convert/_data_definitions.py @@ -86,6 +86,7 @@ # Manually map xarray frequencies to CMIP6/CMIP5 controlled vocabulary. # see: https://github.com/ES-DOC/pyessv-archive xarray_frequencies_to_cmip6like = { + "h": "hr", "H": "hr", "D": "day", "W": "sem", From 55ee882e3992aeb170ee90ec62e37457f8bea368 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 27 Mar 2024 14:48:56 -0400 Subject: [PATCH 33/33] dependencies --- environment-dev.yml | 3 +-- miranda/treatments/_variables.py | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/environment-dev.yml b/environment-dev.yml index 3127306d..c425ef3d 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -33,7 +33,7 @@ dependencies: - xesmf - zarr # Dev tools and testing - - pip >=23.1.2 + - pip >=23.3.0 - bump-my-version >=0.18.3 - watchdog >=3.0.0 - flake8 >=6.1.0 @@ -48,7 +48,6 @@ dependencies: - blackdoc ==0.3.9 - isort ==5.13.2 - pre-commit >=3.3.2 - - pip - pip: - coverage >=6.2.0,<7.0.0 - coveralls >=3.3.1 diff --git a/miranda/treatments/_variables.py b/miranda/treatments/_variables.py index 5991e4c7..ff96b343 100644 --- a/miranda/treatments/_variables.py +++ b/miranda/treatments/_variables.py @@ -185,12 +185,12 @@ def clip_values(d: xr.Dataset, p: str, m: dict) -> xr.Dataset: converted = [] for vv in d.data_vars: if vv in m["variables"].keys(): - clip_values = _get_section_entry_key(m, "variables", vv, key, p) + clip_vals = _get_section_entry_key(m, "variables", vv, key, p) if clip_values: min_value, max_value = None, None # Gather unit conversion context, if applicable - context = clip_values.get("context", None) - for op, value in clip_values.items(): + context = clip_vals.get("context", None) + for op, value in clip_vals.items(): if op == "min": min_value = xclim.core.units.convert_units_to( value, d[vv], context