From 7a9d15c9e13971d880cf219850c30b56d1ab026c Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Mon, 19 Jun 2023 17:57:48 -0400
Subject: [PATCH 01/33] WIP - major refactoring of ECCC

---
 miranda/eccc/convert.py                       | 131 ++++
 .../eccc/data/eccc_homogenized_cf_attrs.json  | 111 +++
 .../eccc/data/eccc_obs_summary_cf_attrs.json  | 173 +++++
 miranda/eccc/eccc_obs_cf_attrs.json           | 686 +++++++++++-------
 miranda/units.py                              |   1 +
 templates/eccc_raw_daily_conversion.py        |  34 +-
 templates/eccc_raw_hourly_conversion.py       |  68 +-
 7 files changed, 909 insertions(+), 295 deletions(-)
 create mode 100644 miranda/eccc/convert.py
 create mode 100644 miranda/eccc/data/eccc_homogenized_cf_attrs.json
 create mode 100644 miranda/eccc/data/eccc_obs_summary_cf_attrs.json

diff --git a/miranda/eccc/convert.py b/miranda/eccc/convert.py
new file mode 100644
index 00000000..cc86cfe2
--- /dev/null
+++ b/miranda/eccc/convert.py
@@ -0,0 +1,131 @@
+"""Specialized conversion tools for Environment and Climate Change Canada / Meteorological Service of Canada data."""
+
+from __future__ import annotations
+
+import json
+import logging.config
+import multiprocessing as mp
+import os
+import time
+from functools import partial
+from pathlib import Path
+
+from miranda.eccc._raw import _convert_station_file
+from miranda.eccc._utils import cf_station_metadata
+from miranda.scripting import LOGGING_CONFIG
+
+logging.config.dictConfig(LOGGING_CONFIG)
+
+
+_data_folder = Path(__file__).parent / "data"
+
+eccc_observation_variables = dict()
+eccc_observation_variables["flat"] = [
+    v
+    for v in json.load(open(_data_folder / "eccc_obs_flat_attrs.json"))[
+        "variables"
+    ].keys()
+]
+eccc_observation_variables["summary"] = [
+    attrs["_cf_variable_name"]
+    for attrs in json.load(open(_data_folder / "eccc_obs_summary_cf_attrs.json"))[
+        "variables"
+    ].values()
+]
+eccc_observation_variables["homogenized"] = [
+    attrs["_cf_variable_name"]
+    for attrs in json.load(open(_data_folder / "eccc_homogenized_cf_attrs.json"))[
+        "variables"
+    ].values()
+]
+
+
+def convert_flat_files(
+    source_files: str | os.PathLike,
+    output_folder: str | os.PathLike | list[str | int],
+    variables: str | int | list[str | int],
+    mode: str = "hourly",
+    n_workers: int = 4,
+) -> None:
+    """
+
+    Parameters
+    ----------
+    source_files: str or Path
+    output_folder: str or Path
+    variables: str or List[str]
+    mode: {"hourly", "daily"}
+    n_workers: int
+
+    Returns
+    -------
+    None
+    """
+    func_time = time.time()
+
+    if mode.lower() in ["h", "hour", "hourly"]:
+        num_observations = 24
+        column_names = ["code", "year", "month", "day", "code_var"]
+        column_dtypes = [str, float, float, float, str]
+    elif mode.lower() in ["d", "day", "daily"]:
+        num_observations = 31
+        column_names = ["code", "year", "month", "code_var"]
+        column_dtypes = [str, float, float, str]
+    else:
+        raise NotImplementedError("`mode` must be 'h'/'hourly or 'd'/'daily'.")
+
+    # Preparing the data column headers
+    for i in range(1, num_observations + 1):
+        data_entry, flag_entry = f"D{i:0n}", f"F{i:0n}"
+        column_names.append(data_entry)
+        column_names.append(flag_entry)
+        column_dtypes.extend([str, str])
+
+    if isinstance(variables, (str, int)):
+        variables = [variables]
+
+    for variable_code in variables:
+        variable_code = str(variable_code).zfill(3)
+        metadata = cf_station_metadata(variable_code)
+        nc_name = metadata["nc_name"]
+
+        rep_nc = Path(output_folder).joinpath(nc_name)
+        rep_nc.mkdir(parents=True, exist_ok=True)
+
+        # Loop on the files
+        logging.info(
+            f"Collecting files for variable '{metadata['standard_name']}' "
+            f"(filenames containing '{metadata['_table_name']}')."
+        )
+        list_files = list()
+        if isinstance(source_files, list) or Path(source_files).is_file():
+            list_files.append(source_files)
+        else:
+            glob_patterns = [g for g in metadata["_table_name"]]
+            for pattern in glob_patterns:
+                list_files.extend(
+                    [f for f in Path(source_files).rglob(f"{pattern}*") if f.is_file()]
+                )
+        manager = mp.Manager()
+        errored_files = manager.list()
+        converter_func = partial(
+            _convert_station_file,
+            output_path=rep_nc,
+            errored_files=errored_files,
+            mode=mode,
+            variable_code=variable_code,
+            column_names=column_names,
+            column_dtypes=column_dtypes,
+            **metadata,
+        )
+        with mp.Pool(processes=n_workers) as pool:
+            pool.map(converter_func, list_files)
+            pool.close()
+            pool.join()
+
+        if errored_files:
+            logging.warning(
+                "Some files failed to be properly parsed:\n", ", ".join(errored_files)
+            )
+
+    logging.warning(f"Process completed in {time.time() - func_time:.2f} seconds")
diff --git a/miranda/eccc/data/eccc_homogenized_cf_attrs.json b/miranda/eccc/data/eccc_homogenized_cf_attrs.json
new file mode 100644
index 00000000..92c3b0f1
--- /dev/null
+++ b/miranda/eccc/data/eccc_homogenized_cf_attrs.json
@@ -0,0 +1,111 @@
+{
+  "Header": {
+    "Conventions": "CF-1.8",
+    "_product": {
+      "gen2": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 2",
+      "gen3": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 3"
+    },
+    "citation": {
+      "gen2": "Mekis, É and L.A. Vincent, 2011: An overview of the second generation adjusted daily precipitation dataset for trend analysis in Canada. Atmosphere-Ocean 49(2), 163-177 doi:10.1080/07055900.2011.583910",
+      "gen3": "Vincent, L.A., M.M. Hartwell and X.L. Wang, 2020: A Third Generation of Homogenized Temperature for Trend Analysis and Monitoring Changes in Canada’s Climate. Atmosphere-Ocean. https://doi.org/10.1080/07055900.2020.1765728"
+    },
+    "contact": "info.cccs-ccsc@canada.ca",
+    "documentation": "https://www.canada.ca/en/environment-climate-change/services/climate-change/canadian-centre-climate-services/display-download/technical-documentation-adjusted-climate-data.html",
+    "float_missing_value": "1e20",
+    "frequency": "day",
+    "institution": "GovCan",
+    "int_missing_value": "-999",
+    "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html",
+    "license_type": "permissive",
+    "organization": "ECCC",
+    "realm": "atmos",
+    "table_date": "2023-03-23",
+    "table_id": "ECCC"
+  },
+  "variable_entry": {
+    "dm": {
+      "add_offset": 273.15,
+      "cell_methods": "time: mean",
+      "comments": "Station data converted from Mean Temp (°C)",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Near-Surface Air Temperature",
+      "original_field": "Mean Temp (°C)",
+      "out_name": "tas",
+      "scale_factor": 1,
+      "standard_name": "air_temperature",
+      "type": "real",
+      "units": "K"
+    },
+    "dn": {
+      "add_offset": 273.15,
+      "cell_methods": "time: minimum",
+      "comments": "Station data converted from Min Temp (°C)",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Daily Minimum Near-Surface Air Temperature",
+      "original_field": "Min Temp (°C)",
+      "out_name": "tasmin",
+      "scale_factor": 1,
+      "standard_name": "air_temperature",
+      "type": "real",
+      "units": "K"
+    },
+    "dr": {
+      "add_offset": 0,
+      "cell_methods": "time: mean",
+      "comments": "Station data converted from Total Rain (mm) using a density of 1000 kg/m³",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Liquid Precipitation",
+      "original_field": "Total Rain (mm)",
+      "out_name": "prlp",
+      "scale_factor": 1.1574074074074073e-05,
+      "standard_name": "rainfall_flux",
+      "type": "real",
+      "units": "kg m-2 s-1"
+    },
+    "ds": {
+      "add_offset": 0,
+      "cell_methods": "time: mean",
+      "comments": "station data converted from Total Snow (cm) using a density of 100 kg/m³",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Snowfall Flux",
+      "original_field": "Total Snow (cm)",
+      "out_name": "prsn",
+      "scale_factor": 1.1574074074074073e-05,
+      "standard_name": "snowfall_flux",
+      "type": "real",
+      "units": "kg m-2 s-1"
+    },
+    "dt": {
+      "add_offset": 0,
+      "cell_methods": "time: mean",
+      "comments": "Station data converted from Total Precip (mm) using a density of 1000 kg/m³",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Precipitation",
+      "original_field": "Total Precip (mm)",
+      "out_name": "pr",
+      "scale_factor": 1.1574074074074073e-05,
+      "standard_name": "precipitation_flux",
+      "type": "real",
+      "units": "kg m-2 s-1"
+    },
+    "dx": {
+      "add_offset": 273.15,
+      "cell_methods": "time: maximum",
+      "comments": "station data converted from Max Temp (°C)",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Daily Maximum Near-Surface Air Temperature",
+      "original_field": "Max Temp (°C)",
+      "out_name": "tasmax",
+      "scale_factor": 1,
+      "standard_name": "air_temperature",
+      "type": "real",
+      "units": "K"
+    }
+  }
+}
diff --git a/miranda/eccc/data/eccc_obs_summary_cf_attrs.json b/miranda/eccc/data/eccc_obs_summary_cf_attrs.json
new file mode 100644
index 00000000..b21f224e
--- /dev/null
+++ b/miranda/eccc/data/eccc_obs_summary_cf_attrs.json
@@ -0,0 +1,173 @@
+{
+  "Header": {
+    "Conventions": "CF-1.8",
+    "contact": "info.cccs-ccsc@canada.ca",
+    "institution": "GovCan",
+    "int_missing_value": "-999",
+    "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html",
+    "license_type": "permissive",
+    "missing_value": "1e20",
+    "organization": "ECCC",
+    "processing_level": "raw",
+    "realm": "atmos",
+    "source": "msc",
+    "table_date": "2023-03-23",
+    "type": "station-obs"
+  },
+  "variable_entry": {
+    "cdd": {
+      "add_offset": 0,
+      "cell_methods": "time: sum",
+      "comments": "Station data converted from Cool Deg Days (°C)",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Number of Degrees Celsius Over a Mean Temperature of 18 °C",
+      "original_variable": "Cool Deg Days (°C)",
+      "out_name": "cdd",
+      "scale_factor": 1,
+      "standard_name": "cooling_degree_days",
+      "type": "real",
+      "units": "C"
+    },
+    "hdd": {
+      "add_offset": 0,
+      "cell_methods": "time: sum",
+      "comments": "Station data converted from Heat Deg Days (°C)",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Number of Degrees Celsius Under a Mean Temperature of 18 °C",
+      "original_variable": "Heat Deg Days (°C)",
+      "out_name": "hdd",
+      "scale_factor": 1,
+      "standard_name": "heating_degree_days",
+      "type": "real",
+      "units": "C"
+    },
+    "pr": {
+      "add_offset": 0,
+      "cell_methods": "time: mean",
+      "comments": "Station data converted from Total Precip (mm) using a density of 1000 kg/m³",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Precipitation",
+      "original_variable": "Total Precip (mm)",
+      "out_name": "pr",
+      "scale_factor": 1.1574074074074073e-05,
+      "standard_name": "precipitation_flux",
+      "type": "real",
+      "units": "kg m-2 s-1"
+    },
+    "prlp": {
+      "add_offset": 0,
+      "cell_methods": "time: mean",
+      "comments": "Station data converted from Total Rain (mm) using a density of 1000 kg/m³",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Liquid Precipitation",
+      "original_variable": "Total Rain (mm)",
+      "out_name": "prlp",
+      "scale_factor": 1.1574074074074073e-05,
+      "standard_name": "rainfall_flux",
+      "type": "real",
+      "units": "kg m-2 s-1"
+    },
+    "prsn": {
+      "add_offset": 0,
+      "cell_methods": "time: mean",
+      "comments": "station data converted from Total Snow (cm) using a density of 100 kg/m³",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Snowfall Flux",
+      "original_variable": "Total Snow (cm)",
+      "out_name": "prsn",
+      "scale_factor": 1.1574074074074073e-05,
+      "standard_name": "snowfall_flux",
+      "type": "real",
+      "units": "kg m-2 s-1"
+    },
+    "sfcWindAz": {
+      "add_offset": 0,
+      "cell_methods": "time: mean",
+      "comments": "Station data converted from Dir of Max Gust (10s deg)",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Direction from which the Daily Maximum Near-Surface Gust Wind Speed maximum Blows",
+      "original_variable": "Dir of Max Gust (10s deg)",
+      "out_name": "sfcWindAz",
+      "scale_factor": 1,
+      "standard_name": "wind_direction",
+      "type": "real",
+      "units": "degree"
+    },
+    "sfcWindMax": {
+      "add_offset": 0,
+      "cell_methods": "time: max",
+      "comments": "Station data converted from Spd of Max Gust (km/h)",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Daily Maximum Near-Surface Gust Wind Speed maximum",
+      "original_variable": "Spd of Max Gust (km/h)",
+      "out_name": "sfcWindMax",
+      "scale_factor": 0.2777777777777778,
+      "standard_name": "wind_speed_of_gust maximum",
+      "type": "real",
+      "units": "m s-1"
+    },
+    "snd": {
+      "add_offset": 0,
+      "cell_methods": "time: mean",
+      "comments": "Station data converted from Snow on Grnd (cm)",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Snow Depth",
+      "original_variable": "Snow on Grnd (cm)",
+      "out_name": "snd",
+      "scale_factor": 0.01,
+      "standard_name": "surface_snow_thickness",
+      "type": "real",
+      "units": "m"
+    },
+    "tas": {
+      "add_offset": 273.15,
+      "cell_methods": "time: mean",
+      "comments": "Station data converted from Mean Temp (°C)",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Near-Surface Air Temperature",
+      "original_variable": "Mean Temp (°C)",
+      "out_name": "tas",
+      "scale_factor": 1,
+      "standard_name": "air_temperature",
+      "type": "real",
+      "units": "K"
+    },
+    "tasmax": {
+      "add_offset": 273.15,
+      "cell_methods": "time: maximum",
+      "comments": "station data converted from Max Temp (°C)",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Daily Maximum Near-Surface Air Temperature",
+      "original_variable": "Max Temp (°C)",
+      "out_name": "tasmax",
+      "scale_factor": 1,
+      "standard_name": "air_temperature",
+      "type": "real",
+      "units": "K"
+    },
+    "tasmin": {
+      "add_offset": 273.15,
+      "cell_methods": "time: minimum",
+      "comments": "Station data converted from Min Temp (°C)",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Daily Minimum Near-Surface Air Temperature",
+      "original_variable": "Min Temp (°C)",
+      "out_name": "tasmin",
+      "scale_factor": 1,
+      "standard_name": "air_temperature",
+      "type": "real",
+      "units": "K"
+    }
+  }
+}
diff --git a/miranda/eccc/eccc_obs_cf_attrs.json b/miranda/eccc/eccc_obs_cf_attrs.json
index 7c882e31..135756c9 100644
--- a/miranda/eccc/eccc_obs_cf_attrs.json
+++ b/miranda/eccc/eccc_obs_cf_attrs.json
@@ -1,6 +1,8 @@
 {
   "Header": {
-    "Conventions": "CF-1.8",
+    "Conventions": "CF-1.9",
+    "_frequency": true,
+    "_miranda_version": true,
     "contact": "climatcentre-climatecentral@ec.gc.ca",
     "institution": "GovCan",
     "int_missing_value": "-999",
@@ -11,985 +13,1179 @@
     "organization": "ECCC",
     "processing_level": "raw",
     "realm": "atmos",
-    "source": "msc",
+    "source": "MSC",
     "table_date": "2023-03-23",
     "type": "station-obs"
   },
-  "variable_entry": {
+  "dimensions": {
+    "latitude": {
+      "_cf_dimension_name": "lat",
+      "_precision": 4,
+      "axis": "Y",
+      "standard_name": "latitude"
+    },
+    "longitude": {
+      "_cf_dimension_name": "lon",
+      "_precision": 4,
+      "axis": "X",
+      "standard_name": "longitude"
+    },
+    "time": {
+      "_ensure_correct_time": {
+        "obs-daily": "1D",
+        "obs-hourly": "1H"
+      },
+      "_strict_time": false,
+      "axis": "T",
+      "long_name": "time",
+      "standard_name": "time"
+    }
+  },
+  "variables": {
     "001": {
+      "_cf_variable_name": "tasmax",
+      "_corrected_units": "degC",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "DLY02",
         "DLY04",
         "DLY44"
       ],
-      "add_offset": 0,
-      "nc_name": "tasmax",
+      "_transformation": "op * 0.1 degC",
       "original_units": "0.1 °C",
       "original_variable": "Daily Maximum Temperature",
-      "raw_units": "degC",
       "scale_factor": 0.1,
       "standard_name": "air_temperature_maximum",
       "units": "K"
     },
     "002": {
+      "_cf_variable_name": "tasmin",
+      "_corrected_units": "degC",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "DLY02",
         "DLY04",
         "DLY44"
       ],
-      "add_offset": 0,
-      "nc_name": "tasmin",
+      "_transformation": "op * 0.1 degC",
       "original_units": "0.1 °C",
       "original_variable": "Daily Minimum Temperature",
-      "raw_units": "degC",
       "scale_factor": 0.1,
       "standard_name": "air_temperature_minimum",
       "units": "K"
     },
     "003": {
+      "_cf_variable_name": "tas",
+      "_corrected_units": "degC",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "DLY02",
         "DLY04",
         "DLY44"
       ],
-      "add_offset": 0,
-      "nc_name": "tas",
+      "_transformation": "op * 0.1 degC",
       "original_units": "0.1 °C",
       "original_variable": "Daily Mean Temperature",
-      "raw_units": "degC",
       "scale_factor": 0.1,
       "standard_name": "air_temperature",
       "units": "K"
     },
     "010": {
+      "_cf_variable_name": "prlptot",
+      "_corrected_units": "mm",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "DLY02",
         "DLY04",
         "DLY44"
       ],
-      "add_offset": 0,
-      "nc_name": "prlptot",
+      "_transformation": "op * 0.1 mm day-1",
       "original_units": "0.1 mm day-1",
       "original_variable": "Daily Total Rainfall",
-      "raw_units": "mm",
       "scale_factor": 0.1,
       "standard_name": "liquid_precipitation_amount",
       "units": "m"
     },
     "011": {
+      "_cf_variable_name": "prsntot",
+      "_corrected_units": "cm",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "DLY02",
         "DLY04",
         "DLY44"
       ],
-      "add_offset": 0,
-      "nc_name": "prsntot",
+      "_transformation": "op * 0.1 cm day-1",
       "original_units": "0.1 cm day-1",
       "original_variable": "Daily Total Snowfall",
-      "raw_units": "cm",
       "scale_factor": 0.1,
       "standard_name": "solid_precipitation_amount",
       "units": "m"
     },
     "012": {
+      "_cf_variable_name": "prcptot",
+      "_corrected_units": "mm",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "DLY02",
         "DLY04",
         "DLY44"
       ],
-      "add_offset": 0,
-      "nc_name": "prcptot",
+      "_transformation": "op * 0.1 mm day-1",
       "original_units": "0.1 mm day-1",
       "original_variable": "Daily Total Precipitation",
-      "raw_units": "mm",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "m"
     },
     "013": {
+      "_cf_variable_name": "sndtot",
+      "_corrected_units": "cm",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "DLY02",
         "DLY04",
         "DLY44"
       ],
-      "add_offset": 0,
-      "nc_name": "sndtot",
+      "_transformation": false,
       "original_units": "cm",
       "original_variable": "Snow on the Ground",
-      "raw_units": "cm",
       "scale_factor": 1,
       "standard_name": "surface_snow_thickness",
       "units": "m"
     },
     "014": {
+      "_cf_variable_name": "thunder",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "DLY02",
         "DLY04",
         "DLY44"
       ],
-      "add_offset": 0,
-      "nc_name": "thunder",
+      "_transformation": false,
       "original_variable": "Thunderstorms",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "thunderstorm_presence",
       "units": "1"
     },
     "015": {
+      "_cf_variable_name": "freezing_rain_drizzle",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "DLY02",
         "DLY04",
         "DLY44"
       ],
-      "add_offset": 0,
-      "nc_name": "freezing_rain_drizzle",
+      "_transformation": false,
       "original_variable": "Freezing rain or drizzle",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "freeze_rain_drizzle_presence",
       "units": "1"
     },
     "016": {
+      "_cf_variable_name": "hail",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "DLY02",
         "DLY04",
         "DLY44"
       ],
-      "add_offset": 0,
-      "nc_name": "hail",
+      "_transformation": false,
       "original_variable": "Hail",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "hail_presence",
       "units": "1"
     },
     "017": {
+      "_cf_variable_name": "fog_ice_fog",
+      "_corrected_units": "1",
       "_table_name": [
         "DLY02",
         "DLY04",
         "DLY44"
       ],
-      "add_offset": 0,
-      "nc_name": "fog_ice_fog",
       "original_variable": "Fog or Ice Fog",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "fog_ice_fog_presence",
       "units": "1"
     },
     "018": {
+      "_cf_variable_name": "smoke_haze",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "DLY02",
         "DLY04"
       ],
-      "add_offset": 0,
-      "nc_name": "smoke_haze",
+      "_transformation": false,
       "original_variable": "Smoke or Haze",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "smoke_haze_presence",
       "units": "1"
     },
     "019": {
+      "_cf_variable_name": "blowing_dust_sand",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "DLY02",
         "DLY04"
       ],
-      "add_offset": 0,
-      "nc_name": "blowing_dust_sand",
+      "_transformation": false,
       "original_variable": "Blowing Dust or Sand",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "blowing_dust_sand_presence",
       "units": "1"
     },
     "020": {
+      "_cf_variable_name": "blow_snow",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "DLY02",
         "DLY04"
       ],
-      "add_offset": 0,
-      "nc_name": "blow_snow",
+      "_transformation": false,
       "original_variable": "Blowing snow",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "blowing_snow_presence",
       "units": "1"
     },
     "021": {
+      "_cf_variable_name": "wind_gt_28kt",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "DLY02",
         "DLY04"
       ],
-      "add_offset": 0,
-      "nc_name": "wind_gt_28kt",
+      "_transformation": false,
       "original_variable": "Wind speed >= 28 Knots",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "wind_exceeding_28_knots",
       "units": "1"
     },
     "022": {
+      "_cf_variable_name": "wind_gt_34kt",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "DLY02",
         "DLY04"
       ],
-      "add_offset": 0,
-      "nc_name": "wind_gt_34kt",
+      "_transformation": false,
       "original_variable": "Wind speed >= 34 Knots",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "wind_exceeding_34_knots",
       "units": "1"
     },
     "023": {
+      "_cf_variable_name": "gust_dir_16pts",
+      "_corrected_units": "deg",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "DLY02",
         "DLY04"
       ],
-      "add_offset": 0,
-      "nc_name": "gust_dir_16pts",
+      "_transformation": "op * 10 deg",
       "original_units": "10's of degrees",
       "original_variable": "Direction of extreme gust (16 pts) to December 1976",
-      "raw_units": "deg",
-      "scale_factor": 10,
       "standard_name": "gust_to_direction",
       "units": "deg"
     },
     "024": {
+      "_cf_variable_name": "gust_speed",
+      "_corrected_units": "km h-1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "DLY02",
         "DLY04"
       ],
-      "add_offset": 0,
-      "nc_name": "gust_speed",
+      "_transformation": false,
       "original_units": "km/h",
       "original_variable": "Speed of extreme gust",
-      "raw_units": "km h-1",
       "scale_factor": 1,
       "standard_name": "wind_speed_of_gust",
       "units": "m s-1"
     },
     "025": {
+      "_cf_variable_name": "gust_hour",
+      "_corrected_units": "h",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "DLY02",
         "DLY04"
       ],
-      "add_offset": 0,
-      "nc_name": "gust_hour",
+      "_transformation": false,
       "original_variable": "UTC hour of extreme gust",
-      "raw_units": "h",
       "scale_factor": 1,
       "standard_name": "hour_of_extreme_gust",
       "units": "h"
     },
     "061": {
+      "_cf_variable_name": "rf1_radiation",
+      "_corrected_units": "MJ m-2",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY11"
       ],
-      "add_offset": 0,
-      "nc_name": "rf1_radiation",
+      "_transformation": "op / 1000 MJ m-2",
       "original_units": "0.001 MJ/m",
       "original_variable": "RF1 global solar radiation",
-      "raw_units": "W m-2 h-1",
-      "scale_factor": 277.77777777777777,
       "standard_name": "solar_radiation_flux",
-      "units": "W m-2 h-1"
+      "units": "W h m-2"
     },
     "062": {
+      "_cf_variable_name": "rf2_radiation",
+      "_corrected_units": "MJ m-2",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY11"
       ],
-      "add_offset": 0,
-      "nc_name": "rf2_radiation",
+      "_transformation": "op / 1000 MJ m-2",
       "original_units": "0.001 MJ/m",
       "original_variable": "RF2 sky (diffuse) radiation",
-      "raw_units": "W m-2 h-1",
-      "scale_factor": 277.77777777777777,
       "standard_name": "solar_radiation_flux",
-      "units": "W m-2 h-1"
+      "units": "W h m-2"
     },
     "063": {
+      "_cf_variable_name": "rf3_radiation",
+      "_corrected_units": "MJ m-2",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY11"
       ],
-      "add_offset": 0,
-      "nc_name": "rf3_radiation",
+      "_transformation": "op / 1000 MJ m-2",
       "original_units": "0.001 MJ/m",
       "original_variable": "RF3 reflected solar radiation",
-      "raw_units": "W m-2 h-1",
-      "scale_factor": 277.77777777777777,
       "standard_name": "solar_radiation_flux",
-      "units": "W m-2 h-1"
+      "units": "W h m-2"
     },
     "064": {
+      "_cf_variable_name": "rf4_radiation",
+      "_corrected_units": "MJ m-2",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY11"
       ],
-      "add_offset": 0,
-      "nc_name": "rf4_radiation",
+      "_transformation": "op / 1000 MJ m-2",
       "original_units": "0.001 MJ/m",
       "original_variable": "RF4 net all wave radiation",
-      "raw_units": "W m-2 h-1",
-      "scale_factor": 277.77777777777777,
       "standard_name": "solar_radiation_flux",
-      "units": "W m-2 h-1"
+      "units": "W h m-2"
     },
     "067": {
+      "_cf_variable_name": "rf7_radiation",
+      "_corrected_units": "lux h",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY11"
       ],
-      "add_offset": 0,
-      "nc_name": "rf7_radiation",
+      "_transformation": false,
       "original_units": "0.01 Kilolux_hrs",
       "original_variable": "RF7 daylight illumination",
-      "raw_units": "lux h",
       "scale_factor": 10,
       "standard_name": "solar_radiation_flux",
       "units": "lux h"
     },
     "068": {
+      "_cf_variable_name": "rf8_radiation",
+      "_corrected_units": "MJ m-2",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY11"
       ],
-      "add_offset": 0,
-      "nc_name": "rf8_radiation",
+      "_transformation": "op / 1000 MJ m-2",
       "original_units": "0.001 MJ/m",
       "original_variable": "RF8 direct solar radiation",
-      "raw_units": "W m-2 h-1",
-      "scale_factor": 277.77777777777777,
       "standard_name": "solar_radiation_flux",
-      "units": "W m-2 h-1"
+      "units": "W h m-2"
     },
     "069": {
+      "_cf_variable_name": "wind_dir_45B",
+      "_corrected_units": "deg",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY15"
       ],
-      "add_offset": 0,
-      "nc_name": "wind_dir_45B",
+      "_transformation": false,
       "original_units": "10's of degrees",
       "original_variable": "Direction - 45B anemometer (8 pts)",
-      "raw_units": "deg",
       "scale_factor": 1,
       "standard_name": "wind_to_direction",
       "units": "deg"
     },
     "071": {
+      "_cf_variable_name": "ceiling_hgt",
+      "_corrected_units": "m",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "ceiling_hgt",
+      "_transformation": false,
       "original_units": "30's of meters",
       "original_variable": "Ceiling height of lowest layer of clouds",
-      "raw_units": "m",
       "scale_factor": 30,
       "standard_name": "ceiling_cloud_height",
       "units": "m"
     },
     "072": {
+      "_cf_variable_name": "visibility",
+      "_corrected_units": "km",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "visibility",
+      "_transformation": false,
       "original_units": "0.1 km",
       "original_variable": "Visibility",
-      "raw_units": "km",
       "scale_factor": 0.1,
       "standard_name": "visibility_in_air",
       "units": "m"
     },
     "073": {
+      "_cf_variable_name": "psl",
+      "_corrected_units": "Pa",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "psl",
+      "_transformation": false,
       "original_units": "0.01 kPa",
       "original_variable": "Sea Level Pressure",
-      "raw_units": "Pa",
       "scale_factor": 10,
       "standard_name": "air_pressure_at_mean_sea_level",
       "units": "Pa"
     },
     "074": {
+      "_cf_variable_name": "tds",
+      "_corrected_units": "degC",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "tds",
+      "_transformation": false,
       "original_units": "0.1 °C",
       "original_variable": "Dew Point Temperature",
-      "raw_units": "degC",
       "scale_factor": 0.1,
       "standard_name": "dew_point_temperature",
       "units": "K"
     },
     "075": {
+      "_cf_variable_name": "wind_dir_u2a_16",
+      "_corrected_units": "deg",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "wind_dir_u2a_16",
+      "_transformation": false,
       "original_units": "10's of degrees",
       "original_variable": "Wind Direction at 2 m (U2A Anemometer) (16 pts)",
-      "raw_units": "deg",
       "scale_factor": 10,
       "standard_name": "wind_direction_u2a",
       "units": "deg"
     },
     "076": {
+      "_cf_variable_name": "wind_speed_u2a",
+      "_corrected_units": "km h-1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "wind_speed_u2a",
+      "_transformation": false,
       "original_units": "km/h",
       "original_variable": "Wind Speed - U2A (16 pts) to December 1970",
-      "raw_units": "km h-1",
       "scale_factor": 1,
       "standard_name": "wind_speed_u2a",
       "units": "m s-1"
     },
     "077": {
+      "_cf_variable_name": "pressure",
+      "_corrected_units": "Pa",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "pressure",
+      "_transformation": false,
       "original_units": "0.01 kPa",
       "original_variable": "Station Pressure",
-      "raw_units": "Pa",
       "scale_factor": 10,
       "standard_name": "atmospheric_pressure",
       "units": "Pa"
     },
     "078": {
+      "_cf_variable_name": "tas_dry",
+      "_corrected_units": "degC",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "tas_dry",
+      "_transformation": false,
       "original_units": "0.1 °C",
       "original_variable": "Dry Bulb Temperature",
-      "raw_units": "degC",
       "scale_factor": 0.1,
       "standard_name": "dry_bulb_temperature",
       "units": "K"
     },
     "079": {
+      "_cf_variable_name": "tas_wet",
+      "_corrected_units": "degC",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "tas_wet",
+      "_transformation": false,
       "original_units": "0.1 °C",
       "original_variable": "Wet Bulb temperature",
-      "raw_units": "degC",
       "scale_factor": 0.1,
       "standard_name": "wet_bulb_temperature",
       "units": "K"
     },
     "080": {
+      "_cf_variable_name": "hur",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "hur",
+      "_transformation": false,
       "original_units": "%",
       "original_variable": "Relative Humidity",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "relative_humidity",
       "units": "1"
     },
     "081": {
+      "_cf_variable_name": "clo",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "clo",
+      "_transformation": false,
       "original_units": "%",
       "original_variable": "Total Cloud Opacity",
-      "raw_units": "1",
       "scale_factor": 10,
       "standard_name": "cloud_albedo",
       "units": "1"
     },
     "082": {
+      "_cf_variable_name": "clt",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "clt",
+      "_transformation": false,
       "original_units": "%",
       "original_variable": "Total Cloud Amount",
-      "raw_units": "1",
       "scale_factor": 10,
       "standard_name": "cloud_area_fraction",
       "units": "1"
     },
     "089": {
+      "_cf_variable_name": "freeze_rain",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "freeze_rain",
+      "_transformation": false,
       "original_variable": "Freezing Rain",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "freezing_rain",
       "units": "1"
     },
     "094": {
+      "_cf_variable_name": "ice_pellets",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "ice_pellets",
+      "_transformation": false,
       "original_variable": "Ice Pellets",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "ice_pellet_presence",
       "units": "1"
     },
     "107": {
+      "_cf_variable_name": "1low_cloud_opac",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "1low_cloud_opac",
+      "_transformation": false,
       "original_units": "Tenths",
       "original_variable": "Lowest cloud layer opacity",
-      "raw_units": "1",
       "scale_factor": 10,
       "standard_name": "low_type_cloud_opacity_fraction",
       "units": "1"
     },
     "108": {
+      "_cf_variable_name": "1low_cloud_frac",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "1low_cloud_frac",
+      "_transformation": false,
       "original_units": "Tenths",
       "original_variable": "Lowest cloud layer amount or condition",
-      "raw_units": "1",
       "scale_factor": 10,
       "standard_name": "low_type_cloud_area_fraction",
       "units": "1"
     },
     "109": {
+      "_cf_variable_name": "1low_cloud_type",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "1low_cloud_type",
+      "_transformation": false,
       "original_variable": "Lowest cloud layer type",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "low_type_cloud_type",
       "units": "1"
     },
     "110": {
+      "_cf_variable_name": "1low_cloud_hgt",
+      "_corrected_units": "m",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "1low_cloud_hgt",
+      "_transformation": false,
       "original_units": "30's of meters",
       "original_variable": "Lowest cloud layer height",
-      "raw_units": "m",
       "scale_factor": 30,
       "standard_name": "low_type_cloud_height",
       "units": "m"
     },
     "111": {
+      "_cf_variable_name": "2low_cloud_opac",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "2low_cloud_opac",
+      "_transformation": false,
       "original_units": "Tenths",
       "original_variable": "Second lowest cloud layer opacity",
-      "raw_units": "1",
       "scale_factor": 10,
       "standard_name": "low_type_cloud_opacity_fraction",
       "units": "1"
     },
     "112": {
+      "_cf_variable_name": "2low_cloud_frac",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "2low_cloud_frac",
+      "_transformation": false,
       "original_units": "Tenths",
       "original_variable": "Second lowest cloud layer amount or condition",
-      "raw_units": "1",
       "scale_factor": 10,
       "standard_name": "low_type_cloud_area_fraction",
       "units": "1"
     },
     "113": {
+      "_cf_variable_name": "2low_cloud_type",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "2low_cloud_type",
+      "_transformation": false,
       "original_units": "",
       "original_variable": "Second lowest cloud layer type",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "low_type_cloud_type",
       "units": "1"
     },
     "114": {
+      "_cf_variable_name": "2low_cloud_hgt",
+      "_corrected_units": "m",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "2low_cloud_hgt",
+      "_transformation": false,
       "original_units": "30's of meters",
       "original_variable": "Second lowest cloud layer height",
-      "raw_units": "m",
       "scale_factor": 30,
       "standard_name": "low_type_cloud_height",
       "units": "m"
     },
     "115": {
+      "_cf_variable_name": "3low_cloud_opac",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "3low_cloud_opac",
+      "_transformation": false,
       "original_units": "Tenths",
       "original_variable": "Thirsd lowest cloud layer opacity",
-      "raw_units": "1",
       "scale_factor": 10,
       "standard_name": "low_type_cloud_opacity_fraction",
       "units": "1"
     },
     "116": {
+      "_cf_variable_name": "3low_cloud_frac",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "3low_cloud_frac",
+      "_transformation": false,
       "original_units": "Tenths",
       "original_variable": "Third lowest cloud layer amount or condition",
-      "raw_units": "1",
       "scale_factor": 10,
       "standard_name": "low_type_cloud_area_fraction",
       "units": "1"
     },
     "117": {
+      "_cf_variable_name": "3low_cloud_type",
+      "_corrected_units": "1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "3low_cloud_type",
+      "_transformation": false,
       "original_units": "",
       "original_variable": "Third lowest cloud layer type",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "low_type_cloud_type",
       "units": "1"
     },
     "118": {
+      "_cf_variable_name": "3low_cloud_hgt",
+      "_corrected_units": "m",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "3low_cloud_hgt",
+      "_transformation": false,
       "original_units": "30's of meters",
       "original_variable": "Third lowest cloud layer height",
-      "raw_units": "m",
       "scale_factor": 30,
       "standard_name": "low_type_cloud_height",
       "units": "m"
     },
     "123": {
+      "_cf_variable_name": "rainfall",
+      "_corrected_units": "mm h-1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "add_offset": 0,
-      "nc_name": "rainfall",
+      "_transformation": false,
       "original_units": "0.1 mm",
       "original_variable": "Total Rainfall",
-      "raw_units": "mm h-1",
       "scale_factor": 0.1,
       "standard_name": "rainfall_flux",
       "units": "kg m2 s-1"
     },
     "133": {
+      "_cf_variable_name": "sun",
+      "_corrected_units": "h",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY10"
       ],
-      "add_offset": 0,
-      "nc_name": "sun",
+      "_transformation": false,
       "original_units": "0.1 hrs",
       "original_variable": "Sunshine",
-      "raw_units": "h",
       "scale_factor": 0.1,
       "standard_name": "duration_of_sunshine",
       "units": "s"
     },
     "156": {
+      "_cf_variable_name": "wind_dir_u2a_36",
+      "_corrected_units": "deg",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01"
       ],
-      "nc_name": "wind_dir_u2a_36",
+      "_transformation": false,
       "original_units": "10's of degrees",
       "original_variable": "Wind Direction - U2A (36 pts) from January 1971",
-      "raw_units": "deg",
+      "scale_factor": 10,
+      "standard_name": "wind_direction_u2a",
+      "units": "deg"
+    },
+    "209": {
+      "_cf_variable_name": "wind_character",
+      "_corrected_units": "",
+      "_invert_sign": false,
+      "_offset_time": false,
+      "_table_name": [
+        "HLY01"
+      ],
+      "_transformation": false,
+      "long_name": "wind_direction_u2a",
+      "original_units": "1, 2",
+      "original_variable": "Wind character at 10 m",
+      "scale_factor": 1,
+      "units": ""
+    },
+    "210": {
+      "_cf_variable_name": "wind_dir_u2a_36",
+      "_corrected_units": "deg",
+      "_invert_sign": false,
+      "_offset_time": false,
+      "_table_name": [
+        "HLY01"
+      ],
+      "_transformation": false,
+      "original_units": "km/h",
+      "original_variable": "Wind Direction - U2A (36 pts) from January 1971",
       "scale_factor": 10,
       "standard_name": "wind_direction_u2a",
       "units": "deg"
     },
     "262": {
+      "_cf_variable_name": "prtot",
+      "_corrected_units": "mm",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01_RCS"
       ],
-      "add_offset": 0,
-      "nc_name": "prtot",
+      "_transformation": false,
       "original_units": "0.1 mm",
       "original_variable": "Total Precipitation (minutes 00-60)",
-      "raw_units": "mm",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
     "263": {
+      "_cf_variable_name": "prtot_q1",
+      "_corrected_units": "mm",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01_RCS"
       ],
-      "add_offset": 0,
-      "nc_name": "prtot_q1",
+      "_transformation": false,
       "original_units": "0.1 mm",
       "original_variable": "Total Precipitation (minutes 00-15)",
-      "raw_units": "mm",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
     "264": {
+      "_cf_variable_name": "prtot_q2",
+      "_corrected_units": "mm",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01_RCS"
       ],
-      "add_offset": 0,
-      "nc_name": "prtot_q2",
+      "_transformation": false,
       "original_units": "0.1 mm",
       "original_variable": "Total Precipitation (minutes 15-30)",
-      "raw_units": "mm",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
     "265": {
+      "_cf_variable_name": "prtot_q3",
+      "_corrected_units": "mm",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01_RCS"
       ],
-      "add_offset": 0,
-      "nc_name": "prtot_q3",
+      "_transformation": false,
       "original_units": "0.1 mm",
       "original_variable": "Total Precipitation (minutes 30-45)",
-      "raw_units": "mm",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
     "266": {
+      "_cf_variable_name": "prtot_q4",
+      "_corrected_units": "mm",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01_RCS"
       ],
-      "add_offset": 0,
-      "nc_name": "prtot_q4",
+      "_transformation": false,
       "original_units": "0.1 mm",
       "original_variable": "Total Precipitation (minutes 45-60)",
-      "raw_units": "mm",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
     "267": {
+      "_cf_variable_name": "precipitation_weight_q1",
+      "_corrected_units": "kg m-2",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01_RCS"
       ],
-      "add_offset": 0,
-      "nc_name": "precipitation_weight_q1",
+      "_transformation": false,
       "original_units": "0.1 kg/m²",
       "original_variable": "Precipitation Gauge Weight per Unit Area (at minute 15)",
-      "raw_units": "kg m-2",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
     "268": {
+      "_cf_variable_name": "precipitation_weight_q2",
+      "_corrected_units": "kg m-2",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01_RCS"
       ],
-      "add_offset": 0,
-      "nc_name": "precipitation_weight_q2",
+      "_transformation": false,
       "original_units": "0.1 kg/m²",
       "original_variable": "Precipitation Gauge Weight per Unit Area (at minute 30)",
-      "raw_units": "kg m-2",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
     "269": {
+      "_cf_variable_name": "precipitation_weight_q3",
+      "_corrected_units": "kg m-2",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01_RCS"
       ],
-      "add_offset": 0,
-      "nc_name": "precipitation_weight_q3",
+      "_transformation": false,
       "original_units": "0.1 kg/m²",
       "original_variable": "Precipitation Gauge Weight per Unit Area (at minute 45)",
-      "raw_units": "kg m-2",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
     "270": {
+      "_cf_variable_name": "precipitation_weight_q4",
+      "_corrected_units": "kg m-2",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01_RCS"
       ],
-      "add_offset": 0,
-      "nc_name": "precipitation_weight_q4",
+      "_transformation": false,
       "original_units": "0.1 kg/m²",
       "original_variable": "Precipitation Gauge Weight per Unit Area (at minute 60)",
-      "raw_units": "kg m-2",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
     "271": {
+      "_cf_variable_name": "wind_speed_q1",
+      "_corrected_units": "km h-1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01_RCS"
       ],
-      "add_offset": 0,
-      "nc_name": "wind_speed_q1",
-      "nc_units": "m s-1",
+      "_transformation": false,
       "original_units": "0.1 km/h",
       "original_variable": "Wind Speed at 2 m (minutes 00-15)",
-      "raw_units": "km h-1",
       "scale_factor": 0.1,
-      "standard_name": "wind_speed"
+      "standard_name": "wind_speed",
+      "units": "m s-1"
     },
     "272": {
+      "_cf_variable_name": "wind_speed_q2",
+      "_corrected_units": "km h-1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01_RCS"
       ],
-      "add_offset": 0,
-      "nc_name": "wind_speed_q2",
-      "nc_units": "m s-1",
+      "_transformation": false,
       "original_units": "0.1 km/h",
       "original_variable": "Wind Speed at 2 m (minutes 15-30)",
-      "raw_units": "km h-1",
       "scale_factor": 0.1,
-      "standard_name": "wind_speed"
+      "standard_name": "wind_speed",
+      "units": "m s-1"
     },
     "273": {
+      "_cf_variable_name": "wind_speed_q3",
+      "_corrected_units": "km h-1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01_RCS"
       ],
-      "add_offset": 0,
-      "nc_name": "wind_speed_q3",
-      "nc_units": "m s-1",
+      "_transformation": false,
       "original_units": "0.1 km/h",
       "original_variable": "Wind Speed at 2 m (minutes 30-45)",
-      "raw_units": "km h-1",
       "scale_factor": 0.1,
-      "standard_name": "wind_speed"
+      "standard_name": "wind_speed",
+      "units": "m s-1"
     },
     "274": {
+      "_cf_variable_name": "wind_speed_q4",
+      "_corrected_units": "km h-1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01_RCS"
       ],
-      "add_offset": 0,
-      "nc_name": "wind_speed_q4",
-      "nc_units": "m s-1",
+      "_transformation": false,
       "original_units": "0.1 km/h",
       "original_variable": "Wind Speed at 2 m (minutes 45-60)",
-      "raw_units": "km h-1",
       "scale_factor": 0.1,
-      "standard_name": "wind_speed"
+      "standard_name": "wind_speed",
+      "units": "m s-1"
     },
     "275": {
+      "_cf_variable_name": "snd_q4",
+      "_corrected_units": "cm",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01_RCS"
       ],
-      "add_offset": 0,
-      "nc_name": "snd_q4",
+      "_transformation": false,
       "original_units": "cm",
       "original_variable": "Snow Depth (at minute 60)",
-      "raw_units": "cm",
       "scale_factor": 1,
       "standard_name": "surface_snow_thickness",
       "units": "m"
     },
     "276": {
+      "_cf_variable_name": "snd_q1",
+      "_corrected_units": "cm",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01_RCS"
       ],
-      "add_offset": 0,
-      "nc_name": "snd_q1",
+      "_transformation": false,
       "original_units": "cm",
       "original_variable": "Snow Depth (at minute 15)",
-      "raw_units": "cm",
       "scale_factor": 1,
       "standard_name": "surface_snow_thickness",
       "units": "m"
     },
     "277": {
+      "_cf_variable_name": "snd_q2",
+      "_corrected_units": "cm",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01_RCS"
       ],
-      "add_offset": 0,
-      "nc_name": "snd_q2",
+      "_transformation": false,
       "original_units": "cm",
       "original_variable": "Snow Depth (at minute 30)",
-      "raw_units": "cm",
       "scale_factor": 1,
       "standard_name": "surface_snow_thickness",
       "units": "m"
     },
     "278": {
+      "_cf_variable_name": "snd_q3",
+      "_corrected_units": "cm",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01_RCS"
       ],
-      "add_offset": 0,
-      "nc_name": "snd_q3",
+      "_transformation": false,
       "original_units": "cm",
       "original_variable": "Snow Depth (at minute 45)",
-      "raw_units": "cm",
       "scale_factor": 1,
       "standard_name": "surface_snow_thickness",
       "units": "m"
     },
     "279": {
+      "_cf_variable_name": "wind_dir",
+      "_corrected_units": "deg",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01_RCS"
       ],
-      "add_offset": 0,
-      "nc_name": "wind_dir",
-      "nc_units": "deg",
+      "_transformation": false,
       "original_units": "Degrees",
       "original_variable": "Wind Direction at 2 m (minutes 50-60)",
-      "raw_units": "deg",
       "scale_factor": 1,
-      "standard_name": "wind_direction"
+      "standard_name": "wind_direction",
+      "units": "deg"
     },
     "280": {
+      "_cf_variable_name": "wind_speed",
+      "_corrected_units": "km h-1",
+      "_invert_sign": false,
+      "_offset_time": false,
       "_table_name": [
         "HLY01_RCS"
       ],
-      "add_offset": 0,
-      "nc_name": "wind_speed",
+      "_transformation": false,
       "original_units": "0.1 km/h",
       "original_variable": "Wind Speed at 2 m (minutes 50-60)",
-      "raw_units": "km h-1",
       "scale_factor": 0.1,
       "standard_name": "wind_speed",
       "units": "m s-1"
diff --git a/miranda/units.py b/miranda/units.py
index 381ac063..4775b62c 100644
--- a/miranda/units.py
+++ b/miranda/units.py
@@ -7,6 +7,7 @@
 import pandas as pd
 import xarray as xr
 from xclim.core.calendar import parse_offset
+from xclim.core.units import units
 
 KiB = int(pow(2, 10))
 MiB = int(pow(2, 20))
diff --git a/templates/eccc_raw_daily_conversion.py b/templates/eccc_raw_daily_conversion.py
index 4fb64de7..583c8d10 100644
--- a/templates/eccc_raw_daily_conversion.py
+++ b/templates/eccc_raw_daily_conversion.py
@@ -11,25 +11,25 @@
     time_step = "daily"
     n_workers = 3
     var_codes = [
-        1,
-        2,
-        3,
-        10,
-        11,
-        12,
-        13,
-        14,
-        15,
-        16,
-        17,
-        18,
-        19,
-        20,
-        21,
-        22,
+        # 1,
+        # 2,
+        # 3,
+        # 10,
+        # 11,
+        # 12,
+        # 13,
+        # 14,
+        # 15,
+        # 16,
+        # 17,
+        # 18,
+        # 19,
+        # 20,
+        # 21,
+        # 22,
         23,
         24,
-        25,
+        # 25,
     ]
 
     in_files = getenv("in")
diff --git a/templates/eccc_raw_hourly_conversion.py b/templates/eccc_raw_hourly_conversion.py
index 68a24405..0849e182 100644
--- a/templates/eccc_raw_hourly_conversion.py
+++ b/templates/eccc_raw_hourly_conversion.py
@@ -11,39 +11,41 @@
     time_step = "hourly"
     n_workers = 3
     var_codes = [
-        76,
-        77,
-        78,
-        79,
-        80,
-        89,
-        94,
-        107,
-        108,
-        109,
-        110,
-        123,
-        133,
-        156,
-        262,
-        263,
-        264,
-        265,
-        266,
-        267,
-        268,
-        269,
-        270,
-        271,
-        272,
-        273,
-        274,
-        275,
-        276,
-        277,
-        278,
-        279,
-        280,
+        209,
+        210
+        # 76,
+        # 77,
+        # 78,
+        # 79,
+        # 80,
+        # 89,
+        # 94,
+        # 107,
+        # 108,
+        # 109,
+        # 110,
+        # 123,
+        # 133,
+        # 156,
+        # 262,
+        # 263,
+        # 264,
+        # 265,
+        # 266,
+        # 267,
+        # 268,
+        # 269,
+        # 270,
+        # 271,
+        # 272,
+        # 273,
+        # 274,
+        # 275,
+        # 276,
+        # 277,
+        # 278,
+        # 279,
+        # 280,
     ]
 
     in_files = getenv("in")

From d1a8c679fb245d1bd5537893a7687b6561ce01b3 Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Tue, 20 Jun 2023 13:53:09 -0400
Subject: [PATCH 02/33] WIP - more units handling

---
 miranda/eccc/eccc_obs_cf_attrs.json | 178 +++++++++-------------------
 1 file changed, 55 insertions(+), 123 deletions(-)

diff --git a/miranda/eccc/eccc_obs_cf_attrs.json b/miranda/eccc/eccc_obs_cf_attrs.json
index 135756c9..570c318c 100644
--- a/miranda/eccc/eccc_obs_cf_attrs.json
+++ b/miranda/eccc/eccc_obs_cf_attrs.json
@@ -52,10 +52,9 @@
         "DLY04",
         "DLY44"
       ],
-      "_transformation": "op * 0.1 degC",
+      "_transformation": "op / 10 degC",
       "original_units": "0.1 °C",
       "original_variable": "Daily Maximum Temperature",
-      "scale_factor": 0.1,
       "standard_name": "air_temperature_maximum",
       "units": "K"
     },
@@ -69,10 +68,9 @@
         "DLY04",
         "DLY44"
       ],
-      "_transformation": "op * 0.1 degC",
+      "_transformation": "op / 10 degC",
       "original_units": "0.1 °C",
       "original_variable": "Daily Minimum Temperature",
-      "scale_factor": 0.1,
       "standard_name": "air_temperature_minimum",
       "units": "K"
     },
@@ -86,10 +84,9 @@
         "DLY04",
         "DLY44"
       ],
-      "_transformation": "op * 0.1 degC",
+      "_transformation": "op / 10 degC",
       "original_units": "0.1 °C",
       "original_variable": "Daily Mean Temperature",
-      "scale_factor": 0.1,
       "standard_name": "air_temperature",
       "units": "K"
     },
@@ -103,10 +100,9 @@
         "DLY04",
         "DLY44"
       ],
-      "_transformation": "op * 0.1 mm day-1",
+      "_transformation": "op / 10 mm day-1",
       "original_units": "0.1 mm day-1",
       "original_variable": "Daily Total Rainfall",
-      "scale_factor": 0.1,
       "standard_name": "liquid_precipitation_amount",
       "units": "m"
     },
@@ -120,10 +116,9 @@
         "DLY04",
         "DLY44"
       ],
-      "_transformation": "op * 0.1 cm day-1",
+      "_transformation": "op / 10 cm day-1",
       "original_units": "0.1 cm day-1",
       "original_variable": "Daily Total Snowfall",
-      "scale_factor": 0.1,
       "standard_name": "solid_precipitation_amount",
       "units": "m"
     },
@@ -137,10 +132,9 @@
         "DLY04",
         "DLY44"
       ],
-      "_transformation": "op * 0.1 mm day-1",
+      "_transformation": "op / 10 mm day-1",
       "original_units": "0.1 mm day-1",
       "original_variable": "Daily Total Precipitation",
-      "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "m"
     },
@@ -157,7 +151,6 @@
       "_transformation": false,
       "original_units": "cm",
       "original_variable": "Snow on the Ground",
-      "scale_factor": 1,
       "standard_name": "surface_snow_thickness",
       "units": "m"
     },
@@ -173,7 +166,6 @@
       ],
       "_transformation": false,
       "original_variable": "Thunderstorms",
-      "scale_factor": 1,
       "standard_name": "thunderstorm_presence",
       "units": "1"
     },
@@ -189,7 +181,6 @@
       ],
       "_transformation": false,
       "original_variable": "Freezing rain or drizzle",
-      "scale_factor": 1,
       "standard_name": "freeze_rain_drizzle_presence",
       "units": "1"
     },
@@ -205,7 +196,6 @@
       ],
       "_transformation": false,
       "original_variable": "Hail",
-      "scale_factor": 1,
       "standard_name": "hail_presence",
       "units": "1"
     },
@@ -218,7 +208,6 @@
         "DLY44"
       ],
       "original_variable": "Fog or Ice Fog",
-      "scale_factor": 1,
       "standard_name": "fog_ice_fog_presence",
       "units": "1"
     },
@@ -233,7 +222,6 @@
       ],
       "_transformation": false,
       "original_variable": "Smoke or Haze",
-      "scale_factor": 1,
       "standard_name": "smoke_haze_presence",
       "units": "1"
     },
@@ -248,7 +236,6 @@
       ],
       "_transformation": false,
       "original_variable": "Blowing Dust or Sand",
-      "scale_factor": 1,
       "standard_name": "blowing_dust_sand_presence",
       "units": "1"
     },
@@ -263,7 +250,6 @@
       ],
       "_transformation": false,
       "original_variable": "Blowing snow",
-      "scale_factor": 1,
       "standard_name": "blowing_snow_presence",
       "units": "1"
     },
@@ -278,7 +264,6 @@
       ],
       "_transformation": false,
       "original_variable": "Wind speed >= 28 Knots",
-      "scale_factor": 1,
       "standard_name": "wind_exceeding_28_knots",
       "units": "1"
     },
@@ -293,7 +278,6 @@
       ],
       "_transformation": false,
       "original_variable": "Wind speed >= 34 Knots",
-      "scale_factor": 1,
       "standard_name": "wind_exceeding_34_knots",
       "units": "1"
     },
@@ -324,7 +308,6 @@
       "_transformation": false,
       "original_units": "km/h",
       "original_variable": "Speed of extreme gust",
-      "scale_factor": 1,
       "standard_name": "wind_speed_of_gust",
       "units": "m s-1"
     },
@@ -339,7 +322,6 @@
       ],
       "_transformation": false,
       "original_variable": "UTC hour of extreme gust",
-      "scale_factor": 1,
       "standard_name": "hour_of_extreme_gust",
       "units": "h"
     },
@@ -401,18 +383,17 @@
     },
     "067": {
       "_cf_variable_name": "rf7_radiation",
-      "_corrected_units": "lux h",
+      "_corrected_units": "klx h",
       "_invert_sign": false,
       "_offset_time": false,
       "_table_name": [
         "HLY11"
       ],
-      "_transformation": false,
+      "_transformation": "op / 100 klx h",
       "original_units": "0.01 Kilolux_hrs",
       "original_variable": "RF7 daylight illumination",
-      "scale_factor": 10,
       "standard_name": "solar_radiation_flux",
-      "units": "lux h"
+      "units": "klx h"
     },
     "068": {
       "_cf_variable_name": "rf8_radiation",
@@ -436,10 +417,9 @@
       "_table_name": [
         "HLY15"
       ],
-      "_transformation": false,
+      "_transformation": "op * 10 deg",
       "original_units": "10's of degrees",
       "original_variable": "Direction - 45B anemometer (8 pts)",
-      "scale_factor": 1,
       "standard_name": "wind_to_direction",
       "units": "deg"
     },
@@ -451,10 +431,9 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
+      "_transformation": "op * 30 m",
       "original_units": "30's of meters",
       "original_variable": "Ceiling height of lowest layer of clouds",
-      "scale_factor": 30,
       "standard_name": "ceiling_cloud_height",
       "units": "m"
     },
@@ -466,10 +445,9 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 km",
       "original_units": "0.1 km",
       "original_variable": "Visibility",
-      "scale_factor": 0.1,
       "standard_name": "visibility_in_air",
       "units": "m"
     },
@@ -481,10 +459,9 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
+      "_transformation": "op / 100 kPa",
       "original_units": "0.01 kPa",
       "original_variable": "Sea Level Pressure",
-      "scale_factor": 10,
       "standard_name": "air_pressure_at_mean_sea_level",
       "units": "Pa"
     },
@@ -496,10 +473,9 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 degC",
       "original_units": "0.1 °C",
       "original_variable": "Dew Point Temperature",
-      "scale_factor": 0.1,
       "standard_name": "dew_point_temperature",
       "units": "K"
     },
@@ -511,10 +487,9 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
+      "_transformation": "op * 10 deg",
       "original_units": "10's of degrees",
       "original_variable": "Wind Direction at 2 m (U2A Anemometer) (16 pts)",
-      "scale_factor": 10,
       "standard_name": "wind_direction_u2a",
       "units": "deg"
     },
@@ -529,7 +504,6 @@
       "_transformation": false,
       "original_units": "km/h",
       "original_variable": "Wind Speed - U2A (16 pts) to December 1970",
-      "scale_factor": 1,
       "standard_name": "wind_speed_u2a",
       "units": "m s-1"
     },
@@ -541,10 +515,9 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
+      "_transformation": "op / 100 kPa",
       "original_units": "0.01 kPa",
       "original_variable": "Station Pressure",
-      "scale_factor": 10,
       "standard_name": "atmospheric_pressure",
       "units": "Pa"
     },
@@ -556,10 +529,9 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 degC",
       "original_units": "0.1 °C",
       "original_variable": "Dry Bulb Temperature",
-      "scale_factor": 0.1,
       "standard_name": "dry_bulb_temperature",
       "units": "K"
     },
@@ -571,10 +543,9 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 degC",
       "original_units": "0.1 °C",
       "original_variable": "Wet Bulb temperature",
-      "scale_factor": 0.1,
       "standard_name": "wet_bulb_temperature",
       "units": "K"
     },
@@ -589,7 +560,6 @@
       "_transformation": false,
       "original_units": "%",
       "original_variable": "Relative Humidity",
-      "scale_factor": 1,
       "standard_name": "relative_humidity",
       "units": "1"
     },
@@ -601,8 +571,8 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
-      "original_units": "%",
+      "_transformation": "op * 10",
+      "original_units": "Tenths",
       "original_variable": "Total Cloud Opacity",
       "scale_factor": 10,
       "standard_name": "cloud_albedo",
@@ -616,8 +586,8 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
-      "original_units": "%",
+      "_transformation": "op * 10",
+      "original_units": "Tenths",
       "original_variable": "Total Cloud Amount",
       "scale_factor": 10,
       "standard_name": "cloud_area_fraction",
@@ -633,7 +603,6 @@
       ],
       "_transformation": false,
       "original_variable": "Freezing Rain",
-      "scale_factor": 1,
       "standard_name": "freezing_rain",
       "units": "1"
     },
@@ -647,7 +616,6 @@
       ],
       "_transformation": false,
       "original_variable": "Ice Pellets",
-      "scale_factor": 1,
       "standard_name": "ice_pellet_presence",
       "units": "1"
     },
@@ -659,10 +627,9 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
+      "_transformation": "op * 10",
       "original_units": "Tenths",
       "original_variable": "Lowest cloud layer opacity",
-      "scale_factor": 10,
       "standard_name": "low_type_cloud_opacity_fraction",
       "units": "1"
     },
@@ -674,10 +641,9 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
+      "_transformation": "op * 10",
       "original_units": "Tenths",
       "original_variable": "Lowest cloud layer amount or condition",
-      "scale_factor": 10,
       "standard_name": "low_type_cloud_area_fraction",
       "units": "1"
     },
@@ -691,7 +657,6 @@
       ],
       "_transformation": false,
       "original_variable": "Lowest cloud layer type",
-      "scale_factor": 1,
       "standard_name": "low_type_cloud_type",
       "units": "1"
     },
@@ -703,10 +668,9 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
+      "_transformation": "op * 30 m",
       "original_units": "30's of meters",
       "original_variable": "Lowest cloud layer height",
-      "scale_factor": 30,
       "standard_name": "low_type_cloud_height",
       "units": "m"
     },
@@ -718,10 +682,9 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
+      "_transformation": "op * 30 m",
       "original_units": "Tenths",
       "original_variable": "Second lowest cloud layer opacity",
-      "scale_factor": 10,
       "standard_name": "low_type_cloud_opacity_fraction",
       "units": "1"
     },
@@ -733,10 +696,9 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
+      "_transformation": "op * 10",
       "original_units": "Tenths",
       "original_variable": "Second lowest cloud layer amount or condition",
-      "scale_factor": 10,
       "standard_name": "low_type_cloud_area_fraction",
       "units": "1"
     },
@@ -751,7 +713,6 @@
       "_transformation": false,
       "original_units": "",
       "original_variable": "Second lowest cloud layer type",
-      "scale_factor": 1,
       "standard_name": "low_type_cloud_type",
       "units": "1"
     },
@@ -763,10 +724,9 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
+      "_transformation": "op * 30 m",
       "original_units": "30's of meters",
       "original_variable": "Second lowest cloud layer height",
-      "scale_factor": 30,
       "standard_name": "low_type_cloud_height",
       "units": "m"
     },
@@ -778,10 +738,9 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
+      "_transformation": "op * 10",
       "original_units": "Tenths",
       "original_variable": "Thirsd lowest cloud layer opacity",
-      "scale_factor": 10,
       "standard_name": "low_type_cloud_opacity_fraction",
       "units": "1"
     },
@@ -793,10 +752,9 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
+      "_transformation": "op * 10",
       "original_units": "Tenths",
       "original_variable": "Third lowest cloud layer amount or condition",
-      "scale_factor": 10,
       "standard_name": "low_type_cloud_area_fraction",
       "units": "1"
     },
@@ -811,7 +769,6 @@
       "_transformation": false,
       "original_units": "",
       "original_variable": "Third lowest cloud layer type",
-      "scale_factor": 1,
       "standard_name": "low_type_cloud_type",
       "units": "1"
     },
@@ -823,10 +780,9 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
+      "_transformation": "op * 30 m",
       "original_units": "30's of meters",
       "original_variable": "Third lowest cloud layer height",
-      "scale_factor": 30,
       "standard_name": "low_type_cloud_height",
       "units": "m"
     },
@@ -838,10 +794,9 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 mm h-1",
       "original_units": "0.1 mm",
       "original_variable": "Total Rainfall",
-      "scale_factor": 0.1,
       "standard_name": "rainfall_flux",
       "units": "kg m2 s-1"
     },
@@ -853,10 +808,9 @@
       "_table_name": [
         "HLY10"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 h",
       "original_units": "0.1 hrs",
       "original_variable": "Sunshine",
-      "scale_factor": 0.1,
       "standard_name": "duration_of_sunshine",
       "units": "s"
     },
@@ -868,10 +822,8 @@
       "_table_name": [
         "HLY01"
       ],
-      "_transformation": false,
-      "original_units": "10's of degrees",
+      "_transformation": "op * 10 deg",
       "original_variable": "Wind Direction - U2A (36 pts) from January 1971",
-      "scale_factor": 10,
       "standard_name": "wind_direction_u2a",
       "units": "deg"
     },
@@ -884,15 +836,15 @@
         "HLY01"
       ],
       "_transformation": false,
+      "description": "Gust (G)=1, Squall (Q)=2",
       "long_name": "wind_direction_u2a",
       "original_units": "1, 2",
       "original_variable": "Wind character at 10 m",
-      "scale_factor": 1,
       "units": ""
     },
     "210": {
-      "_cf_variable_name": "wind_dir_u2a_36",
-      "_corrected_units": "deg",
+      "_cf_variable_name": "",
+      "_corrected_units": "km h-1",
       "_invert_sign": false,
       "_offset_time": false,
       "_table_name": [
@@ -900,10 +852,9 @@
       ],
       "_transformation": false,
       "original_units": "km/h",
-      "original_variable": "Wind Direction - U2A (36 pts) from January 1971",
-      "scale_factor": 10,
-      "standard_name": "wind_direction_u2a",
-      "units": "deg"
+      "original_variable": "Wind gust speed at 10 m",
+      "standard_name": "wind_speed_of_gust",
+      "units": "m s-1"
     },
     "262": {
       "_cf_variable_name": "prtot",
@@ -913,10 +864,9 @@
       "_table_name": [
         "HLY01_RCS"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 mm",
       "original_units": "0.1 mm",
       "original_variable": "Total Precipitation (minutes 00-60)",
-      "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
@@ -928,10 +878,9 @@
       "_table_name": [
         "HLY01_RCS"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 mm",
       "original_units": "0.1 mm",
       "original_variable": "Total Precipitation (minutes 00-15)",
-      "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
@@ -943,10 +892,9 @@
       "_table_name": [
         "HLY01_RCS"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 mm",
       "original_units": "0.1 mm",
       "original_variable": "Total Precipitation (minutes 15-30)",
-      "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
@@ -958,10 +906,9 @@
       "_table_name": [
         "HLY01_RCS"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 mm",
       "original_units": "0.1 mm",
       "original_variable": "Total Precipitation (minutes 30-45)",
-      "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
@@ -973,10 +920,9 @@
       "_table_name": [
         "HLY01_RCS"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 mm",
       "original_units": "0.1 mm",
       "original_variable": "Total Precipitation (minutes 45-60)",
-      "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
@@ -988,10 +934,9 @@
       "_table_name": [
         "HLY01_RCS"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 kg m-2",
       "original_units": "0.1 kg/m²",
       "original_variable": "Precipitation Gauge Weight per Unit Area (at minute 15)",
-      "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
@@ -1003,10 +948,9 @@
       "_table_name": [
         "HLY01_RCS"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 kg m-2",
       "original_units": "0.1 kg/m²",
       "original_variable": "Precipitation Gauge Weight per Unit Area (at minute 30)",
-      "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
@@ -1018,10 +962,9 @@
       "_table_name": [
         "HLY01_RCS"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 kg m-2",
       "original_units": "0.1 kg/m²",
       "original_variable": "Precipitation Gauge Weight per Unit Area (at minute 45)",
-      "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
@@ -1033,10 +976,9 @@
       "_table_name": [
         "HLY01_RCS"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 kg m-2",
       "original_units": "0.1 kg/m²",
       "original_variable": "Precipitation Gauge Weight per Unit Area (at minute 60)",
-      "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
@@ -1048,10 +990,9 @@
       "_table_name": [
         "HLY01_RCS"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 km h-1",
       "original_units": "0.1 km/h",
       "original_variable": "Wind Speed at 2 m (minutes 00-15)",
-      "scale_factor": 0.1,
       "standard_name": "wind_speed",
       "units": "m s-1"
     },
@@ -1063,10 +1004,9 @@
       "_table_name": [
         "HLY01_RCS"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 km h-1",
       "original_units": "0.1 km/h",
       "original_variable": "Wind Speed at 2 m (minutes 15-30)",
-      "scale_factor": 0.1,
       "standard_name": "wind_speed",
       "units": "m s-1"
     },
@@ -1078,10 +1018,9 @@
       "_table_name": [
         "HLY01_RCS"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 km h-1",
       "original_units": "0.1 km/h",
       "original_variable": "Wind Speed at 2 m (minutes 30-45)",
-      "scale_factor": 0.1,
       "standard_name": "wind_speed",
       "units": "m s-1"
     },
@@ -1093,15 +1032,14 @@
       "_table_name": [
         "HLY01_RCS"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 km h-1",
       "original_units": "0.1 km/h",
       "original_variable": "Wind Speed at 2 m (minutes 45-60)",
-      "scale_factor": 0.1,
       "standard_name": "wind_speed",
       "units": "m s-1"
     },
     "275": {
-      "_cf_variable_name": "snd_q4",
+      "_cf_variable_name": "snd",
       "_corrected_units": "cm",
       "_invert_sign": false,
       "_offset_time": false,
@@ -1111,7 +1049,6 @@
       "_transformation": false,
       "original_units": "cm",
       "original_variable": "Snow Depth (at minute 60)",
-      "scale_factor": 1,
       "standard_name": "surface_snow_thickness",
       "units": "m"
     },
@@ -1126,7 +1063,6 @@
       "_transformation": false,
       "original_units": "cm",
       "original_variable": "Snow Depth (at minute 15)",
-      "scale_factor": 1,
       "standard_name": "surface_snow_thickness",
       "units": "m"
     },
@@ -1141,7 +1077,6 @@
       "_transformation": false,
       "original_units": "cm",
       "original_variable": "Snow Depth (at minute 30)",
-      "scale_factor": 1,
       "standard_name": "surface_snow_thickness",
       "units": "m"
     },
@@ -1156,7 +1091,6 @@
       "_transformation": false,
       "original_units": "cm",
       "original_variable": "Snow Depth (at minute 45)",
-      "scale_factor": 1,
       "standard_name": "surface_snow_thickness",
       "units": "m"
     },
@@ -1171,7 +1105,6 @@
       "_transformation": false,
       "original_units": "Degrees",
       "original_variable": "Wind Direction at 2 m (minutes 50-60)",
-      "scale_factor": 1,
       "standard_name": "wind_direction",
       "units": "deg"
     },
@@ -1183,10 +1116,9 @@
       "_table_name": [
         "HLY01_RCS"
       ],
-      "_transformation": false,
+      "_transformation": "op / 10 km h-1",
       "original_units": "0.1 km/h",
       "original_variable": "Wind Speed at 2 m (minutes 50-60)",
-      "scale_factor": 0.1,
       "standard_name": "wind_speed",
       "units": "m s-1"
     }

From 94fe8669013264a82097e6b84632beadd5dcd2ad Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Tue, 20 Jun 2023 15:09:09 -0400
Subject: [PATCH 03/33] WIP - more refactoring - AHCCD incomplete

---
 miranda/convert/__init__.py                   |   4 +-
 miranda/convert/_data_definitions.py          |  56 ++++-
 miranda/convert/_reconstruction.py            |   2 +-
 .../{_data_corrections.py => _treatments.py}  | 217 +++---------------
 miranda/convert/corrections.py                | 157 +++++++++++++
 ...f_attrs.json => eccc-canswe_cf_attrs.json} |  15 +-
 .../data/eccc_homogenized_cf_attrs.json       |  18 +-
 .../data}/eccc_obs_cf_attrs.json              |  11 +-
 miranda/convert/data/eccc_rdrs_cf_attrs.json  |  11 +-
 miranda/convert/data/espo-g6-e5l_attrs.json   |   1 +
 miranda/convert/data/espo-g6-r2_attrs.json    |   1 +
 ...sa_cf_attrs.json => nasa_ag_cf_attrs.json} |   0
 .../convert/data/nex-gddp-cmip6_attrs.json    |   1 +
 miranda/convert/{eccc.py => eccc_canswe.py}   |   4 +-
 miranda/{eccc/_raw.py => convert/eccc_obs.py} |  49 ++--
 miranda/convert/eccc_rdrs.py                  |  10 +-
 miranda/convert/melcc.py                      |   8 +-
 miranda/eccc/__init__.py                      |   1 -
 miranda/eccc/convert.py                       |   2 +-
 templates/eccc_raw_hourly_conversion.py       |   6 +-
 templates/emdna_processing.py                 |   3 +-
 templates/era5-land_reanalysis_processing.py  |   3 +-
 templates/espo-g6.py                          |   3 +-
 templates/nasa_nex-gddp-cmip6_processing.py   |   3 +-
 24 files changed, 327 insertions(+), 259 deletions(-)
 rename miranda/convert/{_data_corrections.py => _treatments.py} (78%)
 create mode 100644 miranda/convert/corrections.py
 rename miranda/convert/data/{eccc_cf_attrs.json => eccc-canswe_cf_attrs.json} (60%)
 rename miranda/{eccc => convert}/data/eccc_homogenized_cf_attrs.json (96%)
 rename miranda/{eccc => convert/data}/eccc_obs_cf_attrs.json (99%)
 rename miranda/convert/data/{nasa_cf_attrs.json => nasa_ag_cf_attrs.json} (100%)
 rename miranda/convert/{eccc.py => eccc_canswe.py} (95%)
 rename miranda/{eccc/_raw.py => convert/eccc_obs.py} (96%)

diff --git a/miranda/convert/__init__.py b/miranda/convert/__init__.py
index a38c34ee..179bd597 100644
--- a/miranda/convert/__init__.py
+++ b/miranda/convert/__init__.py
@@ -1,9 +1,9 @@
 """Data Conversion module."""
 from __future__ import annotations
 
-from . import deh, eccc, ecmwf, hq, melcc, utils
+from . import deh, eccc_canswe, ecmwf, hq, melcc, utils
 from ._aggregation import *
-from ._data_corrections import *
 from ._data_definitions import *
+from ._treatments import *
 
 # from ._reconstruction import *
diff --git a/miranda/convert/_data_definitions.py b/miranda/convert/_data_definitions.py
index 24fe4f2f..4ed4741b 100644
--- a/miranda/convert/_data_definitions.py
+++ b/miranda/convert/_data_definitions.py
@@ -5,6 +5,7 @@
 import logging.config
 import os
 from pathlib import Path
+from typing import Any
 
 from miranda.scripting import LOGGING_CONFIG
 from miranda.storage import report_file_size
@@ -21,10 +22,11 @@
     "gather_nex",
     "gather_nrcan_gridded_obs",
     "gather_raw_rdrs_by_years",
-    "gather_rdrs",
+    "gather_eccc_rdrs",
     "gather_sc_earth",
     "gather_wfdei_gem_capa",
     "gather_emdna",
+    "load_json_data_mappings",
     "nasa_ag_variables",
     "nrcan_variables",
     "project_institutes",
@@ -35,6 +37,54 @@
 
 _data_folder = Path(__file__).parent / "data"
 
+
+def load_json_data_mappings(project: str) -> dict[str, Any]:
+    """Load JSON mappings for supported dataset conversions.
+
+    Parameters
+    ----------
+    project : str
+
+    Returns
+    -------
+    dict[str, Any]
+    """
+    data_folder = Path(__file__).resolve().parent / "data"
+
+    if project.startswith("era5"):
+        metadata_definition = json.load(open(data_folder / "ecmwf_cf_attrs.json"))
+    elif project in ["rdrs-v21"]:
+        metadata_definition = json.load(open(data_folder / "eccc_rdrs_cf_attrs.json"))
+    elif project == "eccc-obs":
+        metadata_definition = json.load(open(data_folder / "eccc_obs_cf_attrs.json"))
+    elif project in ["agcfsr", "agmerra2"]:
+        metadata_definition = json.load(open(data_folder / "nasa_ag_cf_attrs.json"))
+    elif project in ["cordex", "cmip5", "cmip6"]:
+        metadata_definition = json.load(open(data_folder / "cmip_ouranos_attrs.json"))
+    elif project == "ets-grnch":
+        metadata_definition = json.load(open(data_folder / "ets_grnch_cf_attrs.json"))
+    elif project == "nrcan-gridded-10km":
+        raise NotImplementedError()
+    elif project == "wfdei-gem-capa":
+        metadata_definition = json.load(open(data_folder / "usask_cf_attrs.json"))
+    elif project.startswith("melcc"):
+        metadata_definition = json.load(open(data_folder / "melcc_cf_attrs.json"))
+    elif project.startswith("ec"):
+        metadata_definition = json.load(open(data_folder / "eccc-canswe_cf_attrs.json"))
+    elif project in ["NEX-GDDP-CMIP6"]:
+        metadata_definition = json.load(open(data_folder / "nex-gddp-cmip6_attrs.json"))
+    elif project in ["ESPO-G6-R2"]:
+        metadata_definition = json.load(open(data_folder / "espo-g6-r2_attrs.json"))
+    elif project in ["ESPO-G6-E5L"]:
+        metadata_definition = json.load(open(data_folder / "espo-g6-e5l_attrs.json"))
+    elif project in ["EMDNA"]:
+        metadata_definition = json.load(open(data_folder / "emdna_cf_attrs.json"))
+    else:
+        raise NotImplementedError()
+
+    return metadata_definition
+
+
 eccc_rdrs_variables = dict()
 eccc_rdrs_variables["raw"] = [
     v
@@ -54,7 +104,7 @@
 ].keys()
 grnch_variables = ["T", "Tmin", "Tmax", "P"]
 nrcan_variables = ["tasmin", "tasmax", "pr"]
-nasa_ag_variables = json.load(open(_data_folder / "nasa_cf_attrs.json"))[
+nasa_ag_variables = json.load(open(_data_folder / "nasa_ag_cf_attrs.json"))[
     "variables"
 ].keys()
 sc_earth_variables = ["prcp", "tdew", "tmean", "trange", "wind"]
@@ -236,7 +286,7 @@ def gather_sc_earth(path: str | os.PathLike) -> dict[str, list[Path]]:
     )
 
 
-def gather_rdrs(
+def gather_eccc_rdrs(
     name: str, path: str | os.PathLike, suffix: str, key: str
 ) -> dict[str, dict[str, list[Path]]]:
     """Gather RDRS processed source data.
diff --git a/miranda/convert/_reconstruction.py b/miranda/convert/_reconstruction.py
index 6cabf25d..4963d68e 100644
--- a/miranda/convert/_reconstruction.py
+++ b/miranda/convert/_reconstruction.py
@@ -17,8 +17,8 @@
 from miranda.utils import chunk_iterables
 
 from ._aggregation import aggregate as aggregate_func
-from ._data_corrections import dataset_corrections
 from ._data_definitions import project_institutes, xarray_frequencies_to_cmip6like
+from .corrections import dataset_corrections
 
 logging.config.dictConfig(LOGGING_CONFIG)
 
diff --git a/miranda/convert/_data_corrections.py b/miranda/convert/_treatments.py
similarity index 78%
rename from miranda/convert/_data_corrections.py
rename to miranda/convert/_treatments.py
index 5bdbcaf9..a1241940 100644
--- a/miranda/convert/_data_corrections.py
+++ b/miranda/convert/_treatments.py
@@ -1,14 +1,11 @@
 from __future__ import annotations
 
 import datetime
-import json
 import logging.config
 import os
 import warnings
-from collections.abc import Iterator, Sequence
 from functools import partial
 from pathlib import Path
-from typing import Any, Callable
 
 import numpy as np
 import xarray as xr
@@ -18,72 +15,33 @@
 from xclim.core.calendar import parse_offset
 
 from miranda import __version__ as __miranda_version__
-from miranda.gis import subset_domain
 from miranda.scripting import LOGGING_CONFIG
 from miranda.units import get_time_frequency
 
-from .utils import date_parser, find_version_hash
+from ._data_definitions import load_json_data_mappings
+from .utils import date_parser
 
 logging.config.dictConfig(LOGGING_CONFIG)
 
 VERSION = datetime.datetime.now().strftime("%Y.%m.%d")
 
 __all__ = [
-    "dataset_corrections",
-    "dims_conversion",
-    "dataset_conversion",
-    "load_json_data_mappings",
+    "cf_units_conversion",
+    "clip_values",
+    "conservative_regrid",
+    "correct_unit_names",
+    "dimensions_compliance",
+    "ensure_correct_time_frequency",
+    "invert_value_sign",
     "metadata_conversion",
+    "offset_time_dimension",
+    "preprocessing_corrections",
     "threshold_mask",
+    "transform_values",
     "variable_conversion",
 ]
 
 
-def load_json_data_mappings(project: str) -> dict[str, Any]:
-    """Load JSON mappings for supported dataset conversions.
-
-    Parameters
-    ----------
-    project : str
-
-    Returns
-    -------
-    dict[str, Any]
-    """
-    data_folder = Path(__file__).resolve().parent / "data"
-
-    if project.startswith("era5"):
-        metadata_definition = json.load(open(data_folder / "ecmwf_cf_attrs.json"))
-    elif project in ["rdrs-v21"]:
-        metadata_definition = json.load(open(data_folder / "eccc_rdrs_cf_attrs.json"))
-    elif project in ["agcfsr", "agmerra2"]:  # This should handle the AG versions:
-        metadata_definition = json.load(open(data_folder / "nasa_cf_attrs.json"))
-    elif project in ["cordex", "cmip5", "cmip6"]:
-        metadata_definition = json.load(open(data_folder / "cmip_ouranos_attrs.json"))
-    elif project == "ets-grnch":
-        metadata_definition = json.load(open(data_folder / "ets_grnch_cf_attrs.json"))
-    elif project == "nrcan-gridded-10km":
-        raise NotImplementedError()
-    elif project == "wfdei-gem-capa":
-        metadata_definition = json.load(open(data_folder / "usask_cf_attrs.json"))
-    elif project.startswith("melcc"):
-        metadata_definition = json.load(open(data_folder / "melcc_cf_attrs.json"))
-    elif project.startswith("ec"):
-        metadata_definition = json.load(open(data_folder / "eccc_cf_attrs.json"))
-    elif project in ["NEX-GDDP-CMIP6"]:
-        metadata_definition = json.load(open(data_folder / "nex-gddp-cmip6_attrs.json"))
-    elif project in ["ESPO-G6-R2"]:
-        metadata_definition = json.load(open(data_folder / "espo-g6-r2_attrs.json"))
-    elif project in ["ESPO-G6-E5L"]:
-        metadata_definition = json.load(open(data_folder / "espo-g6-e5l_attrs.json"))
-    elif project in ["EMDNA"]:
-        metadata_definition = json.load(open(data_folder / "emdna_cf_attrs.json"))
-    else:
-        raise NotImplementedError()
-
-    return metadata_definition
-
-
 def _get_section_entry_key(meta, entry, var, key, project):
     var_meta = meta[entry].get(var, {})
     if key in var_meta:
@@ -321,7 +279,8 @@ def _preprocess_correct(d: xr.Dataset, *, ops: list[partial]) -> xr.Dataset:
     return ds
 
 
-def _correct_units_names(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+def correct_unit_names(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+    """Correct unit names."""
     key = "_corrected_units"
     for var, val in _iter_entry_key(d, m, "variables", key, p):
         if val:
@@ -336,7 +295,8 @@ def _correct_units_names(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
 
 
 # for de-accumulation or conversion to flux
-def _transform(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+def transform_values(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+    """Transform dataset values according to operation listed."""
     key = "_transformation"
     d_out = xr.Dataset(coords=d.coords, attrs=d.attrs)
     converted = []
@@ -430,7 +390,8 @@ def _transform(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
     return d_out
 
 
-def _offset_time(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+def offset_time_dimension(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+    """Offset time dimension using listed frequency."""
     key = "_offset_time"
     d_out = xr.Dataset(coords=d.coords, attrs=d.attrs)
     converted = []
@@ -479,7 +440,8 @@ def _offset_time(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
     return d_out
 
 
-def _invert_sign(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+def invert_value_sign(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+    """Flip value of DataArray."""
     key = "_invert_sign"
     d_out = xr.Dataset(coords=d.coords, attrs=d.attrs)
     converted = []
@@ -507,7 +469,8 @@ def _invert_sign(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
 
 
 # For converting variable units to standard workflow units
-def _units_cf_conversion(d: xr.Dataset, m: dict) -> xr.Dataset:
+def cf_units_conversion(d: xr.Dataset, m: dict) -> xr.Dataset:
+    """Perform pint-based units-conversion."""
     if "time" in m["dimensions"].keys():
         if m["dimensions"]["time"].get("units"):
             d["time"]["units"] = m["dimensions"]["time"]["units"]
@@ -524,7 +487,8 @@ def _units_cf_conversion(d: xr.Dataset, m: dict) -> xr.Dataset:
 
 
 # For clipping variable values to an established maximum/minimum
-def _clip_values(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+def clip_values(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+    """Clip values to an appropriate range,."""
     key = "_clip_values"
     d_out = xr.Dataset(coords=d.coords, attrs=d.attrs)
     converted = []
@@ -572,7 +536,8 @@ def _clip_values(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
     return d_out
 
 
-def _ensure_correct_time(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+def ensure_correct_time_frequency(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+    """Ensure that time frequency is consistent with expected frequency for project."""
     key = "_ensure_correct_time"
     strict_time = "_strict_time"
 
@@ -643,8 +608,8 @@ def _ensure_correct_time(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
 
 
 # For renaming and reordering lat and lon dims
-def dims_conversion(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
-    """Rename dimensions to CF to their equivalents.
+def dimensions_compliance(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+    """Rename dimensions to CF to their equivalents and reorder them if needed.
 
     Parameters
     ----------
@@ -854,129 +819,3 @@ def metadata_conversion(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
     d.attrs.update(dict(history=history))
 
     return d
-
-
-def dataset_corrections(ds: xr.Dataset, project: str) -> xr.Dataset:
-    """Convert variables to CF-compliant format"""
-    metadata_definition = load_json_data_mappings(project)
-
-    ds = _correct_units_names(ds, project, metadata_definition)
-    ds = _transform(ds, project, metadata_definition)
-    ds = _invert_sign(ds, project, metadata_definition)
-    ds = _units_cf_conversion(ds, metadata_definition)
-    ds = _clip_values(ds, project, metadata_definition)
-
-    ds = dims_conversion(ds, project, metadata_definition)
-    ds = _ensure_correct_time(ds, project, metadata_definition)
-    ds = _offset_time(ds, project, metadata_definition)
-
-    ds = variable_conversion(ds, project, metadata_definition)
-
-    ds = metadata_conversion(ds, project, metadata_definition)
-
-    ds.attrs["history"] = (
-        f"{datetime.datetime.now()}: "
-        f"Variables converted from original files using miranda.convert.{dataset_corrections.__name__}. "
-        f"{ds.attrs.get('history')}".strip()
-    )
-
-    return ds
-
-
-def dataset_conversion(
-    input_files: (
-        str
-        | os.PathLike
-        | Sequence[str | os.PathLike]
-        | Iterator[os.PathLike]
-        | xr.Dataset
-    ),
-    project: str,
-    domain: str | None = None,
-    mask: xr.Dataset | xr.DataArray | None = None,
-    mask_cutoff: float | bool = False,
-    regrid: bool = False,
-    add_version_hashes: bool = True,
-    preprocess: Callable | str | None = "auto",
-    **xr_kwargs,
-) -> xr.Dataset | xr.DataArray:
-    """Convert an existing Xarray-compatible dataset to another format with variable corrections applied.
-
-    Parameters
-    ----------
-    input_files : str or os.PathLike or Sequence[str or os.PathLike] or Iterator[os.PathLike] or xr.Dataset
-        Files or objects to be converted.
-        If sent a list or GeneratorType, will open with :py:func:`xarray.open_mfdataset` and concatenate files.
-    project : {"cordex", "cmip5", "cmip6", "ets-grnch", "isimip-ft", "pcic-candcs-u6", "converted"}
-        Project name for decoding/handling purposes.
-    domain: {"global", "nam", "can", "qc", "mtl"}, optional
-        Domain to perform subsetting for. Default: None.
-    mask : Optional[Union[xr.Dataset, xr.DataArray]]
-        DataArray or single data_variable dataset containing mask.
-    mask_cutoff : float or bool
-        If land_sea_mask supplied, the threshold above which to mask with land_sea_mask. Default: False.
-    regrid : bool
-        Performing regridding with xesmf. Default: False.
-    add_version_hashes : bool
-        If True, version name and sha256sum of source file(s) will be added as a field among the global attributes.
-    preprocess : callable or str, optional
-        Preprocessing functions to perform over each Dataset.
-        Default: "auto" - Run preprocessing fixes based on supplied fields from metadata definition.
-        Callable - Runs function over Dataset (single) or supplied to `preprocess` (multifile dataset).
-    **xr_kwargs
-        Arguments passed directly to xarray.
-
-    Returns
-    -------
-    xr.Dataset or xr.DataArray
-    """
-    if isinstance(input_files, xr.Dataset):
-        ds = input_files
-    else:
-        if isinstance(input_files, (str, os.PathLike)):
-            if Path(input_files).is_dir():
-                files = []
-                files.extend([f for f in Path(input_files).glob("*.nc")])
-                files.extend([f for f in Path(input_files).glob("*.zarr")])
-            else:
-                files = [Path(input_files)]
-        elif isinstance(input_files, (Sequence, Iterator)):
-            files = [Path(f) for f in input_files]
-        else:
-            files = input_files
-        version_hashes = dict()
-        if add_version_hashes:
-            for file in files:
-                version_hashes[file.name] = find_version_hash(file)
-
-        preprocess_kwargs = dict()
-        if preprocess:
-            if preprocess == "auto":
-                preprocess_kwargs.update(
-                    preprocess=partial(preprocessing_corrections, project=project)
-                )
-            elif isinstance(preprocess, Callable):
-                preprocess_kwargs.update(preprocess=preprocess)
-
-        if len(files) == 1:
-            ds = xr.open_dataset(files[0], **xr_kwargs)
-            for _, process in preprocess_kwargs.items():
-                ds = process(ds)
-        else:
-            ds = xr.open_mfdataset(files, **xr_kwargs, **preprocess_kwargs)
-        if version_hashes:
-            ds.attrs.update(dict(original_files=str(version_hashes)))
-
-    ds = dataset_corrections(ds, project)
-
-    if domain:
-        ds = subset_domain(ds, domain)
-
-    if isinstance(mask, (str, Path)):
-        mask = xr.open_dataset(mask)
-    if isinstance(mask, (xr.Dataset, xr.DataArray)):
-        if regrid:
-            mask = conservative_regrid(ds, mask)
-        ds = threshold_mask(ds, mask=mask, mask_cutoff=mask_cutoff)
-
-    return ds
diff --git a/miranda/convert/corrections.py b/miranda/convert/corrections.py
new file mode 100644
index 00000000..259d2b1b
--- /dev/null
+++ b/miranda/convert/corrections.py
@@ -0,0 +1,157 @@
+"""Dataset corrections submodule."""
+from __future__ import annotations
+
+import datetime
+import os
+from functools import partial
+from pathlib import Path
+from typing import Callable, Iterator, Sequence
+
+import xarray as xr
+
+from miranda.convert import (
+    dimensions_compliance,
+    metadata_conversion,
+    threshold_mask,
+    variable_conversion,
+)
+from miranda.convert._data_definitions import load_json_data_mappings
+from miranda.convert._treatments import (
+    cf_units_conversion,
+    clip_values,
+    conservative_regrid,
+    correct_unit_names,
+    ensure_correct_time_frequency,
+    invert_value_sign,
+    offset_time_dimension,
+    preprocessing_corrections,
+    transform_values,
+)
+from miranda.convert.utils import find_version_hash
+from miranda.gis import subset_domain
+
+
+def dataset_corrections(ds: xr.Dataset, project: str) -> xr.Dataset:
+    """Convert variables to CF-compliant format"""
+    metadata_definition = load_json_data_mappings(project)
+
+    ds = correct_unit_names(ds, project, metadata_definition)
+    ds = transform_values(ds, project, metadata_definition)
+    ds = invert_value_sign(ds, project, metadata_definition)
+    ds = cf_units_conversion(ds, metadata_definition)
+    ds = clip_values(ds, project, metadata_definition)
+
+    ds = dimensions_compliance(ds, project, metadata_definition)
+    ds = ensure_correct_time_frequency(ds, project, metadata_definition)
+    ds = offset_time_dimension(ds, project, metadata_definition)
+
+    ds = variable_conversion(ds, project, metadata_definition)
+
+    ds = metadata_conversion(ds, project, metadata_definition)
+
+    ds.attrs["history"] = (
+        f"{datetime.datetime.now()}: "
+        f"Variables converted from original files using miranda.convert.{dataset_corrections.__name__}. "
+        f"{ds.attrs.get('history')}".strip()
+    )
+
+    return ds
+
+
+def dataset_conversion(
+    input_files: (
+        str
+        | os.PathLike
+        | Sequence[str | os.PathLike]
+        | Iterator[os.PathLike]
+        | xr.Dataset
+    ),
+    project: str,
+    domain: str | None = None,
+    mask: xr.Dataset | xr.DataArray | None = None,
+    mask_cutoff: float | bool = False,
+    regrid: bool = False,
+    add_version_hashes: bool = True,
+    preprocess: Callable | str | None = "auto",
+    **xr_kwargs,
+) -> xr.Dataset | xr.DataArray:
+    """Convert an existing Xarray-compatible dataset to another format with variable corrections applied.
+
+    Parameters
+    ----------
+    input_files : str or os.PathLike or Sequence[str or os.PathLike] or Iterator[os.PathLike] or xr.Dataset
+        Files or objects to be converted.
+        If sent a list or GeneratorType, will open with :py:func:`xarray.open_mfdataset` and concatenate files.
+    project : {"cordex", "cmip5", "cmip6", "ets-grnch", "isimip-ft", "pcic-candcs-u6", "converted"}
+        Project name for decoding/handling purposes.
+    domain: {"global", "nam", "can", "qc", "mtl"}, optional
+        Domain to perform subsetting for. Default: None.
+    mask : Optional[Union[xr.Dataset, xr.DataArray]]
+        DataArray or single data_variable dataset containing mask.
+    mask_cutoff : float or bool
+        If land_sea_mask supplied, the threshold above which to mask with land_sea_mask. Default: False.
+    regrid : bool
+        Performing regridding with xesmf. Default: False.
+    add_version_hashes : bool
+        If True, version name and sha256sum of source file(s) will be added as a field among the global attributes.
+    preprocess : callable or str, optional
+        Preprocessing functions to perform over each Dataset.
+        Default: "auto" - Run preprocessing fixes based on supplied fields from metadata definition.
+        Callable - Runs function over Dataset (single) or supplied to `preprocess` (multifile dataset).
+    **xr_kwargs
+        Arguments passed directly to xarray.
+
+    Returns
+    -------
+    xr.Dataset or xr.DataArray
+    """
+    if isinstance(input_files, xr.Dataset):
+        ds = input_files
+    else:
+        if isinstance(input_files, (str, os.PathLike)):
+            if Path(input_files).is_dir():
+                files = []
+                files.extend([f for f in Path(input_files).glob("*.nc")])
+                files.extend([f for f in Path(input_files).glob("*.zarr")])
+            else:
+                files = [Path(input_files)]
+        elif isinstance(input_files, (Sequence, Iterator)):
+            files = [Path(f) for f in input_files]
+        else:
+            files = input_files
+        version_hashes = dict()
+        if add_version_hashes:
+            for file in files:
+                version_hashes[file.name] = find_version_hash(file)
+
+        preprocess_kwargs = dict()
+        if preprocess:
+            if preprocess == "auto":
+                preprocess_kwargs.update(
+                    preprocess=partial(preprocessing_corrections, project=project)
+                )
+            elif isinstance(preprocess, Callable):
+                preprocess_kwargs.update(preprocess=preprocess)
+
+        if len(files) == 1:
+            ds = xr.open_dataset(files[0], **xr_kwargs)
+            for _, process in preprocess_kwargs.items():
+                ds = process(ds)
+        else:
+            ds = xr.open_mfdataset(files, **xr_kwargs, **preprocess_kwargs)
+        if version_hashes:
+            ds.attrs.update(dict(original_files=str(version_hashes)))
+
+    ds = dataset_corrections(ds, project)
+
+    if domain:
+        ds = subset_domain(ds, domain)
+
+    if isinstance(mask, (str, Path)):
+        mask = xr.open_dataset(mask)
+    if isinstance(mask, (xr.Dataset, xr.DataArray)):
+        if regrid:
+            mask = conservative_regrid(ds, mask)
+        ds = threshold_mask(ds, mask=mask, mask_cutoff=mask_cutoff)
+
+    return ds
diff --git a/miranda/convert/data/eccc_cf_attrs.json b/miranda/convert/data/eccc-canswe_cf_attrs.json
similarity index 60%
rename from miranda/convert/data/eccc_cf_attrs.json
rename to miranda/convert/data/eccc-canswe_cf_attrs.json
index 4424ae76..4b48eb98 100644
--- a/miranda/convert/data/eccc_cf_attrs.json
+++ b/miranda/convert/data/eccc-canswe_cf_attrs.json
@@ -2,31 +2,32 @@
   "Header": {
     "Conventions": "CF-1.9",
     "_contact": {
-      "ec-canswe": "vincent.vionnet@canada.ca"
+      "eccc-canswe": "vincent.vionnet@canada.ca"
     },
     "_doi": {
-      "ec-canswe": "10.5281/zenodo.6638382"
+      "eccc-canswe": "10.5281/zenodo.6638382"
     },
     "_license": {
-      "ec-canswe": "https://open.canada.ca/en/open-government-licence-canada"
+      "eccc-canswe": "https://open.canada.ca/en/open-government-licence-canada"
     },
     "_miranda_version": true,
     "_reference": {
-      "ec-canswe": "https://zenodo.org/record/6638382"
+      "eccc-canswe": "https://zenodo.org/record/6638382"
     },
     "_source": {
-      "ec-canswe": "CanSWE"
+      "eccc-canswe": "CanSWE"
     },
     "_version": {
-      "ec-canswe": "v4"
+      "eccc-canswe": "v4"
     },
     "institution": "GovCan",
     "license_type": {
-      "ec-canswe": "permissive"
+      "eccc-canswe": "permissive"
     },
     "organisation": "ECCC",
     "processing_level": "raw",
     "realm": "atmos",
+    "source": "ECCC-CANSWE",
     "table_date": "2023-03-23",
     "table_id": "eccc",
     "type": "station-obs"
diff --git a/miranda/eccc/data/eccc_homogenized_cf_attrs.json b/miranda/convert/data/eccc_homogenized_cf_attrs.json
similarity index 96%
rename from miranda/eccc/data/eccc_homogenized_cf_attrs.json
rename to miranda/convert/data/eccc_homogenized_cf_attrs.json
index 92c3b0f1..9eac354f 100644
--- a/miranda/eccc/data/eccc_homogenized_cf_attrs.json
+++ b/miranda/convert/data/eccc_homogenized_cf_attrs.json
@@ -1,24 +1,28 @@
 {
   "Header": {
     "Conventions": "CF-1.8",
+    "_citation": {
+      "gen2": "Mekis, É and L.A. Vincent, 2011: An overview of the second generation adjusted daily precipitation dataset for trend analysis in Canada. Atmosphere-Ocean 49(2), 163-177 doi:10.1080/07055900.2011.583910",
+      "gen3": "Vincent, L.A., M.M. Hartwell and X.L. Wang, 2020: A Third Generation of Homogenized Temperature for Trend Analysis and Monitoring Changes in Canada’s Climate. Atmosphere-Ocean. https://doi.org/10.1080/07055900.2020.1765728"
+    },
+    "_frequency": true,
+    "_miranda_version": true,
+    "_missing_values": [
+      "-999",
+      "1e20"
+    ],
     "_product": {
       "gen2": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 2",
       "gen3": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 3"
     },
-    "citation": {
-      "gen2": "Mekis, É and L.A. Vincent, 2011: An overview of the second generation adjusted daily precipitation dataset for trend analysis in Canada. Atmosphere-Ocean 49(2), 163-177 doi:10.1080/07055900.2011.583910",
-      "gen3": "Vincent, L.A., M.M. Hartwell and X.L. Wang, 2020: A Third Generation of Homogenized Temperature for Trend Analysis and Monitoring Changes in Canada’s Climate. Atmosphere-Ocean. https://doi.org/10.1080/07055900.2020.1765728"
-    },
     "contact": "info.cccs-ccsc@canada.ca",
     "documentation": "https://www.canada.ca/en/environment-climate-change/services/climate-change/canadian-centre-climate-services/display-download/technical-documentation-adjusted-climate-data.html",
-    "float_missing_value": "1e20",
-    "frequency": "day",
     "institution": "GovCan",
-    "int_missing_value": "-999",
     "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html",
     "license_type": "permissive",
     "organization": "ECCC",
     "realm": "atmos",
+    "source": "AHCCD",
     "table_date": "2023-03-23",
     "table_id": "ECCC"
   },
diff --git a/miranda/eccc/eccc_obs_cf_attrs.json b/miranda/convert/data/eccc_obs_cf_attrs.json
similarity index 99%
rename from miranda/eccc/eccc_obs_cf_attrs.json
rename to miranda/convert/data/eccc_obs_cf_attrs.json
index 570c318c..68b30487 100644
--- a/miranda/eccc/eccc_obs_cf_attrs.json
+++ b/miranda/convert/data/eccc_obs_cf_attrs.json
@@ -3,17 +3,22 @@
     "Conventions": "CF-1.9",
     "_frequency": true,
     "_miranda_version": true,
+    "_missing_flags": "M",
+    "_missing_values": [
+      "-999",
+      "1e20",
+      "-9999",
+      "#####"
+    ],
     "contact": "climatcentre-climatecentral@ec.gc.ca",
     "institution": "GovCan",
-    "int_missing_value": "-999",
     "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html",
     "license_preamble": "The data is owned by the Government of Canada (Environment and Climate Change Canada), and fall under the licence agreement for use of Environment and Climate Change Canada data.",
     "license_type": "permissive",
-    "missing_value": "1e20",
     "organization": "ECCC",
     "processing_level": "raw",
     "realm": "atmos",
-    "source": "MSC",
+    "source": "ECCC-OBS",
     "table_date": "2023-03-23",
     "type": "station-obs"
   },
diff --git a/miranda/convert/data/eccc_rdrs_cf_attrs.json b/miranda/convert/data/eccc_rdrs_cf_attrs.json
index d3d985b3..9a946194 100644
--- a/miranda/convert/data/eccc_rdrs_cf_attrs.json
+++ b/miranda/convert/data/eccc_rdrs_cf_attrs.json
@@ -27,6 +27,7 @@
     "organisation": "ECCC",
     "processing_level": "raw",
     "realm": "atmos",
+    "source": "RDRS",
     "table_date": "2023-03-23",
     "table_id": "eccc",
     "type": "reconstruction"
@@ -46,8 +47,8 @@
     "RDRS_v2.1_A_PR0_SFC": {
       "_cf_variable_name": "pr",
       "_corrected_units": false,
-      "_invert_sign": {},
-      "_offset_time": {},
+      "_invert_sign": false,
+      "_offset_time": false,
       "_transformation": {
         "rdrs-v21": "amount2rate"
       },
@@ -63,9 +64,9 @@
       "_corrected_units": {
         "rdrs-v21": "degC"
       },
-      "_invert_sign": {},
-      "_offset_time": {},
-      "_transformation": {},
+      "_invert_sign": false,
+      "_offset_time": false,
+      "_transformation": false,
       "cell_methods": "time: point",
       "long_name": "1.5 metre temperature",
       "standard_name": "air_temperature",
diff --git a/miranda/convert/data/espo-g6-e5l_attrs.json b/miranda/convert/data/espo-g6-e5l_attrs.json
index e4e76045..71a2c80a 100644
--- a/miranda/convert/data/espo-g6-e5l_attrs.json
+++ b/miranda/convert/data/espo-g6-e5l_attrs.json
@@ -14,6 +14,7 @@
     "domain": "NAM",
     "mip_era": "CMIP6",
     "processing_level": "biasadjusted",
+    "source": "ESPO-G6-E5L",
     "table_date": "2023-04-24",
     "table_id": "ESPO-G6-E5L",
     "type": "simulation",
diff --git a/miranda/convert/data/espo-g6-r2_attrs.json b/miranda/convert/data/espo-g6-r2_attrs.json
index ad57313f..c0e73f03 100644
--- a/miranda/convert/data/espo-g6-r2_attrs.json
+++ b/miranda/convert/data/espo-g6-r2_attrs.json
@@ -14,6 +14,7 @@
     "domain": "NAM",
     "mip_era": "CMIP6",
     "processing_level": "biasadjusted",
+    "source": "ESPO-G6-R2",
     "table_date": "2023-04-24",
     "table_id": "ESPO-G6-R2",
     "type": "simulation",
diff --git a/miranda/convert/data/nasa_cf_attrs.json b/miranda/convert/data/nasa_ag_cf_attrs.json
similarity index 100%
rename from miranda/convert/data/nasa_cf_attrs.json
rename to miranda/convert/data/nasa_ag_cf_attrs.json
diff --git a/miranda/convert/data/nex-gddp-cmip6_attrs.json b/miranda/convert/data/nex-gddp-cmip6_attrs.json
index a58f29de..2e962b6e 100644
--- a/miranda/convert/data/nex-gddp-cmip6_attrs.json
+++ b/miranda/convert/data/nex-gddp-cmip6_attrs.json
@@ -12,6 +12,7 @@
     "domain": "QC",
     "mip_era": "CMIP6",
     "processing_level": "biasadjusted",
+    "source": "NASA-NEX-GDDP",
     "table_date": "2023-04-11",
     "table_id": "NEX-GDDP-CMIP6",
     "type": "simulation"
diff --git a/miranda/convert/eccc.py b/miranda/convert/eccc_canswe.py
similarity index 95%
rename from miranda/convert/eccc.py
rename to miranda/convert/eccc_canswe.py
index d3d9ac93..9ed1237b 100644
--- a/miranda/convert/eccc.py
+++ b/miranda/convert/eccc_canswe.py
@@ -7,7 +7,7 @@
 import pandas as pd
 import xarray as xr
 
-from ._data_corrections import dataset_corrections
+from .corrections import dataset_corrections
 
 __all__ = ["convert_canswe"]
 
@@ -55,7 +55,7 @@ def parse_desc(desc):
     ds.snd.attrs["ancillary_variables"] = "data_flag_snd qc_flag_snd"
     ds.snw.attrs["ancillary_variables"] = "data_flag_snw qc_flag_snw"
 
-    ds = dataset_corrections(ds, "ec-canswe")
+    ds = dataset_corrections(ds, "eccc-canswe")
     ds.attrs["frequency"] = "day"
     date = "-".join(ds.indexes["time"][[0, -1]].strftime("%Y%m"))
     for var in ["snd", "snw"]:
diff --git a/miranda/eccc/_raw.py b/miranda/convert/eccc_obs.py
similarity index 96%
rename from miranda/eccc/_raw.py
rename to miranda/convert/eccc_obs.py
index d4dc98ff..30c6c776 100644
--- a/miranda/eccc/_raw.py
+++ b/miranda/convert/eccc_obs.py
@@ -1,3 +1,4 @@
+"""Specialized conversion tools for Environment and Climate Change Canada / Meteorological Service of Canada data."""
 ######################################################################
 # S.Biner, Ouranos, mai 2019
 #
@@ -36,21 +37,22 @@
 from xclim.core.units import convert_units_to
 
 from miranda.archive import group_by_length
+from miranda.convert import load_json_data_mappings
 from miranda.scripting import LOGGING_CONFIG
 from miranda.storage import file_size, report_file_size
-from miranda.units import GiB, MiB
 from miranda.utils import generic_extract_archive
 
-from ._utils import cf_station_metadata
-
 config.dictConfig(LOGGING_CONFIG)
 
 __all__ = [
-    "aggregate_stations",
+    "merge_stations",
     "convert_flat_files",
     "merge_converted_variables",
 ]
 
+KiB = int(pow(2, 10))
+MiB = int(pow(2, 20))
+GiB = int(pow(2, 30))
 TABLE_DATE = dt.now().strftime("%d %B %Y")
 
 
@@ -86,7 +88,7 @@ def _remove_duplicates(ds):
 
 
 def _convert_station_file(
-    fichier: Path,
+    file: Path,
     output_path: Path,
     errored_files: list[Path],
     mode: str,
@@ -117,11 +119,11 @@ def _convert_station_file(
         missing_values = {-9999, "#####"}
 
     with tempfile.TemporaryDirectory() as temp_folder:
-        if fichier.suffix in [".gz", ".tar", ".zip", ".7z"]:
-            data_files = generic_extract_archive(fichier, output_dir=temp_folder)
+        if file.suffix in [".gz", ".tar", ".zip", ".7z"]:
+            data_files = generic_extract_archive(file, output_dir=temp_folder)
         else:
-            data_files = [fichier]
-        logging.info(f"Processing file: {fichier}.")
+            data_files = [file]
+        logging.info(f"Processing file: {file}.")
 
         size_limit = 1 * GiB
 
@@ -325,7 +327,7 @@ def _convert_station_file(
 
                     history = (
                         f"{dt.now().strftime('%Y-%m-%d %X')} converted from flat station file "
-                        f"(`{fichier.name}`) to n-dimensional array."
+                        f"(`{file.name}`) to n-dimensional array."
                     )
 
                     # TODO: This info should eventually be sourced from a JSON definition
@@ -432,8 +434,8 @@ def convert_flat_files(
 
     for variable_code in variables:
         variable_code = str(variable_code).zfill(3)
-        metadata = cf_station_metadata(variable_code)
-        nc_name = metadata["nc_name"]
+        metadata = load_json_data_mappings("eccc-obs")[variable_code]
+        nc_name = metadata["cf_variable_name"]
 
         rep_nc = Path(output_folder).joinpath(nc_name)
         rep_nc.mkdir(parents=True, exist_ok=True)
@@ -477,7 +479,7 @@ def convert_flat_files(
     logging.warning(f"Process completed in {time.time() - func_time:.2f} seconds")
 
 
-def aggregate_stations(
+def merge_stations(
     source_files: str | os.PathLike | None = None,
     output_folder: str | os.PathLike | None = None,
     time_step: str = None,
@@ -525,6 +527,7 @@ def aggregate_stations(
         pass
     elif isinstance(variables, (str, int)):
         variables = [variables]
+
     # TODO: have the variable gathered from a JSON file
     elif variables is None:
         if mode == "hourly":
@@ -542,8 +545,8 @@ def aggregate_stations(
         raise NotImplementedError()
 
     for variable_code in variables:
-        info = cf_station_metadata(variable_code)
-        variable_name = info["nc_name"]
+        info = load_json_data_mappings("eccc-obs")["variables"][variable_code]
+        variable_name = info["cf_variable_name"]
         logging.info(f"Merging `{variable_name}` using `{time_step}` time step.")
 
         # Only perform aggregation on available data with corresponding metadata
@@ -869,18 +872,18 @@ def merge_converted_variables(
 
     Parameters
     ----------
-    source_files: str, Path
-    output_folder: str, Path
-    variables: str or int or list of str or int, optional
-    station_metadata: str or Path, optional
-    overwrite: bool
-    n_workers: int
+    source_files : str, Path
+    output_folder : str, Path
+    variables : str or int or list of str or int, optional
+    station_metadata : str or Path, optional
+    overwrite : bool
+    n_workers : int
 
     Returns
     -------
     None
     """
-    meta = load_station_metadata(station_metadata)
+    meta = load_json_data_mappings("eccc-obs")
     metadata_file = Path(tempfile.NamedTemporaryFile(suffix=".nc", delete=False).name)
     meta.to_netcdf(metadata_file)
 
@@ -894,7 +897,7 @@ def merge_converted_variables(
         if not isinstance(variables, list):
             variables = [variables]
         for var in variables:
-            selected_variables.append(cf_station_metadata(var))
+            selected_variables.append(meta[var])
 
     variables_found = [x.name for x in source_files.iterdir() if x.is_dir()]
     if selected_variables:
diff --git a/miranda/convert/eccc_rdrs.py b/miranda/convert/eccc_rdrs.py
index fa206746..ceff1654 100644
--- a/miranda/convert/eccc_rdrs.py
+++ b/miranda/convert/eccc_rdrs.py
@@ -13,8 +13,12 @@
 from miranda.units import get_time_frequency
 
 from ._aggregation import aggregate
-from ._data_corrections import dataset_conversion, load_json_data_mappings
-from ._data_definitions import gather_raw_rdrs_by_years, gather_rdrs
+from ._data_definitions import (
+    gather_eccc_rdrs,
+    gather_raw_rdrs_by_years,
+    load_json_data_mappings,
+)
+from .corrections import dataset_conversion
 
 logging.config.dictConfig(LOGGING_CONFIG)
 
@@ -159,7 +163,7 @@ def rdrs_to_daily(
         working_folder = Path(working_folder).expanduser()
 
     # GATHER ALL RDRS FILES
-    gathered = gather_rdrs(project, input_folder, "zarr", "cf")
+    gathered = gather_eccc_rdrs(project, input_folder, "zarr", "cf")
     files = gathered["rdrs-v21"]  # noqa
     if process_variables:
         for vv in [f for f in files.keys() if f not in process_variables]:
diff --git a/miranda/convert/melcc.py b/miranda/convert/melcc.py
index e7619848..dba999dc 100644
--- a/miranda/convert/melcc.py
+++ b/miranda/convert/melcc.py
@@ -21,13 +21,11 @@
 from xclim.core.units import convert_units_to, pint_multiply, str2pint
 
 from miranda import __version__
+from miranda.convert._data_definitions import load_json_data_mappings
+from miranda.convert.corrections import dataset_corrections
 from miranda.scripting import LOGGING_CONFIG
 
-from ._data_corrections import (
-    dataset_corrections,
-    load_json_data_mappings,
-    metadata_conversion,
-)
+from ._treatments import metadata_conversion
 
 logging.config.dictConfig(LOGGING_CONFIG)
 logger = logging.getLogger(__name__)
diff --git a/miranda/eccc/__init__.py b/miranda/eccc/__init__.py
index 3fe8fdd3..781076e0 100644
--- a/miranda/eccc/__init__.py
+++ b/miranda/eccc/__init__.py
@@ -2,5 +2,4 @@
 from __future__ import annotations
 
 from ._homogenized import *
-from ._raw import *
 from ._summaries import *
diff --git a/miranda/eccc/convert.py b/miranda/eccc/convert.py
index cc86cfe2..eb94be48 100644
--- a/miranda/eccc/convert.py
+++ b/miranda/eccc/convert.py
@@ -10,7 +10,7 @@
 from functools import partial
 from pathlib import Path
 
-from miranda.eccc._raw import _convert_station_file
+from miranda.convert.eccc_obs import _convert_station_file
 from miranda.eccc._utils import cf_station_metadata
 from miranda.scripting import LOGGING_CONFIG
 
diff --git a/templates/eccc_raw_hourly_conversion.py b/templates/eccc_raw_hourly_conversion.py
index 0849e182..fcb47b2f 100644
--- a/templates/eccc_raw_hourly_conversion.py
+++ b/templates/eccc_raw_hourly_conversion.py
@@ -1,8 +1,8 @@
 from os import getenv
 from pathlib import Path
 
-from miranda.eccc import (
-    aggregate_stations,
+from miranda.convert.eccc_obs import (
+    merge_stations,
     convert_flat_files,
     merge_converted_variables,
 )
@@ -76,7 +76,7 @@
         n_workers=n_workers,
     )
 
-    aggregate_stations(
+    merge_stations(
         source_files=merged,
         output_folder=final,
         time_step=time_step,
diff --git a/templates/emdna_processing.py b/templates/emdna_processing.py
index 194526fc..3ee9eb07 100644
--- a/templates/emdna_processing.py
+++ b/templates/emdna_processing.py
@@ -4,6 +4,7 @@
 
 from dask.diagnostics import ProgressBar
 
+import miranda.convert.corrections
 from miranda import convert, io, structure
 
 
@@ -23,7 +24,7 @@ def main():
     files_by_member = convert.gather_emdna(path)
     for member, files in files_by_member.items():
         if member == "OI":
-            ds = convert.dataset_conversion(
+            ds = miranda.convert.corrections.dataset_conversion(
                 files, project="EMDNA", preprocess=preprocess_dna
             )
 
diff --git a/templates/era5-land_reanalysis_processing.py b/templates/era5-land_reanalysis_processing.py
index c58aa430..3fc27945 100644
--- a/templates/era5-land_reanalysis_processing.py
+++ b/templates/era5-land_reanalysis_processing.py
@@ -1,5 +1,6 @@
 from pathlib import Path
 
+import miranda.convert.corrections
 from miranda import convert, io
 
 
@@ -7,7 +8,7 @@ def main():
     path_era5_land_out = Path("~/Desktop").expanduser()
     era5_land_files = convert.gather_ecmwf("era5-land", path_era5_land_out)
 
-    ds = convert.dataset_conversion(
+    ds = miranda.convert.corrections.dataset_conversion(
         era5_land_files,
         project="era5-land-monthly-means",
     )
diff --git a/templates/espo-g6.py b/templates/espo-g6.py
index ac43168f..ea9a3e4d 100644
--- a/templates/espo-g6.py
+++ b/templates/espo-g6.py
@@ -4,6 +4,7 @@
 
 from dask.diagnostics import ProgressBar
 
+import miranda.convert.corrections
 from miranda import convert, io, structure
 from miranda.decode import Decoder
 
@@ -42,7 +43,7 @@ def main():
                 )
                 if not os.path.exists(new_path):  # and path not in skip:
                     # open as dataset
-                    ds = convert.dataset_conversion(
+                    ds = miranda.convert.corrections.dataset_conversion(
                         [f],
                         add_version_hashes=False,
                         project=project,
diff --git a/templates/nasa_nex-gddp-cmip6_processing.py b/templates/nasa_nex-gddp-cmip6_processing.py
index 78f51687..3fb7572b 100644
--- a/templates/nasa_nex-gddp-cmip6_processing.py
+++ b/templates/nasa_nex-gddp-cmip6_processing.py
@@ -1,5 +1,6 @@
 from pathlib import Path
 
+import miranda.convert.corrections
 from miranda import convert, io
 
 
@@ -10,7 +11,7 @@ def main():
 
     for path, list_files in nex_files.items():
         # open as dataset
-        ds = convert.dataset_conversion(
+        ds = miranda.convert.corrections.dataset_conversion(
             list_files,
             add_version_hashes=False,
             project="NEX-GDDP-CMIP6",

From d63523bbf9353a1f6d02ae13f8d8434febd9a127 Mon Sep 17 00:00:00 2001
From: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com>
Date: Wed, 21 Jun 2023 15:36:19 -0400
Subject: [PATCH 04/33] broken - refactoring of station writer

---
 miranda/convert/eccc_obs.py | 522 +++++++++++++++++-------------------
 miranda/eccc/geomet.py      |  29 ++
 2 files changed, 281 insertions(+), 270 deletions(-)
 create mode 100644 miranda/eccc/geomet.py

diff --git a/miranda/convert/eccc_obs.py b/miranda/convert/eccc_obs.py
index 30c6c776..76f9146b 100644
--- a/miranda/convert/eccc_obs.py
+++ b/miranda/convert/eccc_obs.py
@@ -27,7 +27,6 @@
 from datetime import datetime as dt
 from logging import config
 from pathlib import Path
-from urllib.error import HTTPError
 
 import dask.dataframe as dd
 import numpy as np
@@ -56,26 +55,19 @@
 TABLE_DATE = dt.now().strftime("%d %B %Y")
 
 
-def load_station_metadata(meta: str | os.PathLike) -> xr.Dataset:
-    if meta:
-        df_inv = pd.read_csv(meta, header=0)
-    else:
-        try:
-            import geopandas as gpd
+def fwf_column_definitions(time_frequency: str):
+    """Return the column widths for the fixed-width format."""
 
-            station_metadata_url = "https://api.weather.gc.ca/collections/climate-stations/items?f=json&limit=15000000"
-            df_inv = gpd.read_file(station_metadata_url)
-        except HTTPError as err:
-            raise RuntimeError(
-                f"Station metadata table unable to be fetched. Considering downloading directly: {err}"
-            )
-    df_inv["LONGITUDE"] = df_inv.geometry.x
-    df_inv["LATITUDE"] = df_inv.geometry.y
-    df_inv["ELEVATION"] = df_inv.ELEVATION.astype(float)
-    df_inv["CLIMATE_IDENTIFIER"] = df_inv["CLIMATE_IDENTIFIER"].astype(str)
+    if time_frequency.lower() in ["h", "hour", "hourly"]:
+        num_observations = 24
+        column_widths = [7, 4, 2, 2, 3] + [6, 1] * num_observations
+    elif time_frequency.lower() in ["d", "day", "daily"]:
+        num_observations = 31
+        column_widths = [7, 4, 2, 3] + [6, 1] * num_observations
+    else:
+        raise NotImplementedError("`mode` must be 'h'/'hourly or 'd'/'daily'.")
 
-    df_inv = df_inv.drop(["geometry"], axis=1)
-    return df_inv.to_xarray()
+    return column_widths
 
 
 def _remove_duplicates(ds):
@@ -87,6 +79,241 @@ def _remove_duplicates(ds):
     return ds.sel(time=~ds.get_index("time").duplicated())
 
 
+def convert_station(
+    data: str | os.PathLike, mode: str, using_dask_array: bool = False, **kwargs
+):
+    column_widths = fwf_column_definitions(mode)
+
+    if using_dask_array:
+        pandas_reader = dd
+        chunks = dict(blocksize=200 * MiB)
+    else:
+        pandas_reader = pd
+        chunks = dict()
+        using_dask_array = False
+
+    # Create a dataframe from the files
+    try:
+        df = pandas_reader.read_fwf(
+            data,
+            widths=column_widths,
+            names=column_names,
+            dtype={
+                name: data_type for name, data_type in zip(column_names, column_dtypes)
+            },
+            assume_missing=True,
+            **chunks,
+        )
+        if using_dask_array:
+            df = c.persist(df)
+
+    except FileNotFoundError:
+        logging.error(f"File {data} was not found.")
+        errored_files.append(data)
+        return
+
+    except (UnicodeDecodeError, Exception) as e:
+        logging.error(
+            f"File {data.name} was unable to be read. "
+            f"This is probably an issue with the file: {e}"
+        )
+        errored_files.append(data)
+        return
+
+    # Loop through the station codes
+    station_codes = df["code"].unique()
+    for code in station_codes:
+        df_code = df[df["code"] == code]
+
+        # Abort if the variable is not found
+        if using_dask_array:
+            has_variable_codes = (
+                (df_code["code_var"] == variable_code).compute()
+            ).any()
+        else:
+            has_variable_codes = (df_code["code_var"] == variable_code).any()
+        if not has_variable_codes:
+            logging.info(
+                f"Variable `{nc_name}` not found for station code: {code} in file {data}. Continuing..."
+            )
+            continue
+
+        # Perform the data treatment
+        logging.info(f"Converting `{nc_name}` for station code: {code}")
+
+        # Dump the data into a DataFrame
+        df_var = df_code[df_code["code_var"] == variable_code].copy()
+
+        # Mask the data according to the missing values flag
+        df_var = df_var.replace(missing_values, np.nan)
+
+        # Decode the values and flags
+        dfd = df_var.loc[:, [f"D{i:0n}" for i in range(1, num_observations + 1)]]
+        dff = df_var.loc[:, [f"F{i:0n}" for i in range(1, num_observations + 1)]]
+
+        # Remove the "NaN" flag
+        dff = dff.fillna("")
+
+        # Use the flag to mask the values
+        try:
+            val = np.asarray(dfd.values, float)
+        except ValueError as e:
+            logging.error(f"{e} raised from {dfd}, continuing...")
+            continue
+        try:
+            flag = np.asarray(dff.values, str)
+        except ValueError as e:
+            logging.error(f"{e} raised from {dff}, continuing...")
+            continue
+        mask = np.isin(flag, missing_flags)
+        val[mask] = np.nan
+
+        # Treat according to units conversions
+        val = val * scale_factor + add_offset
+
+        # Create the DataArray
+        date_summations = dict(time=list())
+        if mode == "hourly":
+            for index, row in df_var.iterrows():
+                period = pd.Period(
+                    year=row.year, month=row.month, day=row.day, freq="D"
+                )
+                dates = pd.Series(
+                    pd.date_range(
+                        start=period.start_time,
+                        end=period.end_time,
+                        freq="H",
+                    )
+                )
+                date_summations["time"].extend(dates)
+            written_values = val.flatten()
+            written_flags = flag.flatten()
+        elif mode == "daily":
+            value_days = list()
+            flag_days = list()
+            for i, (index, row) in enumerate(df_var.iterrows()):
+                period = pd.Period(year=row.year, month=row.month, freq="M")
+                dates = pd.Series(
+                    pd.date_range(
+                        start=period.start_time,
+                        end=period.end_time,
+                        freq="D",
+                    )
+                )
+                date_summations["time"].extend(dates)
+
+                value_days.extend(
+                    val[i][range(monthrange(int(row.year), int(row.month))[1])]
+                )
+                flag_days.extend(
+                    flag[i][range(monthrange(int(row.year), int(row.month))[1])]
+                )
+            written_values = value_days
+            written_flags = flag_days
+
+        ds = xr.Dataset()
+        da_val = xr.DataArray(written_values, coords=date_summations, dims=["time"])
+
+        if raw_units != units:
+            da_val.attrs["units"] = raw_units
+            da_val = convert_units_to(da_val, units)
+        else:
+            da_val.attrs["units"] = units
+
+        da_val = da_val.rename(nc_name)
+        variable_attributes = dict(
+            variable_code=variable_code,
+            standard_name=standard_name,
+            long_name=long_name,
+        )
+        if "original_units" in kwargs:
+            variable_attributes["original_units"] = kwargs["original_units"]
+        da_val.attrs.update(variable_attributes)
+
+        da_flag = xr.DataArray(written_flags, coords=date_summations, dims=["time"])
+        da_flag = da_flag.rename("flag")
+        flag_attributes = dict(
+            long_name="data flag",
+            note="See ECCC technical documentation for details",
+        )
+        da_flag.attrs.update(flag_attributes)
+
+        ds[nc_name] = da_val
+        ds["flag"] = da_flag
+
+        # save the file in NetCDF format
+        start_year = ds.time.dt.year.values[0]
+        end_year = ds.time.dt.year.values[-1]
+
+        station_folder = output_path.joinpath(str(code))
+        station_folder.mkdir(parents=True, exist_ok=True)
+
+        f_nc = (
+            f"{code}_{variable_code}_{nc_name}_"
+            f"{start_year if start_year == end_year else '_'.join([str(start_year), str(end_year)])}.nc"
+        )
+
+        if station_folder.joinpath(f_nc).exists():
+            logging.warning(f"File `{f_nc}` already exists. Continuing...")
+
+        history = (
+            f"{dt.now().strftime('%Y-%m-%d %X')} converted from flat station file "
+            f"(`{file.name}`) to n-dimensional array."
+        )
+
+        # TODO: This info should eventually be sourced from a JSON definition
+        global_attrs = dict(
+            Conventions="CF-1.8",
+            comment="Acquired on demand from data specialists at "
+            "ECCC Climate Services / Services Climatiques.",
+            contact="John Richard",
+            contact_email="climatcentre-climatecentral@ec.gc.ca",
+            domain="CAN",
+        )
+        if mode == "hourly":
+            global_attrs.update(dict(frequency="1hr"))
+        elif mode == "daily":
+            global_attrs.update(dict(frequency="day"))
+        global_attrs.update(
+            dict(
+                history=history,
+                internal_comment=f"Converted by {os.environ.get('USER', os.environ.get('USERNAME'))}.",
+                institution="ECCC",
+                license="https://climate.weather.gc.ca/prods_servs/attachment1_e.html",
+                member=code,
+                processing_level="raw",
+                redistribution="Redistribution permitted.",
+                references="https://climate.weather.gc.ca/doc/Technical_Documentation.pdf",
+                source="historical-station-records",
+                table_date=TABLE_DATE,
+                title="Environment and Climate Change Canada (ECCC) weather station observations",
+                type="station-obs",
+                usage="The original data is owned by the Government of Canada (Environment and Climate "
+                "Change Canada), and falls under the licence agreement for use of Environment and "
+                "Climate Change Canada data",
+                variable=str(nc_name),
+                version=f"v{dt.now().strftime('%Y.%m.%V')}",  # Year.Month.Week
+            )
+        )
+        ds.attrs.update(global_attrs)
+
+        logging.info(f"Exporting to: {station_folder.joinpath(f_nc)}")
+        ds.to_netcdf(station_folder.joinpath(f_nc))
+        del ds
+        del val
+        del mask
+        del flag
+        del da_val
+        del da_flag
+        del dfd
+        del dff
+        del written_values
+        del written_flags
+        del date_summations
+
+    del df
+
+
 def _convert_station_file(
     file: Path,
     output_path: Path,
@@ -104,16 +331,9 @@ def _convert_station_file(
     scale_factor: float,
     standard_name: str,
     variable_code: str,
-    **kwargs,
+    **dask_kwargs,
 ):
-    if mode.lower() in ["h", "hour", "hourly"]:
-        num_observations = 24
-        column_widths = [7, 4, 2, 2, 3] + [6, 1] * num_observations
-    elif mode.lower() in ["d", "day", "daily"]:
-        num_observations = 31
-        column_widths = [7, 4, 2, 3] + [6, 1] * num_observations
-    else:
-        raise NotImplementedError("`mode` must be 'h'/'hourly or 'd'/'daily'.")
+    column_widths = fwf_column_definitions(mode)
 
     if not missing_values:
         missing_values = {-9999, "#####"}
@@ -132,255 +352,17 @@ def _convert_station_file(
                 logging.info(
                     f"File exceeds {report_file_size(size_limit)} - Using dask.dataframes."
                 )
-                pandas_reader = dd
-                using_dask_array = True
-                chunks = dict(blocksize=200 * MiB)
                 client = ProgressBar
+                using_dask = True
             else:
                 logging.info(
                     f"File below {report_file_size(size_limit)} - Using pandas.dataframes."
                 )
-                pandas_reader = pd
-                chunks = dict()
-                using_dask_array = False
                 client = contextlib.nullcontext
+                using_dask = False
 
-            with client() as c:
-                # Create a dataframe from the files
-                try:
-                    df = pandas_reader.read_fwf(
-                        data,
-                        widths=column_widths,
-                        names=column_names,
-                        dtype={
-                            name: data_type
-                            for name, data_type in zip(column_names, column_dtypes)
-                        },
-                        assume_missing=True,
-                        **chunks,
-                    )
-                    if using_dask_array:
-                        df = c.persist(df)
-
-                except FileNotFoundError:
-                    logging.error(f"File {data} was not found.")
-                    errored_files.append(data)
-                    return
-
-                except (UnicodeDecodeError, Exception) as e:
-                    logging.error(
-                        f"File {data.name} was unable to be read. "
-                        f"This is probably an issue with the file: {e}"
-                    )
-                    errored_files.append(data)
-                    return
-
-                # Loop through the station codes
-                station_codes = df["code"].unique()
-                for code in station_codes:
-                    df_code = df[df["code"] == code]
-
-                    # Abort if the variable is not found
-                    if using_dask_array:
-                        has_variable_codes = (
-                            (df_code["code_var"] == variable_code).compute()
-                        ).any()
-                    else:
-                        has_variable_codes = (
-                            df_code["code_var"] == variable_code
-                        ).any()
-                    if not has_variable_codes:
-                        logging.info(
-                            f"Variable `{nc_name}` not found for station code: {code} in file {data}. Continuing..."
-                        )
-                        continue
-
-                    # Perform the data treatment
-                    logging.info(f"Converting `{nc_name}` for station code: {code}")
-
-                    # Dump the data into a DataFrame
-                    df_var = df_code[df_code["code_var"] == variable_code].copy()
-
-                    # Mask the data according to the missing values flag
-                    df_var = df_var.replace(missing_values, np.nan)
-
-                    # Decode the values and flags
-                    dfd = df_var.loc[
-                        :, [f"D{i:0n}" for i in range(1, num_observations + 1)]
-                    ]
-                    dff = df_var.loc[
-                        :, [f"F{i:0n}" for i in range(1, num_observations + 1)]
-                    ]
-
-                    # Remove the "NaN" flag
-                    dff = dff.fillna("")
-
-                    # Use the flag to mask the values
-                    try:
-                        val = np.asarray(dfd.values, float)
-                    except ValueError as e:
-                        logging.error(f"{e} raised from {dfd}, continuing...")
-                        continue
-                    try:
-                        flag = np.asarray(dff.values, str)
-                    except ValueError as e:
-                        logging.error(f"{e} raised from {dff}, continuing...")
-                        continue
-                    mask = np.isin(flag, missing_flags)
-                    val[mask] = np.nan
-
-                    # Treat according to units conversions
-                    val = val * scale_factor + add_offset
-
-                    # Create the DataArray
-                    date_summations = dict(time=list())
-                    if mode == "hourly":
-                        for index, row in df_var.iterrows():
-                            period = pd.Period(
-                                year=row.year, month=row.month, day=row.day, freq="D"
-                            )
-                            dates = pd.Series(
-                                pd.date_range(
-                                    start=period.start_time,
-                                    end=period.end_time,
-                                    freq="H",
-                                )
-                            )
-                            date_summations["time"].extend(dates)
-                        written_values = val.flatten()
-                        written_flags = flag.flatten()
-                    elif mode == "daily":
-                        value_days = list()
-                        flag_days = list()
-                        for i, (index, row) in enumerate(df_var.iterrows()):
-                            period = pd.Period(year=row.year, month=row.month, freq="M")
-                            dates = pd.Series(
-                                pd.date_range(
-                                    start=period.start_time,
-                                    end=period.end_time,
-                                    freq="D",
-                                )
-                            )
-                            date_summations["time"].extend(dates)
-
-                            value_days.extend(
-                                val[i][
-                                    range(monthrange(int(row.year), int(row.month))[1])
-                                ]
-                            )
-                            flag_days.extend(
-                                flag[i][
-                                    range(monthrange(int(row.year), int(row.month))[1])
-                                ]
-                            )
-                        written_values = value_days
-                        written_flags = flag_days
-
-                    ds = xr.Dataset()
-                    da_val = xr.DataArray(
-                        written_values, coords=date_summations, dims=["time"]
-                    )
-
-                    if raw_units != units:
-                        da_val.attrs["units"] = raw_units
-                        da_val = convert_units_to(da_val, units)
-                    else:
-                        da_val.attrs["units"] = units
-
-                    da_val = da_val.rename(nc_name)
-                    variable_attributes = dict(
-                        variable_code=variable_code,
-                        standard_name=standard_name,
-                        long_name=long_name,
-                    )
-                    if "original_units" in kwargs:
-                        variable_attributes["original_units"] = kwargs["original_units"]
-                    da_val.attrs.update(variable_attributes)
-
-                    da_flag = xr.DataArray(
-                        written_flags, coords=date_summations, dims=["time"]
-                    )
-                    da_flag = da_flag.rename("flag")
-                    flag_attributes = dict(
-                        long_name="data flag",
-                        note="See ECCC technical documentation for details",
-                    )
-                    da_flag.attrs.update(flag_attributes)
-
-                    ds[nc_name] = da_val
-                    ds["flag"] = da_flag
-
-                    # save the file in NetCDF format
-                    start_year = ds.time.dt.year.values[0]
-                    end_year = ds.time.dt.year.values[-1]
-
-                    station_folder = output_path.joinpath(str(code))
-                    station_folder.mkdir(parents=True, exist_ok=True)
-
-                    f_nc = (
-                        f"{code}_{variable_code}_{nc_name}_"
-                        f"{start_year if start_year == end_year else '_'.join([str(start_year), str(end_year)])}.nc"
-                    )
-
-                    if station_folder.joinpath(f_nc).exists():
-                        logging.warning(f"File `{f_nc}` already exists. Continuing...")
-
-                    history = (
-                        f"{dt.now().strftime('%Y-%m-%d %X')} converted from flat station file "
-                        f"(`{file.name}`) to n-dimensional array."
-                    )
-
-                    # TODO: This info should eventually be sourced from a JSON definition
-                    global_attrs = dict(
-                        Conventions="CF-1.8",
-                        comment="Acquired on demand from data specialists at "
-                        "ECCC Climate Services / Services Climatiques.",
-                        contact="John Richard",
-                        contact_email="climatcentre-climatecentral@ec.gc.ca",
-                        domain="CAN",
-                    )
-                    if mode == "hourly":
-                        global_attrs.update(dict(frequency="1hr"))
-                    elif mode == "daily":
-                        global_attrs.update(dict(frequency="day"))
-                    global_attrs.update(
-                        dict(
-                            history=history,
-                            internal_comment=f"Converted by {os.environ.get('USER', os.environ.get('USERNAME'))}.",
-                            institution="ECCC",
-                            license="https://climate.weather.gc.ca/prods_servs/attachment1_e.html",
-                            member=code,
-                            processing_level="raw",
-                            redistribution="Redistribution permitted.",
-                            references="https://climate.weather.gc.ca/doc/Technical_Documentation.pdf",
-                            source="historical-station-records",
-                            table_date=TABLE_DATE,
-                            title="Environment and Climate Change Canada (ECCC) weather station observations",
-                            type="station-obs",
-                            usage="The original data is owned by the Government of Canada (Environment and Climate "
-                            "Change Canada), and falls under the licence agreement for use of Environment and "
-                            "Climate Change Canada data",
-                            variable=str(nc_name),
-                            version=f"v{dt.now().strftime('%Y.%m.%V')}",  # Year.Month.Week
-                        )
-                    )
-                    ds.attrs.update(global_attrs)
-
-                    logging.info(f"Exporting to: {station_folder.joinpath(f_nc)}")
-                    ds.to_netcdf(station_folder.joinpath(f_nc))
-                    del ds
-                    del val
-                    del mask
-                    del flag
-                    del da_val
-                    del da_flag
-                    del dfd
-                    del dff
-                    del written_values
-                    del written_flags
-                    del date_summations
-
-                del df
+            with client(**dask_kwargs) as c:
+                convert_station(data, mode, using_dask=using_dask)
 
         if os.listdir(temp_folder):
             for temporary_file in Path(temp_folder).glob("*"):
diff --git a/miranda/eccc/geomet.py b/miranda/eccc/geomet.py
new file mode 100644
index 00000000..3fb00d72
--- /dev/null
+++ b/miranda/eccc/geomet.py
@@ -0,0 +1,29 @@
+from __future__ import annotations
+
+import os
+from urllib.error import HTTPError
+
+import pandas as pd
+import xarray as xr
+
+
+def load_station_metadata(meta: str | os.PathLike) -> xr.Dataset:
+    if meta:
+        df_inv = pd.read_csv(meta, header=0)
+    else:
+        try:
+            import geopandas as gpd
+
+            station_metadata_url = "https://api.weather.gc.ca/collections/climate-stations/items?f=json&limit=15000000"
+            df_inv = gpd.read_file(station_metadata_url)
+        except HTTPError as err:
+            raise RuntimeError(
+                f"Station metadata table unable to be fetched. Considering downloading directly: {err}"
+            )
+    df_inv["LONGITUDE"] = df_inv.geometry.x
+    df_inv["LATITUDE"] = df_inv.geometry.y
+    df_inv["ELEVATION"] = df_inv.ELEVATION.astype(float)
+    df_inv["CLIMATE_IDENTIFIER"] = df_inv["CLIMATE_IDENTIFIER"].astype(str)
+
+    df_inv = df_inv.drop(["geometry"], axis=1)
+    return df_inv.to_xarray()

From 500ae2ad5cf751b17a5374fee8ec68a59f9b224f Mon Sep 17 00:00:00 2001
From: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com>
Date: Thu, 6 Jul 2023 14:53:53 -0400
Subject: [PATCH 05/33] broken - more refactoring

---
 miranda/convert/eccc_obs.py | 57 ++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 33 deletions(-)

diff --git a/miranda/convert/eccc_obs.py b/miranda/convert/eccc_obs.py
index 76f9146b..32e2c3e1 100644
--- a/miranda/convert/eccc_obs.py
+++ b/miranda/convert/eccc_obs.py
@@ -27,6 +27,7 @@
 from datetime import datetime as dt
 from logging import config
 from pathlib import Path
+from typing import List, Tuple, Union, Type, Any
 
 import dask.dataframe as dd
 import numpy as np
@@ -55,19 +56,23 @@
 TABLE_DATE = dt.now().strftime("%d %B %Y")
 
 
-def fwf_column_definitions(time_frequency: str):
-    """Return the column widths for the fixed-width format."""
+def _fwf_column_definitions(time_frequency: str) -> Tuple[List[str], List[int], List[Type[Union[str, int]]]]:
+    """Return the column names, widths, and data types for the fixed-width format."""
 
     if time_frequency.lower() in ["h", "hour", "hourly"]:
         num_observations = 24
+        column_names = ["code", "year", "month", "day", "code_var"]
         column_widths = [7, 4, 2, 2, 3] + [6, 1] * num_observations
+        column_dtypes = [str, int, int, int, str]
     elif time_frequency.lower() in ["d", "day", "daily"]:
         num_observations = 31
+        column_names = ["code", "year", "month", "code_var"]
         column_widths = [7, 4, 2, 3] + [6, 1] * num_observations
+        column_dtypes = [str, int, int, str]
     else:
         raise NotImplementedError("`mode` must be 'h'/'hourly or 'd'/'daily'.")
 
-    return column_widths
+    return column_names, column_widths, column_dtypes
 
 
 def _remove_duplicates(ds):
@@ -80,9 +85,10 @@ def _remove_duplicates(ds):
 
 
 def convert_station(
-    data: str | os.PathLike, mode: str, using_dask_array: bool = False, **kwargs
+    data: str | os.PathLike, mode: str, using_dask_array: bool = False, *, client: Any, **kwargs
 ):
-    column_widths = fwf_column_definitions(mode)
+    data = Path(data)
+    column_names, column_widths, column_dtypes = _fwf_column_definitions(mode)
 
     if using_dask_array:
         pandas_reader = dd
@@ -105,20 +111,17 @@ def convert_station(
             **chunks,
         )
         if using_dask_array:
-            df = c.persist(df)
+            df = client.persist(df)
 
-    except FileNotFoundError:
-        logging.error(f"File {data} was not found.")
-        errored_files.append(data)
-        return
+    except FileNotFoundError as e:
+        msg = f"File {data} was not found: {e}"
+        logging.error(msg)
+        raise FileNotFoundError(msg)
 
-    except (UnicodeDecodeError, Exception) as e:
-        logging.error(
-            f"File {data.name} was unable to be read. "
-            f"This is probably an issue with the file: {e}"
-        )
-        errored_files.append(data)
-        return
+    except UnicodeDecodeError as e:
+        msg = f"File {data.name} was unable to be read. This is probably an issue with the file: {e}"
+        logging.error(msg)
+        raise UnicodeDecodeError(msg)
 
     # Loop through the station codes
     station_codes = df["code"].unique()
@@ -320,8 +323,6 @@ def _convert_station_file(
     errored_files: list[Path],
     mode: str,
     add_offset: float,
-    column_dtypes: list[str],
-    column_names: list[str],
     long_name: str,
     missing_flags: set[str],
     missing_values: set[str],
@@ -333,8 +334,6 @@ def _convert_station_file(
     variable_code: str,
     **dask_kwargs,
 ):
-    column_widths = fwf_column_definitions(mode)
-
     if not missing_values:
         missing_values = {-9999, "#####"}
 
@@ -362,7 +361,10 @@ def _convert_station_file(
                 using_dask = False
 
             with client(**dask_kwargs) as c:
-                convert_station(data, mode, using_dask=using_dask)
+                try:
+                    convert_station(data, mode, using_dask=using_dask)
+                except FileNotFoundError:
+                    errored_files.append(data)
 
         if os.listdir(temp_folder):
             for temporary_file in Path(temp_folder).glob("*"):
@@ -393,17 +395,6 @@ def convert_flat_files(
     """
     func_time = time.time()
 
-    if mode.lower() in ["h", "hour", "hourly"]:
-        num_observations = 24
-        column_names = ["code", "year", "month", "day", "code_var"]
-        column_dtypes = [str, float, float, float, str]
-    elif mode.lower() in ["d", "day", "daily"]:
-        num_observations = 31
-        column_names = ["code", "year", "month", "code_var"]
-        column_dtypes = [str, float, float, str]
-    else:
-        raise NotImplementedError("`mode` must be 'h'/'hourly or 'd'/'daily'.")
-
     # Preparing the data column headers
     for i in range(1, num_observations + 1):
         data_entry, flag_entry = f"D{i:0n}", f"F{i:0n}"

From c781287840afd3d8cc6d055fc1af95fec93fd751 Mon Sep 17 00:00:00 2001
From: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com>
Date: Thu, 6 Jul 2023 17:56:45 -0400
Subject: [PATCH 06/33] broken - more refactoring

---
 miranda/convert/eccc_obs.py | 41 ++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/miranda/convert/eccc_obs.py b/miranda/convert/eccc_obs.py
index 32e2c3e1..fd827d2f 100644
--- a/miranda/convert/eccc_obs.py
+++ b/miranda/convert/eccc_obs.py
@@ -27,7 +27,7 @@
 from datetime import datetime as dt
 from logging import config
 from pathlib import Path
-from typing import List, Tuple, Union, Type, Any
+from typing import Any, List
 
 import dask.dataframe as dd
 import numpy as np
@@ -56,22 +56,33 @@
 TABLE_DATE = dt.now().strftime("%d %B %Y")
 
 
-def _fwf_column_definitions(time_frequency: str) -> Tuple[List[str], List[int], List[Type[Union[str, int]]]]:
+def _fwf_column_definitions(
+    time_frequency: str,
+) -> tuple[list[str], list[int], list[type[str | int]]]:
     """Return the column names, widths, and data types for the fixed-width format."""
 
+    # Preparing the column headers
     if time_frequency.lower() in ["h", "hour", "hourly"]:
         num_observations = 24
         column_names = ["code", "year", "month", "day", "code_var"]
-        column_widths = [7, 4, 2, 2, 3] + [6, 1] * num_observations
+        column_widths = [7, 4, 2, 2, 3]
         column_dtypes = [str, int, int, int, str]
     elif time_frequency.lower() in ["d", "day", "daily"]:
         num_observations = 31
         column_names = ["code", "year", "month", "code_var"]
-        column_widths = [7, 4, 2, 3] + [6, 1] * num_observations
+        column_widths = [7, 4, 2, 3]
         column_dtypes = [str, int, int, str]
     else:
         raise NotImplementedError("`mode` must be 'h'/'hourly or 'd'/'daily'.")
 
+    # Add the data columns
+    for i in range(1, num_observations + 1):
+        data_entry, flag_entry = f"D{i:0n}", f"F{i:0n}"
+        column_names.append(data_entry)
+        column_names.append(flag_entry)
+        column_widths.extend([6, 1] * num_observations)
+        column_dtypes.extend([str, str])
+
     return column_names, column_widths, column_dtypes
 
 
@@ -85,7 +96,12 @@ def _remove_duplicates(ds):
 
 
 def convert_station(
-    data: str | os.PathLike, mode: str, using_dask_array: bool = False, *, client: Any, **kwargs
+    data: str | os.PathLike,
+    mode: str,
+    using_dask_array: bool = False,
+    *,
+    client: Any,
+    **kwargs,
 ):
     data = Path(data)
     column_names, column_widths, column_dtypes = _fwf_column_definitions(mode)
@@ -362,7 +378,7 @@ def _convert_station_file(
 
             with client(**dask_kwargs) as c:
                 try:
-                    convert_station(data, mode, using_dask=using_dask)
+                    convert_station(data, mode, using_dask=using_dask, client=c)
                 except FileNotFoundError:
                     errored_files.append(data)
 
@@ -393,15 +409,6 @@ def convert_flat_files(
     -------
     None
     """
-    func_time = time.time()
-
-    # Preparing the data column headers
-    for i in range(1, num_observations + 1):
-        data_entry, flag_entry = f"D{i:0n}", f"F{i:0n}"
-        column_names.append(data_entry)
-        column_names.append(flag_entry)
-        column_dtypes.extend([str, str])
-
     if isinstance(variables, (str, int)):
         variables = [variables]
 
@@ -435,8 +442,6 @@ def convert_flat_files(
             errored_files=errored_files,
             mode=mode,
             variable_code=variable_code,
-            column_names=column_names,
-            column_dtypes=column_dtypes,
             **metadata,
         )
         with mp.Pool(processes=n_workers) as pool:
@@ -449,8 +454,6 @@ def convert_flat_files(
                 "Some files failed to be properly parsed:\n", ", ".join(errored_files)
             )
 
-    logging.warning(f"Process completed in {time.time() - func_time:.2f} seconds")
-
 
 def merge_stations(
     source_files: str | os.PathLike | None = None,

From ce2f6d92985e1c0dd244d7c98a10dc0caffb995e Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Wed, 2 Aug 2023 16:58:48 -0400
Subject: [PATCH 07/33] significant refactoring - WIP

* moved many eccc conversion functions to new preprocessing module
* created new vocabularies module for CV support of multiple projects
* json mappings are now configs
* CF conversion will be handled by convert only - WIP
---
 miranda/__init__.py                           |   3 +-
 miranda/convert/__init__.py                   |   2 +-
 miranda/convert/_data_definitions.py          |  60 +++---
 .../{data => configs}/cmip_ouranos_attrs.json |   0
 .../{data => configs}/deh_cf_attrs.json       |   0
 .../eccc-canswe_cf_attrs.json                 |   0
 .../eccc-homogenized_cf_attrs.json}           |   1 +
 .../eccc-obs_cf_attrs.json}                   |   1 -
 .../eccc-rdrs_cf_attrs.json}                  |   0
 .../{data => configs}/ecmwf_cf_attrs.json     |   0
 .../{data => configs}/emdna_cf_attrs.json     |   0
 .../{data => configs}/espo-g6-e5l_attrs.json  |   0
 .../{data => configs}/espo-g6-r2_attrs.json   |   0
 .../{data => configs}/ets_grnch_cf_attrs.json |   0
 .../{data => configs}/hq_cf_attrs.json        |   0
 .../{data => configs}/melcc_cf_attrs.json     |   0
 .../{data => configs}/nasa_ag_cf_attrs.json   |   0
 .../nex-gddp-cmip6_attrs.json                 |   0
 .../{data => configs}/rvt_raven_attrs.json    |   0
 .../{data => configs}/usask_cf_attrs.json     |   0
 miranda/convert/deh.py                        |   5 +-
 miranda/convert/hq.py                         |   6 +-
 miranda/decode/_decoder.py                    |   4 +-
 miranda/eccc/__init__.py                      |   3 -
 miranda/eccc/_utils.py                        | 166 +----------------
 miranda/eccc/eccc_homogenized_cf_attrs.json   | 111 -----------
 miranda/eccc/eccc_obs_summary_cf_attrs.json   | 173 ------------------
 miranda/eccc/geomet.py                        |   4 +-
 miranda/io/_rechunk.py                        |   2 +-
 miranda/io/utils.py                           |   2 +-
 miranda/preprocess/__init__.py                |   1 +
 miranda/preprocess/_data_definitions.py       |  37 ++++
 .../_eccc_homogenized.py}                     | 115 +++++++++++-
 .../eccc_obs.py => preprocess/_eccc_obs.py}   |   8 +-
 .../_eccc_summaries.py}                       |   2 +-
 miranda/preprocess/_treatments.py             |  43 +++++
 .../configs/eccc-homogenized_attrs.json       |  36 ++++
 .../configs/eccc-obs-summary_cf_attrs.json}   |   0
 .../preprocess/configs/eccc-obs_attrs.json    |  29 +++
 .../{eccc/convert.py => preprocess/eccc.py}   |  13 +-
 .../ecmwf.py => preprocess/ecmwf_tigge.py}    |   0
 miranda/structure/_structure.py               |   6 +-
 miranda/validators.py                         |   4 +-
 miranda/vocabularies/__init__.py              |   1 +
 miranda/vocabularies/eccc.py                  |  88 +++++++++
 miranda/{cv.py => vocabularies/esgf.py}       |   2 +-
 pyproject.toml                                |   6 +-
 templates/eccc_raw_hourly_conversion.py       |   2 +-
 templates/eccc_rdrs_processing.py             |   2 +-
 49 files changed, 425 insertions(+), 513 deletions(-)
 rename miranda/convert/{data => configs}/cmip_ouranos_attrs.json (100%)
 rename miranda/convert/{data => configs}/deh_cf_attrs.json (100%)
 rename miranda/convert/{data => configs}/eccc-canswe_cf_attrs.json (100%)
 rename miranda/convert/{data/eccc_homogenized_cf_attrs.json => configs/eccc-homogenized_cf_attrs.json} (99%)
 rename miranda/convert/{data/eccc_obs_cf_attrs.json => configs/eccc-obs_cf_attrs.json} (99%)
 rename miranda/convert/{data/eccc_rdrs_cf_attrs.json => configs/eccc-rdrs_cf_attrs.json} (100%)
 rename miranda/convert/{data => configs}/ecmwf_cf_attrs.json (100%)
 rename miranda/convert/{data => configs}/emdna_cf_attrs.json (100%)
 rename miranda/convert/{data => configs}/espo-g6-e5l_attrs.json (100%)
 rename miranda/convert/{data => configs}/espo-g6-r2_attrs.json (100%)
 rename miranda/convert/{data => configs}/ets_grnch_cf_attrs.json (100%)
 rename miranda/convert/{data => configs}/hq_cf_attrs.json (100%)
 rename miranda/convert/{data => configs}/melcc_cf_attrs.json (100%)
 rename miranda/convert/{data => configs}/nasa_ag_cf_attrs.json (100%)
 rename miranda/convert/{data => configs}/nex-gddp-cmip6_attrs.json (100%)
 rename miranda/convert/{data => configs}/rvt_raven_attrs.json (100%)
 rename miranda/convert/{data => configs}/usask_cf_attrs.json (100%)
 delete mode 100644 miranda/eccc/eccc_homogenized_cf_attrs.json
 delete mode 100644 miranda/eccc/eccc_obs_summary_cf_attrs.json
 create mode 100644 miranda/preprocess/__init__.py
 create mode 100644 miranda/preprocess/_data_definitions.py
 rename miranda/{eccc/_homogenized.py => preprocess/_eccc_homogenized.py} (74%)
 rename miranda/{convert/eccc_obs.py => preprocess/_eccc_obs.py} (99%)
 rename miranda/{eccc/_summaries.py => preprocess/_eccc_summaries.py} (99%)
 create mode 100644 miranda/preprocess/_treatments.py
 create mode 100644 miranda/preprocess/configs/eccc-homogenized_attrs.json
 rename miranda/{eccc/data/eccc_obs_summary_cf_attrs.json => preprocess/configs/eccc-obs-summary_cf_attrs.json} (100%)
 create mode 100644 miranda/preprocess/configs/eccc-obs_attrs.json
 rename miranda/{eccc/convert.py => preprocess/eccc.py} (90%)
 rename miranda/{convert/ecmwf.py => preprocess/ecmwf_tigge.py} (100%)
 create mode 100644 miranda/vocabularies/__init__.py
 create mode 100644 miranda/vocabularies/eccc.py
 rename miranda/{cv.py => vocabularies/esgf.py} (99%)

diff --git a/miranda/__init__.py b/miranda/__init__.py
index 8439bcd3..6c9577f1 100644
--- a/miranda/__init__.py
+++ b/miranda/__init__.py
@@ -23,14 +23,15 @@
 from . import (
     archive,
     convert,
-    cv,
     decode,
     io,
+    preprocess,
     scripting,
     structure,
     units,
     utils,
     validators,
+    vocabularies,
 )
 from .data import DataBase
 from .storage import FileMeta, StorageState
diff --git a/miranda/convert/__init__.py b/miranda/convert/__init__.py
index 179bd597..bfc31224 100644
--- a/miranda/convert/__init__.py
+++ b/miranda/convert/__init__.py
@@ -1,7 +1,7 @@
 """Data Conversion module."""
 from __future__ import annotations
 
-from . import deh, eccc_canswe, ecmwf, hq, melcc, utils
+from . import deh, hq, melcc, utils
 from ._aggregation import *
 from ._data_definitions import *
 from ._treatments import *
diff --git a/miranda/convert/_data_definitions.py b/miranda/convert/_data_definitions.py
index 4ed4741b..1b6dbc8a 100644
--- a/miranda/convert/_data_definitions.py
+++ b/miranda/convert/_data_definitions.py
@@ -35,7 +35,7 @@
     "xarray_frequencies_to_cmip6like",
 ]
 
-_data_folder = Path(__file__).parent / "data"
+_config_folder = Path(__file__).resolve().parent / "configs"
 
 
 def load_json_data_mappings(project: str) -> dict[str, Any]:
@@ -49,38 +49,50 @@ def load_json_data_mappings(project: str) -> dict[str, Any]:
     -------
     dict[str, Any]
     """
-    data_folder = Path(__file__).resolve().parent / "data"
-
     if project.startswith("era5"):
-        metadata_definition = json.load(open(data_folder / "ecmwf_cf_attrs.json"))
+        metadata_definition = json.load(open(_config_folder / "ecmwf_cf_attrs.json"))
     elif project in ["rdrs-v21"]:
-        metadata_definition = json.load(open(data_folder / "eccc_rdrs_cf_attrs.json"))
+        metadata_definition = json.load(
+            open(_config_folder / "eccc-rdrs_cf_attrs.json")
+        )
     elif project == "eccc-obs":
-        metadata_definition = json.load(open(data_folder / "eccc_obs_cf_attrs.json"))
+        metadata_definition = json.load(open(_config_folder / "eccc-obs_cf_attrs.json"))
     elif project in ["agcfsr", "agmerra2"]:
-        metadata_definition = json.load(open(data_folder / "nasa_ag_cf_attrs.json"))
+        metadata_definition = json.load(open(_config_folder / "nasa_ag_cf_attrs.json"))
     elif project in ["cordex", "cmip5", "cmip6"]:
-        metadata_definition = json.load(open(data_folder / "cmip_ouranos_attrs.json"))
+        metadata_definition = json.load(
+            open(_config_folder / "cmip_ouranos_attrs.json")
+        )
     elif project == "ets-grnch":
-        metadata_definition = json.load(open(data_folder / "ets_grnch_cf_attrs.json"))
+        metadata_definition = json.load(
+            open(_config_folder / "ets_grnch_cf_attrs.json")
+        )
     elif project == "nrcan-gridded-10km":
         raise NotImplementedError()
     elif project == "wfdei-gem-capa":
-        metadata_definition = json.load(open(data_folder / "usask_cf_attrs.json"))
-    elif project.startswith("melcc"):
-        metadata_definition = json.load(open(data_folder / "melcc_cf_attrs.json"))
-    elif project.startswith("ec"):
-        metadata_definition = json.load(open(data_folder / "eccc-canswe_cf_attrs.json"))
+        metadata_definition = json.load(open(_config_folder / "usask_cf_attrs.json"))
+    elif project == "melcc":
+        metadata_definition = json.load(open(_config_folder / "melcc_cf_attrs.json"))
+    elif project == "eccc-canswe":
+        metadata_definition = json.load(
+            open(_config_folder / "eccc-canswe_cf_attrs.json")
+        )
+    elif project == "eccc-homogenized":
+        metadata_definition = json.load(
+            open(_config_folder / "eccc-homogenized_cf_attrs.json")
+        )
     elif project in ["NEX-GDDP-CMIP6"]:
-        metadata_definition = json.load(open(data_folder / "nex-gddp-cmip6_attrs.json"))
+        metadata_definition = json.load(
+            open(_config_folder / "nex-gddp-cmip6_attrs.json")
+        )
     elif project in ["ESPO-G6-R2"]:
-        metadata_definition = json.load(open(data_folder / "espo-g6-r2_attrs.json"))
+        metadata_definition = json.load(open(_config_folder / "espo-g6-r2_attrs.json"))
     elif project in ["ESPO-G6-E5L"]:
-        metadata_definition = json.load(open(data_folder / "espo-g6-e5l_attrs.json"))
+        metadata_definition = json.load(open(_config_folder / "espo-g6-e5l_attrs.json"))
     elif project in ["EMDNA"]:
-        metadata_definition = json.load(open(data_folder / "emdna_cf_attrs.json"))
+        metadata_definition = json.load(open(_config_folder / "emdna_cf_attrs.json"))
     else:
-        raise NotImplementedError()
+        raise NotImplementedError(f"Project not supported: {project}")
 
     return metadata_definition
 
@@ -88,27 +100,27 @@ def load_json_data_mappings(project: str) -> dict[str, Any]:
 eccc_rdrs_variables = dict()
 eccc_rdrs_variables["raw"] = [
     v
-    for v in json.load(open(_data_folder / "eccc_rdrs_cf_attrs.json"))[
+    for v in json.load(open(_config_folder / "eccc-rdrs_cf_attrs.json"))[
         "variables"
     ].keys()
 ]
 eccc_rdrs_variables["cf"] = [
     attrs["_cf_variable_name"]
-    for attrs in json.load(open(_data_folder / "eccc_rdrs_cf_attrs.json"))[
+    for attrs in json.load(open(_config_folder / "eccc-rdrs_cf_attrs.json"))[
         "variables"
     ].values()
 ]
 
-era5_variables = json.load(open(_data_folder / "ecmwf_cf_attrs.json"))[
+era5_variables = json.load(open(_config_folder / "ecmwf_cf_attrs.json"))[
     "variables"
 ].keys()
 grnch_variables = ["T", "Tmin", "Tmax", "P"]
 nrcan_variables = ["tasmin", "tasmax", "pr"]
-nasa_ag_variables = json.load(open(_data_folder / "nasa_ag_cf_attrs.json"))[
+nasa_ag_variables = json.load(open(_config_folder / "nasa_ag_cf_attrs.json"))[
     "variables"
 ].keys()
 sc_earth_variables = ["prcp", "tdew", "tmean", "trange", "wind"]
-wfdei_gem_capa_variables = json.load(open(_data_folder / "usask_cf_attrs.json"))[
+wfdei_gem_capa_variables = json.load(open(_config_folder / "usask_cf_attrs.json"))[
     "variables"
 ].keys()
 
diff --git a/miranda/convert/data/cmip_ouranos_attrs.json b/miranda/convert/configs/cmip_ouranos_attrs.json
similarity index 100%
rename from miranda/convert/data/cmip_ouranos_attrs.json
rename to miranda/convert/configs/cmip_ouranos_attrs.json
diff --git a/miranda/convert/data/deh_cf_attrs.json b/miranda/convert/configs/deh_cf_attrs.json
similarity index 100%
rename from miranda/convert/data/deh_cf_attrs.json
rename to miranda/convert/configs/deh_cf_attrs.json
diff --git a/miranda/convert/data/eccc-canswe_cf_attrs.json b/miranda/convert/configs/eccc-canswe_cf_attrs.json
similarity index 100%
rename from miranda/convert/data/eccc-canswe_cf_attrs.json
rename to miranda/convert/configs/eccc-canswe_cf_attrs.json
diff --git a/miranda/convert/data/eccc_homogenized_cf_attrs.json b/miranda/convert/configs/eccc-homogenized_cf_attrs.json
similarity index 99%
rename from miranda/convert/data/eccc_homogenized_cf_attrs.json
rename to miranda/convert/configs/eccc-homogenized_cf_attrs.json
index 9eac354f..5c777230 100644
--- a/miranda/convert/data/eccc_homogenized_cf_attrs.json
+++ b/miranda/convert/configs/eccc-homogenized_cf_attrs.json
@@ -6,6 +6,7 @@
       "gen3": "Vincent, L.A., M.M. Hartwell and X.L. Wang, 2020: A Third Generation of Homogenized Temperature for Trend Analysis and Monitoring Changes in Canada’s Climate. Atmosphere-Ocean. https://doi.org/10.1080/07055900.2020.1765728"
     },
     "_frequency": true,
+    "_generation": true,
     "_miranda_version": true,
     "_missing_values": [
       "-999",
diff --git a/miranda/convert/data/eccc_obs_cf_attrs.json b/miranda/convert/configs/eccc-obs_cf_attrs.json
similarity index 99%
rename from miranda/convert/data/eccc_obs_cf_attrs.json
rename to miranda/convert/configs/eccc-obs_cf_attrs.json
index 68b30487..c504965e 100644
--- a/miranda/convert/data/eccc_obs_cf_attrs.json
+++ b/miranda/convert/configs/eccc-obs_cf_attrs.json
@@ -17,7 +17,6 @@
     "license_type": "permissive",
     "organization": "ECCC",
     "processing_level": "raw",
-    "realm": "atmos",
     "source": "ECCC-OBS",
     "table_date": "2023-03-23",
     "type": "station-obs"
diff --git a/miranda/convert/data/eccc_rdrs_cf_attrs.json b/miranda/convert/configs/eccc-rdrs_cf_attrs.json
similarity index 100%
rename from miranda/convert/data/eccc_rdrs_cf_attrs.json
rename to miranda/convert/configs/eccc-rdrs_cf_attrs.json
diff --git a/miranda/convert/data/ecmwf_cf_attrs.json b/miranda/convert/configs/ecmwf_cf_attrs.json
similarity index 100%
rename from miranda/convert/data/ecmwf_cf_attrs.json
rename to miranda/convert/configs/ecmwf_cf_attrs.json
diff --git a/miranda/convert/data/emdna_cf_attrs.json b/miranda/convert/configs/emdna_cf_attrs.json
similarity index 100%
rename from miranda/convert/data/emdna_cf_attrs.json
rename to miranda/convert/configs/emdna_cf_attrs.json
diff --git a/miranda/convert/data/espo-g6-e5l_attrs.json b/miranda/convert/configs/espo-g6-e5l_attrs.json
similarity index 100%
rename from miranda/convert/data/espo-g6-e5l_attrs.json
rename to miranda/convert/configs/espo-g6-e5l_attrs.json
diff --git a/miranda/convert/data/espo-g6-r2_attrs.json b/miranda/convert/configs/espo-g6-r2_attrs.json
similarity index 100%
rename from miranda/convert/data/espo-g6-r2_attrs.json
rename to miranda/convert/configs/espo-g6-r2_attrs.json
diff --git a/miranda/convert/data/ets_grnch_cf_attrs.json b/miranda/convert/configs/ets_grnch_cf_attrs.json
similarity index 100%
rename from miranda/convert/data/ets_grnch_cf_attrs.json
rename to miranda/convert/configs/ets_grnch_cf_attrs.json
diff --git a/miranda/convert/data/hq_cf_attrs.json b/miranda/convert/configs/hq_cf_attrs.json
similarity index 100%
rename from miranda/convert/data/hq_cf_attrs.json
rename to miranda/convert/configs/hq_cf_attrs.json
diff --git a/miranda/convert/data/melcc_cf_attrs.json b/miranda/convert/configs/melcc_cf_attrs.json
similarity index 100%
rename from miranda/convert/data/melcc_cf_attrs.json
rename to miranda/convert/configs/melcc_cf_attrs.json
diff --git a/miranda/convert/data/nasa_ag_cf_attrs.json b/miranda/convert/configs/nasa_ag_cf_attrs.json
similarity index 100%
rename from miranda/convert/data/nasa_ag_cf_attrs.json
rename to miranda/convert/configs/nasa_ag_cf_attrs.json
diff --git a/miranda/convert/data/nex-gddp-cmip6_attrs.json b/miranda/convert/configs/nex-gddp-cmip6_attrs.json
similarity index 100%
rename from miranda/convert/data/nex-gddp-cmip6_attrs.json
rename to miranda/convert/configs/nex-gddp-cmip6_attrs.json
diff --git a/miranda/convert/data/rvt_raven_attrs.json b/miranda/convert/configs/rvt_raven_attrs.json
similarity index 100%
rename from miranda/convert/data/rvt_raven_attrs.json
rename to miranda/convert/configs/rvt_raven_attrs.json
diff --git a/miranda/convert/data/usask_cf_attrs.json b/miranda/convert/configs/usask_cf_attrs.json
similarity index 100%
rename from miranda/convert/data/usask_cf_attrs.json
rename to miranda/convert/configs/usask_cf_attrs.json
diff --git a/miranda/convert/deh.py b/miranda/convert/deh.py
index 6179d7fd..55581f5d 100644
--- a/miranda/convert/deh.py
+++ b/miranda/convert/deh.py
@@ -18,11 +18,12 @@
 __all__ = ["open_txt"]
 
 # CMOR-like attributes
-cmor = json.load(open(Path(__file__).parent / "data" / "deh_cf_attrs.json"))[  # noqa
+cmor = json.load(open(Path(__file__).parent / "configs" / "deh_cf_attrs.json"))[
     "variable_entry"
 ]
 
-# TODO: Some potentially useful attributes were skipped, because they would be complicated to include in a dataset since they vary per station
+# TODO: Some potentially useful attributes were skipped
+# because they would be complicated to include in a dataset since they vary per station
 meta_patterns = {
     "Station: ": "name",
     "Bassin versant: ": "bv",
diff --git a/miranda/convert/hq.py b/miranda/convert/hq.py
index 6425f0e1..338fcd92 100644
--- a/miranda/convert/hq.py
+++ b/miranda/convert/hq.py
@@ -21,7 +21,9 @@
 __all__ = ["open_csv"]
 
 # CMOR-like attributes
-cmor = json.load(open(Path(__file__).parent / "data" / "hq_cf_attrs.json"))["variables"]
+cmor = json.load(open(Path(__file__).parent / "configs" / "hq_cf_attrs.json"))[
+    "variables"
+]
 
 fp = r"[-+]?\d*,\d+|\d+"
 
@@ -180,6 +182,6 @@ def to_cf(meta: dict, data: pd.DataFrame, cf_table: dict | None = None) -> xr.Da
 
 
 def open_csv(path: str | Path, cf_table: dict | None = cmor) -> xr.DataArray:
-    """Extract daily HQ meteo data and convert to xr.DataArray with CF-Convention attributes."""
+    """Extract daily HQ meteo configs and convert to xr.DataArray with CF-Convention attributes."""
     meta, data = extract_daily(path)
     return to_cf(meta, data, cf_table)
diff --git a/miranda/decode/_decoder.py b/miranda/decode/_decoder.py
index 088e0cc5..67433f1c 100644
--- a/miranda/decode/_decoder.py
+++ b/miranda/decode/_decoder.py
@@ -19,15 +19,15 @@
 from pandas._libs.tslibs import NaTType  # noqa
 
 from miranda.convert.utils import date_parser, find_version_hash  # noqa
-from miranda.cv import VALIDATION_ENABLED
 from miranda.scripting import LOGGING_CONFIG
 from miranda.units import get_time_frequency
+from miranda.vocabularies.esgf import VALIDATION_ENABLED
 
 from ._time import TIME_UNITS_TO_FREQUENCY, TIME_UNITS_TO_TIMEDELTA, DecoderError
 
 if VALIDATION_ENABLED:
-    from miranda.cv import INSTITUTIONS, PROJECT_MODELS
     from miranda.validators import FACETS_SCHEMA  # noqa
+    from miranda.vocabularies.esgf import INSTITUTIONS, PROJECT_MODELS
 
 
 config.dictConfig(LOGGING_CONFIG)
diff --git a/miranda/eccc/__init__.py b/miranda/eccc/__init__.py
index 781076e0..c4a33869 100644
--- a/miranda/eccc/__init__.py
+++ b/miranda/eccc/__init__.py
@@ -1,5 +1,2 @@
 """Environment and Climate Change Canada specialized conversion module."""
 from __future__ import annotations
-
-from ._homogenized import *
-from ._summaries import *
diff --git a/miranda/eccc/_utils.py b/miranda/eccc/_utils.py
index afb34770..a501dac6 100644
--- a/miranda/eccc/_utils.py
+++ b/miranda/eccc/_utils.py
@@ -6,7 +6,7 @@
 
 from miranda.scripting import LOGGING_CONFIG
 
-__all__ = ["cf_station_metadata", "cf_ahccd_metadata"]
+__all__ = ["cf_station_metadata"]
 
 logging.config.dictConfig(LOGGING_CONFIG)
 
@@ -836,167 +836,3 @@ def cf_station_metadata(variable_code: int | str) -> Mapping[str, int | float |
         logging.error(f"Hourly variable `{code}` not supported.")
         raise
     return variable
-
-
-def cf_ahccd_metadata(
-    code: str, gen: int
-) -> (dict[str, int | float | str], dict, list[tuple[int, int]], int):
-    """
-
-    Parameters
-    ----------
-    code: {"dx", "dn", "dm", "dt", "ds", "dr"}
-    gen: {1, 2, 3}
-
-    Returns
-    -------
-    dict[str, int or str or float], dict, list[tuple[int, int]], int
-    """
-    generation = {1: "First", 2: "Second", 3: "Third"}.get(gen)
-
-    ec_ahccd_attrs = dict(
-        dx=dict(
-            variable="tasmax",
-            units="degC",
-            standard_name="air_temperature",
-            long_name="Near-Surface Maximum Daily Air Temperature",
-            comment=f"ECCC {generation} Generation of Adjusted and Homogenized Temperature Data",
-        ),
-        dn=dict(
-            variable="tasmin",
-            units="degC",
-            standard_name="air_temperature",
-            long_name="Near-Surface Minimum Daily Air Temperature",
-            comment=f"ECCC {generation} Generation of Adjusted and Homogenized Temperature Data",
-        ),
-        dm=dict(
-            variable="tas",
-            units="degC",
-            standard_name="air_temperature",
-            long_name="Near-Surface Daily Mean Air Temperature",
-            comment=f"ECCC {generation} Generation of Adjusted and Homogenized Temperature Data",
-        ),
-        dt=dict(
-            variable="pr",
-            units="mm d-1",
-            standard_name="precipitation_flux",
-            long_name="Daily Total Precipitation",
-            comment=f"ECCC {generation} Generation of Adjusted and Homogenized Precipitation Data",
-        ),
-        ds=dict(
-            variable="prsn",
-            units="mm d-1",
-            standard_name="snowfall_flux",
-            long_name="Daily Snowfall",
-            comment=f"ECCC {generation} Generation of Adjusted and Homogenized Precipitation Data",
-        ),
-        dr=dict(
-            variable="prlp",
-            units="mm d-1",
-            standard_name="rainfall_flux",
-            long_name="Daily Rainfall",
-            comment=f"ECCC {generation} Generation of Adjusted and Homogenized Precipitation Data",
-        ),
-    )
-    try:
-        variable = ec_ahccd_attrs[code]
-        variable["missing_flags"] = "M"
-        if variable["variable"].startswith("tas"):
-            variable["NaN_value"] = -9999.9
-            column_names = [
-                "No",
-                "StnId",
-                "Station name",
-                "Prov",
-                "FromYear",
-                "FromMonth",
-                "ToYear",
-                "ToMonth",
-                "%Miss",
-                "Lat(deg)",
-                "Long(deg)",
-                "Elev(m)",
-                "Joined",
-                "RCS",
-            ]
-            column_spaces = [(0, 5), (5, 6), (6, 8), (8, 9)]
-            ii = 9
-            for i in range(1, 32):
-                column_spaces.append((ii, ii + 7))
-                ii += 7
-                column_spaces.append((ii, ii + 1))
-                ii += 1
-            header_row = 3
-
-        elif variable["variable"].startswith("pr"):
-            variable["NaN_value"] = -9999.99
-            column_names = [
-                "Prov",
-                "Station name",
-                "stnid",
-                "beg yr",
-                "beg mon",
-                "end yr",
-                "end mon",
-                "lat (deg)",
-                "long (deg)",
-                "elev (m)",
-                "stns joined",
-            ]
-            column_spaces = [(0, 4), (4, 5), (5, 7), (7, 8)]
-            ii = 8
-            for i in range(1, 32):
-                column_spaces.append((ii, ii + 8))
-                ii += 8
-                column_spaces.append((ii, ii + 1))
-                ii += 1
-            header_row = 0
-
-        else:
-            raise KeyError
-
-        column_names = {
-            col.lower()
-            .split("(")[0]
-            .replace("%", "pct_")
-            .strip()
-            .replace(" ", "_"): col
-            for col in list(column_names)
-        }
-
-        if gen == 3:
-            _citation = (
-                "Vincent, L.A., M.M. Hartwell and X.L. Wang, 2020: A Third Generation of Homogenized "
-                "Temperature for Trend Analysis and Monitoring Changes in Canada’s Climate. "
-                "Atmosphere-Ocean. https://doi.org/10.1080/07055900.2020.1765728"
-            )
-        elif gen == 2:
-            _citation = (
-                "Mekis, É and L.A. Vincent, 2011: An overview of the second generation adjusted daily "
-                "precipitation dataset for trend analysis in Canada. Atmosphere-Ocean 49(2), "
-                "163-177 doi:10.1080/07055900.2011.583910"
-            )
-        else:
-            msg = f"Generation '{gen}' not supported."
-            raise NotImplementedError(msg)
-
-        global_attrs = dict(
-            title=f"{generation} Generation of Homogenized Daily {variable['variable']} "
-            "for Canada (Updated to December 2019)",
-            history=f"{dt.today().strftime('%Y-%m-%d')}: Convert from original format to NetCDF",
-            type="station_obs",
-            institute="Environment and Climate Change Canada",
-            institute_id="ECCC",
-            dataset_id=f"AHCCD_gen{gen}_day_{variable['variable']}",
-            frequency="day",
-            license_type="permissive",
-            license="https:/open.canada.ca/en/open-government-licence-canada",
-            citation=_citation,
-        )
-
-    except KeyError as e:
-        msg = f"AHCCD variable '{code}' or generation '{gen}' not supported."
-        logging.error(msg)
-        raise NotImplementedError(msg) from e
-
-    return variable, column_names, column_spaces, header_row, global_attrs
diff --git a/miranda/eccc/eccc_homogenized_cf_attrs.json b/miranda/eccc/eccc_homogenized_cf_attrs.json
deleted file mode 100644
index 92c3b0f1..00000000
--- a/miranda/eccc/eccc_homogenized_cf_attrs.json
+++ /dev/null
@@ -1,111 +0,0 @@
-{
-  "Header": {
-    "Conventions": "CF-1.8",
-    "_product": {
-      "gen2": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 2",
-      "gen3": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 3"
-    },
-    "citation": {
-      "gen2": "Mekis, É and L.A. Vincent, 2011: An overview of the second generation adjusted daily precipitation dataset for trend analysis in Canada. Atmosphere-Ocean 49(2), 163-177 doi:10.1080/07055900.2011.583910",
-      "gen3": "Vincent, L.A., M.M. Hartwell and X.L. Wang, 2020: A Third Generation of Homogenized Temperature for Trend Analysis and Monitoring Changes in Canada’s Climate. Atmosphere-Ocean. https://doi.org/10.1080/07055900.2020.1765728"
-    },
-    "contact": "info.cccs-ccsc@canada.ca",
-    "documentation": "https://www.canada.ca/en/environment-climate-change/services/climate-change/canadian-centre-climate-services/display-download/technical-documentation-adjusted-climate-data.html",
-    "float_missing_value": "1e20",
-    "frequency": "day",
-    "institution": "GovCan",
-    "int_missing_value": "-999",
-    "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html",
-    "license_type": "permissive",
-    "organization": "ECCC",
-    "realm": "atmos",
-    "table_date": "2023-03-23",
-    "table_id": "ECCC"
-  },
-  "variable_entry": {
-    "dm": {
-      "add_offset": 273.15,
-      "cell_methods": "time: mean",
-      "comments": "Station data converted from Mean Temp (°C)",
-      "frequency": "day",
-      "grid_mapping": "regular_lon_lat",
-      "long_name": "Near-Surface Air Temperature",
-      "original_field": "Mean Temp (°C)",
-      "out_name": "tas",
-      "scale_factor": 1,
-      "standard_name": "air_temperature",
-      "type": "real",
-      "units": "K"
-    },
-    "dn": {
-      "add_offset": 273.15,
-      "cell_methods": "time: minimum",
-      "comments": "Station data converted from Min Temp (°C)",
-      "frequency": "day",
-      "grid_mapping": "regular_lon_lat",
-      "long_name": "Daily Minimum Near-Surface Air Temperature",
-      "original_field": "Min Temp (°C)",
-      "out_name": "tasmin",
-      "scale_factor": 1,
-      "standard_name": "air_temperature",
-      "type": "real",
-      "units": "K"
-    },
-    "dr": {
-      "add_offset": 0,
-      "cell_methods": "time: mean",
-      "comments": "Station data converted from Total Rain (mm) using a density of 1000 kg/m³",
-      "frequency": "day",
-      "grid_mapping": "regular_lon_lat",
-      "long_name": "Liquid Precipitation",
-      "original_field": "Total Rain (mm)",
-      "out_name": "prlp",
-      "scale_factor": 1.1574074074074073e-05,
-      "standard_name": "rainfall_flux",
-      "type": "real",
-      "units": "kg m-2 s-1"
-    },
-    "ds": {
-      "add_offset": 0,
-      "cell_methods": "time: mean",
-      "comments": "station data converted from Total Snow (cm) using a density of 100 kg/m³",
-      "frequency": "day",
-      "grid_mapping": "regular_lon_lat",
-      "long_name": "Snowfall Flux",
-      "original_field": "Total Snow (cm)",
-      "out_name": "prsn",
-      "scale_factor": 1.1574074074074073e-05,
-      "standard_name": "snowfall_flux",
-      "type": "real",
-      "units": "kg m-2 s-1"
-    },
-    "dt": {
-      "add_offset": 0,
-      "cell_methods": "time: mean",
-      "comments": "Station data converted from Total Precip (mm) using a density of 1000 kg/m³",
-      "frequency": "day",
-      "grid_mapping": "regular_lon_lat",
-      "long_name": "Precipitation",
-      "original_field": "Total Precip (mm)",
-      "out_name": "pr",
-      "scale_factor": 1.1574074074074073e-05,
-      "standard_name": "precipitation_flux",
-      "type": "real",
-      "units": "kg m-2 s-1"
-    },
-    "dx": {
-      "add_offset": 273.15,
-      "cell_methods": "time: maximum",
-      "comments": "station data converted from Max Temp (°C)",
-      "frequency": "day",
-      "grid_mapping": "regular_lon_lat",
-      "long_name": "Daily Maximum Near-Surface Air Temperature",
-      "original_field": "Max Temp (°C)",
-      "out_name": "tasmax",
-      "scale_factor": 1,
-      "standard_name": "air_temperature",
-      "type": "real",
-      "units": "K"
-    }
-  }
-}
diff --git a/miranda/eccc/eccc_obs_summary_cf_attrs.json b/miranda/eccc/eccc_obs_summary_cf_attrs.json
deleted file mode 100644
index b21f224e..00000000
--- a/miranda/eccc/eccc_obs_summary_cf_attrs.json
+++ /dev/null
@@ -1,173 +0,0 @@
-{
-  "Header": {
-    "Conventions": "CF-1.8",
-    "contact": "info.cccs-ccsc@canada.ca",
-    "institution": "GovCan",
-    "int_missing_value": "-999",
-    "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html",
-    "license_type": "permissive",
-    "missing_value": "1e20",
-    "organization": "ECCC",
-    "processing_level": "raw",
-    "realm": "atmos",
-    "source": "msc",
-    "table_date": "2023-03-23",
-    "type": "station-obs"
-  },
-  "variable_entry": {
-    "cdd": {
-      "add_offset": 0,
-      "cell_methods": "time: sum",
-      "comments": "Station data converted from Cool Deg Days (°C)",
-      "frequency": "day",
-      "grid_mapping": "regular_lon_lat",
-      "long_name": "Number of Degrees Celsius Over a Mean Temperature of 18 °C",
-      "original_variable": "Cool Deg Days (°C)",
-      "out_name": "cdd",
-      "scale_factor": 1,
-      "standard_name": "cooling_degree_days",
-      "type": "real",
-      "units": "C"
-    },
-    "hdd": {
-      "add_offset": 0,
-      "cell_methods": "time: sum",
-      "comments": "Station data converted from Heat Deg Days (°C)",
-      "frequency": "day",
-      "grid_mapping": "regular_lon_lat",
-      "long_name": "Number of Degrees Celsius Under a Mean Temperature of 18 °C",
-      "original_variable": "Heat Deg Days (°C)",
-      "out_name": "hdd",
-      "scale_factor": 1,
-      "standard_name": "heating_degree_days",
-      "type": "real",
-      "units": "C"
-    },
-    "pr": {
-      "add_offset": 0,
-      "cell_methods": "time: mean",
-      "comments": "Station data converted from Total Precip (mm) using a density of 1000 kg/m³",
-      "frequency": "day",
-      "grid_mapping": "regular_lon_lat",
-      "long_name": "Precipitation",
-      "original_variable": "Total Precip (mm)",
-      "out_name": "pr",
-      "scale_factor": 1.1574074074074073e-05,
-      "standard_name": "precipitation_flux",
-      "type": "real",
-      "units": "kg m-2 s-1"
-    },
-    "prlp": {
-      "add_offset": 0,
-      "cell_methods": "time: mean",
-      "comments": "Station data converted from Total Rain (mm) using a density of 1000 kg/m³",
-      "frequency": "day",
-      "grid_mapping": "regular_lon_lat",
-      "long_name": "Liquid Precipitation",
-      "original_variable": "Total Rain (mm)",
-      "out_name": "prlp",
-      "scale_factor": 1.1574074074074073e-05,
-      "standard_name": "rainfall_flux",
-      "type": "real",
-      "units": "kg m-2 s-1"
-    },
-    "prsn": {
-      "add_offset": 0,
-      "cell_methods": "time: mean",
-      "comments": "station data converted from Total Snow (cm) using a density of 100 kg/m³",
-      "frequency": "day",
-      "grid_mapping": "regular_lon_lat",
-      "long_name": "Snowfall Flux",
-      "original_variable": "Total Snow (cm)",
-      "out_name": "prsn",
-      "scale_factor": 1.1574074074074073e-05,
-      "standard_name": "snowfall_flux",
-      "type": "real",
-      "units": "kg m-2 s-1"
-    },
-    "sfcWindAz": {
-      "add_offset": 0,
-      "cell_methods": "time: mean",
-      "comments": "Station data converted from Dir of Max Gust (10s deg)",
-      "frequency": "day",
-      "grid_mapping": "regular_lon_lat",
-      "long_name": "Direction from which the Daily Maximum Near-Surface Gust Wind Speed maximum Blows",
-      "original_variable": "Dir of Max Gust (10s deg)",
-      "out_name": "sfcWindAz",
-      "scale_factor": 1,
-      "standard_name": "wind_direction",
-      "type": "real",
-      "units": "degree"
-    },
-    "sfcWindMax": {
-      "add_offset": 0,
-      "cell_methods": "time: max",
-      "comments": "Station data converted from Spd of Max Gust (km/h)",
-      "frequency": "day",
-      "grid_mapping": "regular_lon_lat",
-      "long_name": "Daily Maximum Near-Surface Gust Wind Speed maximum",
-      "original_variable": "Spd of Max Gust (km/h)",
-      "out_name": "sfcWindMax",
-      "scale_factor": 0.2777777777777778,
-      "standard_name": "wind_speed_of_gust maximum",
-      "type": "real",
-      "units": "m s-1"
-    },
-    "snd": {
-      "add_offset": 0,
-      "cell_methods": "time: mean",
-      "comments": "Station data converted from Snow on Grnd (cm)",
-      "frequency": "day",
-      "grid_mapping": "regular_lon_lat",
-      "long_name": "Snow Depth",
-      "original_variable": "Snow on Grnd (cm)",
-      "out_name": "snd",
-      "scale_factor": 0.01,
-      "standard_name": "surface_snow_thickness",
-      "type": "real",
-      "units": "m"
-    },
-    "tas": {
-      "add_offset": 273.15,
-      "cell_methods": "time: mean",
-      "comments": "Station data converted from Mean Temp (°C)",
-      "frequency": "day",
-      "grid_mapping": "regular_lon_lat",
-      "long_name": "Near-Surface Air Temperature",
-      "original_variable": "Mean Temp (°C)",
-      "out_name": "tas",
-      "scale_factor": 1,
-      "standard_name": "air_temperature",
-      "type": "real",
-      "units": "K"
-    },
-    "tasmax": {
-      "add_offset": 273.15,
-      "cell_methods": "time: maximum",
-      "comments": "station data converted from Max Temp (°C)",
-      "frequency": "day",
-      "grid_mapping": "regular_lon_lat",
-      "long_name": "Daily Maximum Near-Surface Air Temperature",
-      "original_variable": "Max Temp (°C)",
-      "out_name": "tasmax",
-      "scale_factor": 1,
-      "standard_name": "air_temperature",
-      "type": "real",
-      "units": "K"
-    },
-    "tasmin": {
-      "add_offset": 273.15,
-      "cell_methods": "time: minimum",
-      "comments": "Station data converted from Min Temp (°C)",
-      "frequency": "day",
-      "grid_mapping": "regular_lon_lat",
-      "long_name": "Daily Minimum Near-Surface Air Temperature",
-      "original_variable": "Min Temp (°C)",
-      "out_name": "tasmin",
-      "scale_factor": 1,
-      "standard_name": "air_temperature",
-      "type": "real",
-      "units": "K"
-    }
-  }
-}
diff --git a/miranda/eccc/geomet.py b/miranda/eccc/geomet.py
index 3fb00d72..c5446dd5 100644
--- a/miranda/eccc/geomet.py
+++ b/miranda/eccc/geomet.py
@@ -1,3 +1,4 @@
+"""ECCC Geomet Module."""
 from __future__ import annotations
 
 import os
@@ -7,7 +8,8 @@
 import xarray as xr
 
 
-def load_station_metadata(meta: str | os.PathLike) -> xr.Dataset:
+def load_station_metadata(meta: str | os.PathLike | None) -> xr.Dataset:
+    """Method to load station metadata from a file or URL."""
     if meta:
         df_inv = pd.read_csv(meta, header=0)
     else:
diff --git a/miranda/io/_rechunk.py b/miranda/io/_rechunk.py
index 6c25d432..7c55fee4 100644
--- a/miranda/io/_rechunk.py
+++ b/miranda/io/_rechunk.py
@@ -26,7 +26,7 @@
     "translate_time_chunk",
 ]
 
-_data_folder = Path(__file__).parent / "data"
+_data_folder = Path(__file__).parent / "configs"
 chunk_configurations = json.load(open(_data_folder / "ouranos_chunk_config.json"))
 
 
diff --git a/miranda/io/utils.py b/miranda/io/utils.py
index 33da8643..42f7b5e7 100644
--- a/miranda/io/utils.py
+++ b/miranda/io/utils.py
@@ -28,7 +28,7 @@
     "sort_variables",
 ]
 
-_data_folder = Path(__file__).parent / "data"
+_data_folder = Path(__file__).parent / "configs"
 name_configurations = json.load(open(_data_folder / "ouranos_name_config.json"))
 
 
diff --git a/miranda/preprocess/__init__.py b/miranda/preprocess/__init__.py
new file mode 100644
index 00000000..0ae1f1d6
--- /dev/null
+++ b/miranda/preprocess/__init__.py
@@ -0,0 +1 @@
+"""Preprocessing tools for Miranda."""
diff --git a/miranda/preprocess/_data_definitions.py b/miranda/preprocess/_data_definitions.py
new file mode 100644
index 00000000..815b1048
--- /dev/null
+++ b/miranda/preprocess/_data_definitions.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+_config_folder = Path(__file__).resolve().parent / "configs"
+
+
+__all__ = ["load_json_data_mappings"]
+
+
+def load_json_data_mappings(project: str) -> dict[str, Any]:
+    """Load JSON mappings for supported dataset conversions.
+
+    Parameters
+    ----------
+    project : str
+
+    Returns
+    -------
+    dict[str, Any]
+    """
+    if project == "eccc-homogenized":
+        metadata_definition = json.load(
+            open(_config_folder / "eccc-homogenized_attrs.json")
+        )
+    elif project == "eccc-obs":
+        metadata_definition = json.load(open(_config_folder / "eccc-obs_attrs.json"))
+    elif project == "eccc-obs-summary":
+        metadata_definition = json.load(
+            open(_config_folder / "eccc-obs-summary_attrs.json")
+        )
+    else:
+        raise NotImplementedError(f"Project not supported: {project}")
+
+    return metadata_definition
diff --git a/miranda/eccc/_homogenized.py b/miranda/preprocess/_eccc_homogenized.py
similarity index 74%
rename from miranda/eccc/_homogenized.py
rename to miranda/preprocess/_eccc_homogenized.py
index de4b04f0..4b4e6706 100644
--- a/miranda/eccc/_homogenized.py
+++ b/miranda/preprocess/_eccc_homogenized.py
@@ -10,16 +10,121 @@
 import xarray as xr
 from dask.diagnostics import ProgressBar
 
+from miranda.preprocess._data_definitions import load_json_data_mappings
+from miranda.preprocess._treatments import basic_metadata_conversion
 from miranda.scripting import LOGGING_CONFIG
 
-from ._utils import cf_ahccd_metadata
-
 logging.config.dictConfig(LOGGING_CONFIG)
 logger = logging.Logger("miranda")
 
 __all__ = ["convert_ahccd", "convert_ahccd_fwf_files"]
 
 
+def _ahccd_metadata(
+    gen: int,
+) -> (dict[str, int | float | str], dict, list[tuple[int, int]], int):
+    """
+
+    Parameters
+    ----------
+    gen: {1, 2, 3}
+
+    Returns
+    -------
+    dict[str, int or str or float], dict, list[tuple[int, int]], int
+    """
+    generation = {1: "First", 2: "Second", 3: "Third"}.get(gen)
+    if not generation:
+        raise NotImplementedError(f"Generation '{gen}' not supported")
+
+    config = load_json_data_mappings("eccc-homogenized")
+    metadata = basic_metadata_conversion("eccc-homogenized", config)
+    header = metadata["Header"]
+
+    # Conditional handling of global attributes based on generation
+    for field in [f for f in header if f.startswith("_")]:
+        if isinstance(header[field], dict):
+            attr_treatment = header[field]["generation"]
+        else:
+            raise AttributeError(
+                f"Attribute treatment configuration for field `{field}` is not properly configured. Verify JSON."
+            )
+        if field in ["_citation" "_product"]:
+            for attribute, value in attr_treatment.items():
+                if attribute == generation:
+                    header[field[1:]] = value
+        del header[field]
+
+    return header
+
+
+def _column_definitions(
+    variable_code: str, metadata: dict
+) -> tuple[dict, list[tuple[int, int]], int]:
+    variable = metadata[variable_code]
+    variable["missing_flags"] = "M"
+    if variable["variable"].startswith("tas"):
+        variable["NaN_value"] = -9999.9
+        column_names = [
+            "No",
+            "StnId",
+            "Station name",
+            "Prov",
+            "FromYear",
+            "FromMonth",
+            "ToYear",
+            "ToMonth",
+            "%Miss",
+            "Lat(deg)",
+            "Long(deg)",
+            "Elev(m)",
+            "Joined",
+            "RCS",
+        ]
+        column_spaces = [(0, 5), (5, 6), (6, 8), (8, 9)]
+        ii = 9
+        for i in range(1, 32):
+            column_spaces.append((ii, ii + 7))
+            ii += 7
+            column_spaces.append((ii, ii + 1))
+            ii += 1
+        header_row = 3
+
+    elif variable["variable"].startswith("pr"):
+        variable["NaN_value"] = -9999.99
+        column_names = [
+            "Prov",
+            "Station name",
+            "stnid",
+            "beg yr",
+            "beg mon",
+            "end yr",
+            "end mon",
+            "lat (deg)",
+            "long (deg)",
+            "elev (m)",
+            "stns joined",
+        ]
+        column_spaces = [(0, 4), (4, 5), (5, 7), (7, 8)]
+        ii = 8
+        for i in range(1, 32):
+            column_spaces.append((ii, ii + 8))
+            ii += 8
+            column_spaces.append((ii, ii + 1))
+            ii += 1
+        header_row = 0
+
+    else:
+        raise KeyError
+
+    column_names = {
+        col.lower().split("(")[0].replace("%", "pct_").strip().replace(" ", "_"): col
+        for col in list(column_names)
+    }
+
+    return column_names, column_spaces, header_row
+
+
 def convert_ahccd(
     data_source: str | Path,
     output_dir: str | Path,
@@ -45,6 +150,8 @@ def convert_ahccd(
     code = dict(tasmax="dx", tasmin="dn", tas="dm", pr="dt", prsn="ds", prlp="dr").get(
         variable
     )
+
+    attrs = _ahccd_metadata(generation)
     var, col_names, col_spaces, header_row, global_attrs = cf_ahccd_metadata(
         code, generation
     )
@@ -56,7 +163,7 @@ def convert_ahccd(
 
     else:
         raise NotImplementedError(f"Code '{code} for generation {gen}.")
-    metadata_source = Path(__file__).resolve().parent.joinpath("data").joinpath(meta)
+    metadata_source = Path(__file__).resolve().parent.joinpath("configs").joinpath(meta)
 
     if "tas" in variable:
         metadata = pd.read_csv(metadata_source, header=2)
@@ -179,7 +286,7 @@ def convert_ahccd_fwf_files(
     )
 
     if attrs is None:
-        attrs, _, _, _, _ = cf_ahccd_metadata(code, generation)
+        attrs = _ahccd_metadata(generation)
     if cols_specs is None:
         _, _, cols_specs, _, _ = cf_ahccd_metadata(code, generation)
     _, _, _, nhead, _ = cf_ahccd_metadata(code, generation)
diff --git a/miranda/convert/eccc_obs.py b/miranda/preprocess/_eccc_obs.py
similarity index 99%
rename from miranda/convert/eccc_obs.py
rename to miranda/preprocess/_eccc_obs.py
index fd827d2f..7cfa9249 100644
--- a/miranda/convert/eccc_obs.py
+++ b/miranda/preprocess/_eccc_obs.py
@@ -45,9 +45,10 @@
 config.dictConfig(LOGGING_CONFIG)
 
 __all__ = [
-    "merge_stations",
     "convert_flat_files",
+    "convert_station",
     "merge_converted_variables",
+    "merge_stations",
 ]
 
 KiB = int(pow(2, 10))
@@ -60,8 +61,6 @@ def _fwf_column_definitions(
     time_frequency: str,
 ) -> tuple[list[str], list[int], list[type[str | int]]]:
     """Return the column names, widths, and data types for the fixed-width format."""
-
-    # Preparing the column headers
     if time_frequency.lower() in ["h", "hour", "hourly"]:
         num_observations = 24
         column_names = ["code", "year", "month", "day", "code_var"]
@@ -103,6 +102,7 @@ def convert_station(
     client: Any,
     **kwargs,
 ):
+    """Convert a single station's data from the fixed-width format to a netCDF file."""
     data = Path(data)
     column_names, column_widths, column_dtypes = _fwf_column_definitions(mode)
 
@@ -690,7 +690,7 @@ def _tmp_zarr(
 
     try:
         ds = xr.open_mfdataset(
-            nc, combine="nested", concat_dim={"station"}, preprocess=_remove_duplicates
+            nc, combine="nested", concat_dim="station", preprocess=_remove_duplicates
         )
     except ValueError as e:
         errored_nc_files = ", ".join([Path(f).name for f in nc])
diff --git a/miranda/eccc/_summaries.py b/miranda/preprocess/_eccc_summaries.py
similarity index 99%
rename from miranda/eccc/_summaries.py
rename to miranda/preprocess/_eccc_summaries.py
index 8e0b42c1..3c31ba32 100755
--- a/miranda/eccc/_summaries.py
+++ b/miranda/preprocess/_eccc_summaries.py
@@ -30,7 +30,7 @@
 __all__ = ["extract_daily_summaries", "daily_summaries_to_netcdf"]
 
 eccc_metadata = json.load(
-    open(Path(__file__).parent / "eccc_obs_summary_cf_attrs.json")
+    open(Path(__file__).resolve().parent / "configs" / "eccc-obs-summary_cf_attrs.json")
 )["variable_entry"]
 
 
diff --git a/miranda/preprocess/_treatments.py b/miranda/preprocess/_treatments.py
new file mode 100644
index 00000000..80d3fd45
--- /dev/null
+++ b/miranda/preprocess/_treatments.py
@@ -0,0 +1,43 @@
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from miranda import __version__ as __miranda_version__
+
+
+def basic_metadata_conversion(
+    project: str, metadata: dict
+) -> dict[str, dict[str, Any]]:
+    """Present basic metadata conversion.
+
+    Parameters
+    ----------
+    project : str
+        Dataset project name.
+    metadata : dict
+        Metadata definition dictionary for project and variable(s).
+
+    Returns
+    -------
+    xarray.Dataset
+    """
+    logging.info("Converting metadata.")
+    header = metadata["Header"]
+
+    # Static handling of version global attributes
+    miranda_version = header.get("_miranda_version")
+    if miranda_version:
+        if isinstance(miranda_version, bool):
+            header["miranda_version"] = __miranda_version__
+        elif isinstance(miranda_version, dict):
+            if project in miranda_version.keys():
+                header["miranda_version"] = __miranda_version__
+        else:
+            logging.warning(
+                f"`_miranda_version` not set for project `{project}`. Not appending."
+            )
+    if "_miranda_version" in header:
+        del header["_miranda_version"]
+
+    return metadata
diff --git a/miranda/preprocess/configs/eccc-homogenized_attrs.json b/miranda/preprocess/configs/eccc-homogenized_attrs.json
new file mode 100644
index 00000000..539ad40c
--- /dev/null
+++ b/miranda/preprocess/configs/eccc-homogenized_attrs.json
@@ -0,0 +1,36 @@
+{
+  "Header": {
+    "_citation": {
+      "generation": {
+        "Second": "Mekis, É and L.A. Vincent, 2011: An overview of the second generation adjusted daily precipitation dataset for trend analysis in Canada. Atmosphere-Ocean 49(2), 163-177 doi:10.1080/07055900.2011.583910",
+        "Third": "Vincent, L.A., M.M. Hartwell and X.L. Wang, 2020: A Third Generation of Homogenized Temperature for Trend Analysis and Monitoring Changes in Canada’s Climate. Atmosphere-Ocean. https://doi.org/10.1080/07055900.2020.1765728"
+      }
+    },
+    "_converter": true,
+    "_miranda_version": true,
+    "_missing_values": [
+      "-999",
+      "1e20"
+    ],
+    "_product": {
+      "generation": {
+        "Second": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 2",
+        "Third": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 3"
+      }
+    },
+    "acknowledgement": "This data is provided by Environment and Climate Change Canada (ECCC).",
+    "author": "Environment and Climate Change Canada (ECCC)",
+    "contact": "info.cccs-ccsc@canada.ca",
+    "documentation": "https://www.canada.ca/en/environment-climate-change/services/climate-change/canadian-centre-climate-services/display-download/technical-documentation-adjusted-climate-data.html",
+    "institution": "GovCan",
+    "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html",
+    "license_preamble": "The data is owned by the Government of Canada (Environment and Climate Change Canada), and fall under the licence agreement for use of Environment and Climate Change Canada data.",
+    "license_type": "permissive",
+    "organization": "ECCC",
+    "processing_level": "adjusted",
+    "realm": "atmos",
+    "source": "AHCCD",
+    "table_date": "2023-03-23",
+    "table_id": "ECCC"
+  }
+}
diff --git a/miranda/eccc/data/eccc_obs_summary_cf_attrs.json b/miranda/preprocess/configs/eccc-obs-summary_cf_attrs.json
similarity index 100%
rename from miranda/eccc/data/eccc_obs_summary_cf_attrs.json
rename to miranda/preprocess/configs/eccc-obs-summary_cf_attrs.json
diff --git a/miranda/preprocess/configs/eccc-obs_attrs.json b/miranda/preprocess/configs/eccc-obs_attrs.json
new file mode 100644
index 00000000..7265ca71
--- /dev/null
+++ b/miranda/preprocess/configs/eccc-obs_attrs.json
@@ -0,0 +1,29 @@
+{
+  "Header": {
+    "_converter": true,
+    "_frequency": true,
+    "_miranda_version": true,
+    "_missing_flags": "M",
+    "_missing_values": [
+      "-999",
+      "1e20",
+      "-9999",
+      "#####"
+    ],
+    "acknowledgement": "This data is provided by Environment and Climate Change Canada (ECCC).",
+    "author": "Environment and Climate Change Canada (ECCC)",
+    "contact": "climatcentre-climatecentral@ec.gc.ca",
+    "documentation": "https://climate.weather.gc.ca/doc/Technical_Documentation.pdf",
+    "institution": "GovCan",
+    "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html",
+    "license_preamble": "The data is owned by the Government of Canada (Environment and Climate Change Canada), and fall under the licence agreement for use of Environment and Climate Change Canada data.",
+    "license_type": "permissive",
+    "organization": "ECCC",
+    "processing_level": "raw",
+    "source": "ECCC-OBS",
+    "table_date": "2023-08-02",
+    "title": "Environment and Climate Change Canada (ECCC) weather station observations",
+    "type": "station-obs",
+    "usage": "The original data is owned by the Government of Canada (Environment and Climate Change Canada), and falls under the licence agreement for use of Environment and Climate Change Canada data"
+  }
+}
diff --git a/miranda/eccc/convert.py b/miranda/preprocess/eccc.py
similarity index 90%
rename from miranda/eccc/convert.py
rename to miranda/preprocess/eccc.py
index eb94be48..aab3595d 100644
--- a/miranda/eccc/convert.py
+++ b/miranda/preprocess/eccc.py
@@ -10,31 +10,28 @@
 from functools import partial
 from pathlib import Path
 
-from miranda.convert.eccc_obs import _convert_station_file
 from miranda.eccc._utils import cf_station_metadata
+from miranda.preprocess._eccc_obs import _convert_station_file
 from miranda.scripting import LOGGING_CONFIG
 
 logging.config.dictConfig(LOGGING_CONFIG)
 
 
-_data_folder = Path(__file__).parent / "data"
+_data_folder = Path(__file__).parent / "configs"
 
 eccc_observation_variables = dict()
 eccc_observation_variables["flat"] = [
-    v
-    for v in json.load(open(_data_folder / "eccc_obs_flat_attrs.json"))[
-        "variables"
-    ].keys()
+    v for v in json.load(open(_data_folder / "eccc-obs_attrs.json"))["variables"].keys()
 ]
 eccc_observation_variables["summary"] = [
     attrs["_cf_variable_name"]
-    for attrs in json.load(open(_data_folder / "eccc_obs_summary_cf_attrs.json"))[
+    for attrs in json.load(open(_data_folder / "eccc-obs-summary_attrs.json"))[
         "variables"
     ].values()
 ]
 eccc_observation_variables["homogenized"] = [
     attrs["_cf_variable_name"]
-    for attrs in json.load(open(_data_folder / "eccc_homogenized_cf_attrs.json"))[
+    for attrs in json.load(open(_data_folder / "eccc-homogenized_attrs.json"))[
         "variables"
     ].values()
 ]
diff --git a/miranda/convert/ecmwf.py b/miranda/preprocess/ecmwf_tigge.py
similarity index 100%
rename from miranda/convert/ecmwf.py
rename to miranda/preprocess/ecmwf_tigge.py
diff --git a/miranda/structure/_structure.py b/miranda/structure/_structure.py
index cac8b53b..1c2b321a 100644
--- a/miranda/structure/_structure.py
+++ b/miranda/structure/_structure.py
@@ -12,10 +12,10 @@
 import yaml
 from schema import SchemaError
 
-from miranda.cv import VALIDATION_ENABLED
 from miranda.decode import Decoder, DecoderError, guess_project
 from miranda.io import discover_data
 from miranda.scripting import LOGGING_CONFIG
+from miranda.vocabularies.esgf import VALIDATION_ENABLED
 
 if VALIDATION_ENABLED:
     from miranda.validators import validation_schemas
@@ -306,7 +306,9 @@ def build_path_from_schema(
     Path or None
     """
     if schema is None:
-        schema = Path(__file__).parent.joinpath("data").joinpath("ouranos_schema.yml")
+        schema = (
+            Path(__file__).parent.joinpath("configs").joinpath("ouranos_schema.yml")
+        )
 
     tree = parse_schema(facets, schema, top_folder)
     branch = tree[0]
diff --git a/miranda/validators.py b/miranda/validators.py
index 75551a9d..02ffc7bd 100644
--- a/miranda/validators.py
+++ b/miranda/validators.py
@@ -8,12 +8,12 @@
 from pandas._libs.tslibs import NaTType  # noqa
 from schema import Literal, Optional, Or, Regex, Schema
 
-from .cv import VALIDATION_ENABLED
+from miranda.vocabularies.esgf import VALIDATION_ENABLED
 
 __all__ = ["url_validate"]
 
 if VALIDATION_ENABLED:
-    from .cv import (
+    from miranda.vocabularies.esgf import (
         ACTIVITIES,
         BIAS_ADJUST_INSTITUTIONS,
         DRIVING_MODELS,
diff --git a/miranda/vocabularies/__init__.py b/miranda/vocabularies/__init__.py
new file mode 100644
index 00000000..74f0a223
--- /dev/null
+++ b/miranda/vocabularies/__init__.py
@@ -0,0 +1 @@
+"""Controlled Vocabulary module."""
diff --git a/miranda/vocabularies/eccc.py b/miranda/vocabularies/eccc.py
new file mode 100644
index 00000000..f86ebb53
--- /dev/null
+++ b/miranda/vocabularies/eccc.py
@@ -0,0 +1,88 @@
+"""Definition lists of variables from ECCC for each type of archive."""
+
+# For more information see the ECCC Technical Documentation
+
+__all__ = [
+    "DLY",
+    "DLY02",
+    "DLY03",
+    "DLY04",
+    "DLY12",
+    "DLY13",
+    "DLY21",
+    "DLY44",
+    "HLY",
+    "HLY01",
+    "HLY01_RCS",
+    "HLY03",
+    "HLY10",
+    "HLY15",
+    "HLY21",
+    "MLY",
+    "MLY04",
+]
+
+# Hourly Data
+
+HLY01 = []
+HLY01.extend(list(range(71, 123)))  # Hourly variables
+HLY01.extend([209, 210])  # Wind character and gust speed
+HLY01.extend(list(range(219, 231)))  # Cloud layers
+HLY01.append(244)  # Precipitation type
+HLY01.append(260)  # Freezing fog
+
+HLY01_RCS = HLY01.copy()
+HLY01_RCS.extend(
+    list(range(262, 281))
+)  # Reference Climate Surface (RCS) weather stations
+
+HLY03 = []
+HLY03.extend(list(range(123, 133)))  # Hourly rainfall
+HLY03.extend([160, 161])
+
+HLY10 = []
+HLY10.extend(list(range(61, 69)))  # Sunshine
+HLY10.extend([133, 169, 170, 171, 172])  # Solar radiation
+
+HLY15 = [69, 70, 76, 156]  # Wind
+
+HLY21 = [123]  # Fischer/Porter precipitation
+
+HLY = list(set(HLY01 + HLY01_RCS + HLY03 + HLY10 + HLY15 + HLY21))
+
+# Daily Data
+
+DLY02 = []
+DLY02.extend(list(range(1, 26)))  # Daily variables
+DLY02.append(157)  # Direction of extreme gust
+DLY02.append(179)  # Daily bright sunshine
+
+DLY03 = []
+DLY03.extend(list(range(124, 133)))
+DLY03.extend([160, 161])
+
+DLY04 = DLY02.copy()
+
+DLY12 = []
+DLY12.extend(list(range(134, 151)))  # Soil temperatures
+
+DLY13 = list(range(151, 156))  # Pan evaporation
+
+DLY21 = [12]  # Precipitation
+DLY21.extend(list(range(127, 133)))  # Precipitation over time
+DLY21.append(161)  # Most precipitation in 25 hours
+
+DLY44 = []
+DLY44.extend([1, 2, 3])  # Temperature
+DLY44.extend(list(range(10, 18)))  # Precipitation
+
+DLY = list(set(DLY02 + DLY03 + DLY04 + DLY12 + DLY13 + DLY21 + DLY44))
+
+# Monthly data
+
+MLY04 = []
+MLY04.extend(list(range(26, 39)))  # Days with variables
+MLY04.extend(list(range(39, 61)))  # Means of variables
+MLY04.append(158)  # Direction of extreme gust
+
+MLY = list(set(MLY04))
diff --git a/miranda/cv.py b/miranda/vocabularies/esgf.py
similarity index 99%
rename from miranda/cv.py
rename to miranda/vocabularies/esgf.py
index 93d1f3c2..c22e0f13 100644
--- a/miranda/cv.py
+++ b/miranda/vocabularies/esgf.py
@@ -1,4 +1,4 @@
-"""Controlled Vocabulary module."""
+"""ESGF Vocabularies."""
 from __future__ import annotations
 
 import warnings
diff --git a/pyproject.toml b/pyproject.toml
index 2b5c171c..ed3ca140 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -100,7 +100,11 @@ remote = [
 
 [tool.black]
 target-version = [
-  "py37"
+  "py37",
+  "py38",
+  "py39",
+  "py310",
+  "py311"
 ]
 
 [tool.coverage.run]
diff --git a/templates/eccc_raw_hourly_conversion.py b/templates/eccc_raw_hourly_conversion.py
index fcb47b2f..c72e3777 100644
--- a/templates/eccc_raw_hourly_conversion.py
+++ b/templates/eccc_raw_hourly_conversion.py
@@ -1,7 +1,7 @@
 from os import getenv
 from pathlib import Path
 
-from miranda.convert.eccc_obs import (
+from miranda.preprocess._eccc_obs import (
     merge_stations,
     convert_flat_files,
     merge_converted_variables,
diff --git a/templates/eccc_rdrs_processing.py b/templates/eccc_rdrs_processing.py
index c7563157..f2313cfa 100644
--- a/templates/eccc_rdrs_processing.py
+++ b/templates/eccc_rdrs_processing.py
@@ -1,7 +1,7 @@
 import logging
 from pathlib import Path
 
-from miranda.convert.eccc_rdrs import convert_rdrs, rdrs_to_daily
+from miranda.preprocess.eccc_rdrs import convert_rdrs, rdrs_to_daily
 from miranda.io import concat_rechunk_zarr
 
 

From 9baaad505ddffe08847032771ad740b2c1c66f49 Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Thu, 3 Aug 2023 15:52:34 -0400
Subject: [PATCH 08/33] working version of ahccd conversion

---
 miranda/convert/corrections.py                |  10 +-
 miranda/io/_rechunk.py                        |   2 +-
 miranda/io/utils.py                           |   2 +-
 miranda/preprocess/_eccc_homogenized.py       | 347 +++++++++---------
 .../configs}/ahccd_gen2_precipitation.csv     |   0
 .../configs}/ahccd_gen3_temperature.csv       |   0
 .../configs/eccc-homogenized_attrs.json       |  83 ++++-
 7 files changed, 259 insertions(+), 185 deletions(-)
 rename miranda/{eccc/data => preprocess/configs}/ahccd_gen2_precipitation.csv (100%)
 rename miranda/{eccc/data => preprocess/configs}/ahccd_gen3_temperature.csv (100%)

diff --git a/miranda/convert/corrections.py b/miranda/convert/corrections.py
index 259d2b1b..8a5ef0ee 100644
--- a/miranda/convert/corrections.py
+++ b/miranda/convert/corrections.py
@@ -9,23 +9,21 @@
 
 import xarray as xr
 
-from miranda.convert import (
-    dimensions_compliance,
-    metadata_conversion,
-    threshold_mask,
-    variable_conversion,
-)
 from miranda.convert._data_definitions import load_json_data_mappings
 from miranda.convert._treatments import (
     cf_units_conversion,
     clip_values,
     conservative_regrid,
     correct_unit_names,
+    dimensions_compliance,
     ensure_correct_time_frequency,
     invert_value_sign,
+    metadata_conversion,
     offset_time_dimension,
     preprocessing_corrections,
+    threshold_mask,
     transform_values,
+    variable_conversion,
 )
 from miranda.convert.utils import find_version_hash
 from miranda.gis import subset_domain
diff --git a/miranda/io/_rechunk.py b/miranda/io/_rechunk.py
index 7c55fee4..6c25d432 100644
--- a/miranda/io/_rechunk.py
+++ b/miranda/io/_rechunk.py
@@ -26,7 +26,7 @@
     "translate_time_chunk",
 ]
 
-_data_folder = Path(__file__).parent / "configs"
+_data_folder = Path(__file__).parent / "data"
 chunk_configurations = json.load(open(_data_folder / "ouranos_chunk_config.json"))
 
 
diff --git a/miranda/io/utils.py b/miranda/io/utils.py
index 42f7b5e7..33da8643 100644
--- a/miranda/io/utils.py
+++ b/miranda/io/utils.py
@@ -28,7 +28,7 @@
     "sort_variables",
 ]
 
-_data_folder = Path(__file__).parent / "configs"
+_data_folder = Path(__file__).parent / "data"
 name_configurations = json.load(open(_data_folder / "ouranos_name_config.json"))
 
 
diff --git a/miranda/preprocess/_eccc_homogenized.py b/miranda/preprocess/_eccc_homogenized.py
index 4b4e6706..01530156 100644
--- a/miranda/preprocess/_eccc_homogenized.py
+++ b/miranda/preprocess/_eccc_homogenized.py
@@ -8,7 +8,6 @@
 import numpy as np
 import pandas as pd
 import xarray as xr
-from dask.diagnostics import ProgressBar
 
 from miranda.preprocess._data_definitions import load_json_data_mappings
 from miranda.preprocess._treatments import basic_metadata_conversion
@@ -20,13 +19,15 @@
 __all__ = ["convert_ahccd", "convert_ahccd_fwf_files"]
 
 
-def _ahccd_metadata(
+def _ahccd_variable_metadata(
+    variable_code: str,
     gen: int,
 ) -> (dict[str, int | float | str], dict, list[tuple[int, int]], int):
     """
 
     Parameters
     ----------
+    variable_code: {"dm", "dn", "dr", "ds", "dt", "dx"}
     gen: {1, 2, 3}
 
     Returns
@@ -39,8 +40,19 @@ def _ahccd_metadata(
 
     config = load_json_data_mappings("eccc-homogenized")
     metadata = basic_metadata_conversion("eccc-homogenized", config)
-    header = metadata["Header"]
 
+    variable_meta = metadata["variables"].get(variable_code)
+    if not variable_meta:
+        raise NotImplementedError(f"Variable `{variable_code}` not supported.")
+
+    variable_name = variable_meta.get("_variable_name")
+    if variable_name:
+        variable_meta = {variable_name: variable_meta}
+        del variable_meta[variable_name]["_variable_name"]
+    else:
+        variable_meta = {variable_code: variable_meta}
+
+    header = metadata["Header"]
     # Conditional handling of global attributes based on generation
     for field in [f for f in header if f.startswith("_")]:
         if isinstance(header[field], dict):
@@ -55,16 +67,21 @@ def _ahccd_metadata(
                     header[field[1:]] = value
         del header[field]
 
-    return header
+    return variable_meta, header
+
 
+def _ahccd_station_metadata(code):
+    pass
 
-def _column_definitions(
-    variable_code: str, metadata: dict
+
+def _ahccd_column_definitions(
+    variable_code: str,
 ) -> tuple[dict, list[tuple[int, int]], int]:
-    variable = metadata[variable_code]
-    variable["missing_flags"] = "M"
-    if variable["variable"].startswith("tas"):
-        variable["NaN_value"] = -9999.9
+    config = load_json_data_mappings("eccc-homogenized")
+    metadata = basic_metadata_conversion("eccc-homogenized", config)
+
+    variable = metadata["variables"][variable_code]["_variable_name"]
+    if variable.startswith("tas"):
         column_names = [
             "No",
             "StnId",
@@ -90,8 +107,7 @@ def _column_definitions(
             ii += 1
         header_row = 3
 
-    elif variable["variable"].startswith("pr"):
-        variable["NaN_value"] = -9999.99
+    elif variable.startswith("pr"):
         column_names = [
             "Prov",
             "Station name",
@@ -125,146 +141,12 @@ def _column_definitions(
     return column_names, column_spaces, header_row
 
 
-def convert_ahccd(
-    data_source: str | Path,
-    output_dir: str | Path,
-    variable: str,
-    generation: int | None = None,
-) -> None:
-    """Convert Adjusted and Homogenized Canadian Climate Dataset files.
-
-    Parameters
-    ----------
-    data_source: str or Path
-    output_dir: str or Path
-    variable: str
-    generation: int, optional
-
-    Returns
-    -------
-    None
-    """
-    output_dir = Path(output_dir).resolve().joinpath(variable)
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    code = dict(tasmax="dx", tasmin="dn", tas="dm", pr="dt", prsn="ds", prlp="dr").get(
-        variable
-    )
-
-    attrs = _ahccd_metadata(generation)
-    var, col_names, col_spaces, header_row, global_attrs = cf_ahccd_metadata(
-        code, generation
-    )
-    gen = {2: "Second", 3: "Third"}.get(generation)
-    if generation == 3 and code in {"dx", "dn", "dm"}:
-        meta = "ahccd_gen3_temperature.csv"
-    elif generation == 2 and code in {"dt", "ds", "dr"}:
-        meta = "ahccd_gen2_precipitation.csv"
-
-    else:
-        raise NotImplementedError(f"Code '{code} for generation {gen}.")
-    metadata_source = Path(__file__).resolve().parent.joinpath("configs").joinpath(meta)
-
-    if "tas" in variable:
-        metadata = pd.read_csv(metadata_source, header=2)
-        metadata.columns = col_names.keys()
-        cols_specs = col_spaces
-
-    elif "pr" in variable:
-        metadata = pd.read_csv(metadata_source, header=3)
-        metadata.columns = col_names.keys()
-        cols_specs = col_spaces
-        for index, row in metadata.iterrows():
-            if isinstance(row["stnid"], str):
-                metadata.loc[index, "stnid"] = metadata.loc[index, "stnid"].replace(
-                    " ", ""
-                )
-    else:
-        raise KeyError(f"{variable} does not include 'pr' or 'tas'.")
-
-    # Convert station .txt files to netcdf
-    for ff in Path(data_source).glob("*d*.txt"):
-        outfile = output_dir.joinpath(ff.name.replace(".txt", ".nc"))
-        if not outfile.exists():
-            logger.info(ff.name)
-
-            stid = ff.name.replace(code, "").split(".txt")[0]
-            try:
-                metadata_st = metadata[metadata["stnid"] == int(stid)]
-            except ValueError:
-                metadata_st = metadata[metadata["stnid"] == stid]
-
-            if len(metadata_st) == 1:
-                ds_out = convert_ahccd_fwf_files(
-                    ff, metadata_st, variable, generation, cols_specs, var
-                )
-                ds_out.attrs = global_attrs
-
-                ds_out.to_netcdf(outfile, engine="h5netcdf")
-            else:
-                logger.warning(
-                    f"metadata info for station {ff.name} not found : skipping"
-                )
-
-    # merge individual stations to single .nc file
-    # variable
-    ncfiles = list(output_dir.glob("*.nc"))
-    outfile = output_dir.parent.joinpath(
-        "merged_stations", f"ahccd_gen{generation}_{variable}.nc"
-    )
-
-    if not outfile.exists():
-        logger.info("merging stations :", variable)
-        with ProgressBar():
-            ds_ahccd = xr.open_mfdataset(
-                ncfiles, concat_dim="station", combine="nested"
-            ).load()
-
-            for coord in ds_ahccd.coords:
-                # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files
-                # Do not apply to datetime object
-                if coord != "time" and ds_ahccd[coord].dtype == "O":
-                    ds_ahccd[coord] = ds_ahccd[coord].astype(str)
-
-            for v in ds_ahccd.data_vars:
-                # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files
-                # Do not apply to flag timeseries
-                if ds_ahccd[v].dtype == "O" and "flag" not in v:
-                    logger.info(v)
-                    ds_ahccd[v] = ds_ahccd[v].astype(str)
-
-            ds_ahccd[f"{variable}_flag"].attrs[
-                "long_name"
-            ] = f"{ds_ahccd[f'{variable}'].attrs['long_name']} flag"
-            ds_ahccd.lon.attrs["units"] = "degrees_east"
-            ds_ahccd.lon.attrs["long_name"] = "longitude"
-            ds_ahccd.lat.attrs["units"] = "degrees_north"
-            ds_ahccd.lat.attrs["long_name"] = "latitude"
-
-            for clean_name, orig_name in col_names.items():
-                if clean_name in ["lat", "long"]:
-                    continue
-                ds_ahccd[clean_name].attrs["long_name"] = orig_name
-
-            outfile.parent.mkdir(parents=True, exist_ok=True)
-            ds_ahccd.to_netcdf(
-                outfile, engine="h5netcdf", format="NETCDF4_CLASSIC", mode="w"
-            )
-
-            del ds_ahccd
-    for nc in outfile.parent.glob("*.nc"):
-        logger.info(nc)
-        ds = xr.open_dataset(nc)
-        logger.info(ds)
-
-
 def convert_ahccd_fwf_files(
     ff: Path | str,
     metadata: pd.DataFrame,
     variable: str,
-    generation: int = None,
-    cols_specs: list[tuple[int, int]] | None = None,
-    attrs: dict | None = None,
+    *,
+    generation: int,
 ) -> xr.Dataset:
     """Convert AHCCD fixed-width files.
 
@@ -273,9 +155,7 @@ def convert_ahccd_fwf_files(
     ff: str or Path
     metadata: pandas.DataFrame
     variable: str
-    generation
-    cols_specs
-    attrs
+    generation: int
 
     Returns
     -------
@@ -285,26 +165,25 @@ def convert_ahccd_fwf_files(
         variable
     )
 
-    if attrs is None:
-        attrs = _ahccd_metadata(generation)
-    if cols_specs is None:
-        _, _, cols_specs, _, _ = cf_ahccd_metadata(code, generation)
-    _, _, _, nhead, _ = cf_ahccd_metadata(code, generation)
+    variable_meta, global_attrs = _ahccd_variable_metadata(code, generation)
+    col_names, cols_specs, header = _ahccd_column_definitions(code)
 
-    df = pd.read_fwf(ff, header=nhead, colspecs=cols_specs)
+    df = pd.read_fwf(ff, header=header, colspecs=cols_specs)
     if "pr" in variable:
         cols = list(df.columns[0:3])
         cols = cols[0::2]
         cols.extend(list(df.columns[4::2]))
         flags = list(df.columns[5::2])
         dfflags = df[flags]
-    else:
+    elif "tas" in variable:
         cols = [c for c in df.columns if "Unnamed" not in c]
         flags = [c for c in df.columns if "Unnamed" in c]
         dfflags = df[flags[2:]]
+    else:
+        raise NotImplementedError(f"Variable `{variable}` not supported.")
 
     df = df[cols]
-    df.replace(attrs["NaN_value"], np.NaN, inplace=True)
+    df.replace(variable_meta[variable]["NaN_value"], np.NaN, inplace=True)
 
     for i, j in enumerate(["Year", "Month"]):
         df = df.rename(columns={df.columns[i]: j})
@@ -316,17 +195,19 @@ def convert_ahccd_fwf_files(
 
     index = pd.MultiIndex.from_arrays([df["Year"], df["Month"]])
     df.index = index
-    dfflags.index = index
     cols = [c for c in df.columns if "Year" not in c and "Month" not in c]
     df = df[cols]
     df.columns = np.arange(1, 32)
-    dfflags.columns = np.arange(1, 32)
     ds = df.stack().to_frame()
     ds = ds.rename(columns={0: variable})
+    ds.index.names = ["Year", "Month", "Day"]
+
+    dfflags.index = index
+    dfflags.columns = np.arange(1, 32)
     ds_flag = dfflags.stack().to_frame()
     ds_flag = ds_flag.rename(columns={0: "flag"})
-    ds.index.names = ["Year", "Month", "Day"]
     ds_flag.index.names = ["Year", "Month", "Day"]
+
     ds[f"{variable}_flag"] = ds_flag["flag"]
     del ds_flag
 
@@ -355,15 +236,12 @@ def convert_ahccd_fwf_files(
     )
 
     ds.index = pd.to_datetime(time_ds)
-
     ds = ds.to_xarray().rename({"index": "time"})
-
     ds_out = xr.Dataset(coords={"time": time1})
     for v in ds.data_vars:
         ds_out[v] = ds[v]
 
-    ds_out[variable].attrs = attrs
-    # ds_out
+    ds_out[variable].attrs = variable_meta[variable]
     metadata = metadata.to_xarray().rename({"index": "station"}).drop_vars("station")
     metadata = metadata.assign_coords(
         {
@@ -371,17 +249,19 @@ def convert_ahccd_fwf_files(
             "station_name": metadata["station_name"],
         }
     )
-    # ds_out = ds_out.assign_coords({'lon': metadata['long'], 'lat': metadata['lat'], 'elevation': metadata['elev']})
-    #
     ds_out = ds_out.assign_coords(station=metadata.stnid)
     metadata = metadata.drop_vars(["stnid", "station_name"])
 
+    ds_out[f"{variable}_flag"].attrs["long_name"] = variable_meta[variable]["long_name"]
     ds_out["lon"] = metadata["long"]
-    ds_out["lon"].attrs["units"] = "degrees_east"
+    ds_out.lon.attrs["units"] = "degrees_east"
+    ds_out.lon.attrs["axis"] = "X"
     ds_out["lat"] = metadata["lat"]
-    ds_out["lat"].attrs["units"] = "degrees_north"
+    ds_out.lat.attrs["units"] = "degrees_north"
+    ds_out.lat.attrs["axis"] = "Y"
     ds_out["elev"] = metadata["elev"]
-    ds_out["elev"].attrs["units"] = "m"
+    ds_out.elev.attrs["units"] = "m"
+    ds_out.elev.attrs["axis"] = "Z"
 
     metadata = metadata.drop_vars(["long", "lat", "elev"])
     for vv in metadata.data_vars:
@@ -390,3 +270,128 @@ def convert_ahccd_fwf_files(
         else:
             ds_out[vv] = metadata[vv]
     return ds_out
+
+
+def convert_ahccd(
+    data_source: str | Path,
+    output_dir: str | Path,
+    variable: str,
+    generation: int,
+) -> None:
+    """Convert Adjusted and Homogenized Canadian Climate Dataset files.
+
+    Parameters
+    ----------
+    data_source: str or Path
+    output_dir: str or Path
+    variable: str
+    generation: int
+
+    Returns
+    -------
+    None
+    """
+    output_dir = Path(output_dir).resolve().joinpath(variable)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    code = dict(tasmax="dx", tasmin="dn", tas="dm", pr="dt", prsn="ds", prlp="dr").get(
+        variable
+    )
+
+    var_meta, global_attrs = _ahccd_variable_metadata(code, generation)
+
+    (
+        col_names,
+        col_spaces,
+        header_row,
+    ) = _ahccd_column_definitions(code)
+
+    gen = {2: "Second", 3: "Third"}.get(generation)
+    if generation == 3 and code in {"dx", "dn", "dm"}:
+        station_meta = "ahccd_gen3_temperature.csv"
+    elif generation == 2 and code in {"dt", "ds", "dr"}:
+        station_meta = "ahccd_gen2_precipitation.csv"
+
+    else:
+        raise NotImplementedError(f"Code '{code} for generation {gen}.")
+    metadata_source = (
+        Path(__file__).resolve().parent.joinpath("configs").joinpath(station_meta)
+    )
+
+    if "tas" in variable:
+        metadata = pd.read_csv(metadata_source, header=2)
+        metadata.columns = col_names.keys()
+
+    elif "pr" in variable:
+        metadata = pd.read_csv(metadata_source, header=3)
+        metadata.columns = col_names.keys()
+        for index, row in metadata.iterrows():
+            if isinstance(row["stnid"], str):
+                metadata.loc[index, "stnid"] = metadata.loc[index, "stnid"].replace(
+                    " ", ""
+                )
+    else:
+        raise KeyError(f"{variable} does not include 'pr' or 'tas'.")
+
+    # Convert station .txt files to netcdf
+    for ff in Path(data_source).glob(f"{code}*.txt"):
+        outfile = output_dir.joinpath(ff.name.replace(".txt", ".nc"))
+        if not outfile.exists():
+            logger.info(ff.name)
+
+            station_id = ff.name[2:].split(".txt")[0]
+            try:
+                metadata_st = metadata[metadata["stnid"] == int(station_id)]
+            except ValueError:
+                metadata_st = metadata[metadata["stnid"] == station_id]
+
+            if len(metadata_st) == 1:
+                ds_out = convert_ahccd_fwf_files(
+                    ff, metadata_st, variable, generation=generation
+                )
+                ds_out.attrs = global_attrs
+
+                ds_out.to_netcdf(outfile, engine="h5netcdf")
+            else:
+                logger.warning(
+                    f"metadata info for station {ff.name} not found : skipping"
+                )
+
+    # merge individual stations to single .nc file
+    # variable
+    ncfiles = list(output_dir.glob("*.nc"))
+    outfile = output_dir.parent.joinpath(
+        "merged_stations", f"ahccd_gen{generation}_{variable}.nc"
+    )
+
+    if not outfile.exists():
+        logger.info("merging stations :", variable)
+        ds_ahccd = xr.open_mfdataset(
+            ncfiles, concat_dim="station", combine="nested"
+        ).load()
+
+        for coord in ds_ahccd.coords:
+            # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files
+            # Do not apply to datetime object
+            if coord != "time" and ds_ahccd[coord].dtype == "O":
+                ds_ahccd[coord] = ds_ahccd[coord].astype(str)
+
+        for v in ds_ahccd.data_vars:
+            # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files
+            # Do not apply to flag timeseries
+            if ds_ahccd[v].dtype == "O" and "flag" not in v:
+                logger.info(v)
+                ds_ahccd[v] = ds_ahccd[v].astype(str)
+
+        # for clean_name, orig_name in col_names.items():
+        #     if clean_name in ["lat", "long"]:
+        #         continue
+        #     ds_ahccd[clean_name].attrs["long_name"] = orig_name
+
+        outfile.parent.mkdir(parents=True, exist_ok=True)
+        ds_ahccd.to_netcdf(outfile, engine="h5netcdf", mode="w")
+        del ds_ahccd
+    for nc in outfile.parent.glob("*.nc"):
+        logger.info(nc)
+        ds = xr.open_dataset(nc)
+        logger.info(ds)
diff --git a/miranda/eccc/data/ahccd_gen2_precipitation.csv b/miranda/preprocess/configs/ahccd_gen2_precipitation.csv
similarity index 100%
rename from miranda/eccc/data/ahccd_gen2_precipitation.csv
rename to miranda/preprocess/configs/ahccd_gen2_precipitation.csv
diff --git a/miranda/eccc/data/ahccd_gen3_temperature.csv b/miranda/preprocess/configs/ahccd_gen3_temperature.csv
similarity index 100%
rename from miranda/eccc/data/ahccd_gen3_temperature.csv
rename to miranda/preprocess/configs/ahccd_gen3_temperature.csv
diff --git a/miranda/preprocess/configs/eccc-homogenized_attrs.json b/miranda/preprocess/configs/eccc-homogenized_attrs.json
index 539ad40c..b1861502 100644
--- a/miranda/preprocess/configs/eccc-homogenized_attrs.json
+++ b/miranda/preprocess/configs/eccc-homogenized_attrs.json
@@ -6,12 +6,7 @@
         "Third": "Vincent, L.A., M.M. Hartwell and X.L. Wang, 2020: A Third Generation of Homogenized Temperature for Trend Analysis and Monitoring Changes in Canada’s Climate. Atmosphere-Ocean. https://doi.org/10.1080/07055900.2020.1765728"
       }
     },
-    "_converter": true,
     "_miranda_version": true,
-    "_missing_values": [
-      "-999",
-      "1e20"
-    ],
     "_product": {
       "generation": {
         "Second": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 2",
@@ -21,6 +16,7 @@
     "acknowledgement": "This data is provided by Environment and Climate Change Canada (ECCC).",
     "author": "Environment and Climate Change Canada (ECCC)",
     "contact": "info.cccs-ccsc@canada.ca",
+    "dataset_id": "d6813de6-b20a-46cc-8990-01862ae15c5f",
     "documentation": "https://www.canada.ca/en/environment-climate-change/services/climate-change/canadian-centre-climate-services/display-download/technical-documentation-adjusted-climate-data.html",
     "institution": "GovCan",
     "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html",
@@ -30,7 +26,82 @@
     "processing_level": "adjusted",
     "realm": "atmos",
     "source": "AHCCD",
-    "table_date": "2023-03-23",
+    "table_date": "2023-08-03",
     "table_id": "ECCC"
+  },
+  "variables": {
+    "dm": {
+      "NaN_value": -9999.9,
+      "_variable_name": "tas",
+      "cell_methods": "time: mean",
+      "comments": "Station data converted from Mean Temp (°C)",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Near-Surface Air Temperature",
+      "missing_flags": "M",
+      "original_field": "Mean Temp (°C)",
+      "units": "degC"
+    },
+    "dn": {
+      "NaN_value": -9999.9,
+      "_variable_name": "tasmin",
+      "cell_methods": "time: minimum",
+      "comments": "Station data converted from Min Temp (°C)",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Daily Minimum Near-Surface Air Temperature",
+      "missing_flags": "M",
+      "original_field": "Min Temp (°C)",
+      "units": "degC"
+    },
+    "dr": {
+      "NaN_value": -9999.99,
+      "_variable_name": "prlp",
+      "cell_methods": "time: mean",
+      "comments": "Station data converted from Total Rain (mm) using a density of 1000 kg/m³",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Liquid Precipitation",
+      "missing_flags": "M",
+      "original_field": "Total Rain (mm)",
+      "units": "mm"
+    },
+    "ds": {
+      "NaN_value": -9999.99,
+      "_variable_name": "prsn",
+      "cell_methods": "time: mean",
+      "comments": "station data converted from Total Snow (cm) using a density of 100 kg/m³",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Snowfall Flux",
+      "missing_flags": "M",
+      "original_field": "Total Snow (cm)",
+      "units": "mm"
+    },
+    "dt": {
+      "NaN_value": -9999.99,
+      "_variable_name": "pr",
+      "cell_methods": "time: mean",
+      "comments": "Station data converted from Total Precip (mm) using a density of 1000 kg/m³",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Precipitation",
+      "missing_flags": "M",
+      "original_field": "Total Precip (mm)",
+      "units": "mm"
+    },
+    "dx": {
+      "NaN_value": -9999.9,
+      "_variable_name": "tasmax",
+      "cell_methods": "time: maximum",
+      "comments": "station data converted from Max Temp (°C)",
+      "frequency": "day",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Daily Maximum Near-Surface Air Temperature",
+      "missing_flags": "M",
+      "original_field": "Max Temp (°C)",
+      "standard_name": "air_temperature",
+      "units": "degC"
+    }
   }
 }

From 53fc8f01357c69d8d6318cc51699d399af54bcef Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Mon, 7 Aug 2023 10:14:23 -0400
Subject: [PATCH 09/33] working version of ahccd conversion

---
 miranda/preprocess/_eccc_homogenized.py       |  37 ++-
 miranda/preprocess/_treatments.py             |   1 -
 .../configs/ahccd_gen2_precipitation.csv      | 302 +++++++++---------
 .../configs/eccc-homogenized_attrs.json       |   5 +-
 4 files changed, 175 insertions(+), 170 deletions(-)

diff --git a/miranda/preprocess/_eccc_homogenized.py b/miranda/preprocess/_eccc_homogenized.py
index 01530156..f400b5b1 100644
--- a/miranda/preprocess/_eccc_homogenized.py
+++ b/miranda/preprocess/_eccc_homogenized.py
@@ -16,7 +16,7 @@
 logging.config.dictConfig(LOGGING_CONFIG)
 logger = logging.Logger("miranda")
 
-__all__ = ["convert_ahccd", "convert_ahccd_fwf_files"]
+__all__ = ["convert_ahccd", "convert_ahccd_fwf_file"]
 
 
 def _ahccd_variable_metadata(
@@ -55,16 +55,22 @@ def _ahccd_variable_metadata(
     header = metadata["Header"]
     # Conditional handling of global attributes based on generation
     for field in [f for f in header if f.startswith("_")]:
-        if isinstance(header[field], dict):
+        if isinstance(header[field], bool):
+            if header[field] and field[1:] == "variable":
+                header[field[1:]] = variable_name
+
+        elif isinstance(header[field], dict):
             attr_treatment = header[field]["generation"]
+            if field in ["_citation" "_product"]:
+                for attribute, value in attr_treatment.items():
+                    if attribute == generation:
+                        header[field[1:]] = value
+
         else:
             raise AttributeError(
                 f"Attribute treatment configuration for field `{field}` is not properly configured. Verify JSON."
             )
-        if field in ["_citation" "_product"]:
-            for attribute, value in attr_treatment.items():
-                if attribute == generation:
-                    header[field[1:]] = value
+
         del header[field]
 
     return variable_meta, header
@@ -141,7 +147,7 @@ def _ahccd_column_definitions(
     return column_names, column_spaces, header_row
 
 
-def convert_ahccd_fwf_files(
+def convert_ahccd_fwf_file(
     ff: Path | str,
     metadata: pd.DataFrame,
     variable: str,
@@ -235,7 +241,7 @@ def convert_ahccd_fwf_files(
         }
     )
 
-    ds.index = pd.to_datetime(time_ds)
+    ds.index = pd.to_datetime(time_ds)  # noqa
     ds = ds.to_xarray().rename({"index": "time"})
     ds_out = xr.Dataset(coords={"time": time1})
     for v in ds.data_vars:
@@ -277,6 +283,7 @@ def convert_ahccd(
     output_dir: str | Path,
     variable: str,
     generation: int,
+    merge: bool = False,
 ) -> None:
     """Convert Adjusted and Homogenized Canadian Climate Dataset files.
 
@@ -286,6 +293,7 @@ def convert_ahccd(
     output_dir: str or Path
     variable: str
     generation: int
+    merge: bool
 
     Returns
     -------
@@ -299,7 +307,6 @@ def convert_ahccd(
     )
 
     var_meta, global_attrs = _ahccd_variable_metadata(code, generation)
-
     (
         col_names,
         col_spaces,
@@ -346,7 +353,7 @@ def convert_ahccd(
                 metadata_st = metadata[metadata["stnid"] == station_id]
 
             if len(metadata_st) == 1:
-                ds_out = convert_ahccd_fwf_files(
+                ds_out = convert_ahccd_fwf_file(
                     ff, metadata_st, variable, generation=generation
                 )
                 ds_out.attrs = global_attrs
@@ -356,10 +363,11 @@ def convert_ahccd(
                 logger.warning(
                     f"metadata info for station {ff.name} not found : skipping"
                 )
+    if not merge:
+        return
 
     # merge individual stations to single .nc file
-    # variable
-    ncfiles = list(output_dir.glob("*.nc"))
+    ncfiles = list(output_dir.glob(f"{code}*.nc"))
     outfile = output_dir.parent.joinpath(
         "merged_stations", f"ahccd_gen{generation}_{variable}.nc"
     )
@@ -383,11 +391,6 @@ def convert_ahccd(
                 logger.info(v)
                 ds_ahccd[v] = ds_ahccd[v].astype(str)
 
-        # for clean_name, orig_name in col_names.items():
-        #     if clean_name in ["lat", "long"]:
-        #         continue
-        #     ds_ahccd[clean_name].attrs["long_name"] = orig_name
-
         outfile.parent.mkdir(parents=True, exist_ok=True)
         ds_ahccd.to_netcdf(outfile, engine="h5netcdf", mode="w")
         del ds_ahccd
diff --git a/miranda/preprocess/_treatments.py b/miranda/preprocess/_treatments.py
index 80d3fd45..9e667440 100644
--- a/miranda/preprocess/_treatments.py
+++ b/miranda/preprocess/_treatments.py
@@ -22,7 +22,6 @@ def basic_metadata_conversion(
     -------
     xarray.Dataset
     """
-    logging.info("Converting metadata.")
     header = metadata["Header"]
 
     # Static handling of version global attributes
diff --git a/miranda/preprocess/configs/ahccd_gen2_precipitation.csv b/miranda/preprocess/configs/ahccd_gen2_precipitation.csv
index ce59df01..6f0c0f3a 100644
--- a/miranda/preprocess/configs/ahccd_gen2_precipitation.csv
+++ b/miranda/preprocess/configs/ahccd_gen2_precipitation.csv
@@ -8,23 +8,23 @@ BC,ARMSTRONG HULLCAR,1160483,1912,1,1998,12,50.5,-119.216666666667,505,Yes
 BC,ATLIN,1200560,1906,1,2017,12,59.5666666666667,-133.7,674,No
 BC,BARKERVILLE,1090660,1888,1,2015,3,53.0691666666667,-121.514722222222,1283,No
 BC,BEAVERDELL NORTH,1130771,1926,1,2006,9,49.4783333333333,-119.047,838,Yes
-BC,BELLA COOLA ,1060841,1899,1,2017,11,52.3875,-126.595833333333,36,Yes
+BC,BELLA COOLA,1060841,1899,1,2017,11,52.3875,-126.595833333333,36,Yes
 BC,BIG CREEK,1080870,1904,1,1998,11,51.6672236111111,-123.073056944444,1175,No
-BC,BLUE RIVER ,1160899,1929,1,2017,12,52.1290277777778,-119.289527777778,683,Yes
+BC,BLUE RIVER,1160899,1929,1,2017,12,52.1290277777778,-119.289527777778,683,Yes
 BC,BRISCO,1171020,1924,1,2004,3,50.8205555555556,-116.258055555556,823,No
 BC,BRITANNIA BEACH FURRY CREEK,1041050,1914,1,2000,4,49.5838888888889,-123.223611111111,9,Yes
 BC,BURQUITLAM VANCOUVER GOLF COURSE,1101200,1926,1,2005,12,49.2516666666667,-122.876944444444,122,Yes
 BC,CAPE SCOTT,1031353,1921,1,2016,6,50.7822333333333,-128.427227777778,72,Yes
 BC,CAPE ST JAMES,1051350,1926,1,1992,8,51.9333333333333,-131.016666666667,89,No
 BC,CASSIAR,1191440,1954,1,1996,8,59.2833333333333,-129.833333333333,1078,No
-BC,CELISTA,116146F             ,1924,1,2004,7,50.9555555555556,-119.379444444444,515,Yes
+BC,CELISTA,116146F,1924,1,2004,7,50.9555555555556,-119.379444444444,515,Yes
 BC,CHATHAM POINT,1021480,1932,1,2016,2,50.3331944444444,-125.445555555556,23,Yes
-BC,COMOX ,1021830,1936,1,2017,12,49.7166666666667,-124.9,26,Yes
+BC,COMOX,1021830,1936,1,2017,12,49.7166666666667,-124.9,26,Yes
 BC,CORTES ISLAND TIBER BAY,1021960,1919,1,2017,12,50.0713888888889,-124.949444444444,15,Yes
-BC,CRANBROOK ,1152102,1909,1,2012,11,49.6122222222222,-115.781944444444,939,Yes
+BC,CRANBROOK,1152102,1909,1,2012,11,49.6122222222222,-115.781944444444,939,Yes
 BC,CRESTON,1142160,1912,1,2015,6,49.0970555555556,-116.517833333333,597,No
 BC,DARFIELD,1162265,1914,1,2017,11,51.2973333333333,-120.182666666667,412,Yes
-BC,DAWSON CREEK ,1182285,1952,1,2007,2,55.7416666666667,-120.181944444444,655,Yes
+BC,DAWSON CREEK,1182285,1952,1,2007,2,55.7416666666667,-120.181944444444,655,Yes
 BC,DEASE LAKE,1192340,1945,1,2008,7,58.428335,-130.010556666667,807,No
 BC,DEER PARK,1142400,1924,1,1995,9,49.4166666666667,-118.05,485,No
 BC,DRYAD POINT,1062544,1933,1,2017,12,52.1850005555556,-128.112224444444,4,Yes
@@ -33,12 +33,12 @@ BC,ESTEVAN POINT,1032730,1924,1,2017,12,49.3835,-126.550833333333,7,No
 BC,FALLS RIVER,1062790,1932,1,1992,10,53.9833333333333,-129.733333333333,18,No
 BC,FAUQUIER,1142820,1913,1,2015,6,49.8719444444444,-118.0675,490,No
 BC,FERNIE,1152850,1914,1,2017,12,49.4888888888889,-115.072222222222,1001,No
-BC,FORT NELSON ,1192940,1938,1,2012,11,58.8363888888889,-122.597222222222,382,No
+BC,FORT NELSON,1192940,1938,1,2012,11,58.8363888888889,-122.597222222222,382,No
 BC,FORT ST JAMES,1092970,1895,1,2017,12,54.4552802777778,-124.285556111111,686,No
-BC,FORT ST JOHN ,1183000,1931,1,2012,12,56.2380555555556,-120.740277777778,695,Yes
+BC,FORT ST JOHN,1183000,1931,1,2012,12,56.2380555555556,-120.740277777778,695,Yes
 BC,GERMANSEN LANDING,1183090,1952,1,2013,11,55.7855277777778,-124.701444444444,766,No
 BC,GLACIER NP ROGERS PASS,1173191,1909,1,2014,7,51.3009166666667,-117.516388888889,1323,Yes
-BC,GOLDEN ,1173210,1908,1,2017,12,51.2983333333333,-116.981666666667,785,No
+BC,GOLDEN,1173210,1908,1,2017,12,51.2983333333333,-116.981666666667,785,No
 BC,GRAND FORKS,1133270,1910,1,2008,3,49.0261666666667,-118.465666666667,532,Yes
 BC,GRASMERE,1153282,1896,1,1993,11,49.0833333333333,-115.066666666667,869,Yes
 BC,HAZELTON TEMLEHAN,1073347,1915,1,1997,4,55.2,-127.733333333333,122,Yes
@@ -58,7 +58,7 @@ BC,MASSET AIRPORT,1054920,1900,1,2008,6,54.0226111111111,-132.117472222222,7,Yes
 BC,MCINNES ISLAND,1065010,1954,1,2017,12,52.2616666666667,-128.719444444444,26,No
 BC,MERRITT STP,1125079,1919,1,2017,12,50.1141677777778,-120.800834722222,609,Yes
 BC,MICA DAM,1175122,1962,1,2017,12,52.0530555555556,-118.585277777778,579,No
-BC,NANAIMO CITY YARD,10253G0             ,1913,1,2017,12,49.1988888888889,-123.987777777778,114,Yes
+BC,NANAIMO CITY YARD,10253G0,1913,1,2017,12,49.1988888888889,-123.987777777778,114,Yes
 BC,NASS CAMP,1075384,1924,1,2015,2,55.2375,-129.029444444444,290,Yes
 BC,NELSON NE,1145442,1904,1,2017,12,49.5861111111111,-117.206388888889,570,Yes
 BC,NEW DENVER,1145460,1924,1,2017,12,49.995835,-117.370285,570,No
@@ -68,186 +68,186 @@ BC,OOTSA L SKINS L SPILLWAY,1085835,1926,1,2017,7,53.7721666666667,-125.99655555
 BC,OSOYOOS WEST,1125865,1954,1,2009,9,49.0319444444444,-119.442777777778,297,Yes
 BC,PACHENA POINT,1035940,1925,1,2017,12,48.7227777777778,-125.097222222222,37,No
 BC,PEMBERTON AIRPORT,1086082,1913,1,1991,6,50.3056461111111,-122.734088888889,204,Yes
-BC,PENTICTON ,1126150,1907,1,2012,5,49.4630555555556,-119.602222222222,344,Yes
+BC,PENTICTON,1126150,1907,1,2012,5,49.4630555555556,-119.602222222222,344,Yes
 BC,PORT ALICE,1036240,1924,1,2016,4,50.3858361111111,-127.455286111111,21,No
-BC,PORT HARDY ,1026270,1944,1,2013,6,50.6802777777778,-127.366111111111,22,No
+BC,PORT HARDY,1026270,1944,1,2013,6,50.6802777777778,-127.366111111111,22,No
 BC,POWELL RIVER,1046390,1924,1,2007,7,49.8761111111111,-124.554166666667,52,No
-BC,PRINCE GEORGE ,1096450,1913,1,2009,10,53.8908333333333,-122.678888888889,691,Yes
-BC,PRINCE RUPERT ,1066481,1909,1,2006,3,54.2925,-130.444722222222,35,Yes
-BC,PRINCETON ,1126510,1901,1,2017,12,49.4677777777778,-120.5125,700,Yes
+BC,PRINCE GEORGE,1096450,1913,1,2009,10,53.8908333333333,-122.678888888889,691,Yes
+BC,PRINCE RUPERT,1066481,1909,1,2006,3,54.2925,-130.444722222222,35,Yes
+BC,PRINCETON,1126510,1901,1,2017,12,49.4677777777778,-120.5125,700,Yes
 BC,QUATSINO,1036570,1895,1,2017,12,50.5336138888889,-127.653335833333,8,No
-BC,QUESNEL ,1096630,1900,1,2007,3,53.0261111111111,-122.51,545,Yes
+BC,QUESNEL,1096630,1900,1,2007,3,53.0261111111111,-122.51,545,Yes
 BC,QUINSAM RIVER HATCHERY,1026639,1936,1,2017,12,50.0161111111111,-125.303888888889,46,Yes
 BC,REVELSTOKE,1176751,1898,1,1999,8,50.9533333333333,-118.166388888889,450,Yes
 BC,SAANICHTON,1016940,1914,1,2017,12,48.6216666666667,-123.418888888889,61,No
-BC,SALMON RM ,1166R45             ,1911,1,2013,2,50.6855577777778,-119.233613611111,527,Yes
-BC,SANDSPIT ,1057050,1949,1,2017,3,53.2538888888889,-131.813055555556,6,No
+BC,SALMON RM,1166R45,1911,1,2013,2,50.6855577777778,-119.233613611111,527,Yes
+BC,SANDSPIT,1057050,1949,1,2017,3,53.2538888888889,-131.813055555556,6,No
 BC,SEYMOUR FALLS,1107200,1928,1,2003,9,49.4402777777778,-122.971111111111,244,No
 BC,SHALALTH,1117215,1935,1,2004,4,50.7283333333333,-122.240555555556,244,Yes
 BC,SHAWNIGAN LAKE,1017230,1911,1,2017,12,48.6469472222222,-123.626408333333,138,No
-BC,SMITHERS ,1077500,1922,1,2017,12,54.8247222222222,-127.182777777778,522,Yes
+BC,SMITHERS,1077500,1922,1,2017,12,54.8247222222222,-127.182777777778,522,Yes
 BC,STAVE FALLS,1107680,1910,1,2004,8,49.2333333333333,-122.366666666667,110,No
-BC,STEWART ,1067742,1911,1,2016,6,55.9361111111111,-129.985,7,Yes
+BC,STEWART,1067742,1911,1,2016,6,55.9361111111111,-129.985,7,Yes
 BC,STILLWATER POWER HOUSE,1047770,1931,1,2007,7,49.7666666666667,-124.316666666667,7,No
 BC,TATLAYOKO LAKE,1088010,1928,1,2005,4,51.6747222222222,-124.405,870,No
-BC,TERRACE ,1068130,1913,1,2013,1,54.4663888888889,-128.5775,217,Yes
+BC,TERRACE,1068130,1913,1,2013,1,54.4663888888889,-128.5775,217,Yes
 BC,TLELL,1058190,1950,1,1999,1,53.5,-131.95,5,No
-BC,TOFINO ,1038205,1942,1,2017,12,49.0822222222222,-125.772505555556,24,No
+BC,TOFINO,1038205,1942,1,2017,12,49.0822222222222,-125.772505555556,24,No
 BC,UCLUELET KENNEDY CAMP,1038332,1958,1,2017,12,48.9452833333333,-125.527236111111,30,Yes
-BC,VANCOUVER ,1108447,1896,1,2013,6,49.195,-123.181944444444,4,Yes
+BC,VANCOUVER,1108447,1896,1,2013,6,49.195,-123.181944444444,4,Yes
 BC,VAVENBY,1168520,1913,1,2017,12,51.5761111111111,-119.778055555556,445,No
 BC,VERNON BELLA VISTA,1128553,1900,1,2015,6,50.2643611111111,-119.308861111111,427,Yes
-BC,VICTORIA ,1018620,1899,1,2013,7,48.647225,-123.425833333333,19,Yes
+BC,VICTORIA,1018620,1899,1,2013,7,48.647225,-123.425833333333,19,Yes
 BC,WARFIELD,1148700,1928,1,2002,12,49.1,-117.75,606,No
 BC,WASA,1158730,1924,1,2017,12,49.8239722222222,-115.630777777778,930,No
 BC,WESTWOLD,1168880,1921,1,2013,5,50.4688911111111,-119.750556388889,609,No
-BC,WILLIAMS LAKE ,1098940,1936,1,2012,12,52.1830555555556,-122.054166666667,940,Yes
+BC,WILLIAMS LAKE,1098940,1936,1,2012,12,52.1830555555556,-122.054166666667,940,Yes
 NU,ALERT,2400306,1950,1,2017,12,82.5,-62.3333333333333,65,Yes
-NU,BAKER LAKE ,2300500,1949,1,2013,11,64.2988888888889,-96.0777777777778,18,No
-YK,BURWASH ,2100182,1967,1,2015,2,61.3666666666667,-139.05,807,No
-NU,BYRON BAY ,2400595,1957,1,1993,6,68.75,-109.066666666667,92,No
-NU,CAMBRIDGE BAY ,2400600,1940,1,2015,2,69.1080555555556,-105.138333333333,27,No
-NU,CAPE DORSET ,2400635,1932,1,2014,11,64.2302777777778,-76.525,50,Yes
-NU,CAPE DYER ,2400654,1960,1,1993,3,66.5833333333333,-61.6166666666667,393,No
+NU,BAKER LAKE,2300500,1949,1,2013,11,64.2988888888889,-96.0777777777778,18,No
+YK,BURWASH,2100182,1967,1,2015,2,61.3666666666667,-139.05,807,No
+NU,BYRON BAY,2400595,1957,1,1993,6,68.75,-109.066666666667,92,No
+NU,CAMBRIDGE BAY,2400600,1940,1,2015,2,69.1080555555556,-105.138333333333,27,No
+NU,CAPE DORSET,2400635,1932,1,2014,11,64.2302777777778,-76.525,50,Yes
+NU,CAPE DYER,2400654,1960,1,1993,3,66.5833333333333,-61.6166666666667,393,No
 NU,CAPE HOOPER,2400660,1958,1,2007,9,68.4725,-66.8152777777778,390,No
-NT,CAPE PARRY ,2200675,1960,1,1993,3,70.1666666666667,-124.716666666667,87,No
+NT,CAPE PARRY,2200675,1960,1,1993,3,70.1666666666667,-124.716666666667,87,No
 YK,CARMACKS,2100300,1964,1,2008,2,62.1,-136.3,525,No
-NU,CHESTERFIELD INLET ,2300707,1931,1,2014,11,63.3469444444444,-90.7311111111111,10,Yes
+NU,CHESTERFIELD INLET,2300707,1931,1,2014,11,63.3469444444444,-90.7311111111111,10,Yes
 NU,CLINTON POINT,2300750,1957,1,1993,6,69.5833333333333,-120.8,101,No
-NU,CLYDE ,2400800,1946,1,2002,6,70.4861111111111,-68.5166666666667,27,No
-NU,CORAL HARBOUR ,2301000,1945,1,2015,5,64.1933333333333,-83.3594444444445,64,No
-YK,DAWSON ,2100402,1901,1,2015,2,64.0430555555556,-139.127777777778,370,Yes
+NU,CLYDE,2400800,1946,1,2002,6,70.4861111111111,-68.5166666666667,27,No
+NU,CORAL HARBOUR,2301000,1945,1,2015,5,64.1933333333333,-83.3594444444445,64,No
+YK,DAWSON,2100402,1901,1,2015,2,64.0430555555556,-139.127777777778,370,Yes
 NU,DEWAR LAKES,2401030,1958,1,1993,3,68.65,-71.1666666666667,527,No
 YK,DRURY CREEK,2100460,1970,1,2009,4,62.2019444444444,-134.39,609,No
 NU,EUREKA,2401200,1948,1,2016,2,79.9833333333333,-85.9333333333333,10,No
-NT,FORT GOOD HOPE ,2201400,1945,1,2014,11,66.2408333333333,-128.650833333333,82,No
-NT,FORT MCPHERSON ,2201601,1932,1,2014,11,67.4077777777778,-134.860277777778,35,Yes
+NT,FORT GOOD HOPE,2201400,1945,1,2014,11,66.2408333333333,-128.650833333333,82,No
+NT,FORT MCPHERSON,2201601,1932,1,2014,11,67.4077777777778,-134.860277777778,35,Yes
 NT,FORT RELIANCE,2201903,1949,1,2007,8,62.7113888888889,-109.168333333333,168,Yes
-NT,FORT RESOLUTION ,2202000,1931,1,2014,11,61.1808333333333,-113.689722222222,160,No
-NT,FORT SIMPSON ,2202101,1898,1,2014,10,61.7602777777778,-121.236666666667,169,Yes
-NT,FORT SMITH ,2202200,1915,1,2014,11,60.0202777777778,-111.961944444444,205,Yes
+NT,FORT RESOLUTION,2202000,1931,1,2014,11,61.1808333333333,-113.689722222222,160,No
+NT,FORT SIMPSON,2202101,1898,1,2014,10,61.7602777777778,-121.236666666667,169,Yes
+NT,FORT SMITH,2202200,1915,1,2014,11,60.0202777777778,-111.961944444444,205,Yes
 NU,FOX FIVE,2400570,1959,1,2007,9,67.5355555555556,-63.7888888888889,584,No
-NU,GLADMAN POINT ,2402340,1957,1,1992,7,68.6666666666667,-97.8,14,No
+NU,GLADMAN POINT,2402340,1957,1,1992,7,68.6666666666667,-97.8,14,No
 YK,HAINES JUNCTION,2100631,1945,1,2008,9,60.7495444444445,-137.50525,596,Yes
-NU,HALL BEACH ,2402350,1957,1,2014,12,68.7758333333333,-81.2425,8,No
-NT,HAY RIVER ,2202400,1909,1,2014,9,60.8397222222222,-115.782777777778,166,Yes
+NU,HALL BEACH,2402350,1957,1,2014,12,68.7758333333333,-81.2425,8,No
+NT,HAY RIVER,2202400,1909,1,2014,9,60.8397222222222,-115.782777777778,166,Yes
 NT,INUVIK,2202578,1957,1,2007,11,68.3166666666667,-133.516666666667,103,Yes
 NU,IQALUIT,2402592,1946,1,2007,11,63.7472222222222,-68.5444444444445,34,Yes
-NU,JENNY LIND ISLAND ,2302650,1958,1,1992,7,68.65,-101.733333333333,18,No
-YK,KOMAKUK BEACH ,2100685,1959,1,1993,6,69.5833333333333,-140.183333333333,7,No
-NU,KUGAARUK ,2303092,1957,1,2012,8,68.5405555555556,-89.7972222222222,17,Yes
-NU,KUGLUKTUK ,2300902,1931,1,2014,12,67.8166666666667,-115.143888888889,23,Yes
-NU,LADY FRANKLIN POINT ,2302680,1958,1,1993,3,68.5,-113.216666666667,16,No
+NU,JENNY LIND ISLAND,2302650,1958,1,1992,7,68.65,-101.733333333333,18,No
+YK,KOMAKUK BEACH,2100685,1959,1,1993,6,69.5833333333333,-140.183333333333,7,No
+NU,KUGAARUK,2303092,1957,1,2012,8,68.5405555555556,-89.7972222222222,17,Yes
+NU,KUGLUKTUK,2300902,1931,1,2014,12,67.8166666666667,-115.143888888889,23,Yes
+NU,LADY FRANKLIN POINT,2302680,1958,1,1993,3,68.5,-113.216666666667,16,No
 NU,LONGSTAFF BLUFF,2402684,1958,1,1991,6,68.8986111111111,-75.1408333333333,161,No
-NU,LUPIN,230N002             ,1959,1,2007,7,65.7552916666667,-111.245841666667,488,Yes
+NU,LUPIN,230N002,1959,1,2007,7,65.7552916666667,-111.245841666667,488,Yes
 NU,MACKAR INLET,2402686,1958,1,1992,5,68.3,-85.6666666666667,395,No
-YK,MAYO ,2100700,1925,1,2013,11,63.6166666666667,-135.866666666667,504,No
-NT,MOULD BAY,250M001             ,1948,1,2007,11,76.2375166666667,-119.347233333333,2,Yes
-NU,NANISIVIK ,2402730,1938,1,2010,12,72.9833333333333,-84.6166666666667,642,Yes
+YK,MAYO,2100700,1925,1,2013,11,63.6166666666667,-135.866666666667,504,No
+NT,MOULD BAY,250M001,1948,1,2007,11,76.2375166666667,-119.347233333333,2,Yes
+NU,NANISIVIK,2402730,1938,1,2010,12,72.9833333333333,-84.6166666666667,642,Yes
 NT,NICHOLSON PENINSULA,2202750,1958,1,1993,6,69.9333333333333,-128.966666666667,89,No
-NT,NORMAN WELLS ,2202800,1943,1,2012,10,65.2825,-126.800277777778,73,No
-YK,OLD CROW ,2100800,1952,1,2015,2,67.5705555555556,-139.839166666667,251,No
+NT,NORMAN WELLS,2202800,1943,1,2012,10,65.2825,-126.800277777778,73,No
+YK,OLD CROW,2100800,1952,1,2015,2,67.5705555555556,-139.839166666667,251,No
 YK,PELLY RANCH,2100880,1952,1,2015,3,62.8166666666667,-137.366666666667,454,No
 NU,RESOLUTE CARS,2403500,1948,1,2014,11,74.7169444444445,-94.9694444444445,67,No
 YK,ROSS RIVER YTG,2100941,1967,1,2008,2,61.9833333333333,-132.45,698,Yes
-NT,SACHS HARBOUR ,2503650,1956,1,2013,2,72,-125.266666666667,86,No
-NU,SHEPHERD BAY ,2303685,1957,1,1993,3,68.8166666666667,-93.4333333333333,43,No
-YK,SHINGLE POINT ,2100950,1957,1,1993,3,68.95,-137.216666666667,49,No
+NT,SACHS HARBOUR,2503650,1956,1,2013,2,72,-125.266666666667,86,No
+NU,SHEPHERD BAY,2303685,1957,1,1993,3,68.8166666666667,-93.4333333333333,43,No
+YK,SHINGLE POINT,2100950,1957,1,1993,3,68.95,-137.216666666667,49,No
 YK,SWIFT RIVER,2101081,1967,1,2008,2,60,-131.183333333333,891,No
-YK,TESLIN ,2101100,1944,1,2013,12,60.1741388888889,-132.735888888889,705,No
+YK,TESLIN,2101100,1944,1,2013,12,60.1741388888889,-132.735888888889,705,No
 YK,TUCHITUA,2101135,1967,1,2014,9,60.9333333333333,-129.216666666667,724,No
 NT,TUKTOYAKTUK,2203910,1957,1,1993,6,69.45,-133,18,No
-NT,TULITA ,2201700,1904,1,2014,12,64.9086111111111,-125.568333333333,101,No
-NT,ULUKHAKTOK ,2502501,1941,1,2010,6,70.7627777777778,-117.806111111111,36,Yes
-YK,WATSON LAKE ,2101200,1939,1,2014,12,60.1165,-128.822333333333,687,No
-YK,WHITEHORSE ,2101300,1942,1,2012,12,60.7095,-135.068833333333,706,No
+NT,TULITA,2201700,1904,1,2014,12,64.9086111111111,-125.568333333333,101,No
+NT,ULUKHAKTOK,2502501,1941,1,2010,6,70.7627777777778,-117.806111111111,36,Yes
+YK,WATSON LAKE,2101200,1939,1,2014,12,60.1165,-128.822333333333,687,No
+YK,WHITEHORSE,2101300,1942,1,2012,12,60.7095,-135.068833333333,706,No
 NT,WRIGLEY ,2204000,1944,1,2014,10,63.2094444444445,-123.436666666667,149,No
 NT,YELLOWKNIFE ,2204100,1943,1,2013,1,62.4627777777778,-114.440277777778,206,No
 NT,YOHIN,2204300,1957,1,2007,9,61.2419444444444,-123.741666666667,204,No
-AB,ATHABASCA,3060L20             ,1918,1,2017,12,54.7222230555556,-113.2880575,515,Yes
+AB,ATHABASCA,3060L20,1918,1,2017,12,54.7222230555556,-113.2880575,515,Yes
 AB,BANFF,3050519,1894,1,2007,11,51.1933583333333,-115.552236111111,1397,Yes
 AB,BEAVER MINES,3050600,1913,1,2012,3,49.4672277777778,-114.176955555556,1257,No
 AB,BEAVERLODGE,3070600,1916,1,2007,11,55.1966672222222,-119.396413888889,745,Yes
-AB,CALGARY ,3031093,1885,1,2012,7,51.1138888888889,-114.020277777778,1084,No
+AB,CALGARY,3031093,1885,1,2012,7,51.1138888888889,-114.020277777778,1084,No
 AB,CALMAR,3011120,1915,1,2016,12,53.2897241666667,-113.863057777778,720,No
 AB,CAMPSIE,3061200,1910,1,2013,10,54.1322227777778,-114.677778888889,671,No
 AB,CAMROSE,3011240,1946,1,2007,11,53.0347222222222,-112.814166666667,739,No
 AB,CARWAY,3031400,1915,1,2011,11,48.999725,-113.376111111111,1354,No
-AB,CLARESHOLM MEADOW CREEK,3031F5F             ,1913,1,2005,3,49.9375222222222,-113.737519444444,1035,No
-AB,COLD LAKE ,3081680,1926,1,2017,12,54.4166666666667,-110.283333333333,541,Yes
+AB,CLARESHOLM MEADOW CREEK,3031F5F,1913,1,2005,3,49.9375222222222,-113.737519444444,1035,No
+AB,COLD LAKE,3081680,1926,1,2017,12,54.4166666666667,-110.283333333333,541,Yes
 AB,CORONATION,3011887,1928,1,2007,11,52.0741666666667,-111.449444444444,791,Yes
-AB,CROWSNEST,3051R4R             ,1913,1,2007,11,49.627525,-114.48195,1303,Yes
+AB,CROWSNEST,3051R4R,1913,1,2007,11,49.627525,-114.48195,1303,Yes
 AB,DRUMHELLER ANDREW,3022136,1954,1,2008,3,51.4666666666667,-112.866666666667,719,No
-AB,EDMONTON ,3012205,1883,1,2012,4,53.3166666666667,-113.583333333333,723,Yes
+AB,EDMONTON,3012205,1883,1,2012,4,53.3166666666667,-113.583333333333,723,Yes
 AB,EDSON,3062246,1920,1,2007,11,53.5802797222222,-116.453335277778,927,Yes
 AB,ELK POINT,3012280,1913,1,1997,6,53.8833333333333,-111.066666666667,605,No
 AB,ENILDA-BERG,3062427,1932,1,2005,4,55.4166666666667,-116.3,591,Yes
 AB,FAIRVIEW THREE FOX FARM,3072539,1932,1,1999,12,56.0833333333333,-118.533333333333,604,Yes
-AB,FORT CHIPEWYAN ,3072658,1884,1,2007,8,58.7666666666667,-111.116666666667,232,Yes
-AB,FORT MCMURRAY ,3062693,1920,1,2007,11,56.65,-111.216666666667,369,Yes
+AB,FORT CHIPEWYAN,3072658,1884,1,2007,8,58.7666666666667,-111.116666666667,232,Yes
+AB,FORT MCMURRAY,3062693,1920,1,2007,11,56.65,-111.216666666667,369,Yes
 AB,FORT VERMILION,3072723,1909,1,2007,11,58.3823055555556,-116.040166666667,289,Yes
 AB,GLEICHEN,3032800,1903,1,2006,3,50.8833333333333,-113.05,905,No
-AB,GRANDE PRAIRIE ,3072920,1931,1,2013,9,55.1797222222222,-118.885,669,Yes
+AB,GRANDE PRAIRIE,3072920,1931,1,2013,9,55.1797222222222,-118.885,669,Yes
 AB,HIGHWOOD AU,3053250,1903,1,2011,9,50.5511111111111,-114.370555555556,1580,Yes
-AB,HINTON VALLEY,306A009             ,1917,1,2017,12,53.40381,-117.537620277778,1011,Yes
+AB,HINTON VALLEY,306A009,1917,1,2017,12,53.40381,-117.537620277778,1011,Yes
 AB,JASPER WARDEN,3053536,1936,1,2007,11,52.9263888888889,-118.029722222222,1020,Yes
 AB,JENNER,3023560,1916,1,2008,1,50.7222277777778,-111.195852777778,755,No
 AB,KEG RIVER,3073641,1936,1,2009,1,57.75,-117.616666666667,405,Yes
 AB,LACOMBE,3023722,1908,1,2007,11,52.4488905555556,-113.755834722222,860,Yes
-AB,LETHBRIDGE ,3033880,1902,1,2007,8,49.6302777777778,-112.799722222222,929,Yes
-AB,MEDICINE HAT ,3034480,1886,1,2006,5,50.0188888888889,-110.720833333333,717,No
+AB,LETHBRIDGE,3033880,1902,1,2007,8,49.6302777777778,-112.799722222222,929,Yes
+AB,MEDICINE HAT,3034480,1886,1,2006,5,50.0188888888889,-110.720833333333,717,No
 AB,MOUNTAIN VIEW,3034720,1913,1,2006,3,49.1269555555556,-113.630016666667,1339,No
 AB,OLDS,3024920,1914,1,2015,6,51.7833333333333,-114.1,1040,No
 AB,ONEFOUR,3044923,1928,1,2007,10,49.1166666666667,-110.466666666667,935,Yes
-AB,PEACE RIVER ,3075040,1908,1,2014,5,56.2269444444444,-117.447222222222,571,Yes
+AB,PEACE RIVER,3075040,1908,1,2014,5,56.2269444444444,-117.447222222222,571,Yes
 AB,PINCHER CREEK,3035206,1915,1,2007,11,49.5205555555556,-113.997222222222,1190,Yes
 AB,RANFURLY 2NW,3015405,1905,1,2014,11,53.4166666666667,-111.733333333333,673,Yes
 AB,ROCKY MTN HOUSE,3015523,1917,1,2007,11,52.4213905555556,-114.912223055556,988,Yes
 AB,SCOTFIELD,3025770,1913,1,2007,10,51.5833555555556,-111.363611666667,762,Yes
 AB,SION,3015960,1906,1,2004,12,53.8833333333333,-114.116666666667,701,No
-AB,SLAVE LAKE ,3065999,1925,1,2007,8,55.2833333333333,-114.783333333333,583,Yes
+AB,SLAVE LAKE,3065999,1925,1,2007,8,55.2833333333333,-114.783333333333,583,Yes
 AB,STETTLER NORTH,3016119,1919,1,2001,8,52.3333333333333,-112.716666666667,821,Yes
 AB,VAUXHALL,3036682,1914,1,2007,11,50.05,-112.133333333333,779,Yes
 AB,WABASCA,3076908,1915,1,2009,1,55.9666666666667,-113.833333333333,545,Yes
-AB,WHITECOURT ,3067372,1943,1,2009,5,54.1438888888889,-115.786666666667,782,Yes
+AB,WHITECOURT,3067372,1943,1,2009,5,54.1438888888889,-115.786666666667,782,Yes
 SK,ANEROID,4020160,1922,1,2005,4,49.7166666666667,-107.3,754,No
 SK,BANGOR,4010400,1951,1,2005,2,50.9,-102.283333333333,526,No
-SK,BUFFALO NARROWS ,4060982,1962,1,2012,11,55.8333333333333,-108.433333333333,440,Yes
+SK,BUFFALO NARROWS,4060982,1962,1,2012,11,55.8333333333333,-108.433333333333,440,Yes
 SK,CEYLON,4011441,1922,1,2002,12,49.3833333333333,-104.65,753,Yes
 SK,CHAPLIN,4021520,1904,1,1995,9,50.4666666666667,-106.65,672,No
 SK,COLLINS BAY CAMECO,4061632,1965,1,2017,12,58.1833333333333,-103.7,490,Yes
 SK,COTE,4011846,1913,1,2006,3,51.5166666666667,-101.783333333333,450,Yes
 SK,CREE LAKE,4061861,1962,1,1993,8,57.35,-107.133333333333,495,Yes
 SK,DAVIDSON,4012120,1922,1,2005,10,51.2666666666667,-105.983333333333,619,No
-SK,ESTEVAN ,4012400,1902,1,2015,2,49.2166666666667,-102.966666666667,581,Yes
+SK,ESTEVAN,4012400,1902,1,2015,2,49.2166666666667,-102.966666666667,581,Yes
 SK,HIGH POINT,4023240,1929,1,2017,7,50.9786127777778,-107.935278611111,645,No
 SK,HUDSON BAY,4083323,1943,1,2013,12,52.8833333333333,-102.583333333333,422,Yes
 SK,INDIAN HEAD,4013480,1895,1,2007,11,50.55,-103.65,579,No
 SK,ISLAND FALLS,4063560,1931,1,2004,9,55.5333333333333,-102.35,299,No
 SK,KELLIHER,4013660,1908,1,2017,12,51.2574166666667,-103.753027777778,676,Yes
 SK,KEY LAKE,4063755,1977,1,2017,12,57.25,-105.616666666667,509,No
-SK,KINDERSLEY ,4043900,1942,1,2013,11,51.5166666666667,-109.183333333333,694,Yes
+SK,KINDERSLEY,4043900,1942,1,2013,11,51.5166666666667,-109.183333333333,694,Yes
 SK,KLINTONEL,4024080,1911,1,1994,1,49.6833333333333,-108.916666666667,1074,No
-SK,LA RONGE ,4064150,1923,1,2013,10,55.15,-105.266666666667,379,Yes
-SK,LEADER AIRPORT,402DAF0             ,1923,1,2007,11,50.9094638888889,-109.501391666667,676,Yes
+SK,LA RONGE,4064150,1923,1,2013,10,55.15,-105.266666666667,379,Yes
+SK,LEADER AIRPORT,402DAF0,1923,1,2007,11,50.9094638888889,-109.501391666667,676,Yes
 SK,LOON LAKE EPF,4064600,1930,1,2005,10,54.05,-109.1,543,Yes
 SK,MANOR,4014913,1922,1,2004,7,49.6166666666667,-102.1,633,Yes
 SK,MELFORT,4055079,1910,1,2007,11,52.8166666666667,-104.6,490,Yes
 SK,MOOSE JAW,4015322,1895,1,2007,11,50.3316805555556,-105.537508333333,577,Yes
 SK,MOOSOMIN,4015360,1900,1,2000,9,50.1333333333333,-101.666666666667,576,No
-SK,NIPAWIN ,4075518,1911,1,2005,9,53.3333333333333,-104,372,Yes
+SK,NIPAWIN,4075518,1911,1,2005,9,53.3333333333333,-104,372,Yes
 SK,NORTH BATTLEFORD,4045605,1894,1,2007,11,52.7666666666667,-108.25,548,Yes
 SK,OUTLOOK,4055736,1915,1,2007,11,51.4833333333333,-107.05,541,Yes
 SK,PASWEGIN,4015960,1951,1,2003,9,51.9833333333333,-103.916666666667,533,No
 SK,PELLY,4086000,1952,1,2016,3,52.0833333333333,-101.866666666667,509,No
 SK,PILGER,4056120,1913,1,2011,9,52.4166666666667,-105.15,552,No
-SK,PRINCE LBERT ,4056240,1889,1,2013,11,53.2166666666667,-105.666666666667,428,Yes
-SK,REGINA ,4016560,1898,1,2007,11,50.4333333333333,-104.666666666667,577,No
+SK,PRINCE LBERT,4056240,1889,1,2013,11,53.2166666666667,-105.666666666667,428,Yes
+SK,REGINA,4016560,1898,1,2007,11,50.4333333333333,-104.666666666667,577,No
 SK,SASKATOON DIEFENBAKER ,4057120,1900,1,2007,11,52.1666666666667,-106.716666666667,504,No
 SK,SCOTT,4047241,1911,1,2007,11,52.35974,-108.834723333333,660,Yes
 SK,SWIFT CURRENT,4028060,1886,1,2007,11,50.2666666666667,-107.733333333333,825,Yes
 SK,TONKIN,4019082,1941,1,2016,1,51.2,-102.233333333333,527,Yes
-SK,URANIUM CITY,406QLD0             ,1953,1,2007,10,59.5666666666667,-108.483333333333,318,Yes
+SK,URANIUM CITY,406QLD0,1953,1,2007,10,59.5666666666667,-108.483333333333,318,Yes
 SK,VAL-MARIE,4038400,1937,1,2010,5,49.3700138888889,-107.847525,808,No
 SK,WASECA,4048520,1908,1,2014,12,53.1308555555556,-109.403902777778,638,No
 SK,WASKESIU LAKE,4068559,1966,1,2007,11,53.9166666666667,-106.066666666667,569,Yes
@@ -258,13 +258,13 @@ MB,ARBORG,5030080,1951,1,2016,6,50.9333333333333,-97.0833333333333,224,No
 MB,BERENS RIVER,5030203,1905,1,2013,11,52.3597366666667,-97.0219533333333,222,Yes
 MB,BIRTLE,5010240,1917,1,2000,11,50.4333333333333,-101.05,522,No
 MB,BISSETT,5030282,1933,1,1997,6,51.0333333333333,-95.7,259,Yes
-MB,BRANDON ,5010480,1890,1,2012,12,49.91,-99.9519444444445,409,Yes
+MB,BRANDON,5010480,1890,1,2012,12,49.91,-99.9519444444445,409,Yes
 MB,CHURCHILL,5060606,1932,1,2015,12,58.7333333333333,-94.0666666666667,29,Yes
 MB,CYPRESS RIVER,5010640,1948,1,2012,3,49.55,-99.0833333333333,374,No
 MB,DAUPHIN,5040681,1911,1,2007,10,51.1003888888889,-100.056888888889,305,Yes
 MB,EMERSON,5020882,1942,1,2003,1,49,-97.2375,242,Yes
 MB,FLIN FLON,5050920,1927,1,2017,12,54.7666666666667,-101.883333333333,320,No
-MB,GILLAM ,5061001,1943,1,2014,10,56.3575,-94.7105555555556,145,Yes
+MB,GILLAM,5061001,1943,1,2014,10,56.3575,-94.7105555555556,145,Yes
 MB,GIMLI,5031039,1944,1,2008,3,50.6333333333333,-97.0166666666667,223,Yes
 MB,GRAND RAPIDS HYDRO,5031111,1962,1,2017,12,53.1580558333333,-99.2833444444444,223,Yes
 MB,GREAT FALLS,5031200,1923,1,2002,12,50.4666666666667,-96,249,No
@@ -273,17 +273,17 @@ MB,LANGRUTH WEST,5041535,1958,1,2005,2,50.4138888888889,-98.8027777777778,264,Ye
 MB,LYNN LAKE,5061648,1952,1,2007,11,56.8638888888889,-101.076111111111,357,Yes
 MB,MORDEN,5021849,1888,1,2007,11,49.1876388888889,-98.0839444444444,298,Yes
 MB,NEEPAWA MURRAY 6 SOUTHWEST,5042004,1881,1,2008,11,50.15,-99.5666666666667,412,Yes
-MB,NINETTE,50220M0             ,1916,1,1996,5,49.4166666666667,-99.65,419,Yes
+MB,NINETTE,50220M0,1916,1,1996,5,49.4166666666667,-99.65,419,Yes
 MB,NORWAY HOUSE,5062045,1896,1,2007,11,53.9666666666667,-97.85,224,Yes
 MB,PIERSON,5012080,1933,1,2007,3,49.1833333333333,-101.266666666667,469,No
 MB,PINAWA WNRE,5032162,1915,1,2017,3,50.1805555555556,-96.0583333333333,267,Yes
 MB,PORTAGE LA PRAIRIE,5012321,1942,1,2017,12,49.95,-98.2666666666667,259,Yes
 MB,SPRAGUE,5022759,1916,1,2007,11,49.0236111111111,-95.5983358333333,329,Yes
 MB,STEINBACH,5022780,1956,1,2005,3,49.5333333333333,-96.7666666666667,254,No
-MB,SWAN RIVER,504K80K             ,1960,1,2007,10,52.1149722222222,-101.232916666667,335,Yes
-MB,THE PAS ,5052880,1910,1,2014,11,53.9666666666667,-101.1,270,Yes
+MB,SWAN RIVER,504K80K,1960,1,2007,10,52.1149722222222,-101.232916666667,335,Yes
+MB,THE PAS,5052880,1910,1,2014,11,53.9666666666667,-101.1,270,Yes
 MB,THOMPSON ,5062922,1967,1,2014,11,55.8033333333333,-97.8625,222,No
-MB,WINNIPEG RICHARDSON ,5023222,1872,1,2007,11,49.9166666666667,-97.2333333333333,239,Yes
+MB,WINNIPEG RICHARDSON,5023222,1872,1,2007,11,49.9166666666667,-97.2333333333333,239,Yes
 ON,AMHERSTBURG,6130257,1917,1,2017,12,42.1033583333333,-83.0944633333333,182,Yes
 ON,ARMSTRONG JELLIEN,6040330,1939,1,1992,10,50.25,-89.1,341,Yes
 ON,ATIKOKAN MARMION,6020384,1919,1,2007,7,48.8,-91.5833333333333,442,Yes
@@ -293,20 +293,20 @@ ON,BIG TROUT LAKE,6010738,1939,1,1992,10,53.8333333333333,-89.8666666666667,224,
 ON,BISCOTASING,6060773,1914,1,2000,10,47.3,-82.1,407,No
 ON,BROCKVILLE PCC,6100971,1915,1,2017,12,44.6,-75.6666666666667,96,Yes
 ON,CAMERON FALLS,6041109,1924,1,1998,8,49.15,-88.35,229,No
-ON,CHAPLEAU ,6061361,1914,1,2015,3,47.82,-83.3466666666667,447,Yes
+ON,CHAPLEAU,6061361,1914,1,2015,3,47.82,-83.3466666666667,447,Yes
 ON,CORNWALL,6101874,1951,1,2017,12,45.0155783333333,-74.7489,64,No
-ON,DRYDEN ,6032119,1914,1,2005,1,49.8333333333333,-92.75,413,Yes
-ON,EARLTON ,6072225,1939,1,2005,1,47.7,-79.85,243,No
-ON,FORT FRANCES ,6022476,1912,1,2011,5,48.65,-93.4333333333333,342,Yes
-ON,GERALDTON ,6042716,1950,1,2015,2,49.7828027777778,-86.9305694444445,349,Yes
+ON,DRYDEN,6032119,1914,1,2005,1,49.8333333333333,-92.75,413,Yes
+ON,EARLTON,6072225,1939,1,2005,1,47.7,-79.85,243,No
+ON,FORT FRANCES,6022476,1912,1,2011,5,48.65,-93.4333333333333,342,Yes
+ON,GERALDTON,6042716,1950,1,2015,2,49.7828027777778,-86.9305694444445,349,Yes
 ON,GODFREY,6102857,1924,1,2003,5,44.5666666666667,-76.6333333333333,160,Yes
-ON,GORE BAY ,6092925,1916,1,1994,1,45.8833333333333,-82.5666666666667,194,Yes
+ON,GORE BAY,6092925,1916,1,1994,1,45.8833333333333,-82.5666666666667,194,Yes
 ON,HALIBURTON,6163171,1883,1,2017,12,45.0322483333333,-78.531115,330,Yes
-ON,HAMILTON ,6153194,1866,1,2011,12,43.1716866666667,-79.9341766666667,238,Yes
-ON,HORNEPAYNE ,6053575,1917,1,1995,7,49.2,-84.7666666666667,335,Yes
+ON,HAMILTON,6153194,1866,1,2011,12,43.1716866666667,-79.9341766666667,238,Yes
+ON,HORNEPAYNE,6053575,1917,1,1995,7,49.2,-84.7666666666667,335,Yes
 ON,IROQUOIS FALLS,6073810,1913,1,1998,12,48.75,-80.6666666666667,259,No
-ON,KAPUSKASING ,6073975,1918,1,2014,9,49.4138888888889,-82.4675,227,Yes
-ON,KENORA ,6034075,1900,1,2013,2,49.7902791666667,-94.3652786111111,406,Yes
+ON,KAPUSKASING,6073975,1918,1,2014,9,49.4138888888889,-82.4675,227,Yes
+ON,KENORA,6034075,1900,1,2013,2,49.7902791666667,-94.3652786111111,406,Yes
 ON,KINGSTON PUMPING STATION,6104175,1872,1,2007,12,44.2439033333333,-76.4805666666667,77,Yes
 ON,LANSDOWNE HOUSE,6014350,1941,1,1989,6,52.2333333333333,-87.8833333333333,255,No
 ON,LONDON AIRPORT,6144475,1883,1,2017,4,43.0330555555556,-81.1511111111111,278,Yes
@@ -315,68 +315,68 @@ ON,MADAWASKA,6084770,1916,1,2000,11,45.5,-77.9833333333333,316,No
 ON,MINE CENTRE SOUTHWEST,6025205,1914,1,2017,12,48.7597388888889,-92.6227777777778,361,Yes
 ON,MOOSONEE,6075425,1892,1,2017,12,51.2666666666667,-80.65,10,Yes
 ON,MORRISBURG,6105460,1913,1,2008,12,44.9236183333333,-75.1883433333333,82,No
-ON,NORTH BAY ,6085700,1915,1,2013,1,46.3636111111111,-79.4227777777778,370,Yes
+ON,NORTH BAY,6085700,1915,1,2013,1,46.3636111111111,-79.4227777777778,370,Yes
 ON,ORANGEVILLE MOE,6155790,1887,1,2015,12,43.9183516666667,-80.0864066666667,412,Yes
 ON,ORILLIA BRAIN,6115811,1871,1,2017,12,44.6027777777778,-79.4388888888889,250,Yes
 ON,OTTAWA,6105976,1890,1,2017,12,45.3833333333333,-75.7166666666667,79,No
 ON,OWEN SOUND MOE,6116132,1879,1,2007,12,44.5833333333333,-80.9333333333333,179,Yes
-ON,PELEE ISLAND ,6136336,1888,1,1994,9,41.7833333333333,-82.6833333333333,174,Yes
-ON,PETERBOROUGH ,6166418,1866,1,2007,5,44.2333333333333,-78.3666666666667,191,Yes
-ON,PICKLE LAKE ,6016527,1933,1,2012,7,51.4463888888889,-90.2141666666667,386,Yes
-ON,RED LAKE ,6016975,1939,1,2012,5,51.0669444444445,-93.7930555555556,386,No
+ON,PELEE ISLAND,6136336,1888,1,1994,9,41.7833333333333,-82.6833333333333,174,Yes
+ON,PETERBOROUGH,6166418,1866,1,2007,5,44.2333333333333,-78.3666666666667,191,Yes
+ON,PICKLE LAKE,6016527,1933,1,2012,7,51.4463888888889,-90.2141666666667,386,Yes
+ON,RED LAKE,6016975,1939,1,2012,5,51.0669444444445,-93.7930555555556,386,No
 ON,RIDGETOWN,6137149,1883,1,1997,4,42.45,-81.8833333333333,206,Yes
-ON,SAULT STE MARIE ,6057592,1945,1,2012,3,46.4833333333333,-84.5094444444444,192,Yes
-ON,SIOUX LOOKOUT ,6037775,1914,1,2013,2,50.1166666666667,-91.9,383,Yes
+ON,SAULT STE MARIE,6057592,1945,1,2012,3,46.4833333333333,-84.5094444444444,192,Yes
+ON,SIOUX LOOKOUT,6037775,1914,1,2013,2,50.1166666666667,-91.9,383,Yes
 ON,SMOKY FALLS,6077845,1934,1,1997,4,50.0666666666667,-82.1666666666667,183,No
-ON,SUDBURY ,6068150,1921,1,2013,3,46.6255555555556,-80.7977777777778,348,Yes
-ON,TERRACE BAY ,6048231,1910,1,2007,9,48.8166666666667,-87.1,290,Yes
-ON,TIMMINS VICTOR POWER ,6078285,1955,1,2011,2,48.5697222222222,-81.3766666666667,295,No
+ON,SUDBURY,6068150,1921,1,2013,3,46.6255555555556,-80.7977777777778,348,Yes
+ON,TERRACE BAY,6048231,1910,1,2007,9,48.8166666666667,-87.1,290,Yes
+ON,TIMMINS VICTOR POWER,6078285,1955,1,2011,2,48.5697222222222,-81.3766666666667,295,No
 ON,TOBERMORY CYPRUS LAKE,6128323,1915,1,1994,12,45.2333333333333,-81.5333333333333,190,Yes
 ON,TORONTO,6158350,1840,1,2017,4,43.6666666666667,-79.4,113,No
-ON,TORONTO LESTER B. PEARSON ,6158733,1938,1,2013,6,43.6772222222222,-79.6305555555556,173,No
+ON,TORONTO LESTER B. PEARSON,6158733,1938,1,2013,6,43.6772222222222,-79.6305555555556,173,No
 ON,TRANQUILLO RIDGE,6048864,1877,1,2007,12,48.2333333333333,-89.5166666666667,317,Yes
 ON,VINELAND,6139141,1919,1,2013,12,43.15,-79.4166666666667,110,Yes
 ON,WALLACEBURG,6139265,1906,1,1997,4,42.5833333333333,-82.4,177,No
-ON,WAWA ,6059D09             ,1940,1,2014,9,47.9666666666667,-84.7833333333333,287,Yes
+ON,WAWA ,6059D09,1940,1,2014,9,47.9666666666667,-84.7833333333333,287,Yes
 ON,WELLAND,6139445,1873,1,2014,8,42.9925266666667,-79.2611383333333,175,No
-ON,WIARTON ,6119500,1948,1,2014,11,44.7458333333333,-81.1072222222222,222,No
-ON,WINDSOR ,6139525,1866,1,2014,10,42.2755555555556,-82.9555555555556,190,Yes
+ON,WIARTON,6119500,1948,1,2014,11,44.7458333333333,-81.1072222222222,222,No
+ON,WINDSOR,6139525,1866,1,2014,10,42.2755555555556,-82.9555555555556,190,Yes
 ON,WOODSTOCK,6149625,1870,1,2017,12,43.1361233333333,-80.7705666666667,282,No
 QC,ARMAGH,7050240,1916,1,1994,5,46.75,-70.5333333333333,358,Yes
 QC,ARUNDEL,7030310,1914,1,2017,5,45.95,-74.6166666666667,191,Yes
-QC,BAGOTVILLE ,7060400,1876,1,2017,12,48.3333333333333,-71,159,Yes
+QC,BAGOTVILLE,7060400,1876,1,2017,12,48.3333333333333,-71,159,Yes
 QC,BARRAGE ANGLIERS,7080452,1911,1,1996,5,47.5519444444444,-79.2358333333333,267,No
 QC,BARRAGE TEMISCAMINGUE,7080468,1910,1,1995,10,46.7097222222222,-79.1011111111111,181,No
 QC,BELLETERRE,7080600,1952,1,2004,4,47.3833333333333,-78.7,322,No
 QC,BROME,7020840,1877,1,2014,7,45.1833333333333,-72.5666666666667,206,No
 QC,CAUSAPSCAL,7051200,1921,1,2017,8,48.3666666666667,-67.2333333333333,168,No
-QC,CHIBOUGAMAU CHAPAIS ,7091404,1937,1,2016,11,49.7666666666667,-74.5333333333333,387,Yes
+QC,CHIBOUGAMAU CHAPAIS,7091404,1937,1,2016,11,49.7666666666667,-74.5333333333333,387,Yes
 QC,CHELSEA,7031360,1928,1,2017,8,45.5166666666667,-75.7833333333333,113,No
 QC,DONNACONA,7012071,1919,1,2008,11,46.6833333333333,-71.7333333333333,46,Yes
 QC,DRUMMONDVILLE,7022160,1914,1,2017,8,45.8833333333333,-72.4833333333333,82,No
 QC,GASPE ,7052605,1916,1,2013,3,48.7769444444445,-64.4780555555556,33,Yes
 QC,GRANDE VALLEE,7052865,1883,1,2004,4,49.2,-65.15,8,Yes
-QC,ILES DE LA MADELEINE ,705C2G9             ,1934,1,2002,11,47.4166666666667,-61.7833333333333,11,Yes
+QC,ILES DE LA MADELEINE,705C2G9,1934,1,2002,11,47.4166666666667,-61.7833333333333,11,Yes
 QC,INUKJUAK,7103282,1938,1,1994,2,58.4666666666667,-78.0833333333333,24,No
 QC,JOLIETTE VILLE,7013362,1914,1,2011,4,46.0166666666667,-73.4333333333333,56,Yes
-QC,KUUJJUAQ ,7113534,1947,1,2014,3,58.1,-68.4166666666667,39,No
-QC,KUUJJUARAPIK ,7103536,1934,1,2014,4,55.2833333333333,-77.75,10,No
+QC,KUUJJUAQ,7113534,1947,1,2014,3,58.1,-68.4166666666667,39,No
+QC,KUUJJUARAPIK,7103536,1934,1,2014,4,55.2833333333333,-77.75,10,No
 QC,LA MALBAIE,7043960,1914,1,2004,4,47.6666666666667,-70.15,23,No
 QC,LA POCATIERE,7054095,1913,1,1996,3,47.35,-70.0333333333333,31,No
 QC,LA SARRE,7094120,1952,1,2004,4,48.7833333333333,-79.2166666666667,244,No
 QC,LA TUQUE,7074240,1912,1,2004,4,47.4,-72.7833333333333,152,No
 QC,LABRIEVILLE,7043540,1955,1,1994,12,49.3,-69.55,152,No
-QC,LAC BERRY,709CEE9             ,1914,1,2017,8,48.8,-78.2833333333333,305,Yes
+QC,LAC BERRY,709CEE9,1914,1,2017,8,48.8,-78.2833333333333,305,Yes
 QC,LAUZON,7024254,1872,1,2017,8,46.8166666666667,-71.1,69,Yes
 QC,LEBEL SUR QUEVILLON,7094275,1967,1,2004,4,49.05,-76.9666666666667,305,No
 QC,LENNOXVILLE,7024280,1915,1,1995,10,45.3688888888889,-71.8236111111111,181,No
 QC,LES BUISSONS,7044288,1947,1,2017,8,49.1166666666667,-68.3833333333333,15,Yes
 QC,LES CEDRES,7014290,1913,1,2017,8,45.3,-74.05,47,No
-QC,MATAGAMI ,7094639,1964,1,1991,6,49.7666666666667,-77.8166666666667,281,Yes
+QC,MATAGAMI,7094639,1964,1,1991,6,49.7666666666667,-77.8166666666667,281,Yes
 QC,MONT LAURIER,7035160,1920,1,2014,6,46.5666666666667,-75.55,244,Yes
-QC,MONT-JOLI ,7055120,1943,1,2013,3,48.6,-68.2166666666667,52,No
-QC,MONTREAL/PIERRE ELLIOTT TRUDEAU ,7025250,1872,1,2016,9,45.4666666666667,-73.75,36,Yes
-QC,NATASHQUAN ,7045400,1915,1,2003,3,50.1833333333333,-61.8166666666667,11,No
+QC,MONT-JOLI,7055120,1943,1,2013,3,48.6,-68.2166666666667,52,No
+QC,MONTREAL/PIERRE ELLIOTT TRUDEAU,7025250,1872,1,2016,9,45.4666666666667,-73.75,36,Yes
+QC,NATASHQUAN,7045400,1915,1,2003,3,50.1833333333333,-61.8166666666667,11,No
 QC,NICOLET,7025440,1914,1,2017,8,46.2,-72.6166666666667,30,No
 QC,NOMININGUE,7035520,1914,1,2013,11,46.4,-75.0833333333333,274,No
 QC,NORMANDIN,7065640,1936,1,1992,8,48.85,-72.5333333333333,137,No
@@ -384,8 +384,8 @@ QC,PARENT S,7075799,1943,1,2004,4,47.9166666666667,-74.6166666666667,410,Yes
 QC,POINTE AU CHENE,7036063,1919,1,2009,6,45.65,-74.8,51,Yes
 QC,QUAQTAQ,7116270,1930,1,1988,5,61.05,-69.6333333333333,30,Yes
 QC,RIMOUSKI,7056480,1877,1,2017,8,48.45,-68.5166666666667,36,Yes
-QC,ROBERVAL ,7066685,1914,1,2014,3,48.5166666666667,-72.2666666666667,179,Yes
-QC,SCHEFFERVILLE ,7117825,1949,1,1993,9,54.8,-66.8166666666667,522,No
+QC,ROBERVAL,7066685,1914,1,2014,3,48.5166666666667,-72.2666666666667,179,Yes
+QC,SCHEFFERVILLE,7117825,1949,1,1993,9,54.8,-66.8166666666667,522,No
 QC,SENNETERRE,7097900,1940,1,1994,5,48.3333333333333,-77.2666666666667,310,Yes
 QC,SEPT-ILES,7047912,1945,1,2017,5,50.2166666666667,-66.25,53,Yes
 QC,SHAWINIGAN,7018000,1902,1,2004,4,46.5666666666667,-72.75,122,No
@@ -402,43 +402,43 @@ QC,TADOUSSAC,7048320,1914,1,2004,4,48.15,-69.7,70,No
 QC,TETE A LA BALEINE,7048421,1912,1,1995,3,50.7,-59.3166666666667,9,Yes
 QC,THETFORD MINES,7028441,1922,1,2016,7,46.1,-71.35,381,Yes
 QC,TRINITE DES MONTS,7058520,1951,1,2004,4,48.1333333333333,-68.4833333333333,262,No
-QC,VAL-D'OR ,7098600,1952,1,2017,12,48.0563888888889,-77.7866666666667,337,No
+QC,VAL-D'OR,7098600,1952,1,2017,12,48.0563888888889,-77.7866666666667,337,No
 QC,VILLE MARIE,7088760,1914,1,2004,4,47.35,-79.4333333333333,213,No
 QC,WRIGHT,7038975,1914,1,2017,8,46.0666666666667,-76.05,142,Yes
 NS,ANNAPOLIS ROYAL,8200100,1915,1,2007,12,44.75,-65.5166666666667,8,No
 NB,AROOSTOOK,8100300,1920,1,2017,12,46.7122222222222,-67.7155555555556,91,Yes
-NB,BATHURST ,8100503,1884,1,2013,10,47.6291805555556,-65.7483388888889,59,Yes
+NB,BATHURST,8100503,1884,1,2013,10,47.6291805555556,-65.7483388888889,59,Yes
 NL,BAY D'ESPOIR,8400413,1968,1,2017,12,47.9833333333333,-55.8,23,No
 NS,BEAR RIVER,8200500,1915,1,2006,2,44.5666666666667,-65.6333333333333,8,Yes
 NL,BURGEO,8400798,1939,1,1995,7,47.6166666666667,-57.6166666666667,11,Yes
 NL,CARTWRIGHT,8501100,1936,1,2015,3,53.7083333333333,-57.035,14,No
-NB,CHARLO ,8100880,1934,1,2002,10,47.9833333333333,-66.3333333333333,40,Yes
+NB,CHARLO,8100880,1934,1,2002,10,47.9833333333333,-66.3333333333333,40,Yes
 PE,CHARLOTTETOWN ,8300300,1872,1,2012,9,46.2886166666667,-63.1286305555556,49,Yes
-NL,CHURCHILL FALLS,850A131             ,1969,1,1998,4,53.5333333333333,-63.9666666666667,489,Yes
+NL,CHURCHILL FALLS,850A131,1969,1,1998,4,53.5333333333333,-63.9666666666667,489,Yes
 NS,COLLEGEVILLE,8201000,1916,1,2014,6,45.4833333333333,-62.0166666666667,76,No
 NL,CORNER BROOK,8401300,1933,1,2017,12,48.95,-57.95,5,No
 NL,DANIELS HARBOUR,8401400,1947,1,1998,1,50.2363888888889,-57.5811111111111,19,No
 NL,DEER LAKE ,8401501,1933,1,2012,3,49.2166666666667,-57.4,22,Yes
 NS,DEMING,8201410,1884,1,2011,12,45.2163908333333,-61.1778027777778,16,Yes
 NB,DOAKTOWN,8101200,1944,1,2009,6,46.5525138888889,-66.1402916666667,38,No
-NB,EDMUNDSTON,810AL00             ,1916,1,2009,7,47.3463888888889,-68.1877777777778,163,Yes
+NB,EDMUNDSTON,810AL00,1916,1,2009,7,47.3463888888889,-68.1877777777778,163,Yes
 NL,EXPLOITS DAM,8401550,1956,1,2009,2,48.7666666666667,-56.6,154,No
 NB,FREDERICTON ,8101500,1874,1,2010,4,45.8721305555556,-66.5278916666667,21,Yes
-NL,GANDER ,8401700,1937,1,2012,3,48.9463888888889,-54.5769444444444,151,No
-NL,GOOSE ,8501900,1942,1,2017,12,53.3166666666667,-60.4166666666667,49,No
+NL,GANDER,8401700,1937,1,2012,3,48.9463888888889,-54.5769444444444,151,No
+NL,GOOSE,8501900,1942,1,2017,12,53.3166666666667,-60.4166666666667,49,No
 NL,GRAND FALLS,8402050,1937,1,2009,1,48.9333333333333,-55.6666666666667,60,No
-NS,GREENWOOD ,8202000,1943,1,2017,12,44.9833333333333,-64.9166666666667,28,No
-NS,HALIFAX STANFIELD ,8202250,1872,1,2012,9,44.8800166666667,-63.5000138888889,145,Yes
+NS,GREENWOOD,8202000,1943,1,2017,12,44.9833333333333,-64.9166666666667,28,No
+NS,HALIFAX STANFIELD,8202250,1872,1,2012,9,44.8800166666667,-63.5000138888889,145,Yes
 NL,ISLE UX MORTS,8402450,1909,1,2004,10,47.5833333333333,-58.9666666666667,5,Yes
 NB,KEDGWICK,8102300,1932,1,1994,9,47.65,-67.35,274,No
 NS,LIVERPOOL BIG FALLS,8203100,1940,1,2012,10,44.1333333333333,-64.9333333333333,50,No
-NL,MAKKOVIK ,8502NHR             ,1942,1,2014,11,55.0822222222222,-59.1886111111111,71,Yes
-NL,MARY'S HARBOUR ,8502591,1881,1,1998,1,52.3036111111111,-55.8336111111111,12,Yes
-NB,MIRAMICHI ,8101000,1873,1,2005,8,47.0094694444444,-65.4677888888889,33,Yes
-NB,MONCTON ,8103200,1898,1,2012,6,46.1053055555556,-64.6838055555556,71,Yes
+NL,MAKKOVIK,8502NHR,1942,1,2014,11,55.0822222222222,-59.1886111111111,71,Yes
+NL,MARY'S HARBOUR,8502591,1881,1,1998,1,52.3036111111111,-55.8336111111111,12,Yes
+NB,MIRAMICHI,8101000,1873,1,2005,8,47.0094694444444,-65.4677888888889,33,Yes
+NB,MONCTON,8103200,1898,1,2012,6,46.1053055555556,-64.6838055555556,71,Yes
 PE,MONTICELLO,8300447,1960,1,2003,12,46.4666666666667,-62.4666666666667,32,No
 NS,MOUNT UNIACKE,8203600,1920,1,2003,7,44.9,-63.8333333333333,159,No
-NL,NAIN ,8502800,1939,1,2013,3,56.55,-61.6833333333333,7,No
+NL,NAIN,8502800,1939,1,2013,3,56.55,-61.6833333333333,7,No
 NS,NAPPAN,8203700,1913,1,2003,7,45.7666666666667,-64.25,20,No
 NB,NEPISIGUIT FALLS,8103500,1922,1,2006,2,47.4,-65.7833333333333,106,No
 NL,NORTH HARBOUR,8402874,1939,1,2007,11,47.1333333333333,-53.6666666666667,11,Yes
@@ -446,22 +446,22 @@ NS,PARRSBORO,8204400,1897,1,2002,9,45.4,-64.3333333333333,24,No
 NL,PLUM POINT,8402958,1972,1,2016,6,51.0666666666667,-56.8833333333333,6,No
 NB,REXTON,8104400,1923,1,2009,12,46.6666666666667,-64.8666666666667,5,No
 NS,SABLE ISLAND,8204700,1891,1,2001,12,43.9322222222222,-60.0094444444444,5,No
-NB,SAINT JOHN ,8104900,1871,1,2012,6,45.3180555555556,-65.8855694444444,109,Yes
+NB,SAINT JOHN,8104900,1871,1,2012,6,45.3180555555556,-65.8855694444444,109,Yes
 NL,SPRINGDALE,8403700,1956,1,1993,6,49.5,-56.0833333333333,23,No
 NS,SPRINGFIELD,8205200,1920,1,2003,8,44.6666666666667,-64.85,167,No
-NL,ST ANTHONY ,840C401             ,1883,1,2008,1,51.3833333333333,-56.1,33,Yes
-NL,ST JOHN'S ,8403506,1874,1,2012,3,47.6222222222222,-52.7427777777778,141,Yes
+NL,ST ANTHONY,840C401,1883,1,2008,1,51.3833333333333,-56.1,33,Yes
+NL,ST JOHN'S,8403506,1874,1,2012,3,47.6222222222222,-52.7427777777778,141,Yes
 NS,ST MARGARET'S BAY,8204800,1922,1,2017,12,44.7,-63.9,17,No
-NL,STEPHENVILLE ,8403800,1935,1,2014,10,48.5333333333333,-58.55,26,Yes
-PE,SUMMERSIDE ,8300700,1936,1,2002,6,46.4388888888889,-63.8316666666667,20,Yes
+NL,STEPHENVILLE,8403800,1935,1,2014,10,48.5333333333333,-58.55,26,Yes
+PE,SUMMERSIDE,8300700,1936,1,2002,6,46.4388888888889,-63.8316666666667,20,Yes
 NB,SUSSEX,8105200,1898,1,2009,5,45.7166666666667,-65.5333333333333,21,No
-NS,SYDNEY ,8205700,1870,1,2014,8,46.1666666666667,-60.0481388888889,62,Yes
+NS,SYDNEY,8205700,1870,1,2014,8,46.1666666666667,-60.0481388888889,62,Yes
 NS,TRURO,8205990,1910,1,2002,10,45.3666666666667,-63.2666666666667,40,Yes
 NS,UPPER STEWIACKE,8206200,1916,1,2008,4,45.2166666666667,-63,23,No
-NL,WABUSH LAKE ,8504175,1961,1,2013,2,52.9272222222222,-66.8741666666667,551,No
+NL,WABUSH LAKE,8504175,1961,1,2013,2,52.9272222222222,-66.8741666666667,551,No
 NL,WESTBROOK ST LAWRENCE,8404201,1957,1,1995,7,46.95,-55.3833333333333,31,No
 NS,WESTPORT,8206260,1937,1,1993,6,44.25,-66.3666666666667,18,Yes
 NS,WHITE ROCK,8206316,1913,1,2017,6,45.05,-64.3833333333333,38,Yes
 NB,WOODSTOCK,8105600,1914,1,2017,12,46.1702777777778,-67.5536111111111,153,No
 NS,WRECK COVE BROOK,8206450,1951,1,2012,12,46.5333333333333,-60.45,76,Yes
-NS,YARMOUTH ,8206500,1880,1,2012,4,43.8308333333333,-66.0886111111111,43,Yes
+NS,YARMOUTH,8206500,1880,1,2012,4,43.8308333333333,-66.0886111111111,43,Yes
diff --git a/miranda/preprocess/configs/eccc-homogenized_attrs.json b/miranda/preprocess/configs/eccc-homogenized_attrs.json
index b1861502..e35bba41 100644
--- a/miranda/preprocess/configs/eccc-homogenized_attrs.json
+++ b/miranda/preprocess/configs/eccc-homogenized_attrs.json
@@ -13,11 +13,13 @@
         "Third": "ECCC Adjusted and Homogenized Canadian Climate Data (AHCCD) version 3"
       }
     },
+    "_variable": true,
     "acknowledgement": "This data is provided by Environment and Climate Change Canada (ECCC).",
     "author": "Environment and Climate Change Canada (ECCC)",
     "contact": "info.cccs-ccsc@canada.ca",
     "dataset_id": "d6813de6-b20a-46cc-8990-01862ae15c5f",
     "documentation": "https://www.canada.ca/en/environment-climate-change/services/climate-change/canadian-centre-climate-services/display-download/technical-documentation-adjusted-climate-data.html",
+    "domain": "AMNO",
     "institution": "GovCan",
     "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html",
     "license_preamble": "The data is owned by the Government of Canada (Environment and Climate Change Canada), and fall under the licence agreement for use of Environment and Climate Change Canada data.",
@@ -27,7 +29,8 @@
     "realm": "atmos",
     "source": "AHCCD",
     "table_date": "2023-08-03",
-    "table_id": "ECCC"
+    "table_id": "ECCC",
+    "type": "station-obs"
   },
   "variables": {
     "dm": {

From f454cb61ec9ec62f8f14b504e87d86d931a09db4 Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Mon, 7 Aug 2023 14:03:27 -0400
Subject: [PATCH 10/33] naming and more dynamic handling of variables

---
 miranda/io/_output.py                         |  15 +-
 miranda/io/utils.py                           |  12 +-
 miranda/preprocess/__init__.py                |   5 +
 miranda/preprocess/_eccc_homogenized.py       | 154 ++++++++++++------
 .../configs/ahccd_gen3_temperature.csv        |  14 +-
 .../configs/eccc-homogenized_attrs.json       |   2 +
 templates/ahccd_preprocess.py                 |  10 ++
 7 files changed, 146 insertions(+), 66 deletions(-)
 create mode 100644 templates/ahccd_preprocess.py

diff --git a/miranda/io/_output.py b/miranda/io/_output.py
index 015a5c88..8d0667dc 100644
--- a/miranda/io/_output.py
+++ b/miranda/io/_output.py
@@ -34,6 +34,7 @@ def write_dataset(
     ds: xr.DataArray | xr.Dataset,
     output_path: str | os.PathLike,
     output_format: str,
+    output_name: str | None = None,
     chunks: dict | None = None,
     overwrite: bool = False,
     compute: bool = True,
@@ -48,6 +49,8 @@ def write_dataset(
         Output folder path.
     output_format: {"netcdf", "zarr"}
         Output data container type.
+    output_name: str, optional
+        Output file name.
     chunks : dict, optional
         Chunking layout to be written to new files. If None, chunking will be left to the relevant backend engine.
     overwrite : bool
@@ -64,11 +67,15 @@ def write_dataset(
     if isinstance(output_path, str):
         output_path = Path(output_path)
 
-    outfile = name_output_file(ds, output_format)
-    outfile_path = output_path.joinpath(outfile)
+    if not output_name:
+        output_name = name_output_file(ds, output_format)
+    else:
+        output_name = str(output_name)
+
+    outfile_path = output_path.joinpath(output_name)
 
     if overwrite and outfile_path.exists():
-        logging.warning(f"Removing existing {output_format} files for {outfile}.")
+        logging.warning(f"Removing existing {output_format} files for {output_name}.")
         if outfile_path.is_dir():
             shutil.rmtree(outfile_path)
         if outfile_path.is_file():
@@ -78,7 +85,7 @@ def write_dataset(
         freq = ds.attrs["frequency"]  # TOD0: check that this is really there
         chunks = fetch_chunk_config(priority="time", freq=freq, dims=ds.dims)
 
-    logging.info(f"Writing {outfile}.")
+    logging.info(f"Writing {output_name}.")
     write_object = delayed_write(
         ds,
         outfile_path,
diff --git a/miranda/io/utils.py b/miranda/io/utils.py
index 33da8643..15fc771c 100644
--- a/miranda/io/utils.py
+++ b/miranda/io/utils.py
@@ -33,7 +33,9 @@
 
 
 def name_output_file(
-    ds_or_dict: xr.Dataset | dict[str, str], output_format: str
+    ds_or_dict: xr.Dataset | dict[str, str],
+    output_format: str,
+    data_vars: str | None = None,
 ) -> str:
     """Name an output file based on facets within a Dataset or a dictionary.
 
@@ -43,6 +45,8 @@ def name_output_file(
         A miranda-converted Dataset or a dictionary containing the appropriate facets.
     output_format : {"netcdf", "zarr"}
         Output filetype to be used for generating filename suffix.
+    data_vars : str, optional
+        If using a Dataset, the name of the data variable to be used for naming the file.
 
     Returns
     -------
@@ -62,7 +66,9 @@ def name_output_file(
     facets["suffix"] = suffix
 
     if isinstance(ds_or_dict, xr.Dataset):
-        if len(ds_or_dict.data_vars) == 1:
+        if data_vars is not None:
+            facets["variable"] = data_vars
+        elif len(ds_or_dict.data_vars) == 1:
             facets["variable"] = list(ds_or_dict.data_vars.keys())[0]
         elif (
             len(ds_or_dict.data_vars) == 2
@@ -73,7 +79,7 @@ def name_output_file(
             ][0]
         else:
             raise NotImplementedError(
-                f"Too many `data_vars` in Dataset: {' ,'.join(ds_or_dict.data_vars.keys())}."
+                f"Too many `data_vars` in Dataset: {', '.join(ds_or_dict.data_vars.keys())}."
             )
         for f in [
             "bias_adjust_project",
diff --git a/miranda/preprocess/__init__.py b/miranda/preprocess/__init__.py
index 0ae1f1d6..4601a7cc 100644
--- a/miranda/preprocess/__init__.py
+++ b/miranda/preprocess/__init__.py
@@ -1 +1,6 @@
 """Preprocessing tools for Miranda."""
+from __future__ import annotations
+
+from ._eccc_homogenized import *
+from ._eccc_obs import *
+from ._eccc_summaries import *
diff --git a/miranda/preprocess/_eccc_homogenized.py b/miranda/preprocess/_eccc_homogenized.py
index f400b5b1..33f7c9ca 100644
--- a/miranda/preprocess/_eccc_homogenized.py
+++ b/miranda/preprocess/_eccc_homogenized.py
@@ -9,6 +9,8 @@
 import pandas as pd
 import xarray as xr
 
+from miranda.io import write_dataset
+from miranda.io.utils import name_output_file
 from miranda.preprocess._data_definitions import load_json_data_mappings
 from miranda.preprocess._treatments import basic_metadata_conversion
 from miranda.scripting import LOGGING_CONFIG
@@ -16,7 +18,29 @@
 logging.config.dictConfig(LOGGING_CONFIG)
 logger = logging.Logger("miranda")
 
-__all__ = ["convert_ahccd", "convert_ahccd_fwf_file"]
+__all__ = ["convert_ahccd", "convert_ahccd_fwf_file", "merge_ahccd"]
+
+
+def _ahccd_variable_code(code: str):
+    config = load_json_data_mappings("eccc-homogenized")
+    variable_codes = {}
+    for variable_code in config["variables"]:
+        variable_name = config["variables"][variable_code].get("_variable_name")
+        if variable_name:
+            variable_codes[variable_name] = variable_code
+        else:
+            raise AttributeError(
+                f"Variable `{variable_code}` is not properly configured. Verify JSON."
+            )
+
+    if code in variable_codes.values():
+        variable = code
+    else:
+        variable = variable_codes.get(code)
+    if not variable:
+        raise NotImplementedError(f"Variable `{code}` not supported.")
+
+    return variable
 
 
 def _ahccd_variable_metadata(
@@ -27,7 +51,7 @@ def _ahccd_variable_metadata(
 
     Parameters
     ----------
-    variable_code: {"dm", "dn", "dr", "ds", "dt", "dx"}
+    variable_code
     gen: {1, 2, 3}
 
     Returns
@@ -40,11 +64,9 @@ def _ahccd_variable_metadata(
 
     config = load_json_data_mappings("eccc-homogenized")
     metadata = basic_metadata_conversion("eccc-homogenized", config)
+    code = _ahccd_variable_code(variable_code)
 
-    variable_meta = metadata["variables"].get(variable_code)
-    if not variable_meta:
-        raise NotImplementedError(f"Variable `{variable_code}` not supported.")
-
+    variable_meta = metadata["variables"].get(code)
     variable_name = variable_meta.get("_variable_name")
     if variable_name:
         variable_meta = {variable_name: variable_meta}
@@ -53,24 +75,25 @@ def _ahccd_variable_metadata(
         variable_meta = {variable_code: variable_meta}
 
     header = metadata["Header"]
+    to_delete = []
     # Conditional handling of global attributes based on generation
     for field in [f for f in header if f.startswith("_")]:
         if isinstance(header[field], bool):
-            if header[field] and field[1:] == "variable":
+            if header[field] and field == "_variable":
                 header[field[1:]] = variable_name
-
         elif isinstance(header[field], dict):
             attr_treatment = header[field]["generation"]
             if field in ["_citation" "_product"]:
                 for attribute, value in attr_treatment.items():
                     if attribute == generation:
                         header[field[1:]] = value
-
         else:
             raise AttributeError(
                 f"Attribute treatment configuration for field `{field}` is not properly configured. Verify JSON."
             )
+        to_delete.append(field)
 
+    for field in to_delete:
         del header[field]
 
     return variable_meta, header
@@ -167,9 +190,7 @@ def convert_ahccd_fwf_file(
     -------
     xarray.Dataset
     """
-    code = dict(tasmax="dx", tasmin="dn", tas="dm", pr="dt", prsn="ds", prlp="dr").get(
-        variable
-    )
+    code = _ahccd_variable_code(variable)
 
     variable_meta, global_attrs = _ahccd_variable_metadata(code, generation)
     col_names, cols_specs, header = _ahccd_column_definitions(code)
@@ -282,8 +303,10 @@ def convert_ahccd(
     data_source: str | Path,
     output_dir: str | Path,
     variable: str,
+    *,
     generation: int,
     merge: bool = False,
+    overwrite: bool = False,
 ) -> None:
     """Convert Adjusted and Homogenized Canadian Climate Dataset files.
 
@@ -294,6 +317,7 @@ def convert_ahccd(
     variable: str
     generation: int
     merge: bool
+    overwrite: bool
 
     Returns
     -------
@@ -302,10 +326,7 @@ def convert_ahccd(
     output_dir = Path(output_dir).resolve().joinpath(variable)
     output_dir.mkdir(parents=True, exist_ok=True)
 
-    code = dict(tasmax="dx", tasmin="dn", tas="dm", pr="dt", prsn="ds", prlp="dr").get(
-        variable
-    )
-
+    code = _ahccd_variable_code(variable)
     var_meta, global_attrs = _ahccd_variable_metadata(code, generation)
     (
         col_names,
@@ -343,14 +364,11 @@ def convert_ahccd(
     # Convert station .txt files to netcdf
     for ff in Path(data_source).glob(f"{code}*.txt"):
         outfile = output_dir.joinpath(ff.name.replace(".txt", ".nc"))
-        if not outfile.exists():
+        if not outfile.exists() or overwrite:
             logger.info(ff.name)
 
             station_id = ff.name[2:].split(".txt")[0]
-            try:
-                metadata_st = metadata[metadata["stnid"] == int(station_id)]
-            except ValueError:
-                metadata_st = metadata[metadata["stnid"] == station_id]
+            metadata_st = metadata[metadata["stnid"] == station_id]
 
             if len(metadata_st) == 1:
                 ds_out = convert_ahccd_fwf_file(
@@ -363,38 +381,70 @@ def convert_ahccd(
                 logger.warning(
                     f"metadata info for station {ff.name} not found : skipping"
                 )
-    if not merge:
-        return
+        else:
+            logger.info(f"{outfile.name} already exists: Skipping...")
+    if merge:
+        merge_ahccd(data_source, output_dir, variable)
+    return
+
+
+def merge_ahccd(
+    data_source: str | Path,
+    output_dir: str | Path | None = None,
+    variable: str | None = None,
+    overwrite: bool = False,
+) -> None:
+    """Merge Adjusted and Homogenized Canadian Climate Dataset files."""
+    if variable:
+        code = _ahccd_variable_code(variable)
+        glob_pattern = f"{code}*.nc"
+        output_dir = Path(output_dir).resolve().joinpath(variable)
+    else:
+        glob_pattern = "*.nc"
+        output_dir = Path(output_dir).resolve()
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Merge individual stations to single .nc file
+    ds_ahccd = xr.open_mfdataset(
+        list(data_source.glob(glob_pattern)), concat_dim="station", combine="nested"
+    )
 
-    # merge individual stations to single .nc file
-    ncfiles = list(output_dir.glob(f"{code}*.nc"))
-    outfile = output_dir.parent.joinpath(
-        "merged_stations", f"ahccd_gen{generation}_{variable}.nc"
+    for coord in ds_ahccd.coords:
+        # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files
+        # Do not apply to datetime object
+        if coord != "time" and ds_ahccd[coord].dtype == "O":
+            ds_ahccd[coord] = ds_ahccd[coord].astype(str)
+
+    variables_found = set()
+    for v in ds_ahccd.data_vars:
+        # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files
+        # Do not apply to flag timeseries
+        if ds_ahccd[v].dtype == "O" and "flag" not in v:
+            ds_ahccd[v] = ds_ahccd[v].astype(str)
+        try:
+            variables_found.add(_ahccd_variable_code(str(v)))
+        except NotImplementedError:
+            pass
+
+    # Name output file
+    ds_ahccd.attrs["variable"] = ", ".join(variables_found)
+    variables = "-".join(variables_found)
+    output_name = name_output_file(ds_ahccd, "netcdf", variables)
+    logger.info(
+        f"Many variables found. Merging station files in {data_source} as `{output_name}`."
     )
 
-    if not outfile.exists():
-        logger.info("merging stations :", variable)
-        ds_ahccd = xr.open_mfdataset(
-            ncfiles, concat_dim="station", combine="nested"
-        ).load()
-
-        for coord in ds_ahccd.coords:
-            # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files
-            # Do not apply to datetime object
-            if coord != "time" and ds_ahccd[coord].dtype == "O":
-                ds_ahccd[coord] = ds_ahccd[coord].astype(str)
-
-        for v in ds_ahccd.data_vars:
-            # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files
-            # Do not apply to flag timeseries
-            if ds_ahccd[v].dtype == "O" and "flag" not in v:
-                logger.info(v)
-                ds_ahccd[v] = ds_ahccd[v].astype(str)
-
-        outfile.parent.mkdir(parents=True, exist_ok=True)
-        ds_ahccd.to_netcdf(outfile, engine="h5netcdf", mode="w")
+    try:
+        logger.info(f"Writing merged file to: {output_dir}.")
+        write_dataset(
+            ds_ahccd,
+            output_dir,
+            output_format="netcdf",
+            output_name=output_name,
+            chunks={"time": 365},
+            overwrite=overwrite,
+            compute=True,
+        )
         del ds_ahccd
-    for nc in outfile.parent.glob("*.nc"):
-        logger.info(nc)
-        ds = xr.open_dataset(nc)
-        logger.info(ds)
+    except FileExistsError:
+        logger.info("Merged file already exists. Use overwrite=`True` to overwrite.")
diff --git a/miranda/preprocess/configs/ahccd_gen3_temperature.csv b/miranda/preprocess/configs/ahccd_gen3_temperature.csv
index 8c56a6b5..4a65dc15 100644
--- a/miranda/preprocess/configs/ahccd_gen3_temperature.csv
+++ b/miranda/preprocess/configs/ahccd_gen3_temperature.csv
@@ -24,7 +24,7 @@ No,StnId,Station name,Prov,FromYear,FromMonth,ToYear,ToMonth,%Miss,Lat(deg),Long
 21,1161663,CLINTON_AUT,BC,1993,1,2019,12,4.6,51.1,-121.5,105,y,y
 22,1021830,COMOX,BC,1935,11,2019,12,1.2,49.7,-124.9,2,y,n
 23,1021960,CORTES_ISLAND,BC,1947,3,2019,2,9.9,50,-124.9,1,y,n
-24,1012010,COWICHAN_BAY_CHERRY_,BC,1913,10,1984,3,7.7,48.7,-123.5,0,n,n
+24,1012010,COWICHAN_BAY_CHERRY,BC,1913,10,1984,3,7.7,48.7,-123.5,0,n,n
 25,1152106,CRANBROOK,BC,1901,1,2019,12,6.6,49.6,-115.7,92,y,y
 26,114B1F0,CRESTON,BC,1912,6,2019,12,0.5,49,-116.5,64,y,y
 27,1022250,CUMBERLAND,BC,1922,5,1977,6,4.7,49.6,-125,15,n,n
@@ -102,7 +102,7 @@ No,StnId,Station name,Prov,FromYear,FromMonth,ToYear,ToMonth,%Miss,Lat(deg),Long
 99,1176755,REVELSTOKE,BC,1898,5,2019,12,7.3,50.9,-118.1,44,y,y
 100,1016940,SAANICHTON_CDA,BC,1914,3,2019,7,0.6,48.6,-123.4,6,n,n
 101,1167337,SALMON_ARM,BC,1911,7,2019,12,1.1,50.5,-119.3,41,y,n
-102,1016995,SALTSPRING_,BC,1909,11,2019,12,1,48.8,-123.5,4,y,n
+102,1016995,SALTSPRING,BC,1909,11,2019,12,1,48.8,-123.5,4,y,n
 103,1057051,SANDSPIT,BC,1945,9,2019,12,4.2,53.2,-131.8,0,y,y
 104,1017099,SATURNA_CAPMON,BC,1989,6,2019,12,3,48.7,-123.1,17,y,y
 105,1017230,SHAWNIGAN_LAKE,BC,1913,4,2019,12,0.6,48.6,-123.6,15,n,n
@@ -620,7 +620,7 @@ No,StnId,Station name,Prov,FromYear,FromMonth,ToYear,ToMonth,%Miss,Lat(deg),Long
 617,7055122,MONT_JOLI,QUE,1875,10,2019,12,0.6,48.6,-68.2,5,y,y
 618,7035160,MONT_LAURIER,QUE,1920,7,2014,6,7.2,46.5,-75.5,24,y,n
 619,7024745,MONTREAL_TAVISH,QUE,1871,7,2019,12,2.8,45.5,-73.5,7,y,n
-620,702S006,MONTREAL__TRUDEAU_IN,QUE,1953,1,2019,12,0.5,45.4,-73.7,3,y,y
+620,702S006,MONTREAL_TRUDEAU_INTERNATIONAL,QUE,1953,1,2019,12,0.5,45.4,-73.7,3,y,y
 621,7045401,NATASHQUAN,QUE,1914,10,2019,12,4.1,50.1,-61.8,1,y,y
 622,7055422,NEW_CARLISLE,QUE,1963,1,2019,12,17.8,48,-65.3,4,y,n
 623,7025442,NICOLET,QUE,1913,11,2019,12,2.9,46.2,-72.6,0,y,n
@@ -657,8 +657,8 @@ No,StnId,Station name,Prov,FromYear,FromMonth,ToYear,ToMonth,%Miss,Lat(deg),Long
 654,7016800,ST_ALBAN,QUE,1949,9,2019,10,2.3,46.7,-72,7,n,n
 655,7066820,ST_AMBROISE,QUE,1954,9,2019,10,4.5,48.5,-71.3,12,n,n
 656,702FQLF,ST_ANICET,QUE,1960,11,2019,12,2,45.1,-74.2,4,y,y
-657,7056930,ST_CAMILLE_,QUE,1963,7,2019,10,2,46.4,-70.2,39,n,n
-658,7016960,ST_CHARLES_DE_MANDE_,QUE,1976,6,2019,10,21.4,46.3,-73.3,16,n,n
+657,7056930,ST_CAMILLE,QUE,1963,7,2019,10,2,46.4,-70.2,39,n,n
+658,7016960,ST_CHARLES_DE_MANDE,QUE,1976,6,2019,10,21.4,46.3,-73.3,16,n,n
 659,7017080,ST_COME,QUE,1950,12,2018,11,4.6,46.2,-73.7,24,n,n
 660,7027083,ST_COME_DE_LINIERE,QUE,1965,9,2019,10,3.7,46,-70.5,24,n,n
 661,7027200,ST_EPHREM,QUE,1929,2,2019,10,18.1,46,-70.9,31,n,n
@@ -666,7 +666,7 @@ No,StnId,Station name,Prov,FromYear,FromMonth,ToYear,ToMonth,%Miss,Lat(deg),Long
 663,7027259,ST_FLAVIEN,QUE,1963,1,2016,8,2.1,46.4,-71.5,13,n,n
 664,7027302,ST_GUILLAUME,QUE,1963,1,2015,10,7.6,45.8,-72.7,4,n,n
 665,7037310,ST_HIPPOLYTE,QUE,1961,2,2019,10,4.9,45.9,-74,36,n,n
-666,7027329,ST_HUBERT_MONT_,QUE,1953,1,2019,12,0.8,45.5,-73.4,2,y,n
+666,7027329,ST_HUBERT_MONT,QUE,1953,1,2019,12,0.8,45.5,-73.4,2,y,n
 667,7027361,ST_HYACINTHE,QUE,1935,1,2019,10,8.4,45.5,-72.9,3,y,n
 668,7037400,ST_JEROME,QUE,1932,5,2019,10,4.3,45.8,-74,17,n,n
 669,7027516,ST_LUDGER,QUE,1964,10,2019,10,3.1,45.7,-70.6,33,n,n
@@ -778,6 +778,6 @@ No,StnId,Station name,Prov,FromYear,FromMonth,ToYear,ToMonth,%Miss,Lat(deg),Long
 775,8403603,ST_JOHN_WEST,NFLD,1950,11,2019,12,6.6,47.5,-52.7,11,y,y
 776,8403619,ST_LAWRENCE,NFLD,1989,11,2019,12,14.6,46.9,-55.3,4,y,y
 777,8403820,STEPHENVILLE,NFLD,1895,6,2019,12,6.6,48.5,-58.5,5,y,y
-778,8403851,TERRA_NOVA_NAT_PARK_,NFLD,1962,3,2019,12,7.1,48.5,-53.9,10,y,y
+778,8403851,TERRA_NOVA_NAT_PARK,NFLD,1962,3,2019,12,7.1,48.5,-53.9,10,y,y
 779,8504177,WABUSH_LAKE,NFLD,1960,11,2019,12,0.8,52.9,-66.8,55,y,y
 780,8404343,WRECKHOUSE,NFLD,1981,6,2019,12,1.5,47.7,-59.3,3,y,y
diff --git a/miranda/preprocess/configs/eccc-homogenized_attrs.json b/miranda/preprocess/configs/eccc-homogenized_attrs.json
index e35bba41..fc3efb6e 100644
--- a/miranda/preprocess/configs/eccc-homogenized_attrs.json
+++ b/miranda/preprocess/configs/eccc-homogenized_attrs.json
@@ -20,12 +20,14 @@
     "dataset_id": "d6813de6-b20a-46cc-8990-01862ae15c5f",
     "documentation": "https://www.canada.ca/en/environment-climate-change/services/climate-change/canadian-centre-climate-services/display-download/technical-documentation-adjusted-climate-data.html",
     "domain": "AMNO",
+    "frequency": "day",
     "institution": "GovCan",
     "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html",
     "license_preamble": "The data is owned by the Government of Canada (Environment and Climate Change Canada), and fall under the licence agreement for use of Environment and Climate Change Canada data.",
     "license_type": "permissive",
     "organization": "ECCC",
     "processing_level": "adjusted",
+    "project": "AHCCD",
     "realm": "atmos",
     "source": "AHCCD",
     "table_date": "2023-08-03",
diff --git a/templates/ahccd_preprocess.py b/templates/ahccd_preprocess.py
new file mode 100644
index 00000000..2f413a5f
--- /dev/null
+++ b/templates/ahccd_preprocess.py
@@ -0,0 +1,10 @@
+from pathlib import Path
+
+from miranda.preprocess import convert_ahccd, merge_ahccd
+
+in_files = Path("~/Desktop/ec_data/ahccd").expanduser()
+output = Path().cwd().parent / "test"
+variable = "tas"
+
+convert_ahccd(in_files, output, variable, generation=3)
+merge_ahccd(output.joinpath("tas"), output, variable, overwrite=True)

From 9339f30ace1c23c7d9f2ce4bf853463c62e11e6b Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Mon, 7 Aug 2023 15:36:35 -0400
Subject: [PATCH 11/33] working version

---
 miranda/io/_output.py                     | 11 +++++--
 miranda/io/data/ouranos_chunk_config.json | 14 ++++++++
 miranda/preprocess/_eccc_homogenized.py   | 39 +++++++++++------------
 templates/ahccd_preprocess.py             |  2 +-
 4 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/miranda/io/_output.py b/miranda/io/_output.py
index 8d0667dc..4027ffbd 100644
--- a/miranda/io/_output.py
+++ b/miranda/io/_output.py
@@ -82,8 +82,15 @@ def write_dataset(
             outfile_path.unlink()
 
     if chunks is None and "frequency" in ds.attrs:
-        freq = ds.attrs["frequency"]  # TOD0: check that this is really there
-        chunks = fetch_chunk_config(priority="time", freq=freq, dims=ds.dims)
+        freq = ds.attrs.get("frequency")
+        if not freq:
+            raise ValueError(
+                "If 'chunks' are not provided, the 'frequency' attribute must be set."
+            )
+        if "lat" in ds.dims and "lon" in ds.dims:
+            chunks = fetch_chunk_config(priority="time", freq=freq, dims=ds.dims)
+        elif "lat" not in ds.dims and "lon" not in ds.dims:
+            chunks = fetch_chunk_config(priority="stations", freq=freq, dims=ds.dims)
 
     logging.info(f"Writing {output_name}.")
     write_object = delayed_write(
diff --git a/miranda/io/data/ouranos_chunk_config.json b/miranda/io/data/ouranos_chunk_config.json
index 2ac759b7..0f18928d 100644
--- a/miranda/io/data/ouranos_chunk_config.json
+++ b/miranda/io/data/ouranos_chunk_config.json
@@ -37,6 +37,20 @@
       }
     }
   },
+  "stations": {
+    "1hr": {
+      "default": {
+        "station": 50,
+        "time": "5 years"
+      }
+    },
+    "day": {
+      "default": {
+        "station": 200,
+        "time": "10 years"
+      }
+    }
+  },
   "time": {
     "1hr": {
       "default": {
diff --git a/miranda/preprocess/_eccc_homogenized.py b/miranda/preprocess/_eccc_homogenized.py
index 33f7c9ca..ae08af01 100644
--- a/miranda/preprocess/_eccc_homogenized.py
+++ b/miranda/preprocess/_eccc_homogenized.py
@@ -3,6 +3,7 @@
 
 import calendar
 import logging.config
+import warnings
 from pathlib import Path
 
 import numpy as np
@@ -29,9 +30,11 @@ def _ahccd_variable_code(code: str):
         if variable_name:
             variable_codes[variable_name] = variable_code
         else:
-            raise AttributeError(
-                f"Variable `{variable_code}` is not properly configured. Verify JSON."
+            warnings.warn(
+                f"Variable `{variable_code}` does not have accompanying `variable_name`. "
+                f"Verify JSON. Continuing with `{variable_code}` as `variable_name`."
             )
+            variable_codes[variable_code] = variable_code
 
     if code in variable_codes.values():
         variable = code
@@ -69,6 +72,7 @@ def _ahccd_variable_metadata(
     variable_meta = metadata["variables"].get(code)
     variable_name = variable_meta.get("_variable_name")
     if variable_name:
+        variable_meta["original_variable_name"] = variable_code
         variable_meta = {variable_name: variable_meta}
         del variable_meta[variable_name]["_variable_name"]
     else:
@@ -99,10 +103,6 @@ def _ahccd_variable_metadata(
     return variable_meta, header
 
 
-def _ahccd_station_metadata(code):
-    pass
-
-
 def _ahccd_column_definitions(
     variable_code: str,
 ) -> tuple[dict, list[tuple[int, int]], int]:
@@ -270,13 +270,8 @@ def convert_ahccd_fwf_file(
 
     ds_out[variable].attrs = variable_meta[variable]
     metadata = metadata.to_xarray().rename({"index": "station"}).drop_vars("station")
-    metadata = metadata.assign_coords(
-        {
-            "stnid": metadata["stnid"].astype(str),
-            "station_name": metadata["station_name"],
-        }
-    )
-    ds_out = ds_out.assign_coords(station=metadata.stnid)
+    metadata = metadata.assign_coords(dict(station_name=metadata["station_name"]))
+    ds_out = ds_out.assign_coords(station=metadata.stnid.astype(str))
     metadata = metadata.drop_vars(["stnid", "station_name"])
 
     ds_out[f"{variable}_flag"].attrs["long_name"] = variable_meta[variable]["long_name"]
@@ -367,7 +362,7 @@ def convert_ahccd(
         if not outfile.exists() or overwrite:
             logger.info(ff.name)
 
-            station_id = ff.name[2:].split(".txt")[0]
+            station_id = ff.stem[2:]
             metadata_st = metadata[metadata["stnid"] == station_id]
 
             if len(metadata_st) == 1:
@@ -410,14 +405,14 @@ def merge_ahccd(
     )
 
     for coord in ds_ahccd.coords:
-        # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files
+        # xarray object datatypes mix string and int (e.g. station) convert to string for merged nc files
         # Do not apply to datetime object
         if coord != "time" and ds_ahccd[coord].dtype == "O":
             ds_ahccd[coord] = ds_ahccd[coord].astype(str)
 
     variables_found = set()
     for v in ds_ahccd.data_vars:
-        # xarray object datatypes mix string and int (e.g. stnid) convert to string for merged nc files
+        # xarray object datatypes mix string and int (e.g. station) convert to string for merged nc files
         # Do not apply to flag timeseries
         if ds_ahccd[v].dtype == "O" and "flag" not in v:
             ds_ahccd[v] = ds_ahccd[v].astype(str)
@@ -428,11 +423,14 @@ def merge_ahccd(
 
     # Name output file
     ds_ahccd.attrs["variable"] = ", ".join(variables_found)
-    variables = "-".join(variables_found)
+    if len(variables_found) > 1:
+        variables = "-".join(variables_found)
+        logger.info(
+            f"Many variables found. Merging station and variables files in {data_source}."
+        )
+    else:
+        variables = variables_found.pop()
     output_name = name_output_file(ds_ahccd, "netcdf", variables)
-    logger.info(
-        f"Many variables found. Merging station files in {data_source} as `{output_name}`."
-    )
 
     try:
         logger.info(f"Writing merged file to: {output_dir}.")
@@ -441,7 +439,6 @@ def merge_ahccd(
             output_dir,
             output_format="netcdf",
             output_name=output_name,
-            chunks={"time": 365},
             overwrite=overwrite,
             compute=True,
         )
diff --git a/templates/ahccd_preprocess.py b/templates/ahccd_preprocess.py
index 2f413a5f..27a88072 100644
--- a/templates/ahccd_preprocess.py
+++ b/templates/ahccd_preprocess.py
@@ -7,4 +7,4 @@
 variable = "tas"
 
 convert_ahccd(in_files, output, variable, generation=3)
-merge_ahccd(output.joinpath("tas"), output, variable, overwrite=True)
+merge_ahccd(output.joinpath("tas"), output.joinpath("merged"), variable, overwrite=True)

From e096824729c214bd7b4cd78475e7ae54afe81c5b Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Mon, 7 Aug 2023 16:54:14 -0400
Subject: [PATCH 12/33] begin work on obs-summaries

---
 miranda/preprocess/_data_definitions.py       |  40 ++++-
 miranda/preprocess/_eccc_homogenized.py       |  40 ++---
 miranda/preprocess/_eccc_obs.py               |   2 +-
 miranda/preprocess/_eccc_summaries.py         |   2 +-
 .../configs/eccc-homogenized_attrs.json       |   4 +-
 ...attrs.json => eccc-obs-summary_attrs.json} | 147 ++++++++----------
 .../preprocess/configs/eccc-obs_attrs.json    |   4 +-
 ...preprocess.py => eccc-ahccd_preprocess.py} |   0
 templates/eccc-obs_preprocess.py              |   0
 templates/eccc_ahccd_conversion.py            |  28 ----
 10 files changed, 121 insertions(+), 146 deletions(-)
 rename miranda/preprocess/configs/{eccc-obs-summary_cf_attrs.json => eccc-obs-summary_attrs.json} (54%)
 rename templates/{ahccd_preprocess.py => eccc-ahccd_preprocess.py} (100%)
 create mode 100644 templates/eccc-obs_preprocess.py
 delete mode 100644 templates/eccc_ahccd_conversion.py

diff --git a/miranda/preprocess/_data_definitions.py b/miranda/preprocess/_data_definitions.py
index 815b1048..f73251e9 100644
--- a/miranda/preprocess/_data_definitions.py
+++ b/miranda/preprocess/_data_definitions.py
@@ -1,13 +1,14 @@
 from __future__ import annotations
 
 import json
+import warnings
 from pathlib import Path
 from typing import Any
 
 _config_folder = Path(__file__).resolve().parent / "configs"
 
 
-__all__ = ["load_json_data_mappings"]
+__all__ = ["load_json_data_mappings", "find_project_variable_codes"]
 
 
 def load_json_data_mappings(project: str) -> dict[str, Any]:
@@ -35,3 +36,40 @@ def load_json_data_mappings(project: str) -> dict[str, Any]:
         raise NotImplementedError(f"Project not supported: {project}")
 
     return metadata_definition
+
+
+def find_project_variable_codes(code: str, table: str) -> str:
+    """Find the variable code for a given variable name and project.
+
+    Parameters
+    ----------
+    code : str
+        Variable name.
+    table : str
+        Project name.
+
+    Returns
+    -------
+    str
+    """
+    config = load_json_data_mappings(table)
+    variable_codes = {}
+    for variable_code in config["variables"]:
+        variable_name = config["variables"][variable_code].get("_variable_name")
+        if variable_name:
+            variable_codes[variable_name] = variable_code
+        else:
+            warnings.warn(
+                f"Variable `{variable_code}` does not have accompanying `variable_name`. "
+                f"Verify JSON. Continuing with `{variable_code}` as `variable_name`."
+            )
+            variable_codes[variable_code] = variable_code
+
+    if code in variable_codes.values():
+        variable = code
+    else:
+        variable = variable_codes.get(code)
+    if not variable:
+        raise NotImplementedError(f"Variable `{code}` not supported.")
+
+    return variable
diff --git a/miranda/preprocess/_eccc_homogenized.py b/miranda/preprocess/_eccc_homogenized.py
index ae08af01..7e4a7559 100644
--- a/miranda/preprocess/_eccc_homogenized.py
+++ b/miranda/preprocess/_eccc_homogenized.py
@@ -3,7 +3,6 @@
 
 import calendar
 import logging.config
-import warnings
 from pathlib import Path
 
 import numpy as np
@@ -12,7 +11,10 @@
 
 from miranda.io import write_dataset
 from miranda.io.utils import name_output_file
-from miranda.preprocess._data_definitions import load_json_data_mappings
+from miranda.preprocess._data_definitions import (
+    find_project_variable_codes,
+    load_json_data_mappings,
+)
 from miranda.preprocess._treatments import basic_metadata_conversion
 from miranda.scripting import LOGGING_CONFIG
 
@@ -22,30 +24,6 @@
 __all__ = ["convert_ahccd", "convert_ahccd_fwf_file", "merge_ahccd"]
 
 
-def _ahccd_variable_code(code: str):
-    config = load_json_data_mappings("eccc-homogenized")
-    variable_codes = {}
-    for variable_code in config["variables"]:
-        variable_name = config["variables"][variable_code].get("_variable_name")
-        if variable_name:
-            variable_codes[variable_name] = variable_code
-        else:
-            warnings.warn(
-                f"Variable `{variable_code}` does not have accompanying `variable_name`. "
-                f"Verify JSON. Continuing with `{variable_code}` as `variable_name`."
-            )
-            variable_codes[variable_code] = variable_code
-
-    if code in variable_codes.values():
-        variable = code
-    else:
-        variable = variable_codes.get(code)
-    if not variable:
-        raise NotImplementedError(f"Variable `{code}` not supported.")
-
-    return variable
-
-
 def _ahccd_variable_metadata(
     variable_code: str,
     gen: int,
@@ -67,7 +45,7 @@ def _ahccd_variable_metadata(
 
     config = load_json_data_mappings("eccc-homogenized")
     metadata = basic_metadata_conversion("eccc-homogenized", config)
-    code = _ahccd_variable_code(variable_code)
+    code = find_project_variable_codes(variable_code, "eccc-homogenized")
 
     variable_meta = metadata["variables"].get(code)
     variable_name = variable_meta.get("_variable_name")
@@ -190,7 +168,7 @@ def convert_ahccd_fwf_file(
     -------
     xarray.Dataset
     """
-    code = _ahccd_variable_code(variable)
+    code = find_project_variable_codes(variable, "eccc-homogenized")
 
     variable_meta, global_attrs = _ahccd_variable_metadata(code, generation)
     col_names, cols_specs, header = _ahccd_column_definitions(code)
@@ -321,7 +299,7 @@ def convert_ahccd(
     output_dir = Path(output_dir).resolve().joinpath(variable)
     output_dir.mkdir(parents=True, exist_ok=True)
 
-    code = _ahccd_variable_code(variable)
+    code = find_project_variable_codes(variable, "eccc-homogenized")
     var_meta, global_attrs = _ahccd_variable_metadata(code, generation)
     (
         col_names,
@@ -391,7 +369,7 @@ def merge_ahccd(
 ) -> None:
     """Merge Adjusted and Homogenized Canadian Climate Dataset files."""
     if variable:
-        code = _ahccd_variable_code(variable)
+        code = find_project_variable_codes(variable, "eccc-homogenized")
         glob_pattern = f"{code}*.nc"
         output_dir = Path(output_dir).resolve().joinpath(variable)
     else:
@@ -417,7 +395,7 @@ def merge_ahccd(
         if ds_ahccd[v].dtype == "O" and "flag" not in v:
             ds_ahccd[v] = ds_ahccd[v].astype(str)
         try:
-            variables_found.add(_ahccd_variable_code(str(v)))
+            variables_found.add(find_project_variable_codes(str(v), "eccc-homogenized"))
         except NotImplementedError:
             pass
 
diff --git a/miranda/preprocess/_eccc_obs.py b/miranda/preprocess/_eccc_obs.py
index 7cfa9249..5691ee81 100644
--- a/miranda/preprocess/_eccc_obs.py
+++ b/miranda/preprocess/_eccc_obs.py
@@ -37,7 +37,7 @@
 from xclim.core.units import convert_units_to
 
 from miranda.archive import group_by_length
-from miranda.convert import load_json_data_mappings
+from miranda.preprocess._data_definitions import load_json_data_mappings
 from miranda.scripting import LOGGING_CONFIG
 from miranda.storage import file_size, report_file_size
 from miranda.utils import generic_extract_archive
diff --git a/miranda/preprocess/_eccc_summaries.py b/miranda/preprocess/_eccc_summaries.py
index 3c31ba32..118d1c31 100755
--- a/miranda/preprocess/_eccc_summaries.py
+++ b/miranda/preprocess/_eccc_summaries.py
@@ -30,7 +30,7 @@
 __all__ = ["extract_daily_summaries", "daily_summaries_to_netcdf"]
 
 eccc_metadata = json.load(
-    open(Path(__file__).resolve().parent / "configs" / "eccc-obs-summary_cf_attrs.json")
+    open(Path(__file__).resolve().parent / "configs" / "eccc-obs-summary_attrs.json")
 )["variable_entry"]
 
 
diff --git a/miranda/preprocess/configs/eccc-homogenized_attrs.json b/miranda/preprocess/configs/eccc-homogenized_attrs.json
index fc3efb6e..a56c5b51 100644
--- a/miranda/preprocess/configs/eccc-homogenized_attrs.json
+++ b/miranda/preprocess/configs/eccc-homogenized_attrs.json
@@ -19,7 +19,7 @@
     "contact": "info.cccs-ccsc@canada.ca",
     "dataset_id": "d6813de6-b20a-46cc-8990-01862ae15c5f",
     "documentation": "https://www.canada.ca/en/environment-climate-change/services/climate-change/canadian-centre-climate-services/display-download/technical-documentation-adjusted-climate-data.html",
-    "domain": "AMNO",
+    "domain": "CAN",
     "frequency": "day",
     "institution": "GovCan",
     "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html",
@@ -29,7 +29,7 @@
     "processing_level": "adjusted",
     "project": "AHCCD",
     "realm": "atmos",
-    "source": "AHCCD",
+    "source": "msc",
     "table_date": "2023-08-03",
     "table_id": "ECCC",
     "type": "station-obs"
diff --git a/miranda/preprocess/configs/eccc-obs-summary_cf_attrs.json b/miranda/preprocess/configs/eccc-obs-summary_attrs.json
similarity index 54%
rename from miranda/preprocess/configs/eccc-obs-summary_cf_attrs.json
rename to miranda/preprocess/configs/eccc-obs-summary_attrs.json
index b21f224e..11b3dc51 100644
--- a/miranda/preprocess/configs/eccc-obs-summary_cf_attrs.json
+++ b/miranda/preprocess/configs/eccc-obs-summary_attrs.json
@@ -1,173 +1,160 @@
 {
   "Header": {
-    "Conventions": "CF-1.8",
+    "_miranda_version": true,
+    "_variable": true,
+    "acknowledgement": "This data is provided by Environment and Climate Change Canada (ECCC).",
+    "author": "Environment and Climate Change Canada (ECCC)",
     "contact": "info.cccs-ccsc@canada.ca",
+    "dataset_id": "b24efb37-11b6-5d03-ab19-5759f83db546",
+    "documentation": "https://climate.weather.gc.ca/doc/Technical_Documentation.pdf",
+    "domain": "CAN",
+    "frequency": "mon",
     "institution": "GovCan",
-    "int_missing_value": "-999",
     "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html",
+    "license_preamble": "The data is owned by the Government of Canada (Environment and Climate Change Canada), and fall under the licence agreement for use of Environment and Climate Change Canada data.",
     "license_type": "permissive",
-    "missing_value": "1e20",
     "organization": "ECCC",
     "processing_level": "raw",
+    "product": "A cross-country summary of the averages and extremes for the month, including precipitation totals, max-min temperatures, and degree days.",
+    "project": "ECCC-SUMMARIES",
     "realm": "atmos",
     "source": "msc",
-    "table_date": "2023-03-23",
+    "table_date": "2023-08-07",
+    "table_id": "ECCC",
     "type": "station-obs"
   },
   "variable_entry": {
     "cdd": {
-      "add_offset": 0,
+      "_variable_name": "cdd",
       "cell_methods": "time: sum",
       "comments": "Station data converted from Cool Deg Days (°C)",
       "frequency": "day",
       "grid_mapping": "regular_lon_lat",
       "long_name": "Number of Degrees Celsius Over a Mean Temperature of 18 °C",
-      "original_variable": "Cool Deg Days (°C)",
-      "out_name": "cdd",
-      "scale_factor": 1,
-      "standard_name": "cooling_degree_days",
+      "original_field": "Cool Deg Days (°C)",
       "type": "real",
-      "units": "C"
+      "units": "degC"
     },
     "hdd": {
-      "add_offset": 0,
+      "_variable_name": "hdd",
       "cell_methods": "time: sum",
       "comments": "Station data converted from Heat Deg Days (°C)",
-      "frequency": "day",
+      "frequency": "mon",
       "grid_mapping": "regular_lon_lat",
       "long_name": "Number of Degrees Celsius Under a Mean Temperature of 18 °C",
-      "original_variable": "Heat Deg Days (°C)",
-      "out_name": "hdd",
-      "scale_factor": 1,
-      "standard_name": "heating_degree_days",
+      "original_field": "Heat Deg Days (°C)",
       "type": "real",
-      "units": "C"
+      "units": "degC"
     },
     "pr": {
-      "add_offset": 0,
+      "_variable_name": "pr",
       "cell_methods": "time: mean",
       "comments": "Station data converted from Total Precip (mm) using a density of 1000 kg/m³",
-      "frequency": "day",
+      "frequency": "mon",
       "grid_mapping": "regular_lon_lat",
       "long_name": "Precipitation",
-      "original_variable": "Total Precip (mm)",
-      "out_name": "pr",
-      "scale_factor": 1.1574074074074073e-05,
-      "standard_name": "precipitation_flux",
+      "original_field": "Total Precip (mm)",
       "type": "real",
-      "units": "kg m-2 s-1"
+      "units": "mm"
     },
     "prlp": {
-      "add_offset": 0,
+      "_variable_name": "prlp",
       "cell_methods": "time: mean",
       "comments": "Station data converted from Total Rain (mm) using a density of 1000 kg/m³",
-      "frequency": "day",
+      "frequency": "mon",
       "grid_mapping": "regular_lon_lat",
       "long_name": "Liquid Precipitation",
-      "original_variable": "Total Rain (mm)",
-      "out_name": "prlp",
-      "scale_factor": 1.1574074074074073e-05,
-      "standard_name": "rainfall_flux",
+      "original_field": "Total Rain (mm)",
       "type": "real",
-      "units": "kg m-2 s-1"
+      "units": "mm"
     },
     "prsn": {
-      "add_offset": 0,
+      "_variable_name": "prsn",
       "cell_methods": "time: mean",
       "comments": "station data converted from Total Snow (cm) using a density of 100 kg/m³",
-      "frequency": "day",
+      "frequency": "mon",
       "grid_mapping": "regular_lon_lat",
       "long_name": "Snowfall Flux",
-      "original_variable": "Total Snow (cm)",
-      "out_name": "prsn",
-      "scale_factor": 1.1574074074074073e-05,
-      "standard_name": "snowfall_flux",
+      "original_field": "Total Snow (cm)",
       "type": "real",
-      "units": "kg m-2 s-1"
+      "units": "cm"
     },
     "sfcWindAz": {
-      "add_offset": 0,
+      "_variable_name": "sfcWindAz",
       "cell_methods": "time: mean",
       "comments": "Station data converted from Dir of Max Gust (10s deg)",
-      "frequency": "day",
+      "frequency": "mon",
       "grid_mapping": "regular_lon_lat",
       "long_name": "Direction from which the Daily Maximum Near-Surface Gust Wind Speed maximum Blows",
-      "original_variable": "Dir of Max Gust (10s deg)",
-      "out_name": "sfcWindAz",
-      "scale_factor": 1,
-      "standard_name": "wind_direction",
+      "original_field": "Dir of Max Gust (10s deg)",
       "type": "real",
       "units": "degree"
     },
     "sfcWindMax": {
-      "add_offset": 0,
+      "_variable_name": "sfcWindMax",
       "cell_methods": "time: max",
       "comments": "Station data converted from Spd of Max Gust (km/h)",
-      "frequency": "day",
+      "frequency": "mon",
       "grid_mapping": "regular_lon_lat",
       "long_name": "Daily Maximum Near-Surface Gust Wind Speed maximum",
-      "original_variable": "Spd of Max Gust (km/h)",
-      "out_name": "sfcWindMax",
-      "scale_factor": 0.2777777777777778,
-      "standard_name": "wind_speed_of_gust maximum",
+      "original_field": "Spd of Max Gust (km/h)",
       "type": "real",
-      "units": "m s-1"
+      "units": "km h-1"
     },
     "snd": {
-      "add_offset": 0,
+      "_variable_name": "snd",
       "cell_methods": "time: mean",
       "comments": "Station data converted from Snow on Grnd (cm)",
-      "frequency": "day",
+      "frequency": "mon",
       "grid_mapping": "regular_lon_lat",
       "long_name": "Snow Depth",
-      "original_variable": "Snow on Grnd (cm)",
-      "out_name": "snd",
-      "scale_factor": 0.01,
-      "standard_name": "surface_snow_thickness",
+      "original_field": "Snow on Grnd (cm)",
       "type": "real",
-      "units": "m"
+      "units": "cm"
     },
     "tas": {
-      "add_offset": 273.15,
+      "_variable_name": "tas",
       "cell_methods": "time: mean",
-      "comments": "Station data converted from Mean Temp (°C)",
-      "frequency": "day",
+      "comments": "Station data converted from Mean Temperature (°C)",
+      "frequency": "mon",
       "grid_mapping": "regular_lon_lat",
       "long_name": "Near-Surface Air Temperature",
-      "original_variable": "Mean Temp (°C)",
-      "out_name": "tas",
-      "scale_factor": 1,
-      "standard_name": "air_temperature",
+      "original_field": "Mean Temperature",
+      "type": "real",
+      "units": "degC"
+    },
+    "tas_days": {
+      "_variable_name": "tas_days",
+      "cell_methods": "time: count",
+      "comments": "Station data converted from Days With Valid Mean Temperature",
+      "frequency": "mon",
+      "grid_mapping": "regular_lon_lat",
+      "long_name": "Number of Days With Valid Near-Surface Air Temperature",
+      "original_field": "Days With Valid Mean Temp",
       "type": "real",
-      "units": "K"
+      "units": "1"
     },
     "tasmax": {
-      "add_offset": 273.15,
+      "_variable_name": "tasmax",
       "cell_methods": "time: maximum",
       "comments": "station data converted from Max Temp (°C)",
-      "frequency": "day",
+      "frequency": "mon",
       "grid_mapping": "regular_lon_lat",
       "long_name": "Daily Maximum Near-Surface Air Temperature",
-      "original_variable": "Max Temp (°C)",
-      "out_name": "tasmax",
-      "scale_factor": 1,
-      "standard_name": "air_temperature",
+      "original_field": "Max Temp (°C)",
       "type": "real",
-      "units": "K"
+      "units": "degC"
     },
     "tasmin": {
-      "add_offset": 273.15,
+      "_variable_name": "tasmin",
       "cell_methods": "time: minimum",
       "comments": "Station data converted from Min Temp (°C)",
-      "frequency": "day",
+      "frequency": "mon",
       "grid_mapping": "regular_lon_lat",
       "long_name": "Daily Minimum Near-Surface Air Temperature",
-      "original_variable": "Min Temp (°C)",
-      "out_name": "tasmin",
-      "scale_factor": 1,
-      "standard_name": "air_temperature",
+      "original_field": "Min Temp (°C)",
       "type": "real",
-      "units": "K"
+      "units": "degC"
     }
   }
 }
diff --git a/miranda/preprocess/configs/eccc-obs_attrs.json b/miranda/preprocess/configs/eccc-obs_attrs.json
index 7265ca71..82d93e55 100644
--- a/miranda/preprocess/configs/eccc-obs_attrs.json
+++ b/miranda/preprocess/configs/eccc-obs_attrs.json
@@ -12,7 +12,7 @@
     ],
     "acknowledgement": "This data is provided by Environment and Climate Change Canada (ECCC).",
     "author": "Environment and Climate Change Canada (ECCC)",
-    "contact": "climatcentre-climatecentral@ec.gc.ca",
+    "contact": "ccsc-cccs@ec.gc.ca",
     "documentation": "https://climate.weather.gc.ca/doc/Technical_Documentation.pdf",
     "institution": "GovCan",
     "license": "https://climate.weather.gc.ca/prods_servs/attachment1_e.html",
@@ -20,7 +20,7 @@
     "license_type": "permissive",
     "organization": "ECCC",
     "processing_level": "raw",
-    "source": "ECCC-OBS",
+    "source": "msc",
     "table_date": "2023-08-02",
     "title": "Environment and Climate Change Canada (ECCC) weather station observations",
     "type": "station-obs",
diff --git a/templates/ahccd_preprocess.py b/templates/eccc-ahccd_preprocess.py
similarity index 100%
rename from templates/ahccd_preprocess.py
rename to templates/eccc-ahccd_preprocess.py
diff --git a/templates/eccc-obs_preprocess.py b/templates/eccc-obs_preprocess.py
new file mode 100644
index 00000000..e69de29b
diff --git a/templates/eccc_ahccd_conversion.py b/templates/eccc_ahccd_conversion.py
deleted file mode 100644
index e29dd643..00000000
--- a/templates/eccc_ahccd_conversion.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from os import getenv
-from pathlib import Path
-
-from miranda.eccc import convert_ahccd
-
-if __name__ == "__main__":
-    in_files = getenv("in")
-    out_files = getenv("out")
-
-    source_files = Path(in_files)
-    output_path = Path(out_files)
-
-    source_var_gens = {
-        "Generation3/Homog_daily_mean_temp_v2019/": ("tas", 3),
-        "Generation3/Homog_daily_max_temp_v2019/": ("tasmax", 3),
-        "Generation3/Homog_daily_min_temp_v2019/": ("tasmin", 3),
-        "Generation2/Adj_Daily_Total_v2017/": ("pr", 2),
-        "Generation2/Adj_Daily_Snow_v2017/": ("prsn", 2),
-        "Generation2/Adj_Daily_Rain_v2017/": ("prlp", 2),
-    }
-
-    for folder, (variable, generation) in source_var_gens.items():
-        convert_ahccd(
-            source_files.expanduser().joinpath(folder),
-            output_path,
-            variable,
-            generation,
-        )

From 268cca198ddbe78b73431b93332fccbdcb0c8f55 Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Wed, 9 Aug 2023 12:26:17 -0400
Subject: [PATCH 13/33] finishing touches on ahccd

---
 miranda/preprocess/_eccc_homogenized.py | 56 ++++++++++++++----
 miranda/preprocess/_eccc_obs.py         | 79 ++-----------------------
 miranda/preprocess/eccc.py              |  5 --
 3 files changed, 48 insertions(+), 92 deletions(-)

diff --git a/miranda/preprocess/_eccc_homogenized.py b/miranda/preprocess/_eccc_homogenized.py
index 7e4a7559..00a417da 100644
--- a/miranda/preprocess/_eccc_homogenized.py
+++ b/miranda/preprocess/_eccc_homogenized.py
@@ -4,6 +4,7 @@
 import calendar
 import logging.config
 from pathlib import Path
+from typing import Any, Dict, List, Tuple, Type
 
 import numpy as np
 import pandas as pd
@@ -83,7 +84,7 @@ def _ahccd_variable_metadata(
 
 def _ahccd_column_definitions(
     variable_code: str,
-) -> tuple[dict, list[tuple[int, int]], int]:
+) -> tuple[dict, list[tuple[int, int]], dict[str, type[str | int | float] | Any], int]:
     config = load_json_data_mappings("eccc-homogenized")
     metadata = basic_metadata_conversion("eccc-homogenized", config)
 
@@ -105,8 +106,25 @@ def _ahccd_column_definitions(
             "Joined",
             "RCS",
         ]
+        dtypes = [
+            str,
+            str,
+            str,
+            str,
+            int,
+            int,
+            int,
+            int,
+            float,
+            float,
+            float,
+            int,
+            str,
+            str,
+        ]
         column_spaces = [(0, 5), (5, 6), (6, 8), (8, 9)]
         ii = 9
+        # 31 days in a month
         for i in range(1, 32):
             column_spaces.append((ii, ii + 7))
             ii += 7
@@ -128,8 +146,10 @@ def _ahccd_column_definitions(
             "elev (m)",
             "stns joined",
         ]
+        dtypes = [str, str, str, int, int, int, int, float, float, int, str]
         column_spaces = [(0, 4), (4, 5), (5, 7), (7, 8)]
         ii = 8
+        # 31 days in a month
         for i in range(1, 32):
             column_spaces.append((ii, ii + 8))
             ii += 8
@@ -144,8 +164,12 @@ def _ahccd_column_definitions(
         col.lower().split("(")[0].replace("%", "pct_").strip().replace(" ", "_"): col
         for col in list(column_names)
     }
+    #
+    column_dtypes = {}
+    for col in column_names.keys():
+        column_dtypes[col] = dtypes[list(column_names.keys()).index(col)]
 
-    return column_names, column_spaces, header_row
+    return column_names, column_spaces, column_dtypes, header_row
 
 
 def convert_ahccd_fwf_file(
@@ -171,9 +195,9 @@ def convert_ahccd_fwf_file(
     code = find_project_variable_codes(variable, "eccc-homogenized")
 
     variable_meta, global_attrs = _ahccd_variable_metadata(code, generation)
-    col_names, cols_specs, header = _ahccd_column_definitions(code)
+    column_names, column_spaces, column_dtypes, header = _ahccd_column_definitions(code)
 
-    df = pd.read_fwf(ff, header=header, colspecs=cols_specs)
+    df = pd.read_fwf(ff, header=header, colspecs=column_spaces, dtype=column_dtypes)
     if "pr" in variable:
         cols = list(df.columns[0:3])
         cols = cols[0::2]
@@ -302,8 +326,9 @@ def convert_ahccd(
     code = find_project_variable_codes(variable, "eccc-homogenized")
     var_meta, global_attrs = _ahccd_variable_metadata(code, generation)
     (
-        col_names,
-        col_spaces,
+        column_names,
+        column_spaces,
+        column_dtypes,
         header_row,
     ) = _ahccd_column_definitions(code)
 
@@ -321,11 +346,11 @@ def convert_ahccd(
 
     if "tas" in variable:
         metadata = pd.read_csv(metadata_source, header=2)
-        metadata.columns = col_names.keys()
+        metadata.columns = column_names.keys()
 
     elif "pr" in variable:
         metadata = pd.read_csv(metadata_source, header=3)
-        metadata.columns = col_names.keys()
+        metadata.columns = column_names.keys()
         for index, row in metadata.iterrows():
             if isinstance(row["stnid"], str):
                 metadata.loc[index, "stnid"] = metadata.loc[index, "stnid"].replace(
@@ -336,8 +361,8 @@ def convert_ahccd(
 
     # Convert station .txt files to netcdf
     for ff in Path(data_source).glob(f"{code}*.txt"):
-        outfile = output_dir.joinpath(ff.name.replace(".txt", ".nc"))
-        if not outfile.exists() or overwrite:
+        output_name = ff.name.replace(".txt", ".nc")
+        if not output_dir.joinpath(output_name).exists() or overwrite:
             logger.info(ff.name)
 
             station_id = ff.stem[2:]
@@ -349,13 +374,20 @@ def convert_ahccd(
                 )
                 ds_out.attrs = global_attrs
 
-                ds_out.to_netcdf(outfile, engine="h5netcdf")
+                write_dataset(
+                    ds_out,
+                    output_dir,
+                    output_format="netcdf",
+                    output_name=output_name,
+                    overwrite=overwrite,
+                    compute=True,
+                )
             else:
                 logger.warning(
                     f"metadata info for station {ff.name} not found : skipping"
                 )
         else:
-            logger.info(f"{outfile.name} already exists: Skipping...")
+            logger.info(f"{output_name} already exists: Skipping...")
     if merge:
         merge_ahccd(data_source, output_dir, variable)
     return
diff --git a/miranda/preprocess/_eccc_obs.py b/miranda/preprocess/_eccc_obs.py
index 5691ee81..f4601058 100644
--- a/miranda/preprocess/_eccc_obs.py
+++ b/miranda/preprocess/_eccc_obs.py
@@ -45,19 +45,14 @@
 config.dictConfig(LOGGING_CONFIG)
 
 __all__ = [
-    "convert_flat_files",
     "convert_station",
     "merge_converted_variables",
     "merge_stations",
 ]
-
-KiB = int(pow(2, 10))
-MiB = int(pow(2, 20))
-GiB = int(pow(2, 30))
 TABLE_DATE = dt.now().strftime("%d %B %Y")
 
 
-def _fwf_column_definitions(
+def _obs_fwf_column_definitions(
     time_frequency: str,
 ) -> tuple[list[str], list[int], list[type[str | int]]]:
     """Return the column names, widths, and data types for the fixed-width format."""
@@ -104,7 +99,7 @@ def convert_station(
 ):
     """Convert a single station's data from the fixed-width format to a netCDF file."""
     data = Path(data)
-    column_names, column_widths, column_dtypes = _fwf_column_definitions(mode)
+    column_names, column_widths, column_dtypes = _obs_fwf_column_definitions(mode)
 
     if using_dask_array:
         pandas_reader = dd
@@ -360,7 +355,8 @@ def _convert_station_file(
             data_files = [file]
         logging.info(f"Processing file: {file}.")
 
-        size_limit = 1 * GiB
+        # 1 GiB
+        size_limit = 2**30
 
         for data in data_files:
             if file_size(data) > size_limit and "dask" in sys.modules:
@@ -388,73 +384,6 @@ def _convert_station_file(
                     temporary_file.unlink()
 
 
-def convert_flat_files(
-    source_files: str | os.PathLike,
-    output_folder: str | os.PathLike | list[str | int],
-    variables: str | int | list[str | int],
-    mode: str = "hourly",
-    n_workers: int = 4,
-) -> None:
-    """
-
-    Parameters
-    ----------
-    source_files: str or Path
-    output_folder: str or Path
-    variables: str or List[str]
-    mode: {"hourly", "daily"}
-    n_workers: int
-
-    Returns
-    -------
-    None
-    """
-    if isinstance(variables, (str, int)):
-        variables = [variables]
-
-    for variable_code in variables:
-        variable_code = str(variable_code).zfill(3)
-        metadata = load_json_data_mappings("eccc-obs")[variable_code]
-        nc_name = metadata["cf_variable_name"]
-
-        rep_nc = Path(output_folder).joinpath(nc_name)
-        rep_nc.mkdir(parents=True, exist_ok=True)
-
-        # Loop on the files
-        logging.info(
-            f"Collecting files for variable '{metadata['standard_name']}' "
-            f"(filenames containing '{metadata['_table_name']}')."
-        )
-        list_files = list()
-        if isinstance(source_files, list) or Path(source_files).is_file():
-            list_files.append(source_files)
-        else:
-            glob_patterns = [g for g in metadata["_table_name"]]
-            for pattern in glob_patterns:
-                list_files.extend(
-                    [f for f in Path(source_files).rglob(f"{pattern}*") if f.is_file()]
-                )
-        manager = mp.Manager()
-        errored_files = manager.list()
-        converter_func = functools.partial(
-            _convert_station_file,
-            output_path=rep_nc,
-            errored_files=errored_files,
-            mode=mode,
-            variable_code=variable_code,
-            **metadata,
-        )
-        with mp.Pool(processes=n_workers) as pool:
-            pool.map(converter_func, list_files)
-            pool.close()
-            pool.join()
-
-        if errored_files:
-            logging.warning(
-                "Some files failed to be properly parsed:\n", ", ".join(errored_files)
-            )
-
-
 def merge_stations(
     source_files: str | os.PathLike | None = None,
     output_folder: str | os.PathLike | None = None,
diff --git a/miranda/preprocess/eccc.py b/miranda/preprocess/eccc.py
index aab3595d..688ed196 100644
--- a/miranda/preprocess/eccc.py
+++ b/miranda/preprocess/eccc.py
@@ -6,7 +6,6 @@
 import logging.config
 import multiprocessing as mp
 import os
-import time
 from functools import partial
 from pathlib import Path
 
@@ -58,8 +57,6 @@ def convert_flat_files(
     -------
     None
     """
-    func_time = time.time()
-
     if mode.lower() in ["h", "hour", "hourly"]:
         num_observations = 24
         column_names = ["code", "year", "month", "day", "code_var"]
@@ -124,5 +121,3 @@ def convert_flat_files(
             logging.warning(
                 "Some files failed to be properly parsed:\n", ", ".join(errored_files)
             )
-
-    logging.warning(f"Process completed in {time.time() - func_time:.2f} seconds")

From 8728c66b1747abe8669e054be1351651f3db9b65 Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Wed, 9 Aug 2023 17:24:26 -0400
Subject: [PATCH 14/33] significant refactoring

---
 miranda/eccc/_utils.py                        | 838 ------------------
 miranda/preprocess/_eccc_homogenized.py       | 171 +---
 miranda/preprocess/_eccc_obs.py               | 165 ++--
 miranda/preprocess/_metadata.py               | 190 ++++
 miranda/preprocess/_treatments.py             |  42 -
 .../preprocess/configs/eccc-obs_attrs.json    | 730 ++++++++++++++-
 miranda/preprocess/eccc.py                    | 233 +++--
 miranda/vocabularies/eccc.py                  | 125 +--
 8 files changed, 1198 insertions(+), 1296 deletions(-)
 delete mode 100644 miranda/eccc/_utils.py
 create mode 100644 miranda/preprocess/_metadata.py
 delete mode 100644 miranda/preprocess/_treatments.py

diff --git a/miranda/eccc/_utils.py b/miranda/eccc/_utils.py
deleted file mode 100644
index a501dac6..00000000
--- a/miranda/eccc/_utils.py
+++ /dev/null
@@ -1,838 +0,0 @@
-from __future__ import annotations
-
-import logging.config
-from collections.abc import Mapping
-from datetime import datetime as dt
-
-from miranda.scripting import LOGGING_CONFIG
-
-__all__ = ["cf_station_metadata"]
-
-logging.config.dictConfig(LOGGING_CONFIG)
-
-
-def cf_station_metadata(variable_code: int | str) -> Mapping[str, int | float | str]:
-    """
-
-    Parameters
-    ----------
-    variable_code: int or  str
-
-    Returns
-    -------
-    dict
-    """
-    ec_hourly_variables = {
-        "001": {
-            "_table_name": {"DLY02", "DLY04", "DLY44"},
-            "original_units": "0.1 °C",
-            "raw_units": "degC",
-            "units": "K",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Daily Maximum Temperature",
-            "standard_name": "air_temperature_maximum",
-            "nc_name": "tasmax",
-        },
-        "002": {
-            "_table_name": {"DLY02", "DLY04", "DLY44"},
-            "original_units": "0.1 °C",
-            "raw_units": "degC",
-            "units": "K",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Daily Minimum Temperature",
-            "standard_name": "air_temperature_minimum",
-            "nc_name": "tasmin",
-        },
-        "003": {
-            "_table_name": {"DLY02", "DLY04", "DLY44"},
-            "original_units": "0.1 °C",
-            "raw_units": "degC",
-            "units": "K",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Daily Mean Temperature",
-            "standard_name": "air_temperature",
-            "nc_name": "tas",
-        },
-        "010": {
-            "_table_name": {"DLY02", "DLY04", "DLY44"},
-            "original_units": "0.1 mm day-1",
-            "raw_units": "mm",
-            "units": "m",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Daily Total Rainfall",
-            "standard_name": "liquid_precipitation_amount",
-            "nc_name": "prlptot",
-        },
-        "011": {
-            "_table_name": {"DLY02", "DLY04", "DLY44"},
-            "original_units": "0.1 cm day-1",
-            "raw_units": "cm",
-            "units": "m",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Daily Total Snowfall",
-            "standard_name": "solid_precipitation_amount",
-            "nc_name": "prsntot",
-        },
-        "012": {
-            "_table_name": {"DLY02", "DLY04", "DLY44"},
-            "original_units": "0.1 mm day-1",
-            "raw_units": "mm",
-            "units": "m",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Daily Total Precipitation",
-            "standard_name": "precipitation_amount",
-            "nc_name": "prcptot",
-        },
-        "013": {
-            "_table_name": {"DLY02", "DLY04", "DLY44"},
-            "original_units": "cm",
-            "raw_units": "cm",
-            "units": "m",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Snow on the Ground",
-            "standard_name": "surface_snow_thickness",
-            "nc_name": "sndtot",
-        },
-        "014": {
-            "_table_name": {"DLY02", "DLY04", "DLY44"},
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Thunderstorms",
-            "standard_name": "thunderstorm_presence",
-            "nc_name": "thunder",
-        },
-        "015": {
-            "_table_name": {"DLY02", "DLY04", "DLY44"},
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Freezing rain or drizzle",
-            "standard_name": "freeze_rain_drizzle_presence",
-            "nc_name": "freezing_rain_drizzle",
-        },
-        "016": {
-            "_table_name": {"DLY02", "DLY04", "DLY44"},
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Hail",
-            "standard_name": "hail_presence",
-            "nc_name": "hail",
-        },
-        "017": {
-            "_table_name": {"DLY02", "DLY04", "DLY44"},
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Fog or Ice Fog",
-            "standard_name": "fog_ice_fog_presence",
-            "nc_name": "fog_ice_fog",
-        },
-        "018": {
-            "_table_name": {"DLY02", "DLY04"},
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Smoke or Haze",
-            "standard_name": "smoke_haze_presence",
-            "nc_name": "smoke_haze",
-        },
-        "019": {
-            "_table_name": {"DLY02", "DLY04"},
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Blowing Dust or Sand",
-            "standard_name": "blowing_dust_sand_presence",
-            "nc_name": "blowing_dust_sand",
-        },
-        "020": {
-            "_table_name": {"DLY02", "DLY04"},
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Blowing snow",
-            "standard_name": "blowing_snow_presence",
-            "nc_name": "blow_snow",
-        },
-        "021": {
-            "_table_name": {"DLY02", "DLY04"},
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Wind speed >= 28 Knots",
-            "standard_name": "wind_exceeding_28_knots",
-            "nc_name": "wind_gt_28kt",
-        },
-        "022": {
-            "_table_name": {"DLY02", "DLY04"},
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Wind speed >= 34 Knots",
-            "standard_name": "wind_exceeding_34_knots",
-            "nc_name": "wind_gt_34kt",
-        },
-        "023": {
-            "_table_name": {"DLY02", "DLY04"},
-            "original_units": "10's of degrees",
-            "raw_units": "deg",
-            "units": "deg",
-            "scale_factor": 10,
-            "add_offset": 0,
-            "long_name": "Direction of extreme gust (16 pts) to December 1976",
-            "standard_name": "gust_to_direction",
-            "nc_name": "gust_dir_16pts",
-        },
-        "024": {
-            "_table_name": {"DLY02", "DLY04"},
-            "original_units": "km/h",
-            "raw_units": "km h-1",
-            "units": "m s-1",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Speed of extreme gust",
-            "standard_name": "wind_speed_of_gust",
-            "nc_name": "gust_speed",
-        },
-        "025": {
-            "_table_name": {"DLY02", "DLY04"},
-            "raw_units": "h",
-            "units": "h",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "UTC hour of extreme gust",
-            "standard_name": "hour_of_extreme_gust",
-            "nc_name": "gust_hour",
-        },
-        "061": {
-            "_table_name": {"HLY11"},
-            "original_units": "0.001 MJ/m",
-            "raw_units": "W m-2 h-1",
-            "units": "W m-2 h-1",
-            "scale_factor": 1e6 / (60 * 60),
-            "add_offset": 0,
-            "long_name": "RF1 global solar radiation",
-            "standard_name": "solar_radiation_flux",
-            "nc_name": "rf1_radiation",
-        },
-        "062": {
-            "_table_name": {"HLY11"},
-            "original_units": "0.001 MJ/m",
-            "raw_units": "W m-2 h-1",
-            "units": "W m-2 h-1",
-            "scale_factor": 1e6 / (60 * 60),
-            "add_offset": 0,
-            "long_name": "RF2 sky (diffuse) radiation",
-            "standard_name": "solar_radiation_flux",
-            "nc_name": "rf2_radiation",
-        },
-        "063": {
-            "_table_name": {"HLY11"},
-            "original_units": "0.001 MJ/m",
-            "raw_units": "W m-2 h-1",
-            "units": "W m-2 h-1",
-            "scale_factor": 1e6 / (60 * 60),
-            "add_offset": 0,
-            "long_name": "RF3 reflected solar radiation",
-            "standard_name": "solar_radiation_flux",
-            "nc_name": "rf3_radiation",
-        },
-        "064": {
-            "_table_name": {"HLY11"},
-            "original_units": "0.001 MJ/m",
-            "raw_units": "W m-2 h-1",
-            "units": "W m-2 h-1",
-            "scale_factor": 1e6 / (60 * 60),
-            "add_offset": 0,
-            "long_name": "RF4 net all wave radiation",
-            "standard_name": "solar_radiation_flux",
-            "nc_name": "rf4_radiation",
-        },
-        "067": {
-            "_table_name": {"HLY11"},
-            "original_units": "0.01 Kilolux_hrs",
-            "raw_units": "lux h",
-            "units": "lux h",
-            "scale_factor": 10,
-            "add_offset": 0,
-            "long_name": "RF7 daylight illumination",
-            "standard_name": "solar_radiation_flux",
-            "nc_name": "rf7_radiation",
-        },
-        "068": {
-            "_table_name": {"HLY11"},
-            "original_units": "0.001 MJ/m",
-            "raw_units": "W m-2 h-1",
-            "units": "W m-2 h-1",
-            "scale_factor": 1e6 / (60 * 60),
-            "add_offset": 0,
-            "long_name": "RF8 direct solar radiation",
-            "standard_name": "solar_radiation_flux",
-            "nc_name": "rf8_radiation",
-        },
-        "069": {
-            "_table_name": {"HLY15"},
-            "original_units": "10's of degrees",
-            "raw_units": "deg",
-            "units": "deg",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Direction - 45B anemometer (8 pts)",
-            "standard_name": "wind_to_direction",
-            "nc_name": "wind_dir_45B",
-        },
-        "071": {
-            "_table_name": {"HLY01"},
-            "original_units": "30's of meters",
-            "raw_units": "m",
-            "units": "m",
-            "scale_factor": 30,
-            "add_offset": 0,
-            "long_name": "Ceiling height of lowest layer of clouds",
-            "standard_name": "ceiling_cloud_height",
-            "nc_name": "ceiling_hgt",
-        },
-        "072": {
-            "_table_name": {"HLY01"},
-            "original_units": "0.1 km",
-            "raw_units": "km",
-            "units": "m",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Visibility",
-            "standard_name": "visibility_in_air",
-            "nc_name": "visibility",
-        },
-        "073": {
-            "_table_name": {"HLY01"},
-            "original_units": "0.01 kPa",
-            "raw_units": "Pa",
-            "units": "Pa",
-            "scale_factor": 10,
-            "add_offset": 0,
-            "long_name": "Sea Level Pressure",
-            "standard_name": "air_pressure_at_mean_sea_level",
-            "nc_name": "psl",
-        },
-        "074": {
-            "_table_name": {"HLY01"},
-            "original_units": "0.1 °C",
-            "raw_units": "degC",
-            "units": "K",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Dew Point Temperature",
-            "standard_name": "dew_point_temperature",
-            "nc_name": "tds",
-        },
-        "075": {
-            "_table_name": {"HLY01"},
-            "original_units": "10's of degrees",
-            "raw_units": "deg",
-            "units": "deg",
-            "scale_factor": 10,
-            "add_offset": 0,
-            "long_name": "Wind Direction at 2 m (U2A Anemometer) (16 pts)",
-            "standard_name": "wind_direction_u2a",
-            "nc_name": "wind_dir_u2a_16",
-        },
-        "076": {
-            "_table_name": {"HLY01"},
-            "original_units": "km/h",
-            "raw_units": "km h-1",
-            "units": "m s-1",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Wind Speed - U2A (16 pts) to December 1970",
-            "standard_name": "wind_speed_u2a",
-            "nc_name": "wind_speed_u2a",
-        },
-        "077": {
-            "_table_name": {"HLY01"},
-            "original_units": "0.01 kPa",
-            "raw_units": "Pa",
-            "units": "Pa",
-            "scale_factor": 10,
-            "add_offset": 0,
-            "long_name": "Station Pressure",
-            "standard_name": "atmospheric_pressure",
-            "nc_name": "pressure",
-        },
-        "078": {
-            "_table_name": {"HLY01"},
-            "original_units": "0.1 °C",
-            "raw_units": "degC",
-            "units": "K",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Dry Bulb Temperature",
-            "standard_name": "dry_bulb_temperature",
-            "nc_name": "tas_dry",
-        },
-        "079": {
-            "_table_name": {"HLY01"},
-            "original_units": "0.1 °C",
-            "raw_units": "degC",
-            "units": "K",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Wet Bulb temperature",
-            "standard_name": "wet_bulb_temperature",
-            "nc_name": "tas_wet",
-        },
-        "080": {
-            "_table_name": {"HLY01"},
-            "original_units": "%",
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Relative Humidity",
-            "standard_name": "relative_humidity",
-            "nc_name": "hur",
-        },
-        "081": {
-            "_table_name": {"HLY01"},
-            "original_units": "%",
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 10,
-            "add_offset": 0,
-            "long_name": "Total Cloud Opacity",
-            "standard_name": "cloud_albedo",
-            "nc_name": "clo",
-        },
-        "082": {
-            "_table_name": {"HLY01"},
-            "original_units": "%",
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 10,
-            "add_offset": 0,
-            "long_name": "Total Cloud Amount",
-            "standard_name": "cloud_area_fraction",
-            "nc_name": "clt",
-        },
-        "089": {
-            "_table_name": {"HLY01"},
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Freezing Rain",
-            "standard_name": "freezing_rain",
-            "nc_name": "freeze_rain",
-        },
-        "094": {
-            "_table_name": {"HLY01"},
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Ice Pellets",
-            "standard_name": "ice_pellet_presence",
-            "nc_name": "ice_pellets",
-        },
-        "107": {
-            "_table_name": {"HLY01"},
-            "original_units": "Tenths",
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 10,
-            "add_offset": 0,
-            "long_name": "Lowest cloud layer opacity",
-            "standard_name": "low_type_cloud_opacity_fraction",
-            "nc_name": "1low_cloud_opac",
-        },
-        "108": {
-            "_table_name": {"HLY01"},
-            "original_units": "Tenths",
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 10,
-            "add_offset": 0,
-            "long_name": "Lowest cloud layer amount or condition",
-            "standard_name": "low_type_cloud_area_fraction",
-            "nc_name": "1low_cloud_frac",
-        },
-        "109": {
-            "_table_name": {"HLY01"},
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Lowest cloud layer type",
-            "standard_name": "low_type_cloud_type",
-            "nc_name": "1low_cloud_type",
-        },
-        "110": {
-            "_table_name": {"HLY01"},
-            "original_units": "30's of meters",
-            "raw_units": "m",
-            "units": "m",
-            "scale_factor": 30,
-            "add_offset": 0,
-            "long_name": "Lowest cloud layer height",
-            "standard_name": "low_type_cloud_height",
-            "nc_name": "1low_cloud_hgt",
-        },
-        "111": {
-            "_table_name": {"HLY01"},
-            "original_units": "Tenths",
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 10,
-            "add_offset": 0,
-            "long_name": "Second lowest cloud layer opacity",
-            "standard_name": "low_type_cloud_opacity_fraction",
-            "nc_name": "2low_cloud_opac",
-        },
-        "112": {
-            "_table_name": {"HLY01"},
-            "original_units": "Tenths",
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 10,
-            "add_offset": 0,
-            "long_name": "Second lowest cloud layer amount or condition",
-            "standard_name": "low_type_cloud_area_fraction",
-            "nc_name": "2low_cloud_frac",
-        },
-        "113": {
-            "_table_name": {"HLY01"},
-            "original_units": "",
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Second lowest cloud layer type",
-            "standard_name": "low_type_cloud_type",
-            "nc_name": "2low_cloud_type",
-        },
-        "114": {
-            "_table_name": {"HLY01"},
-            "original_units": "30's of meters",
-            "raw_units": "m",
-            "units": "m",
-            "scale_factor": 30,
-            "add_offset": 0,
-            "long_name": "Second lowest cloud layer height",
-            "standard_name": "low_type_cloud_height",
-            "nc_name": "2low_cloud_hgt",
-        },
-        "115": {
-            "_table_name": {"HLY01"},
-            "original_units": "Tenths",
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 10,
-            "add_offset": 0,
-            "long_name": "Thirsd lowest cloud layer opacity",
-            "standard_name": "low_type_cloud_opacity_fraction",
-            "nc_name": "3low_cloud_opac",
-        },
-        "116": {
-            "_table_name": {"HLY01"},
-            "original_units": "Tenths",
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 10,
-            "add_offset": 0,
-            "long_name": "Third lowest cloud layer amount or condition",
-            "standard_name": "low_type_cloud_area_fraction",
-            "nc_name": "3low_cloud_frac",
-        },
-        "117": {
-            "_table_name": {"HLY01"},
-            "original_units": "",
-            "raw_units": "1",
-            "units": "1",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Third lowest cloud layer type",
-            "standard_name": "low_type_cloud_type",
-            "nc_name": "3low_cloud_type",
-        },
-        "118": {
-            "_table_name": {"HLY01"},
-            "original_units": "30's of meters",
-            "raw_units": "m",
-            "units": "m",
-            "scale_factor": 30,
-            "add_offset": 0,
-            "long_name": "Third lowest cloud layer height",
-            "standard_name": "low_type_cloud_height",
-            "nc_name": "3low_cloud_hgt",
-        },
-        "123": {
-            "_table_name": {"HLY01"},
-            "original_units": "0.1 mm",
-            "raw_units": "mm h-1",
-            "units": "kg m2 s-1",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Total Rainfall",
-            "standard_name": "rainfall_flux",
-            "nc_name": "rainfall",
-        },
-        "133": {
-            "_table_name": {"HLY10"},
-            "original_units": "0.1 hrs",
-            "raw_units": "h",
-            "units": "s",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Sunshine",
-            "standard_name": "duration_of_sunshine",
-            "nc_name": "sun",
-        },
-        "156": {
-            "_table_name": {"HLY01"},
-            "original_units": "10's of degrees",
-            "raw_units": "deg",
-            "units": "deg",
-            "scale_factor": 10,
-            "long_name": "Wind Direction - U2A (36 pts) from January 1971",
-            "standard_name": "wind_direction_u2a",
-            "nc_name": "wind_dir_u2a_36",
-        },
-        "262": {
-            "_table_name": {"HLY01_RCS"},
-            "original_units": "0.1 mm",
-            "raw_units": "mm",
-            "units": "kg m-2",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Total Precipitation (minutes 00-60)",
-            "standard_name": "precipitation_amount",
-            "nc_name": "prtot",
-        },
-        "263": {
-            "_table_name": {"HLY01_RCS"},
-            "original_units": "0.1 mm",
-            "raw_units": "mm",
-            "units": "kg m-2",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Total Precipitation (minutes 00-15)",
-            "standard_name": "precipitation_amount",
-            "nc_name": "prtot_q1",
-        },
-        "264": {
-            "_table_name": {"HLY01_RCS"},
-            "original_units": "0.1 mm",
-            "raw_units": "mm",
-            "units": "kg m-2",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Total Precipitation (minutes 15-30)",
-            "standard_name": "precipitation_amount",
-            "nc_name": "prtot_q2",
-        },
-        "265": {
-            "_table_name": {"HLY01_RCS"},
-            "original_units": "0.1 mm",
-            "raw_units": "mm",
-            "units": "kg m-2",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Total Precipitation (minutes 30-45)",
-            "standard_name": "precipitation_amount",
-            "nc_name": "prtot_q3",
-        },
-        "266": {
-            "_table_name": {"HLY01_RCS"},
-            "original_units": "0.1 mm",
-            "raw_units": "mm",
-            "units": "kg m-2",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Total Precipitation (minutes 45-60)",
-            "standard_name": "precipitation_amount",
-            "nc_name": "prtot_q4",
-        },
-        "267": {
-            "_table_name": {"HLY01_RCS"},
-            "original_units": "0.1 kg/m²",
-            "raw_units": "kg m-2",
-            "units": "kg m-2",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Precipitation Gauge Weight per Unit Area (at minute 15)",
-            "standard_name": "precipitation_amount",
-            "nc_name": "precipitation_weight_q1",
-        },
-        "268": {
-            "_table_name": {"HLY01_RCS"},
-            "original_units": "0.1 kg/m²",
-            "raw_units": "kg m-2",
-            "units": "kg m-2",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Precipitation Gauge Weight per Unit Area (at minute 30)",
-            "standard_name": "precipitation_amount",
-            "nc_name": "precipitation_weight_q2",
-        },
-        "269": {
-            "_table_name": {"HLY01_RCS"},
-            "original_units": "0.1 kg/m²",
-            "raw_units": "kg m-2",
-            "units": "kg m-2",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Precipitation Gauge Weight per Unit Area (at minute 45)",
-            "standard_name": "precipitation_amount",
-            "nc_name": "precipitation_weight_q3",
-        },
-        "270": {
-            "_table_name": {"HLY01_RCS"},
-            "original_units": "0.1 kg/m²",
-            "raw_units": "kg m-2",
-            "units": "kg m-2",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Precipitation Gauge Weight per Unit Area (at minute 60)",
-            "standard_name": "precipitation_amount",
-            "nc_name": "precipitation_weight_q4",
-        },
-        "271": {
-            "_table_name": {"HLY01_RCS"},
-            "original_units": "0.1 km/h",
-            "raw_units": "km h-1",
-            "nc_units": "m s-1",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Wind Speed at 2 m (minutes 00-15)",
-            "standard_name": "wind_speed",
-            "nc_name": "wind_speed_q1",
-        },
-        "272": {
-            "_table_name": {"HLY01_RCS"},
-            "original_units": "0.1 km/h",
-            "raw_units": "km h-1",
-            "nc_units": "m s-1",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Wind Speed at 2 m (minutes 15-30)",
-            "standard_name": "wind_speed",
-            "nc_name": "wind_speed_q2",
-        },
-        "273": {
-            "_table_name": {"HLY01_RCS"},
-            "original_units": "0.1 km/h",
-            "raw_units": "km h-1",
-            "nc_units": "m s-1",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Wind Speed at 2 m (minutes 30-45)",
-            "standard_name": "wind_speed",
-            "nc_name": "wind_speed_q3",
-        },
-        "274": {
-            "_table_name": {"HLY01_RCS"},
-            "original_units": "0.1 km/h",
-            "raw_units": "km h-1",
-            "nc_units": "m s-1",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Wind Speed at 2 m (minutes 45-60)",
-            "standard_name": "wind_speed",
-            "nc_name": "wind_speed_q4",
-        },
-        "275": {
-            "_table_name": {"HLY01_RCS"},
-            "original_units": "cm",
-            "raw_units": "cm",
-            "units": "m",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Snow Depth (at minute 60)",
-            "standard_name": "surface_snow_thickness",
-            "nc_name": "snd_q4",
-        },
-        "276": {
-            "_table_name": {"HLY01_RCS"},
-            "original_units": "cm",
-            "raw_units": "cm",
-            "units": "m",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Snow Depth (at minute 15)",
-            "standard_name": "surface_snow_thickness",
-            "nc_name": "snd_q1",
-        },
-        "277": {
-            "_table_name": {"HLY01_RCS"},
-            "original_units": "cm",
-            "raw_units": "cm",
-            "units": "m",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Snow Depth (at minute 30)",
-            "standard_name": "surface_snow_thickness",
-            "nc_name": "snd_q2",
-        },
-        "278": {
-            "_table_name": {"HLY01_RCS"},
-            "original_units": "cm",
-            "raw_units": "cm",
-            "units": "m",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Snow Depth (at minute 45)",
-            "standard_name": "surface_snow_thickness",
-            "nc_name": "snd_q3",
-        },
-        "279": {
-            "_table_name": {"HLY01_RCS"},
-            "original_units": "Degrees",
-            "raw_units": "deg",
-            "nc_units": "deg",
-            "scale_factor": 1,
-            "add_offset": 0,
-            "long_name": "Wind Direction at 2 m (minutes 50-60)",
-            "standard_name": "wind_direction",
-            "nc_name": "wind_dir",
-        },
-        "280": {
-            "_table_name": {"HLY01_RCS"},
-            "original_units": "0.1 km/h",
-            "raw_units": "km h-1",
-            "units": "m s-1",
-            "scale_factor": 0.1,
-            "add_offset": 0,
-            "long_name": "Wind Speed at 2 m (minutes 50-60)",
-            "standard_name": "wind_speed",
-            "nc_name": "wind_speed",
-        },
-    }
-    code = str(variable_code).zfill(3)
-    if code in ["061"]:
-        raise NotImplementedError()
-    try:
-        variable = ec_hourly_variables[code]
-        variable["missing_flags"] = "M"
-        variable["missing_values"] = {-9999, "#####"}
-        variable["least_significant_digit"] = ""
-    except KeyError:
-        logging.error(f"Hourly variable `{code}` not supported.")
-        raise
-    return variable
diff --git a/miranda/preprocess/_eccc_homogenized.py b/miranda/preprocess/_eccc_homogenized.py
index 00a417da..a16b3e14 100644
--- a/miranda/preprocess/_eccc_homogenized.py
+++ b/miranda/preprocess/_eccc_homogenized.py
@@ -4,7 +4,6 @@
 import calendar
 import logging.config
 from pathlib import Path
-from typing import Any, Dict, List, Tuple, Type
 
 import numpy as np
 import pandas as pd
@@ -12,11 +11,11 @@
 
 from miranda.io import write_dataset
 from miranda.io.utils import name_output_file
-from miranda.preprocess._data_definitions import (
-    find_project_variable_codes,
-    load_json_data_mappings,
+from miranda.preprocess._data_definitions import find_project_variable_codes
+from miranda.preprocess._metadata import (
+    eccc_variable_metadata,
+    homogenized_column_definitions,
 )
-from miranda.preprocess._treatments import basic_metadata_conversion
 from miranda.scripting import LOGGING_CONFIG
 
 logging.config.dictConfig(LOGGING_CONFIG)
@@ -25,153 +24,6 @@
 __all__ = ["convert_ahccd", "convert_ahccd_fwf_file", "merge_ahccd"]
 
 
-def _ahccd_variable_metadata(
-    variable_code: str,
-    gen: int,
-) -> (dict[str, int | float | str], dict, list[tuple[int, int]], int):
-    """
-
-    Parameters
-    ----------
-    variable_code
-    gen: {1, 2, 3}
-
-    Returns
-    -------
-    dict[str, int or str or float], dict, list[tuple[int, int]], int
-    """
-    generation = {1: "First", 2: "Second", 3: "Third"}.get(gen)
-    if not generation:
-        raise NotImplementedError(f"Generation '{gen}' not supported")
-
-    config = load_json_data_mappings("eccc-homogenized")
-    metadata = basic_metadata_conversion("eccc-homogenized", config)
-    code = find_project_variable_codes(variable_code, "eccc-homogenized")
-
-    variable_meta = metadata["variables"].get(code)
-    variable_name = variable_meta.get("_variable_name")
-    if variable_name:
-        variable_meta["original_variable_name"] = variable_code
-        variable_meta = {variable_name: variable_meta}
-        del variable_meta[variable_name]["_variable_name"]
-    else:
-        variable_meta = {variable_code: variable_meta}
-
-    header = metadata["Header"]
-    to_delete = []
-    # Conditional handling of global attributes based on generation
-    for field in [f for f in header if f.startswith("_")]:
-        if isinstance(header[field], bool):
-            if header[field] and field == "_variable":
-                header[field[1:]] = variable_name
-        elif isinstance(header[field], dict):
-            attr_treatment = header[field]["generation"]
-            if field in ["_citation" "_product"]:
-                for attribute, value in attr_treatment.items():
-                    if attribute == generation:
-                        header[field[1:]] = value
-        else:
-            raise AttributeError(
-                f"Attribute treatment configuration for field `{field}` is not properly configured. Verify JSON."
-            )
-        to_delete.append(field)
-
-    for field in to_delete:
-        del header[field]
-
-    return variable_meta, header
-
-
-def _ahccd_column_definitions(
-    variable_code: str,
-) -> tuple[dict, list[tuple[int, int]], dict[str, type[str | int | float] | Any], int]:
-    config = load_json_data_mappings("eccc-homogenized")
-    metadata = basic_metadata_conversion("eccc-homogenized", config)
-
-    variable = metadata["variables"][variable_code]["_variable_name"]
-    if variable.startswith("tas"):
-        column_names = [
-            "No",
-            "StnId",
-            "Station name",
-            "Prov",
-            "FromYear",
-            "FromMonth",
-            "ToYear",
-            "ToMonth",
-            "%Miss",
-            "Lat(deg)",
-            "Long(deg)",
-            "Elev(m)",
-            "Joined",
-            "RCS",
-        ]
-        dtypes = [
-            str,
-            str,
-            str,
-            str,
-            int,
-            int,
-            int,
-            int,
-            float,
-            float,
-            float,
-            int,
-            str,
-            str,
-        ]
-        column_spaces = [(0, 5), (5, 6), (6, 8), (8, 9)]
-        ii = 9
-        # 31 days in a month
-        for i in range(1, 32):
-            column_spaces.append((ii, ii + 7))
-            ii += 7
-            column_spaces.append((ii, ii + 1))
-            ii += 1
-        header_row = 3
-
-    elif variable.startswith("pr"):
-        column_names = [
-            "Prov",
-            "Station name",
-            "stnid",
-            "beg yr",
-            "beg mon",
-            "end yr",
-            "end mon",
-            "lat (deg)",
-            "long (deg)",
-            "elev (m)",
-            "stns joined",
-        ]
-        dtypes = [str, str, str, int, int, int, int, float, float, int, str]
-        column_spaces = [(0, 4), (4, 5), (5, 7), (7, 8)]
-        ii = 8
-        # 31 days in a month
-        for i in range(1, 32):
-            column_spaces.append((ii, ii + 8))
-            ii += 8
-            column_spaces.append((ii, ii + 1))
-            ii += 1
-        header_row = 0
-
-    else:
-        raise KeyError
-
-    column_names = {
-        col.lower().split("(")[0].replace("%", "pct_").strip().replace(" ", "_"): col
-        for col in list(column_names)
-    }
-    #
-    column_dtypes = {}
-    for col in column_names.keys():
-        column_dtypes[col] = dtypes[list(column_names.keys()).index(col)]
-
-    return column_names, column_spaces, column_dtypes, header_row
-
-
 def convert_ahccd_fwf_file(
     ff: Path | str,
     metadata: pd.DataFrame,
@@ -194,8 +46,12 @@ def convert_ahccd_fwf_file(
     """
     code = find_project_variable_codes(variable, "eccc-homogenized")
 
-    variable_meta, global_attrs = _ahccd_variable_metadata(code, generation)
-    column_names, column_spaces, column_dtypes, header = _ahccd_column_definitions(code)
+    variable_meta, global_attrs = eccc_variable_metadata(
+        code, "eccc-homogenized", generation
+    )
+    column_names, column_spaces, column_dtypes, header = homogenized_column_definitions(
+        code
+    )
 
     df = pd.read_fwf(ff, header=header, colspecs=column_spaces, dtype=column_dtypes)
     if "pr" in variable:
@@ -324,20 +180,21 @@ def convert_ahccd(
     output_dir.mkdir(parents=True, exist_ok=True)
 
     code = find_project_variable_codes(variable, "eccc-homogenized")
-    var_meta, global_attrs = _ahccd_variable_metadata(code, generation)
+    var_meta, global_attrs = eccc_variable_metadata(
+        code, "eccc-homogenized", generation
+    )
     (
         column_names,
         column_spaces,
         column_dtypes,
         header_row,
-    ) = _ahccd_column_definitions(code)
+    ) = homogenized_column_definitions(code)
 
     gen = {2: "Second", 3: "Third"}.get(generation)
     if generation == 3 and code in {"dx", "dn", "dm"}:
         station_meta = "ahccd_gen3_temperature.csv"
     elif generation == 2 and code in {"dt", "ds", "dr"}:
         station_meta = "ahccd_gen2_precipitation.csv"
-
     else:
         raise NotImplementedError(f"Code '{code} for generation {gen}.")
     metadata_source = (
diff --git a/miranda/preprocess/_eccc_obs.py b/miranda/preprocess/_eccc_obs.py
index f4601058..6ff64d8f 100644
--- a/miranda/preprocess/_eccc_obs.py
+++ b/miranda/preprocess/_eccc_obs.py
@@ -1,33 +1,18 @@
 """Specialized conversion tools for Environment and Climate Change Canada / Meteorological Service of Canada data."""
-######################################################################
-# S.Biner, Ouranos, mai 2019
-#
-# methodologie
-#
-# 1) on rassemble les fichiers netcdf des differentes eccc en un seul fichier netCDF.
-#
-# 2) on scan les fichiers sources annuels en cherchant une variable et on sauve
-# ce qu'on trouve dans des fichiers netcdf. On applique aussi les flags
-# et on fait les changements d'unites
-#
-# obtenu via http://climate.weather.gc.ca/index_e.html en cliquant sur 'about the data'
-#######################################################################
 from __future__ import annotations
 
-import contextlib
 import functools
 import logging
 import multiprocessing as mp
 import os
 import re
-import sys
 import tempfile
 import time
 from calendar import monthrange
 from datetime import datetime as dt
 from logging import config
 from pathlib import Path
-from typing import Any, List
+from typing import Any
 
 import dask.dataframe as dd
 import numpy as np
@@ -37,10 +22,13 @@
 from xclim.core.units import convert_units_to
 
 from miranda.archive import group_by_length
-from miranda.preprocess._data_definitions import load_json_data_mappings
+from miranda.preprocess._data_definitions import (
+    find_project_variable_codes,
+    load_json_data_mappings,
+)
+from miranda.preprocess._metadata import eccc_variable_metadata, obs_column_definitions
 from miranda.scripting import LOGGING_CONFIG
-from miranda.storage import file_size, report_file_size
-from miranda.utils import generic_extract_archive
+from miranda.vocabularies.eccc import obs_vocabularies
 
 config.dictConfig(LOGGING_CONFIG)
 
@@ -52,32 +40,58 @@
 TABLE_DATE = dt.now().strftime("%d %B %Y")
 
 
-def _obs_fwf_column_definitions(
-    time_frequency: str,
-) -> tuple[list[str], list[int], list[type[str | int]]]:
-    """Return the column names, widths, and data types for the fixed-width format."""
-    if time_frequency.lower() in ["h", "hour", "hourly"]:
-        num_observations = 24
-        column_names = ["code", "year", "month", "day", "code_var"]
-        column_widths = [7, 4, 2, 2, 3]
-        column_dtypes = [str, int, int, int, str]
-    elif time_frequency.lower() in ["d", "day", "daily"]:
-        num_observations = 31
-        column_names = ["code", "year", "month", "code_var"]
-        column_widths = [7, 4, 2, 3]
-        column_dtypes = [str, int, int, str]
+def convert_observation(
+    data_source: str | Path | list[str | Path],
+    output_dir: str | Path,
+    variable: str,
+    *,
+    generation: int | None = None,
+    merge: bool = False,
+    overwrite: bool = False,
+):
+    """Convert a single station's data from the fixed-width format to a netCDF file."""
+
+    output_dir = Path(output_dir).resolve().joinpath(variable)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    code = find_project_variable_codes(variable, "eccc-obs")
+    var_meta, global_attrs = eccc_variable_metadata(code, "eccc-obs", generation)
+    (
+        column_names,
+        column_spaces,
+        column_dtypes,
+        header_row,
+    ) = obs_column_definitions(code)
+
+    archives = list()
+    if isinstance(data_source, list) or Path(data_source).is_file():
+        archives.append(data_source)
     else:
-        raise NotImplementedError("`mode` must be 'h'/'hourly or 'd'/'daily'.")
+        tables = []
+        for repository in obs_vocabularies:
+            if code in repository.values():
+                tables.append(str(repository.keys()))
+        logging.info(
+            f"Collecting files for variable '{variable}'. "
+            f"Filename patterns containing variable code '{code}: {', '.join(tables)}'."
+        )
+        for table in tables:
+            archives.extend([f for f in Path(data_source).rglob(f"{table}*.gz")])
+
+    # Create the output directory
+    output_variable_dir = Path(output_dir).joinpath(variable)
+    output_variable_dir.mkdir(parents=True, exist_ok=True)
 
-    # Add the data columns
-    for i in range(1, num_observations + 1):
-        data_entry, flag_entry = f"D{i:0n}", f"F{i:0n}"
-        column_names.append(data_entry)
-        column_names.append(flag_entry)
-        column_widths.extend([6, 1] * num_observations)
-        column_dtypes.extend([str, str])
+    # Loop on the files
+    errored_files = []
+    for file in archives:
+        # FIXME: convert the file using the appropriate function
+        pass
 
-    return column_names, column_widths, column_dtypes
+    if errored_files:
+        logging.warning(
+            "Some files failed to be properly parsed:\n", ", ".join(errored_files)
+        )
 
 
 def _remove_duplicates(ds):
@@ -91,6 +105,7 @@ def _remove_duplicates(ds):
 
 def convert_station(
     data: str | os.PathLike,
+    variable: str,
     mode: str,
     using_dask_array: bool = False,
     *,
@@ -99,11 +114,13 @@ def convert_station(
 ):
     """Convert a single station's data from the fixed-width format to a netCDF file."""
     data = Path(data)
-    column_names, column_widths, column_dtypes = _obs_fwf_column_definitions(mode)
+    variable_code = find_project_variable_codes(variable, "eccc-obs")
+    column_names, column_widths, column_dtypes, header = obs_column_definitions(mode)
 
     if using_dask_array:
         pandas_reader = dd
-        chunks = dict(blocksize=200 * MiB)
+        # set the blocksize to 200 MB
+        chunks = dict(blocksize=200 * 2**20)
     else:
         pandas_reader = pd
         chunks = dict()
@@ -148,12 +165,12 @@ def convert_station(
             has_variable_codes = (df_code["code_var"] == variable_code).any()
         if not has_variable_codes:
             logging.info(
-                f"Variable `{nc_name}` not found for station code: {code} in file {data}. Continuing..."
+                f"Variable `{variable}` not found for station code: {code} in file {data}. Continuing..."
             )
             continue
 
         # Perform the data treatment
-        logging.info(f"Converting `{nc_name}` for station code: {code}")
+        logging.info(f"Converting `{variable}` for station code: {code}")
 
         # Dump the data into a DataFrame
         df_var = df_code[df_code["code_var"] == variable_code].copy()
@@ -328,62 +345,6 @@ def convert_station(
     del df
 
 
-def _convert_station_file(
-    file: Path,
-    output_path: Path,
-    errored_files: list[Path],
-    mode: str,
-    add_offset: float,
-    long_name: str,
-    missing_flags: set[str],
-    missing_values: set[str],
-    nc_name: str,
-    raw_units: str,
-    units: str,
-    scale_factor: float,
-    standard_name: str,
-    variable_code: str,
-    **dask_kwargs,
-):
-    if not missing_values:
-        missing_values = {-9999, "#####"}
-
-    with tempfile.TemporaryDirectory() as temp_folder:
-        if file.suffix in [".gz", ".tar", ".zip", ".7z"]:
-            data_files = generic_extract_archive(file, output_dir=temp_folder)
-        else:
-            data_files = [file]
-        logging.info(f"Processing file: {file}.")
-
-        # 1 GiB
-        size_limit = 2**30
-
-        for data in data_files:
-            if file_size(data) > size_limit and "dask" in sys.modules:
-                logging.info(
-                    f"File exceeds {report_file_size(size_limit)} - Using dask.dataframes."
-                )
-                client = ProgressBar
-                using_dask = True
-            else:
-                logging.info(
-                    f"File below {report_file_size(size_limit)} - Using pandas.dataframes."
-                )
-                client = contextlib.nullcontext
-                using_dask = False
-
-            with client(**dask_kwargs) as c:
-                try:
-                    convert_station(data, mode, using_dask=using_dask, client=c)
-                except FileNotFoundError:
-                    errored_files.append(data)
-
-        if os.listdir(temp_folder):
-            for temporary_file in Path(temp_folder).glob("*"):
-                if temporary_file in data_files:
-                    temporary_file.unlink()
-
-
 def merge_stations(
     source_files: str | os.PathLike | None = None,
     output_folder: str | os.PathLike | None = None,
diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py
new file mode 100644
index 00000000..a663c25a
--- /dev/null
+++ b/miranda/preprocess/_metadata.py
@@ -0,0 +1,190 @@
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from miranda import __version__ as __miranda_version__
+from miranda.preprocess._data_definitions import (
+    find_project_variable_codes,
+    load_json_data_mappings,
+)
+
+__all__ = [
+    "eccc_variable_metadata",
+    "homogenized_column_definitions",
+    "obs_column_definitions",
+]
+
+
+def eccc_variable_metadata(
+    variable_code: str, project: str, gen: int | None = None
+) -> (dict[str, int | float | str], dict, list[tuple[int, int]], int):
+    """
+
+    Parameters
+    ----------
+    variable_code: str
+    project: {"eccc-homogenized", "eccc-obs", "eccc-obs-summary"}
+    gen: {1, 2, 3}, optional
+
+    Returns
+    -------
+    dict[str, int or str or float], dict, list[tuple[int, int]], int
+    """
+    if project == "eccc-homogenized":
+        generation = {1: "First", 2: "Second", 3: "Third"}.get(gen)
+        if not generation:
+            raise NotImplementedError(f"Generation '{gen}' not supported")
+    else:
+        generation = None
+
+    metadata = load_json_data_mappings(project)
+    code = find_project_variable_codes(variable_code, project)
+
+    # Variable metadata
+    variable_meta = metadata["variables"].get(code)
+    variable_name = variable_meta.get("_variable_name")
+    if variable_name:
+        variable_meta["original_variable_name"] = variable_code
+        variable_meta = {variable_name: variable_meta}
+        del variable_meta[variable_name]["_variable_name"]
+    else:
+        variable_meta = {variable_code: variable_meta}
+
+    # Dataset metadata
+    header = metadata.get("Header")
+    # Static handling of version global attributes
+    miranda_version = header.get("_miranda_version")
+    if miranda_version:
+        if isinstance(miranda_version, bool):
+            header["miranda_version"] = __miranda_version__
+        elif isinstance(miranda_version, dict):
+            if project in miranda_version.keys():
+                header["miranda_version"] = __miranda_version__
+        else:
+            logging.warning(
+                f"`_miranda_version` not set for project `{project}`. Not appending."
+            )
+    if "_miranda_version" in header:
+        del header["_miranda_version"]
+
+    to_delete = []
+    # Conditional handling of global attributes based on fields
+    for field in [f for f in header if f.startswith("_")]:
+        if isinstance(header[field], bool):
+            if header[field] and field == "_variable":
+                header[field[1:]] = variable_name
+        elif isinstance(header[field], dict) and generation:
+            attr_treatment = header[field]["generation"]
+            if field in ["_citation" "_product"]:
+                for attribute, value in attr_treatment.items():
+                    if attribute == generation:
+                        header[field[1:]] = value
+        else:
+            raise AttributeError(
+                f"Attribute treatment configuration for field `{field}` is not properly configured. Verify JSON."
+            )
+        to_delete.append(field)
+
+    for field in to_delete:
+        del header[field]
+
+    return variable_meta, header
+
+
+def homogenized_column_definitions(
+    variable_code: str,
+) -> tuple[dict, list[tuple[int, int]], dict[str, type[str | int | float] | Any], int]:
+    metadata = load_json_data_mappings("eccc-homogenized")
+
+    variable = metadata["variables"][variable_code]["_variable_name"]
+    if variable.startswith("tas"):
+        column_dtypes = {
+            "No": str,
+            "StnId": str,
+            "Station name": str,
+            "Prov": str,
+            "FromYear": int,
+            "FromMonth": int,
+            "ToYear": int,
+            "ToMonth": int,
+            "%Miss": float,
+            "Lat(deg)": float,
+            "Long(deg)": float,
+            "Elev(m)": int,
+            "Joined": str,
+            "RCS": str,
+        }
+        column_spaces = [(0, 5), (5, 6), (6, 8), (8, 9)]
+        ii = 9
+        # 31 days in a month
+        for i in range(1, 32):
+            column_spaces.append((ii, ii + 7))
+            ii += 7
+            column_spaces.append((ii, ii + 1))
+            ii += 1
+        header_row = 3
+
+    elif variable.startswith("pr"):
+        column_dtypes = {
+            "Prov": str,
+            "Station name": str,
+            "stnid": str,
+            "beg yr": int,
+            "beg mon": int,
+            "end yr": int,
+            "end mon": int,
+            "lat (deg)": float,
+            "long (deg)": float,
+            "elev (m)": int,
+            "stns joined": str,
+        }
+        column_spaces = [(0, 4), (4, 5), (5, 7), (7, 8)]
+        ii = 8
+        # 31 days in a month
+        for i in range(1, 32):
+            column_spaces.append((ii, ii + 8))
+            ii += 8
+            column_spaces.append((ii, ii + 1))
+            ii += 1
+        header_row = 0
+
+    else:
+        raise KeyError
+
+    column_names = {
+        col.lower().split("(")[0].replace("%", "pct_").strip().replace(" ", "_"): col
+        for col in list(column_dtypes.keys())
+    }
+
+    return column_names, column_spaces, column_dtypes, header_row
+
+
+def obs_column_definitions(
+    time_frequency: str,
+) -> tuple[list[str], list[int], list[type[str | int]], int]:
+    """Return the column names, widths, and data types for the fixed-width format."""
+    if time_frequency.lower() in ["h", "hour", "hourly"]:
+        num_observations = 24
+        column_names = ["code", "year", "month", "day", "code_var"]
+        column_widths = [7, 4, 2, 2, 3]
+        column_dtypes = [str, int, int, int, str]
+    elif time_frequency.lower() in ["d", "day", "daily"]:
+        num_observations = 31
+        column_names = ["code", "year", "month", "code_var"]
+        column_widths = [7, 4, 2, 3]
+        column_dtypes = [str, int, int, str]
+    else:
+        raise NotImplementedError("`mode` must be 'h'/'hourly or 'd'/'daily'.")
+
+    header = 0
+
+    # Add the data columns
+    for i in range(1, num_observations + 1):
+        data_entry, flag_entry = f"D{i:0n}", f"F{i:0n}"
+        column_names.append(data_entry)
+        column_names.append(flag_entry)
+        column_widths.extend([6, 1] * num_observations)
+        column_dtypes.extend([str, str])
+
+    return column_names, column_widths, column_dtypes, header
diff --git a/miranda/preprocess/_treatments.py b/miranda/preprocess/_treatments.py
deleted file mode 100644
index 9e667440..00000000
--- a/miranda/preprocess/_treatments.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from __future__ import annotations
-
-import logging
-from typing import Any
-
-from miranda import __version__ as __miranda_version__
-
-
-def basic_metadata_conversion(
-    project: str, metadata: dict
-) -> dict[str, dict[str, Any]]:
-    """Present basic metadata conversion.
-
-    Parameters
-    ----------
-    project : str
-        Dataset project name.
-    metadata : dict
-        Metadata definition dictionary for project and variable(s).
-
-    Returns
-    -------
-    xarray.Dataset
-    """
-    header = metadata["Header"]
-
-    # Static handling of version global attributes
-    miranda_version = header.get("_miranda_version")
-    if miranda_version:
-        if isinstance(miranda_version, bool):
-            header["miranda_version"] = __miranda_version__
-        elif isinstance(miranda_version, dict):
-            if project in miranda_version.keys():
-                header["miranda_version"] = __miranda_version__
-        else:
-            logging.warning(
-                f"`_miranda_version` not set for project `{project}`. Not appending."
-            )
-    if "_miranda_version" in header:
-        del header["_miranda_version"]
-
-    return metadata
diff --git a/miranda/preprocess/configs/eccc-obs_attrs.json b/miranda/preprocess/configs/eccc-obs_attrs.json
index 82d93e55..7eb3b7b6 100644
--- a/miranda/preprocess/configs/eccc-obs_attrs.json
+++ b/miranda/preprocess/configs/eccc-obs_attrs.json
@@ -1,6 +1,5 @@
 {
   "Header": {
-    "_converter": true,
     "_frequency": true,
     "_miranda_version": true,
     "_missing_flags": "M",
@@ -25,5 +24,734 @@
     "title": "Environment and Climate Change Canada (ECCC) weather station observations",
     "type": "station-obs",
     "usage": "The original data is owned by the Government of Canada (Environment and Climate Change Canada), and falls under the licence agreement for use of Environment and Climate Change Canada data"
+  },
+  "variables": {
+    "001": {
+      "_variable_name": "tasmax",
+      "least_significant_digit": "",
+      "long_name": "Daily Maximum Temperature",
+      "original_units": "0.1 °C",
+      "raw_units": "degC",
+      "scale_factor": 0.1,
+      "standard_name": "air_temperature_maximum",
+      "units": "K"
+    },
+    "002": {
+      "_variable_name": "tasmin",
+      "least_significant_digit": "",
+      "long_name": "Daily Minimum Temperature",
+      "original_units": "0.1 °C",
+      "raw_units": "degC",
+      "scale_factor": 0.1,
+      "standard_name": "air_temperature_minimum",
+      "units": "K"
+    },
+    "003": {
+      "_variable_name": "tas",
+      "least_significant_digit": "",
+      "long_name": "Daily Mean Temperature",
+      "original_units": "0.1 °C",
+      "raw_units": "degC",
+      "scale_factor": 0.1,
+      "standard_name": "air_temperature",
+      "units": "K"
+    },
+    "010": {
+      "_variable_name": "prlptot",
+      "least_significant_digit": "",
+      "long_name": "Daily Total Rainfall",
+      "original_units": "0.1 mm day-1",
+      "raw_units": "mm",
+      "scale_factor": 0.1,
+      "standard_name": "liquid_precipitation_amount",
+      "units": "m"
+    },
+    "011": {
+      "_variable_name": "prsntot",
+      "least_significant_digit": "",
+      "long_name": "Daily Total Snowfall",
+      "original_units": "0.1 cm day-1",
+      "raw_units": "cm",
+      "scale_factor": 0.1,
+      "standard_name": "solid_precipitation_amount",
+      "units": "m"
+    },
+    "012": {
+      "_variable_name": "prcptot",
+      "least_significant_digit": "",
+      "long_name": "Daily Total Precipitation",
+      "original_units": "0.1 mm day-1",
+      "raw_units": "mm",
+      "scale_factor": 0.1,
+      "standard_name": "precipitation_amount",
+      "units": "m"
+    },
+    "013": {
+      "_variable_name": "sndtot",
+      "least_significant_digit": "",
+      "long_name": "Snow on the Ground",
+      "original_units": "cm",
+      "raw_units": "cm",
+      "scale_factor": 1,
+      "standard_name": "surface_snow_thickness",
+      "units": "m"
+    },
+    "014": {
+      "_variable_name": "thunder",
+      "least_significant_digit": "",
+      "long_name": "Thunderstorms",
+      "raw_units": "1",
+      "scale_factor": 1,
+      "standard_name": "thunderstorm_presence",
+      "units": "1"
+    },
+    "015": {
+      "_variable_name": "freezing_rain_drizzle",
+      "least_significant_digit": "",
+      "long_name": "Freezing rain or drizzle",
+      "raw_units": "1",
+      "scale_factor": 1,
+      "standard_name": "freeze_rain_drizzle_presence",
+      "units": "1"
+    },
+    "016": {
+      "_variable_name": "hail",
+      "least_significant_digit": "",
+      "long_name": "Hail",
+      "raw_units": "1",
+      "scale_factor": 1,
+      "standard_name": "hail_presence",
+      "units": "1"
+    },
+    "017": {
+      "_variable_name": "fog_ice_fog",
+      "least_significant_digit": "",
+      "long_name": "Fog or Ice Fog",
+      "raw_units": "1",
+      "scale_factor": 1,
+      "standard_name": "fog_ice_fog_presence",
+      "units": "1"
+    },
+    "018": {
+      "_variable_name": "smoke_haze",
+      "least_significant_digit": "",
+      "long_name": "Smoke or Haze",
+      "raw_units": "1",
+      "scale_factor": 1,
+      "standard_name": "smoke_haze_presence",
+      "units": "1"
+    },
+    "019": {
+      "_variable_name": "blowing_dust_sand",
+      "least_significant_digit": "",
+      "long_name": "Blowing Dust or Sand",
+      "raw_units": "1",
+      "scale_factor": 1,
+      "standard_name": "blowing_dust_sand_presence",
+      "units": "1"
+    },
+    "020": {
+      "_variable_name": "blow_snow",
+      "least_significant_digit": "",
+      "long_name": "Blowing snow",
+      "raw_units": "1",
+      "scale_factor": 1,
+      "standard_name": "blowing_snow_presence",
+      "units": "1"
+    },
+    "021": {
+      "_variable_name": "wind_gt_28kt",
+      "least_significant_digit": "",
+      "long_name": "Wind speed >= 28 Knots",
+      "raw_units": "1",
+      "scale_factor": 1,
+      "standard_name": "wind_exceeding_28_knots",
+      "units": "1"
+    },
+    "022": {
+      "_variable_name": "wind_gt_34kt",
+      "least_significant_digit": "",
+      "long_name": "Wind speed >= 34 Knots",
+      "raw_units": "1",
+      "scale_factor": 1,
+      "standard_name": "wind_exceeding_34_knots",
+      "units": "1"
+    },
+    "023": {
+      "_variable_name": "gust_dir_16pts",
+      "least_significant_digit": "",
+      "long_name": "Direction of extreme gust (16 pts) to December 1976",
+      "original_units": "10's of degrees",
+      "raw_units": "deg",
+      "scale_factor": 10,
+      "standard_name": "gust_to_direction",
+      "units": "deg"
+    },
+    "024": {
+      "_variable_name": "gust_speed",
+      "least_significant_digit": "",
+      "long_name": "Speed of extreme gust",
+      "original_units": "km/h",
+      "raw_units": "km h-1",
+      "scale_factor": 1,
+      "standard_name": "wind_speed_of_gust",
+      "units": "m s-1"
+    },
+    "025": {
+      "_variable_name": "gust_hour",
+      "least_significant_digit": "",
+      "long_name": "UTC hour of extreme gust",
+      "raw_units": "h",
+      "scale_factor": 1,
+      "standard_name": "hour_of_extreme_gust",
+      "units": "h"
+    },
+    "061": {
+      "_variable_name": "rf1_radiation",
+      "least_significant_digit": "",
+      "long_name": "RF1 global solar radiation",
+      "original_units": "0.001 MJ/m",
+      "raw_units": "W m-2 h-1",
+      "scale_factor": 277.77777777777777,
+      "standard_name": "solar_radiation_flux",
+      "units": "W m-2 h-1"
+    },
+    "062": {
+      "_variable_name": "rf2_radiation",
+      "least_significant_digit": "",
+      "long_name": "RF2 sky (diffuse) radiation",
+      "original_units": "0.001 MJ/m",
+      "raw_units": "W m-2 h-1",
+      "scale_factor": 277.77777777777777,
+      "standard_name": "solar_radiation_flux",
+      "units": "W m-2 h-1"
+    },
+    "063": {
+      "_variable_name": "rf3_radiation",
+      "least_significant_digit": "",
+      "long_name": "RF3 reflected solar radiation",
+      "original_units": "0.001 MJ/m",
+      "raw_units": "W m-2 h-1",
+      "scale_factor": 277.77777777777777,
+      "standard_name": "solar_radiation_flux",
+      "units": "W m-2 h-1"
+    },
+    "064": {
+      "_variable_name": "rf4_radiation",
+      "least_significant_digit": "",
+      "long_name": "RF4 net all wave radiation",
+      "original_units": "0.001 MJ/m",
+      "raw_units": "W m-2 h-1",
+      "scale_factor": 277.77777777777777,
+      "standard_name": "solar_radiation_flux",
+      "units": "W m-2 h-1"
+    },
+    "067": {
+      "_variable_name": "rf7_radiation",
+      "least_significant_digit": "",
+      "long_name": "RF7 daylight illumination",
+      "original_units": "0.01 Kilolux_hrs",
+      "raw_units": "lux h",
+      "scale_factor": 10,
+      "standard_name": "solar_radiation_flux",
+      "units": "lux h"
+    },
+    "068": {
+      "_variable_name": "rf8_radiation",
+      "least_significant_digit": "",
+      "long_name": "RF8 direct solar radiation",
+      "original_units": "0.001 MJ/m",
+      "raw_units": "W m-2 h-1",
+      "scale_factor": 277.77777777777777,
+      "standard_name": "solar_radiation_flux",
+      "units": "W m-2 h-1"
+    },
+    "069": {
+      "_variable_name": "wind_dir_45B",
+      "least_significant_digit": "",
+      "long_name": "Direction - 45B anemometer (8 pts)",
+      "original_units": "10's of degrees",
+      "raw_units": "deg",
+      "scale_factor": 1,
+      "standard_name": "wind_to_direction",
+      "units": "deg"
+    },
+    "071": {
+      "_variable_name": "ceiling_hgt",
+      "least_significant_digit": "",
+      "long_name": "Ceiling height of lowest layer of clouds",
+      "original_units": "30's of meters",
+      "raw_units": "m",
+      "scale_factor": 30,
+      "standard_name": "ceiling_cloud_height",
+      "units": "m"
+    },
+    "072": {
+      "_variable_name": "visibility",
+      "least_significant_digit": "",
+      "long_name": "Visibility",
+      "original_units": "0.1 km",
+      "raw_units": "km",
+      "scale_factor": 0.1,
+      "standard_name": "visibility_in_air",
+      "units": "m"
+    },
+    "073": {
+      "_variable_name": "psl",
+      "least_significant_digit": "",
+      "long_name": "Sea Level Pressure",
+      "original_units": "0.01 kPa",
+      "raw_units": "Pa",
+      "scale_factor": 10,
+      "standard_name": "air_pressure_at_mean_sea_level",
+      "units": "Pa"
+    },
+    "074": {
+      "_variable_name": "tds",
+      "least_significant_digit": "",
+      "long_name": "Dew Point Temperature",
+      "original_units": "0.1 °C",
+      "raw_units": "degC",
+      "scale_factor": 0.1,
+      "standard_name": "dew_point_temperature",
+      "units": "K"
+    },
+    "075": {
+      "_variable_name": "wind_dir_u2a_16",
+      "least_significant_digit": "",
+      "long_name": "Wind Direction at 2 m (U2A Anemometer) (16 pts)",
+      "original_units": "10's of degrees",
+      "raw_units": "deg",
+      "scale_factor": 10,
+      "standard_name": "wind_direction_u2a",
+      "units": "deg"
+    },
+    "076": {
+      "_variable_name": "wind_speed_u2a",
+      "least_significant_digit": "",
+      "long_name": "Wind Speed - U2A (16 pts) to December 1970",
+      "original_units": "km/h",
+      "raw_units": "km h-1",
+      "scale_factor": 1,
+      "standard_name": "wind_speed_u2a",
+      "units": "m s-1"
+    },
+    "077": {
+      "_variable_name": "pressure",
+      "least_significant_digit": "",
+      "long_name": "Station Pressure",
+      "original_units": "0.01 kPa",
+      "raw_units": "Pa",
+      "scale_factor": 10,
+      "standard_name": "atmospheric_pressure",
+      "units": "Pa"
+    },
+    "078": {
+      "_variable_name": "tas_dry",
+      "least_significant_digit": "",
+      "long_name": "Dry Bulb Temperature",
+      "original_units": "0.1 °C",
+      "raw_units": "degC",
+      "scale_factor": 0.1,
+      "standard_name": "dry_bulb_temperature",
+      "units": "K"
+    },
+    "079": {
+      "_variable_name": "tas_wet",
+      "least_significant_digit": "",
+      "long_name": "Wet Bulb temperature",
+      "original_units": "0.1 °C",
+      "raw_units": "degC",
+      "scale_factor": 0.1,
+      "standard_name": "wet_bulb_temperature",
+      "units": "K"
+    },
+    "080": {
+      "_variable_name": "hur",
+      "least_significant_digit": "",
+      "long_name": "Relative Humidity",
+      "original_units": "%",
+      "raw_units": "1",
+      "scale_factor": 1,
+      "standard_name": "relative_humidity",
+      "units": "1"
+    },
+    "081": {
+      "_variable_name": "clo",
+      "least_significant_digit": "",
+      "long_name": "Total Cloud Opacity",
+      "original_units": "%",
+      "raw_units": "1",
+      "scale_factor": 10,
+      "standard_name": "cloud_albedo",
+      "units": "1"
+    },
+    "082": {
+      "_variable_name": "clt",
+      "least_significant_digit": "",
+      "long_name": "Total Cloud Amount",
+      "original_units": "%",
+      "raw_units": "1",
+      "scale_factor": 10,
+      "standard_name": "cloud_area_fraction",
+      "units": "1"
+    },
+    "089": {
+      "_variable_name": "freeze_rain",
+      "least_significant_digit": "",
+      "long_name": "Freezing Rain",
+      "raw_units": "1",
+      "scale_factor": 1,
+      "standard_name": "freezing_rain",
+      "units": "1"
+    },
+    "094": {
+      "_variable_name": "ice_pellets",
+      "least_significant_digit": "",
+      "long_name": "Ice Pellets",
+      "raw_units": "1",
+      "scale_factor": 1,
+      "standard_name": "ice_pellet_presence",
+      "units": "1"
+    },
+    "107": {
+      "_variable_name": "1low_cloud_opac",
+      "least_significant_digit": "",
+      "long_name": "Lowest cloud layer opacity",
+      "original_units": "Tenths",
+      "raw_units": "1",
+      "scale_factor": 10,
+      "standard_name": "low_type_cloud_opacity_fraction",
+      "units": "1"
+    },
+    "108": {
+      "_variable_name": "1low_cloud_frac",
+      "least_significant_digit": "",
+      "long_name": "Lowest cloud layer amount or condition",
+      "original_units": "Tenths",
+      "raw_units": "1",
+      "scale_factor": 10,
+      "standard_name": "low_type_cloud_area_fraction",
+      "units": "1"
+    },
+    "109": {
+      "_variable_name": "1low_cloud_type",
+      "least_significant_digit": "",
+      "long_name": "Lowest cloud layer type",
+      "raw_units": "1",
+      "scale_factor": 1,
+      "standard_name": "low_type_cloud_type",
+      "units": "1"
+    },
+    "110": {
+      "_variable_name": "1low_cloud_hgt",
+      "least_significant_digit": "",
+      "long_name": "Lowest cloud layer height",
+      "original_units": "30's of meters",
+      "raw_units": "m",
+      "scale_factor": 30,
+      "standard_name": "low_type_cloud_height",
+      "units": "m"
+    },
+    "111": {
+      "_variable_name": "2low_cloud_opac",
+      "least_significant_digit": "",
+      "long_name": "Second lowest cloud layer opacity",
+      "original_units": "Tenths",
+      "raw_units": "1",
+      "scale_factor": 10,
+      "standard_name": "low_type_cloud_opacity_fraction",
+      "units": "1"
+    },
+    "112": {
+      "_variable_name": "2low_cloud_frac",
+      "least_significant_digit": "",
+      "long_name": "Second lowest cloud layer amount or condition",
+      "original_units": "Tenths",
+      "raw_units": "1",
+      "scale_factor": 10,
+      "standard_name": "low_type_cloud_area_fraction",
+      "units": "1"
+    },
+    "113": {
+      "_variable_name": "2low_cloud_type",
+      "least_significant_digit": "",
+      "long_name": "Second lowest cloud layer type",
+      "original_units": "",
+      "raw_units": "1",
+      "scale_factor": 1,
+      "standard_name": "low_type_cloud_type",
+      "units": "1"
+    },
+    "114": {
+      "_variable_name": "2low_cloud_hgt",
+      "least_significant_digit": "",
+      "long_name": "Second lowest cloud layer height",
+      "original_units": "30's of meters",
+      "raw_units": "m",
+      "scale_factor": 30,
+      "standard_name": "low_type_cloud_height",
+      "units": "m"
+    },
+    "115": {
+      "_variable_name": "3low_cloud_opac",
+      "least_significant_digit": "",
+      "long_name": "Thirsd lowest cloud layer opacity",
+      "original_units": "Tenths",
+      "raw_units": "1",
+      "scale_factor": 10,
+      "standard_name": "low_type_cloud_opacity_fraction",
+      "units": "1"
+    },
+    "116": {
+      "_variable_name": "3low_cloud_frac",
+      "least_significant_digit": "",
+      "long_name": "Third lowest cloud layer amount or condition",
+      "original_units": "Tenths",
+      "raw_units": "1",
+      "scale_factor": 10,
+      "standard_name": "low_type_cloud_area_fraction",
+      "units": "1"
+    },
+    "117": {
+      "_variable_name": "3low_cloud_type",
+      "least_significant_digit": "",
+      "long_name": "Third lowest cloud layer type",
+      "original_units": "",
+      "raw_units": "1",
+      "scale_factor": 1,
+      "standard_name": "low_type_cloud_type",
+      "units": "1"
+    },
+    "118": {
+      "_variable_name": "3low_cloud_hgt",
+      "least_significant_digit": "",
+      "long_name": "Third lowest cloud layer height",
+      "original_units": "30's of meters",
+      "raw_units": "m",
+      "scale_factor": 30,
+      "standard_name": "low_type_cloud_height",
+      "units": "m"
+    },
+    "123": {
+      "_variable_name": "rainfall",
+      "least_significant_digit": "",
+      "long_name": "Total Rainfall",
+      "original_units": "0.1 mm",
+      "raw_units": "mm h-1",
+      "scale_factor": 0.1,
+      "standard_name": "rainfall_flux",
+      "units": "kg m2 s-1"
+    },
+    "133": {
+      "_variable_name": "sun",
+      "least_significant_digit": "",
+      "long_name": "Sunshine",
+      "original_units": "0.1 hrs",
+      "raw_units": "h",
+      "scale_factor": 0.1,
+      "standard_name": "duration_of_sunshine",
+      "units": "s"
+    },
+    "156": {
+      "_variable_name": "wind_dir_u2a_36",
+      "least_significant_digit": "",
+      "long_name": "Wind Direction - U2A (36 pts) from January 1971",
+      "original_units": "10's of degrees",
+      "raw_units": "deg",
+      "scale_factor": 10,
+      "standard_name": "wind_direction_u2a",
+      "units": "deg"
+    },
+    "262": {
+      "_variable_name": "prtot",
+      "least_significant_digit": "",
+      "long_name": "Total Precipitation (minutes 00-60)",
+      "original_units": "0.1 mm",
+      "raw_units": "mm",
+      "scale_factor": 0.1,
+      "standard_name": "precipitation_amount",
+      "units": "kg m-2"
+    },
+    "263": {
+      "_variable_name": "prtot_q1",
+      "least_significant_digit": "",
+      "long_name": "Total Precipitation (minutes 00-15)",
+      "original_units": "0.1 mm",
+      "raw_units": "mm",
+      "scale_factor": 0.1,
+      "standard_name": "precipitation_amount",
+      "units": "kg m-2"
+    },
+    "264": {
+      "_variable_name": "prtot_q2",
+      "least_significant_digit": "",
+      "long_name": "Total Precipitation (minutes 15-30)",
+      "original_units": "0.1 mm",
+      "raw_units": "mm",
+      "scale_factor": 0.1,
+      "standard_name": "precipitation_amount",
+      "units": "kg m-2"
+    },
+    "265": {
+      "_variable_name": "prtot_q3",
+      "least_significant_digit": "",
+      "long_name": "Total Precipitation (minutes 30-45)",
+      "original_units": "0.1 mm",
+      "raw_units": "mm",
+      "scale_factor": 0.1,
+      "standard_name": "precipitation_amount",
+      "units": "kg m-2"
+    },
+    "266": {
+      "_variable_name": "prtot_q4",
+      "least_significant_digit": "",
+      "long_name": "Total Precipitation (minutes 45-60)",
+      "original_units": "0.1 mm",
+      "raw_units": "mm",
+      "scale_factor": 0.1,
+      "standard_name": "precipitation_amount",
+      "units": "kg m-2"
+    },
+    "267": {
+      "_variable_name": "precipitation_weight_q1",
+      "least_significant_digit": "",
+      "long_name": "Precipitation Gauge Weight per Unit Area (at minute 15)",
+      "original_units": "0.1 kg/m²",
+      "raw_units": "kg m-2",
+      "scale_factor": 0.1,
+      "standard_name": "precipitation_amount",
+      "units": "kg m-2"
+    },
+    "268": {
+      "_variable_name": "precipitation_weight_q2",
+      "least_significant_digit": "",
+      "long_name": "Precipitation Gauge Weight per Unit Area (at minute 30)",
+      "original_units": "0.1 kg/m²",
+      "raw_units": "kg m-2",
+      "scale_factor": 0.1,
+      "standard_name": "precipitation_amount",
+      "units": "kg m-2"
+    },
+    "269": {
+      "_variable_name": "precipitation_weight_q3",
+      "least_significant_digit": "",
+      "long_name": "Precipitation Gauge Weight per Unit Area (at minute 45)",
+      "original_units": "0.1 kg/m²",
+      "raw_units": "kg m-2",
+      "scale_factor": 0.1,
+      "standard_name": "precipitation_amount",
+      "units": "kg m-2"
+    },
+    "270": {
+      "_variable_name": "precipitation_weight_q4",
+      "least_significant_digit": "",
+      "long_name": "Precipitation Gauge Weight per Unit Area (at minute 60)",
+      "original_units": "0.1 kg/m²",
+      "raw_units": "kg m-2",
+      "scale_factor": 0.1,
+      "standard_name": "precipitation_amount",
+      "units": "kg m-2"
+    },
+    "271": {
+      "_variable_name": "wind_speed_q1",
+      "least_significant_digit": "",
+      "long_name": "Wind Speed at 2 m (minutes 00-15)",
+      "nc_units": "m s-1",
+      "original_units": "0.1 km/h",
+      "raw_units": "km h-1",
+      "scale_factor": 0.1,
+      "standard_name": "wind_speed"
+    },
+    "272": {
+      "_variable_name": "wind_speed_q2",
+      "least_significant_digit": "",
+      "long_name": "Wind Speed at 2 m (minutes 15-30)",
+      "nc_units": "m s-1",
+      "original_units": "0.1 km/h",
+      "raw_units": "km h-1",
+      "scale_factor": 0.1,
+      "standard_name": "wind_speed"
+    },
+    "273": {
+      "_variable_name": "wind_speed_q3",
+      "least_significant_digit": "",
+      "long_name": "Wind Speed at 2 m (minutes 30-45)",
+      "nc_units": "m s-1",
+      "original_units": "0.1 km/h",
+      "raw_units": "km h-1",
+      "scale_factor": 0.1,
+      "standard_name": "wind_speed"
+    },
+    "274": {
+      "_variable_name": "wind_speed_q4",
+      "least_significant_digit": "",
+      "long_name": "Wind Speed at 2 m (minutes 45-60)",
+      "nc_units": "m s-1",
+      "original_units": "0.1 km/h",
+      "raw_units": "km h-1",
+      "scale_factor": 0.1,
+      "standard_name": "wind_speed"
+    },
+    "275": {
+      "_variable_name": "snd_q4",
+      "least_significant_digit": "",
+      "long_name": "Snow Depth (at minute 60)",
+      "original_units": "cm",
+      "raw_units": "cm",
+      "scale_factor": 1,
+      "standard_name": "surface_snow_thickness",
+      "units": "m"
+    },
+    "276": {
+      "_variable_name": "snd_q1",
+      "least_significant_digit": "",
+      "long_name": "Snow Depth (at minute 15)",
+      "original_units": "cm",
+      "raw_units": "cm",
+      "scale_factor": 1,
+      "standard_name": "surface_snow_thickness",
+      "units": "m"
+    },
+    "277": {
+      "_variable_name": "snd_q2",
+      "least_significant_digit": "",
+      "long_name": "Snow Depth (at minute 30)",
+      "original_units": "cm",
+      "raw_units": "cm",
+      "scale_factor": 1,
+      "standard_name": "surface_snow_thickness",
+      "units": "m"
+    },
+    "278": {
+      "_variable_name": "snd_q3",
+      "least_significant_digit": "",
+      "long_name": "Snow Depth (at minute 45)",
+      "original_units": "cm",
+      "raw_units": "cm",
+      "scale_factor": 1,
+      "standard_name": "surface_snow_thickness",
+      "units": "m"
+    },
+    "279": {
+      "_variable_name": "wind_dir",
+      "least_significant_digit": "",
+      "long_name": "Wind Direction at 2 m (minutes 50-60)",
+      "nc_units": "deg",
+      "original_units": "Degrees",
+      "raw_units": "deg",
+      "scale_factor": 1,
+      "standard_name": "wind_direction"
+    },
+    "280": {
+      "_variable_name": "wind_speed",
+      "least_significant_digit": "",
+      "long_name": "Wind Speed at 2 m (minutes 50-60)",
+      "original_units": "0.1 km/h",
+      "raw_units": "km h-1",
+      "scale_factor": 0.1,
+      "standard_name": "wind_speed",
+      "units": "m s-1"
+    }
   }
 }
diff --git a/miranda/preprocess/eccc.py b/miranda/preprocess/eccc.py
index 688ed196..be087085 100644
--- a/miranda/preprocess/eccc.py
+++ b/miranda/preprocess/eccc.py
@@ -2,122 +2,163 @@
 
 from __future__ import annotations
 
+import contextlib
 import json
 import logging.config
 import multiprocessing as mp
 import os
+import tempfile
 from functools import partial
 from pathlib import Path
+from typing import Callable
+
+from dask.distributed import ProgressBar
 
-from miranda.eccc._utils import cf_station_metadata
-from miranda.preprocess._eccc_obs import _convert_station_file
 from miranda.scripting import LOGGING_CONFIG
+from miranda.storage import file_size, report_file_size
+from miranda.utils import generic_extract_archive
 
 logging.config.dictConfig(LOGGING_CONFIG)
 
 
 _data_folder = Path(__file__).parent / "configs"
 
-eccc_observation_variables = dict()
-eccc_observation_variables["flat"] = [
-    v for v in json.load(open(_data_folder / "eccc-obs_attrs.json"))["variables"].keys()
-]
-eccc_observation_variables["summary"] = [
-    attrs["_cf_variable_name"]
-    for attrs in json.load(open(_data_folder / "eccc-obs-summary_attrs.json"))[
-        "variables"
-    ].values()
-]
-eccc_observation_variables["homogenized"] = [
-    attrs["_cf_variable_name"]
-    for attrs in json.load(open(_data_folder / "eccc-homogenized_attrs.json"))[
-        "variables"
-    ].values()
-]
-
-
-def convert_flat_files(
-    source_files: str | os.PathLike,
-    output_folder: str | os.PathLike | list[str | int],
-    variables: str | int | list[str | int],
-    mode: str = "hourly",
-    n_workers: int = 4,
-) -> None:
-    """
+
+def _run_func_on_archive_with_optional_dask(
+    file: Path,
+    function: Callable,
+    errored_files: list[Path],
+    **dask_kwargs,
+):
+    r"""Run a function on a file archive, extracting it if necessary.
+
+    Notes
+    -----
+    If the file is larger than 1 GiB or dask_kwargs are passed, dask.dataframes will be used.
+    Partial function requires the function to accept the following parameters:
+      - file: Path
+      - using_dask: bool
+      - client: dask.distributed.Client
 
     Parameters
     ----------
-    source_files: str or Path
-    output_folder: str or Path
-    variables: str or List[str]
-    mode: {"hourly", "daily"}
-    n_workers: int
+    file: Path
+        File archive to process.
+    function: Callable
+        Function to run on the file.
+    errored_files: list[Path]
+        List of files that errored during processing.
+    \*\*dask_kwargs
+        Keyword arguments to pass to dask.distributed.Client.
 
     Returns
     -------
-    None
+
     """
-    if mode.lower() in ["h", "hour", "hourly"]:
-        num_observations = 24
-        column_names = ["code", "year", "month", "day", "code_var"]
-        column_dtypes = [str, float, float, float, str]
-    elif mode.lower() in ["d", "day", "daily"]:
-        num_observations = 31
-        column_names = ["code", "year", "month", "code_var"]
-        column_dtypes = [str, float, float, str]
-    else:
-        raise NotImplementedError("`mode` must be 'h'/'hourly or 'd'/'daily'.")
-
-    # Preparing the data column headers
-    for i in range(1, num_observations + 1):
-        data_entry, flag_entry = f"D{i:0n}", f"F{i:0n}"
-        column_names.append(data_entry)
-        column_names.append(flag_entry)
-        column_dtypes.extend([str, str])
-
-    if isinstance(variables, (str, int)):
-        variables = [variables]
-
-    for variable_code in variables:
-        variable_code = str(variable_code).zfill(3)
-        metadata = cf_station_metadata(variable_code)
-        nc_name = metadata["nc_name"]
-
-        rep_nc = Path(output_folder).joinpath(nc_name)
-        rep_nc.mkdir(parents=True, exist_ok=True)
-
-        # Loop on the files
-        logging.info(
-            f"Collecting files for variable '{metadata['standard_name']}' "
-            f"(filenames containing '{metadata['_table_name']}')."
-        )
-        list_files = list()
-        if isinstance(source_files, list) or Path(source_files).is_file():
-            list_files.append(source_files)
+
+    with tempfile.TemporaryDirectory() as temp_folder:
+        if file.suffix in [".gz", ".tar", ".zip", ".7z"]:
+            data_files = generic_extract_archive(file, output_dir=temp_folder)
         else:
-            glob_patterns = [g for g in metadata["_table_name"]]
-            for pattern in glob_patterns:
-                list_files.extend(
-                    [f for f in Path(source_files).rglob(f"{pattern}*") if f.is_file()]
+            data_files = [file]
+        logging.info(f"Processing file: {file}.")
+
+        # 1 GiB
+        size_limit = 2**30
+
+        for data in data_files:
+            size = file_size(data)
+            if size > size_limit or dask_kwargs:
+                if size > size_limit:
+                    logging.info(
+                        f"File exceeds {report_file_size(size_limit)} - Using dask.dataframes."
+                    )
+                client = ProgressBar
+                using_dask = True
+            else:
+                logging.info(
+                    f"File below {report_file_size(size_limit)} - Using pandas.dataframes."
                 )
-        manager = mp.Manager()
-        errored_files = manager.list()
-        converter_func = partial(
-            _convert_station_file,
-            output_path=rep_nc,
-            errored_files=errored_files,
-            mode=mode,
-            variable_code=variable_code,
-            column_names=column_names,
-            column_dtypes=column_dtypes,
-            **metadata,
-        )
-        with mp.Pool(processes=n_workers) as pool:
-            pool.map(converter_func, list_files)
-            pool.close()
-            pool.join()
-
-        if errored_files:
-            logging.warning(
-                "Some files failed to be properly parsed:\n", ", ".join(errored_files)
-            )
+                client = contextlib.nullcontext
+                using_dask = False
+
+            with client(**dask_kwargs) as c:
+                try:
+                    function(data, using_dask=using_dask, client=c)
+                except FileNotFoundError:
+                    errored_files.append(data)
+
+        if os.listdir(temp_folder):
+            for temporary_file in Path(temp_folder).glob("*"):
+                if temporary_file in data_files:
+                    temporary_file.unlink()
+
+
+# def convert_flat_files(
+#     source_files: str | os.PathLike,
+#     output_folder: str | os.PathLike | list[str | int],
+#     variables: str | int | list[str | int],
+#     project: str = "eccc-obs",
+#     mode: str = "hourly",
+#     **dask_kwargs,
+# ) -> None:
+#     """
+#
+#     Parameters
+#     ----------
+#     source_files: str or Path
+#     output_folder: str or Path
+#     variables: str or List[str]
+#     project: {"eccc-obs", "eccc-obs-summary", "eccc-homogenized"}
+#     mode: {"hourly", "daily"}
+#
+#     Returns
+#     -------
+#     None
+#     """
+#
+#     if isinstance(variables, (str, int)):
+#         variables = [variables]
+#
+#     for variable_code in variables:
+#         variable_code = str(variable_code).zfill(3)
+#         metadata = load_json_data_mappings("eccc-obs").get(variable_code)
+#
+#
+#
+#         # Loop on the files
+#         logging.info(
+#             f"Collecting files for variable '{metadata['standard_name']}' "
+#             f"(filenames containing '{metadata['_table_name']}')."
+#         )
+#         list_files = list()
+#         if isinstance(source_files, list) or Path(source_files).is_file():
+#             list_files.append(source_files)
+#         else:
+#             glob_patterns = [g for g in metadata["_table_name"]]
+#             for pattern in glob_patterns:
+#                 list_files.extend(
+#                     [f for f in Path(source_files).rglob(f"{pattern}*") if f.is_file()]
+#                 )
+#
+#
+#
+#
+#         manager = mp.Manager()
+#         errored_files = manager.list()
+#         converter_func = partial(
+#             _convert_station_file,
+#             output_path=rep_nc,
+#             errored_files=errored_files,
+#             mode=mode,
+#             variable_code=variable_code,
+#             column_names=column_names,
+#             column_dtypes=column_dtypes,
+#             **metadata,
+#         )
+#         with mp.Pool(processes=n_workers) as pool:
+#             pool.map(converter_func, list_files)
+#             pool.close()
+#             pool.join()
+#
+#
diff --git a/miranda/vocabularies/eccc.py b/miranda/vocabularies/eccc.py
index f86ebb53..bd739fed 100644
--- a/miranda/vocabularies/eccc.py
+++ b/miranda/vocabularies/eccc.py
@@ -3,86 +3,91 @@
 # For more information see the ECCC Technical Documentation
 
 __all__ = [
-    "DLY",
-    "DLY02",
-    "DLY03",
-    "DLY04",
-    "DLY12",
-    "DLY13",
-    "DLY21",
-    "DLY44",
-    "HLY",
-    "HLY01",
-    "HLY01_RCS",
-    "HLY03",
-    "HLY10",
-    "HLY15",
-    "HLY21",
-    "MLY",
-    "MLY04",
+    "obs_groupings",
+    "obs_vocabularies",
 ]
 
+obs_vocabularies = dict()
+
 # Hourly Data
 
-HLY01 = []
-HLY01.extend(list(range(71, 123)))  # Hourly variables
-HLY01.extend([209, 210])  # Wind character and gust speed
-HLY01.extend(list(range(219, 231)))  # Cloud layers
-HLY01.append(244)  # Precipitation type
-HLY01.append(260)  # Freezing fog
+obs_vocabularies["HLY01"] = []
+obs_vocabularies["HLY01"].extend(list(range(71, 123)))  # Hourly variables
+obs_vocabularies["HLY01"].extend([209, 210])  # Wind character and gust speed
+obs_vocabularies["HLY01"].extend(list(range(219, 231)))  # Cloud layers
+obs_vocabularies["HLY01"].append(244)  # Precipitation type
+obs_vocabularies["HLY01"].append(260)  # Freezing fog
 
-HLY01_RCS = HLY01.copy()
-HLY01_RCS.extend(
+obs_vocabularies["HLY01_RCS"] = obs_vocabularies["HLY01"].copy()
+obs_vocabularies["HLY01_RCS"].extend(
     list(range(262, 281))
 )  # Reference Climate Surface (RCS) weather stations
 
-HLY03 = []
-HLY03.extend(list(range(123, 133)))  # Hourly rainfall
-HLY03.extend([160, 161])
-
-HLY10 = []
-HLY10.extend(list(range(61, 69)))  # Sunshine
-HLY10.extend([133, 169, 170, 171, 172])  # Solar radiation
+obs_vocabularies["HLY03"] = []
+obs_vocabularies["HLY03"].extend(list(range(123, 133)))  # Hourly rainfall
+obs_vocabularies["HLY03"].extend([160, 161])
 
-HLY15 = [69, 70, 76, 156]  # Wind
+obs_vocabularies["HLY10"] = []
+obs_vocabularies["HLY10"].extend(list(range(61, 69)))  # Sunshine
+obs_vocabularies["HLY10"].extend([133, 169, 170, 171, 172])  # Solar radiation
 
-HLY21 = [123]  # Fischer/Porter precipitation
+obs_vocabularies["HLY15"] = [69, 70, 76, 156]  # Wind
 
-HLY = list(set(HLY01 + HLY01_RCS + HLY03 + HLY10 + HLY15 + HLY21))
+obs_vocabularies["HLY21"] = [123]  # Fischer/Porter precipitation
 
 # Daily Data
 
-DLY02 = []
-DLY02.extend(list(range(1, 26)))  # Daily variables
-DLY02.append(157)  # Direction of extreme gust
-DLY02.append(179)  # Daily bright sunshine
+obs_vocabularies["DLY02"] = []
+obs_vocabularies["DLY02"].extend(list(range(1, 26)))  # Daily variables
+obs_vocabularies["DLY02"].append(157)  # Direction of extreme gust
+obs_vocabularies["DLY02"].append(179)  # Daily bright sunshine
 
-DLY03 = []
-DLY03.extend(list(range(124, 133)))
-DLY03.extend([160, 161])
+obs_vocabularies["DLY03"] = []
+obs_vocabularies["DLY03"].extend(list(range(124, 133)))
+obs_vocabularies["DLY03"].extend([160, 161])
 
-DLY04 = DLY02.copy()
+obs_vocabularies["DLY04"] = obs_vocabularies["DLY02"].copy()
 
-DLY12 = []
-DLY12.extend(list(range(134, 151)))  # Soil temperatures
+obs_vocabularies["DLY12"] = []
+obs_vocabularies["DLY12"].extend(list(range(134, 151)))  # Soil temperatures
 
-DLY13 = list(range(151, 156))  # Pan evaporation
+obs_vocabularies["DLY13"] = list(range(151, 156))  # Pan evaporation
 
-DLY21 = [12]  # Precipitation
-DLY21.extend(list(range(127, 133)))  # Precipitation over time
-DLY21.append(161)  # Most precipitation in 25 hours
+obs_vocabularies["DLY21"] = [12]  # Precipitation
+obs_vocabularies["DLY21"].extend(list(range(127, 133)))  # Precipitation over time
+obs_vocabularies["DLY21"].append(161)  # Most precipitation in 25 hours
 
-DLY44 = []
-DLY44.extend([1, 2, 3])  # Temperature
-DLY44.extend(list(range(10, 18)))  # Precipitation
-
-DLY = list(set(DLY02 + DLY03 + DLY04 + DLY12 + DLY13 + DLY21 + DLY44))
+obs_vocabularies["DLY44"] = []
+obs_vocabularies["DLY44"].extend([1, 2, 3])  # Temperature
+obs_vocabularies["DLY44"].extend(list(range(10, 18)))  # Precipitation
 
 # Monthly data
 
-MLY04 = []
-MLY04.extend(list(range(26, 39)))  # Days with variables
-MLY04.extend(list(range(39, 61)))  # Means of variables
-MLY04.append(158)  # Direction of extreme gust
-
-MLY = list(set(MLY04))
+obs_vocabularies["MLY04"] = []
+obs_vocabularies["MLY04"].extend(list(range(26, 39)))  # Days with variables
+obs_vocabularies["MLY04"].extend(list(range(39, 61)))  # Means of variables
+obs_vocabularies["MLY04"].append(158)  # Direction of extreme gust
+
+# Groupings
+
+obs_groupings = dict()
+obs_groupings["HLY"] = list(
+    obs_vocabularies["HLY01"]
+    + obs_vocabularies["HLY01_RCS"]
+    + obs_vocabularies["HLY03"]
+    + obs_vocabularies["HLY10"]
+    + obs_vocabularies["HLY15"]
+    + obs_vocabularies["HLY21"]
+)
+obs_groupings["DLY"] = list(
+    set(
+        obs_vocabularies["DLY02"]
+        + obs_vocabularies["DLY03"]
+        + obs_vocabularies["DLY04"]
+        + obs_vocabularies["DLY12"]
+        + obs_vocabularies["DLY13"]
+        + obs_vocabularies["DLY21"]
+        + obs_vocabularies["DLY44"]
+    )
+)
+obs_groupings["MLY"] = list(set(obs_vocabularies["MLY04"]))

From 010d3ea7d4180fffba4113d3a203bd46537345b5 Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Thu, 10 Aug 2023 12:36:13 -0400
Subject: [PATCH 15/33] reduce amount of unit conversions

---
 miranda/preprocess/_eccc_obs.py               |  26 +-
 .../preprocess/configs/eccc-obs_attrs.json    | 243 ++++--------------
 miranda/preprocess/eccc.py                    |   7 +-
 3 files changed, 60 insertions(+), 216 deletions(-)

diff --git a/miranda/preprocess/_eccc_obs.py b/miranda/preprocess/_eccc_obs.py
index 6ff64d8f..a3c55dfc 100644
--- a/miranda/preprocess/_eccc_obs.py
+++ b/miranda/preprocess/_eccc_obs.py
@@ -40,6 +40,15 @@
 TABLE_DATE = dt.now().strftime("%d %B %Y")
 
 
+def _remove_duplicates(ds):
+    if any(ds.get_index("time").duplicated()):
+        logging.info(
+            f"Found {ds.get_index('time').duplicated().sum()} duplicated time coordinates "
+            f"for station {ds.station_id.values}. Assuming first value."
+        )
+    return ds.sel(time=~ds.get_index("time").duplicated())
+
+
 def convert_observation(
     data_source: str | Path | list[str | Path],
     output_dir: str | Path,
@@ -94,15 +103,6 @@ def convert_observation(
         )
 
 
-def _remove_duplicates(ds):
-    if any(ds.get_index("time").duplicated()):
-        logging.info(
-            f"Found {ds.get_index('time').duplicated().sum()} duplicated time coordinates "
-            f"for station {ds.station_id.values}. Assuming first value."
-        )
-    return ds.sel(time=~ds.get_index("time").duplicated())
-
-
 def convert_station(
     data: str | os.PathLike,
     variable: str,
@@ -665,7 +665,7 @@ def _combine_years(
 
     if _verbose:
         logging.info(f"Opening: {', '.join([p.name for p in nc_files])}")
-    ds = xr.open_mfdataset(nc_files, combine="nested", concat_dim={"time"})
+    ds = xr.open_mfdataset(nc_files, combine="nested", concat_dim="time")
     outfile = Path(out_folder).joinpath(
         f'{nc_files[0].name.split(f"_{varia}_")[0]}_{varia}_'
         f"{ds.time.dt.year.min().values}-{ds.time.dt.year.max().values}.nc"
@@ -702,9 +702,9 @@ def _combine_years(
         "elevation",
     ]
     for vv in meta.data_vars:
-        if vv.lower() not in keep_coords:
+        if str(vv).lower() not in keep_coords:
             continue
-        ds = ds.assign_coords({vv.lower(): meta[vv]})
+        ds = ds.assign_coords({str(vv).lower(): meta[vv]})
 
     for vv in ds.data_vars:
         if ds[vv].dtype == "O":
@@ -730,7 +730,6 @@ def merge_converted_variables(
     source_files: str | os.PathLike,
     output_folder: str | os.PathLike,
     variables: str | int | list[str | int] | None = None,
-    station_metadata: str | os.PathLike | None = None,
     overwrite: bool = False,
     n_workers: int = 1,
 ) -> None:
@@ -741,7 +740,6 @@ def merge_converted_variables(
     source_files : str, Path
     output_folder : str, Path
     variables : str or int or list of str or int, optional
-    station_metadata : str or Path, optional
     overwrite : bool
     n_workers : int
 
diff --git a/miranda/preprocess/configs/eccc-obs_attrs.json b/miranda/preprocess/configs/eccc-obs_attrs.json
index 7eb3b7b6..300b559a 100644
--- a/miranda/preprocess/configs/eccc-obs_attrs.json
+++ b/miranda/preprocess/configs/eccc-obs_attrs.json
@@ -28,730 +28,577 @@
   "variables": {
     "001": {
       "_variable_name": "tasmax",
-      "least_significant_digit": "",
       "long_name": "Daily Maximum Temperature",
       "original_units": "0.1 °C",
-      "raw_units": "degC",
       "scale_factor": 0.1,
       "standard_name": "air_temperature_maximum",
-      "units": "K"
+      "units": "degC"
     },
     "002": {
       "_variable_name": "tasmin",
-      "least_significant_digit": "",
       "long_name": "Daily Minimum Temperature",
       "original_units": "0.1 °C",
-      "raw_units": "degC",
       "scale_factor": 0.1,
       "standard_name": "air_temperature_minimum",
-      "units": "K"
+      "units": "degC"
     },
     "003": {
       "_variable_name": "tas",
-      "least_significant_digit": "",
       "long_name": "Daily Mean Temperature",
       "original_units": "0.1 °C",
-      "raw_units": "degC",
       "scale_factor": 0.1,
       "standard_name": "air_temperature",
-      "units": "K"
+      "units": "degC"
     },
     "010": {
       "_variable_name": "prlptot",
-      "least_significant_digit": "",
       "long_name": "Daily Total Rainfall",
       "original_units": "0.1 mm day-1",
-      "raw_units": "mm",
       "scale_factor": 0.1,
       "standard_name": "liquid_precipitation_amount",
-      "units": "m"
+      "units": "mmn day-1"
     },
     "011": {
       "_variable_name": "prsntot",
-      "least_significant_digit": "",
       "long_name": "Daily Total Snowfall",
       "original_units": "0.1 cm day-1",
-      "raw_units": "cm",
       "scale_factor": 0.1,
       "standard_name": "solid_precipitation_amount",
-      "units": "m"
+      "units": "cm day-1"
     },
     "012": {
       "_variable_name": "prcptot",
-      "least_significant_digit": "",
       "long_name": "Daily Total Precipitation",
       "original_units": "0.1 mm day-1",
-      "raw_units": "mm",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
-      "units": "m"
+      "units": "mm day-1"
     },
     "013": {
       "_variable_name": "sndtot",
-      "least_significant_digit": "",
       "long_name": "Snow on the Ground",
       "original_units": "cm",
-      "raw_units": "cm",
       "scale_factor": 1,
       "standard_name": "surface_snow_thickness",
-      "units": "m"
+      "units": "cm"
     },
     "014": {
       "_variable_name": "thunder",
-      "least_significant_digit": "",
       "long_name": "Thunderstorms",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "thunderstorm_presence",
       "units": "1"
     },
     "015": {
       "_variable_name": "freezing_rain_drizzle",
-      "least_significant_digit": "",
       "long_name": "Freezing rain or drizzle",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "freeze_rain_drizzle_presence",
       "units": "1"
     },
     "016": {
       "_variable_name": "hail",
-      "least_significant_digit": "",
       "long_name": "Hail",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "hail_presence",
       "units": "1"
     },
     "017": {
       "_variable_name": "fog_ice_fog",
-      "least_significant_digit": "",
       "long_name": "Fog or Ice Fog",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "fog_ice_fog_presence",
       "units": "1"
     },
     "018": {
       "_variable_name": "smoke_haze",
-      "least_significant_digit": "",
       "long_name": "Smoke or Haze",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "smoke_haze_presence",
       "units": "1"
     },
     "019": {
       "_variable_name": "blowing_dust_sand",
-      "least_significant_digit": "",
       "long_name": "Blowing Dust or Sand",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "blowing_dust_sand_presence",
       "units": "1"
     },
     "020": {
       "_variable_name": "blow_snow",
-      "least_significant_digit": "",
       "long_name": "Blowing snow",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "blowing_snow_presence",
       "units": "1"
     },
     "021": {
       "_variable_name": "wind_gt_28kt",
-      "least_significant_digit": "",
       "long_name": "Wind speed >= 28 Knots",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "wind_exceeding_28_knots",
       "units": "1"
     },
     "022": {
       "_variable_name": "wind_gt_34kt",
-      "least_significant_digit": "",
       "long_name": "Wind speed >= 34 Knots",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "wind_exceeding_34_knots",
       "units": "1"
     },
     "023": {
       "_variable_name": "gust_dir_16pts",
-      "least_significant_digit": "",
       "long_name": "Direction of extreme gust (16 pts) to December 1976",
       "original_units": "10's of degrees",
-      "raw_units": "deg",
       "scale_factor": 10,
       "standard_name": "gust_to_direction",
       "units": "deg"
     },
     "024": {
       "_variable_name": "gust_speed",
-      "least_significant_digit": "",
       "long_name": "Speed of extreme gust",
       "original_units": "km/h",
-      "raw_units": "km h-1",
-      "scale_factor": 1,
       "standard_name": "wind_speed_of_gust",
-      "units": "m s-1"
+      "units": "km h-1"
     },
     "025": {
       "_variable_name": "gust_hour",
-      "least_significant_digit": "",
       "long_name": "UTC hour of extreme gust",
-      "raw_units": "h",
-      "scale_factor": 1,
       "standard_name": "hour_of_extreme_gust",
       "units": "h"
     },
     "061": {
       "_variable_name": "rf1_radiation",
-      "least_significant_digit": "",
       "long_name": "RF1 global solar radiation",
       "original_units": "0.001 MJ/m",
-      "raw_units": "W m-2 h-1",
-      "scale_factor": 277.77777777777777,
+      "scale_factor": 0.001,
       "standard_name": "solar_radiation_flux",
-      "units": "W m-2 h-1"
+      "units": "MJ m-1"
     },
     "062": {
       "_variable_name": "rf2_radiation",
-      "least_significant_digit": "",
       "long_name": "RF2 sky (diffuse) radiation",
       "original_units": "0.001 MJ/m",
-      "raw_units": "W m-2 h-1",
       "scale_factor": 277.77777777777777,
       "standard_name": "solar_radiation_flux",
-      "units": "W m-2 h-1"
+      "units": "MJ m-1"
     },
     "063": {
       "_variable_name": "rf3_radiation",
-      "least_significant_digit": "",
       "long_name": "RF3 reflected solar radiation",
       "original_units": "0.001 MJ/m",
-      "raw_units": "W m-2 h-1",
       "scale_factor": 277.77777777777777,
       "standard_name": "solar_radiation_flux",
-      "units": "W m-2 h-1"
+      "units": "MJ m-1"
     },
     "064": {
       "_variable_name": "rf4_radiation",
-      "least_significant_digit": "",
       "long_name": "RF4 net all wave radiation",
       "original_units": "0.001 MJ/m",
-      "raw_units": "W m-2 h-1",
       "scale_factor": 277.77777777777777,
       "standard_name": "solar_radiation_flux",
-      "units": "W m-2 h-1"
+      "units": "MJ m-1"
     },
     "067": {
       "_variable_name": "rf7_radiation",
-      "least_significant_digit": "",
       "long_name": "RF7 daylight illumination",
       "original_units": "0.01 Kilolux_hrs",
-      "raw_units": "lux h",
-      "scale_factor": 10,
+      "scale_factor": 0.01,
       "standard_name": "solar_radiation_flux",
-      "units": "lux h"
+      "units": "klux h"
     },
     "068": {
       "_variable_name": "rf8_radiation",
-      "least_significant_digit": "",
       "long_name": "RF8 direct solar radiation",
       "original_units": "0.001 MJ/m",
-      "raw_units": "W m-2 h-1",
       "scale_factor": 277.77777777777777,
       "standard_name": "solar_radiation_flux",
       "units": "W m-2 h-1"
     },
     "069": {
       "_variable_name": "wind_dir_45B",
-      "least_significant_digit": "",
       "long_name": "Direction - 45B anemometer (8 pts)",
       "original_units": "10's of degrees",
-      "raw_units": "deg",
-      "scale_factor": 1,
+      "scale_factor": 10,
       "standard_name": "wind_to_direction",
       "units": "deg"
     },
     "071": {
       "_variable_name": "ceiling_hgt",
-      "least_significant_digit": "",
       "long_name": "Ceiling height of lowest layer of clouds",
       "original_units": "30's of meters",
-      "raw_units": "m",
       "scale_factor": 30,
       "standard_name": "ceiling_cloud_height",
       "units": "m"
     },
     "072": {
       "_variable_name": "visibility",
-      "least_significant_digit": "",
       "long_name": "Visibility",
       "original_units": "0.1 km",
-      "raw_units": "km",
       "scale_factor": 0.1,
       "standard_name": "visibility_in_air",
-      "units": "m"
+      "units": "km"
     },
     "073": {
       "_variable_name": "psl",
-      "least_significant_digit": "",
       "long_name": "Sea Level Pressure",
       "original_units": "0.01 kPa",
-      "raw_units": "Pa",
-      "scale_factor": 10,
+      "scale_factor": 0.01,
       "standard_name": "air_pressure_at_mean_sea_level",
-      "units": "Pa"
+      "units": "kPa"
     },
     "074": {
       "_variable_name": "tds",
-      "least_significant_digit": "",
       "long_name": "Dew Point Temperature",
       "original_units": "0.1 °C",
-      "raw_units": "degC",
       "scale_factor": 0.1,
       "standard_name": "dew_point_temperature",
-      "units": "K"
+      "units": "degC"
     },
     "075": {
       "_variable_name": "wind_dir_u2a_16",
-      "least_significant_digit": "",
       "long_name": "Wind Direction at 2 m (U2A Anemometer) (16 pts)",
       "original_units": "10's of degrees",
-      "raw_units": "deg",
       "scale_factor": 10,
       "standard_name": "wind_direction_u2a",
       "units": "deg"
     },
     "076": {
       "_variable_name": "wind_speed_u2a",
-      "least_significant_digit": "",
       "long_name": "Wind Speed - U2A (16 pts) to December 1970",
       "original_units": "km/h",
-      "raw_units": "km h-1",
       "scale_factor": 1,
       "standard_name": "wind_speed_u2a",
-      "units": "m s-1"
+      "units": "km h-1"
     },
     "077": {
       "_variable_name": "pressure",
-      "least_significant_digit": "",
       "long_name": "Station Pressure",
       "original_units": "0.01 kPa",
-      "raw_units": "Pa",
-      "scale_factor": 10,
+      "scale_factor": 0.01,
       "standard_name": "atmospheric_pressure",
-      "units": "Pa"
+      "units": "kPa"
     },
     "078": {
       "_variable_name": "tas_dry",
-      "least_significant_digit": "",
       "long_name": "Dry Bulb Temperature",
       "original_units": "0.1 °C",
-      "raw_units": "degC",
       "scale_factor": 0.1,
       "standard_name": "dry_bulb_temperature",
-      "units": "K"
+      "units": "degC"
     },
     "079": {
       "_variable_name": "tas_wet",
-      "least_significant_digit": "",
       "long_name": "Wet Bulb temperature",
       "original_units": "0.1 °C",
-      "raw_units": "degC",
       "scale_factor": 0.1,
       "standard_name": "wet_bulb_temperature",
-      "units": "K"
+      "units": "degC"
     },
     "080": {
       "_variable_name": "hur",
-      "least_significant_digit": "",
       "long_name": "Relative Humidity",
       "original_units": "%",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "relative_humidity",
       "units": "1"
     },
     "081": {
       "_variable_name": "clo",
-      "least_significant_digit": "",
       "long_name": "Total Cloud Opacity",
       "original_units": "%",
-      "raw_units": "1",
       "scale_factor": 10,
       "standard_name": "cloud_albedo",
       "units": "1"
     },
     "082": {
       "_variable_name": "clt",
-      "least_significant_digit": "",
       "long_name": "Total Cloud Amount",
       "original_units": "%",
-      "raw_units": "1",
       "scale_factor": 10,
       "standard_name": "cloud_area_fraction",
       "units": "1"
     },
     "089": {
       "_variable_name": "freeze_rain",
-      "least_significant_digit": "",
       "long_name": "Freezing Rain",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "freezing_rain",
       "units": "1"
     },
     "094": {
       "_variable_name": "ice_pellets",
-      "least_significant_digit": "",
       "long_name": "Ice Pellets",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "ice_pellet_presence",
       "units": "1"
     },
     "107": {
       "_variable_name": "1low_cloud_opac",
-      "least_significant_digit": "",
       "long_name": "Lowest cloud layer opacity",
       "original_units": "Tenths",
-      "raw_units": "1",
       "scale_factor": 10,
       "standard_name": "low_type_cloud_opacity_fraction",
       "units": "1"
     },
     "108": {
       "_variable_name": "1low_cloud_frac",
-      "least_significant_digit": "",
       "long_name": "Lowest cloud layer amount or condition",
       "original_units": "Tenths",
-      "raw_units": "1",
       "scale_factor": 10,
       "standard_name": "low_type_cloud_area_fraction",
       "units": "1"
     },
     "109": {
       "_variable_name": "1low_cloud_type",
-      "least_significant_digit": "",
       "long_name": "Lowest cloud layer type",
-      "raw_units": "1",
-      "scale_factor": 1,
       "standard_name": "low_type_cloud_type",
       "units": "1"
     },
     "110": {
       "_variable_name": "1low_cloud_hgt",
-      "least_significant_digit": "",
       "long_name": "Lowest cloud layer height",
       "original_units": "30's of meters",
-      "raw_units": "m",
       "scale_factor": 30,
       "standard_name": "low_type_cloud_height",
       "units": "m"
     },
     "111": {
       "_variable_name": "2low_cloud_opac",
-      "least_significant_digit": "",
       "long_name": "Second lowest cloud layer opacity",
       "original_units": "Tenths",
-      "raw_units": "1",
       "scale_factor": 10,
       "standard_name": "low_type_cloud_opacity_fraction",
       "units": "1"
     },
     "112": {
       "_variable_name": "2low_cloud_frac",
-      "least_significant_digit": "",
       "long_name": "Second lowest cloud layer amount or condition",
       "original_units": "Tenths",
-      "raw_units": "1",
       "scale_factor": 10,
       "standard_name": "low_type_cloud_area_fraction",
       "units": "1"
     },
     "113": {
       "_variable_name": "2low_cloud_type",
-      "least_significant_digit": "",
       "long_name": "Second lowest cloud layer type",
       "original_units": "",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "low_type_cloud_type",
       "units": "1"
     },
     "114": {
       "_variable_name": "2low_cloud_hgt",
-      "least_significant_digit": "",
       "long_name": "Second lowest cloud layer height",
       "original_units": "30's of meters",
-      "raw_units": "m",
       "scale_factor": 30,
       "standard_name": "low_type_cloud_height",
       "units": "m"
     },
     "115": {
       "_variable_name": "3low_cloud_opac",
-      "least_significant_digit": "",
       "long_name": "Thirsd lowest cloud layer opacity",
       "original_units": "Tenths",
-      "raw_units": "1",
       "scale_factor": 10,
       "standard_name": "low_type_cloud_opacity_fraction",
       "units": "1"
     },
     "116": {
       "_variable_name": "3low_cloud_frac",
-      "least_significant_digit": "",
       "long_name": "Third lowest cloud layer amount or condition",
       "original_units": "Tenths",
-      "raw_units": "1",
       "scale_factor": 10,
       "standard_name": "low_type_cloud_area_fraction",
       "units": "1"
     },
     "117": {
       "_variable_name": "3low_cloud_type",
-      "least_significant_digit": "",
       "long_name": "Third lowest cloud layer type",
       "original_units": "",
-      "raw_units": "1",
       "scale_factor": 1,
       "standard_name": "low_type_cloud_type",
       "units": "1"
     },
     "118": {
       "_variable_name": "3low_cloud_hgt",
-      "least_significant_digit": "",
       "long_name": "Third lowest cloud layer height",
       "original_units": "30's of meters",
-      "raw_units": "m",
       "scale_factor": 30,
       "standard_name": "low_type_cloud_height",
       "units": "m"
     },
     "123": {
       "_variable_name": "rainfall",
-      "least_significant_digit": "",
       "long_name": "Total Rainfall",
       "original_units": "0.1 mm",
-      "raw_units": "mm h-1",
       "scale_factor": 0.1,
       "standard_name": "rainfall_flux",
-      "units": "kg m2 s-1"
+      "units": "mm h-1"
     },
     "133": {
       "_variable_name": "sun",
-      "least_significant_digit": "",
       "long_name": "Sunshine",
       "original_units": "0.1 hrs",
-      "raw_units": "h",
       "scale_factor": 0.1,
       "standard_name": "duration_of_sunshine",
-      "units": "s"
+      "units": "h"
     },
     "156": {
       "_variable_name": "wind_dir_u2a_36",
-      "least_significant_digit": "",
       "long_name": "Wind Direction - U2A (36 pts) from January 1971",
       "original_units": "10's of degrees",
-      "raw_units": "deg",
       "scale_factor": 10,
       "standard_name": "wind_direction_u2a",
       "units": "deg"
     },
     "262": {
       "_variable_name": "prtot",
-      "least_significant_digit": "",
       "long_name": "Total Precipitation (minutes 00-60)",
       "original_units": "0.1 mm",
-      "raw_units": "mm",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
-      "units": "kg m-2"
+      "units": "mm"
     },
     "263": {
       "_variable_name": "prtot_q1",
-      "least_significant_digit": "",
       "long_name": "Total Precipitation (minutes 00-15)",
       "original_units": "0.1 mm",
-      "raw_units": "mm",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
-      "units": "kg m-2"
+      "units": "mm"
     },
     "264": {
       "_variable_name": "prtot_q2",
-      "least_significant_digit": "",
       "long_name": "Total Precipitation (minutes 15-30)",
       "original_units": "0.1 mm",
-      "raw_units": "mm",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
-      "units": "kg m-2"
+      "units": "mm"
     },
     "265": {
       "_variable_name": "prtot_q3",
-      "least_significant_digit": "",
       "long_name": "Total Precipitation (minutes 30-45)",
       "original_units": "0.1 mm",
-      "raw_units": "mm",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
-      "units": "kg m-2"
+      "units": "mm"
     },
     "266": {
       "_variable_name": "prtot_q4",
-      "least_significant_digit": "",
       "long_name": "Total Precipitation (minutes 45-60)",
       "original_units": "0.1 mm",
-      "raw_units": "mm",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
-      "units": "kg m-2"
+      "units": "mm"
     },
     "267": {
       "_variable_name": "precipitation_weight_q1",
-      "least_significant_digit": "",
       "long_name": "Precipitation Gauge Weight per Unit Area (at minute 15)",
       "original_units": "0.1 kg/m²",
-      "raw_units": "kg m-2",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
     "268": {
       "_variable_name": "precipitation_weight_q2",
-      "least_significant_digit": "",
       "long_name": "Precipitation Gauge Weight per Unit Area (at minute 30)",
       "original_units": "0.1 kg/m²",
-      "raw_units": "kg m-2",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
     "269": {
       "_variable_name": "precipitation_weight_q3",
-      "least_significant_digit": "",
       "long_name": "Precipitation Gauge Weight per Unit Area (at minute 45)",
       "original_units": "0.1 kg/m²",
-      "raw_units": "kg m-2",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
     "270": {
       "_variable_name": "precipitation_weight_q4",
-      "least_significant_digit": "",
       "long_name": "Precipitation Gauge Weight per Unit Area (at minute 60)",
       "original_units": "0.1 kg/m²",
-      "raw_units": "kg m-2",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "kg m-2"
     },
     "271": {
       "_variable_name": "wind_speed_q1",
-      "least_significant_digit": "",
       "long_name": "Wind Speed at 2 m (minutes 00-15)",
-      "nc_units": "m s-1",
       "original_units": "0.1 km/h",
-      "raw_units": "km h-1",
       "scale_factor": 0.1,
-      "standard_name": "wind_speed"
+      "standard_name": "wind_speed",
+      "units": "km h-1"
     },
     "272": {
       "_variable_name": "wind_speed_q2",
-      "least_significant_digit": "",
       "long_name": "Wind Speed at 2 m (minutes 15-30)",
-      "nc_units": "m s-1",
       "original_units": "0.1 km/h",
-      "raw_units": "km h-1",
       "scale_factor": 0.1,
-      "standard_name": "wind_speed"
+      "standard_name": "wind_speed",
+      "units": "km h-1"
     },
     "273": {
       "_variable_name": "wind_speed_q3",
-      "least_significant_digit": "",
       "long_name": "Wind Speed at 2 m (minutes 30-45)",
-      "nc_units": "m s-1",
       "original_units": "0.1 km/h",
-      "raw_units": "km h-1",
       "scale_factor": 0.1,
-      "standard_name": "wind_speed"
+      "standard_name": "wind_speed",
+      "units": "km h-1"
     },
     "274": {
       "_variable_name": "wind_speed_q4",
-      "least_significant_digit": "",
       "long_name": "Wind Speed at 2 m (minutes 45-60)",
-      "nc_units": "m s-1",
       "original_units": "0.1 km/h",
-      "raw_units": "km h-1",
       "scale_factor": 0.1,
-      "standard_name": "wind_speed"
+      "standard_name": "wind_speed",
+      "units": "km h-1"
     },
     "275": {
       "_variable_name": "snd_q4",
-      "least_significant_digit": "",
       "long_name": "Snow Depth (at minute 60)",
       "original_units": "cm",
-      "raw_units": "cm",
-      "scale_factor": 1,
       "standard_name": "surface_snow_thickness",
-      "units": "m"
+      "units": "cm"
     },
     "276": {
       "_variable_name": "snd_q1",
-      "least_significant_digit": "",
       "long_name": "Snow Depth (at minute 15)",
       "original_units": "cm",
-      "raw_units": "cm",
       "scale_factor": 1,
       "standard_name": "surface_snow_thickness",
-      "units": "m"
+      "units": "cm"
     },
     "277": {
       "_variable_name": "snd_q2",
-      "least_significant_digit": "",
       "long_name": "Snow Depth (at minute 30)",
       "original_units": "cm",
-      "raw_units": "cm",
       "scale_factor": 1,
       "standard_name": "surface_snow_thickness",
-      "units": "m"
+      "units": "cm"
     },
     "278": {
       "_variable_name": "snd_q3",
-      "least_significant_digit": "",
       "long_name": "Snow Depth (at minute 45)",
       "original_units": "cm",
-      "raw_units": "cm",
       "scale_factor": 1,
       "standard_name": "surface_snow_thickness",
-      "units": "m"
+      "units": "cm"
     },
     "279": {
       "_variable_name": "wind_dir",
-      "least_significant_digit": "",
       "long_name": "Wind Direction at 2 m (minutes 50-60)",
       "nc_units": "deg",
       "original_units": "Degrees",
-      "raw_units": "deg",
-      "scale_factor": 1,
       "standard_name": "wind_direction"
     },
     "280": {
       "_variable_name": "wind_speed",
-      "least_significant_digit": "",
       "long_name": "Wind Speed at 2 m (minutes 50-60)",
       "original_units": "0.1 km/h",
-      "raw_units": "km h-1",
       "scale_factor": 0.1,
       "standard_name": "wind_speed",
-      "units": "m s-1"
+      "units": "km h-1"
     }
   }
 }
diff --git a/miranda/preprocess/eccc.py b/miranda/preprocess/eccc.py
index be087085..9631090c 100644
--- a/miranda/preprocess/eccc.py
+++ b/miranda/preprocess/eccc.py
@@ -3,12 +3,9 @@
 from __future__ import annotations
 
 import contextlib
-import json
 import logging.config
-import multiprocessing as mp
 import os
 import tempfile
-from functools import partial
 from pathlib import Path
 from typing import Callable
 
@@ -69,7 +66,9 @@ def _run_func_on_archive_with_optional_dask(
         for data in data_files:
             size = file_size(data)
             if size > size_limit or dask_kwargs:
-                if size > size_limit:
+                if dask_kwargs:
+                    logging.info(f"`dask_kwargs` provided - Using dask.dataframes.")
+                elif size > size_limit:
                     logging.info(
                         f"File exceeds {report_file_size(size_limit)} - Using dask.dataframes."
                     )

From 7d8fdf9734adea87b110aef9a53f641304695ce9 Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Mon, 14 Aug 2023 14:26:01 -0400
Subject: [PATCH 16/33] refactoring - move treatments to new module,
 load_json_data_mappings as a dynamic and shared utility, renaming of
 configuration files for better coupling

---
 miranda/convert/__init__.py                   |  1 -
 miranda/convert/_data_definitions.py          | 71 ++-----------------
 ...trs.json => agcfsr|agmerra2_cf_attrs.json} |  0
 ... => cmip5|cmip6|cordex_ouranos_attrs.json} |  0
 ...cf_attrs.json => eccc-ahccd_cf_attrs.json} |  0
 ...ttrs.json => era5|era5-land_cf_attrs.json} |  0
 ..._cf_attrs.json => ets-grnch_cf_attrs.json} |  0
 ...ttrs.json => wfdei-gem-capa_cf_attrs.json} |  0
 miranda/convert/corrections.py                | 41 +++++++++--
 miranda/convert/melcc.py                      |  6 +-
 miranda/preprocess/_data_definitions.py       | 33 +--------
 miranda/preprocess/_eccc_obs.py               |  4 +-
 ...nized_attrs.json => eccc-ahccd_attrs.json} |  0
 miranda/preprocess/eccc.py                    |  2 +-
 miranda/treatments/__init__.py                |  5 ++
 .../{convert => treatments}/_treatments.py    | 43 ++++++++++-
 16 files changed, 94 insertions(+), 112 deletions(-)
 rename miranda/convert/configs/{nasa_ag_cf_attrs.json => agcfsr|agmerra2_cf_attrs.json} (100%)
 rename miranda/convert/configs/{cmip_ouranos_attrs.json => cmip5|cmip6|cordex_ouranos_attrs.json} (100%)
 rename miranda/convert/configs/{eccc-homogenized_cf_attrs.json => eccc-ahccd_cf_attrs.json} (100%)
 rename miranda/convert/configs/{ecmwf_cf_attrs.json => era5|era5-land_cf_attrs.json} (100%)
 rename miranda/convert/configs/{ets_grnch_cf_attrs.json => ets-grnch_cf_attrs.json} (100%)
 rename miranda/convert/configs/{usask_cf_attrs.json => wfdei-gem-capa_cf_attrs.json} (100%)
 rename miranda/preprocess/configs/{eccc-homogenized_attrs.json => eccc-ahccd_attrs.json} (100%)
 create mode 100644 miranda/treatments/__init__.py
 rename miranda/{convert => treatments}/_treatments.py (95%)

diff --git a/miranda/convert/__init__.py b/miranda/convert/__init__.py
index bfc31224..3533bcac 100644
--- a/miranda/convert/__init__.py
+++ b/miranda/convert/__init__.py
@@ -4,6 +4,5 @@
 from . import deh, hq, melcc, utils
 from ._aggregation import *
 from ._data_definitions import *
-from ._treatments import *
 
 # from ._reconstruction import *
diff --git a/miranda/convert/_data_definitions.py b/miranda/convert/_data_definitions.py
index 1b6dbc8a..e592da65 100644
--- a/miranda/convert/_data_definitions.py
+++ b/miranda/convert/_data_definitions.py
@@ -5,7 +5,6 @@
 import logging.config
 import os
 from pathlib import Path
-from typing import Any
 
 from miranda.scripting import LOGGING_CONFIG
 from miranda.storage import report_file_size
@@ -26,7 +25,6 @@
     "gather_sc_earth",
     "gather_wfdei_gem_capa",
     "gather_emdna",
-    "load_json_data_mappings",
     "nasa_ag_variables",
     "nrcan_variables",
     "project_institutes",
@@ -38,65 +36,6 @@
 _config_folder = Path(__file__).resolve().parent / "configs"
 
 
-def load_json_data_mappings(project: str) -> dict[str, Any]:
-    """Load JSON mappings for supported dataset conversions.
-
-    Parameters
-    ----------
-    project : str
-
-    Returns
-    -------
-    dict[str, Any]
-    """
-    if project.startswith("era5"):
-        metadata_definition = json.load(open(_config_folder / "ecmwf_cf_attrs.json"))
-    elif project in ["rdrs-v21"]:
-        metadata_definition = json.load(
-            open(_config_folder / "eccc-rdrs_cf_attrs.json")
-        )
-    elif project == "eccc-obs":
-        metadata_definition = json.load(open(_config_folder / "eccc-obs_cf_attrs.json"))
-    elif project in ["agcfsr", "agmerra2"]:
-        metadata_definition = json.load(open(_config_folder / "nasa_ag_cf_attrs.json"))
-    elif project in ["cordex", "cmip5", "cmip6"]:
-        metadata_definition = json.load(
-            open(_config_folder / "cmip_ouranos_attrs.json")
-        )
-    elif project == "ets-grnch":
-        metadata_definition = json.load(
-            open(_config_folder / "ets_grnch_cf_attrs.json")
-        )
-    elif project == "nrcan-gridded-10km":
-        raise NotImplementedError()
-    elif project == "wfdei-gem-capa":
-        metadata_definition = json.load(open(_config_folder / "usask_cf_attrs.json"))
-    elif project == "melcc":
-        metadata_definition = json.load(open(_config_folder / "melcc_cf_attrs.json"))
-    elif project == "eccc-canswe":
-        metadata_definition = json.load(
-            open(_config_folder / "eccc-canswe_cf_attrs.json")
-        )
-    elif project == "eccc-homogenized":
-        metadata_definition = json.load(
-            open(_config_folder / "eccc-homogenized_cf_attrs.json")
-        )
-    elif project in ["NEX-GDDP-CMIP6"]:
-        metadata_definition = json.load(
-            open(_config_folder / "nex-gddp-cmip6_attrs.json")
-        )
-    elif project in ["ESPO-G6-R2"]:
-        metadata_definition = json.load(open(_config_folder / "espo-g6-r2_attrs.json"))
-    elif project in ["ESPO-G6-E5L"]:
-        metadata_definition = json.load(open(_config_folder / "espo-g6-e5l_attrs.json"))
-    elif project in ["EMDNA"]:
-        metadata_definition = json.load(open(_config_folder / "emdna_cf_attrs.json"))
-    else:
-        raise NotImplementedError(f"Project not supported: {project}")
-
-    return metadata_definition
-
-
 eccc_rdrs_variables = dict()
 eccc_rdrs_variables["raw"] = [
     v
@@ -111,18 +50,18 @@ def load_json_data_mappings(project: str) -> dict[str, Any]:
     ].values()
 ]
 
-era5_variables = json.load(open(_config_folder / "ecmwf_cf_attrs.json"))[
+era5_variables = json.load(open(_config_folder / "era5|era5-land_cf_attrs.json"))[
     "variables"
 ].keys()
 grnch_variables = ["T", "Tmin", "Tmax", "P"]
 nrcan_variables = ["tasmin", "tasmax", "pr"]
-nasa_ag_variables = json.load(open(_config_folder / "nasa_ag_cf_attrs.json"))[
+nasa_ag_variables = json.load(open(_config_folder / "agcfsr|agmerra2_cf_attrs.json"))[
     "variables"
 ].keys()
 sc_earth_variables = ["prcp", "tdew", "tmean", "trange", "wind"]
-wfdei_gem_capa_variables = json.load(open(_config_folder / "usask_cf_attrs.json"))[
-    "variables"
-].keys()
+wfdei_gem_capa_variables = json.load(
+    open(_config_folder / "wfdei-gem-capa_cf_attrs.json")
+)["variables"].keys()
 
 project_institutes = {
     "cfsr": "ncar",
diff --git a/miranda/convert/configs/nasa_ag_cf_attrs.json b/miranda/convert/configs/agcfsr|agmerra2_cf_attrs.json
similarity index 100%
rename from miranda/convert/configs/nasa_ag_cf_attrs.json
rename to miranda/convert/configs/agcfsr|agmerra2_cf_attrs.json
diff --git a/miranda/convert/configs/cmip_ouranos_attrs.json b/miranda/convert/configs/cmip5|cmip6|cordex_ouranos_attrs.json
similarity index 100%
rename from miranda/convert/configs/cmip_ouranos_attrs.json
rename to miranda/convert/configs/cmip5|cmip6|cordex_ouranos_attrs.json
diff --git a/miranda/convert/configs/eccc-homogenized_cf_attrs.json b/miranda/convert/configs/eccc-ahccd_cf_attrs.json
similarity index 100%
rename from miranda/convert/configs/eccc-homogenized_cf_attrs.json
rename to miranda/convert/configs/eccc-ahccd_cf_attrs.json
diff --git a/miranda/convert/configs/ecmwf_cf_attrs.json b/miranda/convert/configs/era5|era5-land_cf_attrs.json
similarity index 100%
rename from miranda/convert/configs/ecmwf_cf_attrs.json
rename to miranda/convert/configs/era5|era5-land_cf_attrs.json
diff --git a/miranda/convert/configs/ets_grnch_cf_attrs.json b/miranda/convert/configs/ets-grnch_cf_attrs.json
similarity index 100%
rename from miranda/convert/configs/ets_grnch_cf_attrs.json
rename to miranda/convert/configs/ets-grnch_cf_attrs.json
diff --git a/miranda/convert/configs/usask_cf_attrs.json b/miranda/convert/configs/wfdei-gem-capa_cf_attrs.json
similarity index 100%
rename from miranda/convert/configs/usask_cf_attrs.json
rename to miranda/convert/configs/wfdei-gem-capa_cf_attrs.json
diff --git a/miranda/convert/corrections.py b/miranda/convert/corrections.py
index 8a5ef0ee..e2157de2 100644
--- a/miranda/convert/corrections.py
+++ b/miranda/convert/corrections.py
@@ -9,8 +9,9 @@
 
 import xarray as xr
 
-from miranda.convert._data_definitions import load_json_data_mappings
-from miranda.convert._treatments import (
+from miranda.convert.utils import find_version_hash
+from miranda.gis import subset_domain
+from miranda.treatments import (
     cf_units_conversion,
     clip_values,
     conservative_regrid,
@@ -18,6 +19,7 @@
     dimensions_compliance,
     ensure_correct_time_frequency,
     invert_value_sign,
+    load_json_data_mappings,
     metadata_conversion,
     offset_time_dimension,
     preprocessing_corrections,
@@ -25,13 +27,42 @@
     transform_values,
     variable_conversion,
 )
-from miranda.convert.utils import find_version_hash
-from miranda.gis import subset_domain
+
+CONFIG_FOLDER = Path(__file__).parent / "data"
+CONFIG_FILES = {
+    "EMDNA": "emdna_cf_attrs.json",
+    "ESPO-G6-E5L": "espo-g6-e5l_attrs.json",
+    "ESPO-G6-R2": "espo-g6-r2_attrs.json",
+    "NEX-GDDP-CMIP6": "nex-gddp-cmip6_attrs.json",
+    "agcfsr": "agcfsr|agmerra2_cf_attrs.json",
+    "agmerra2": "agcfsr|agmerra2_cf_attrs.json",
+    "cmip": "cmip5|cmip6|cordex_ouranos_attrs.json",
+    "cordex": "cmip5|cmip6|cordex_ouranos_attrs.json",
+    "eccc-canswe": "eccc-canswe_cf_attrs.json",
+    "eccc-ahccd": "eccc-ahccd_cf_attrs.json",
+    "eccc-obs": "eccc-obs_cf_attrs.json",
+    "era5-land": "era5|era5-land_cf_attrs.json",
+    "era5-land-monthly-means": "era5|era5-land_cf_attrs.json",
+    "era5-pressure-levels": "era5|era5-land_cf_attrs.json",
+    "era5-pressure-levels-monthly-means": "era5|era5-land_cf_attrs.json",
+    "era5-pressure-levels-monthly-means-preliminary-back-extension": "era5|era5-land_cf_attrs.json",
+    "era5-pressure-levels-preliminary-back-extension": "era5|era5-land_cf_attrs.json",
+    "era5-single-levels": "era5|era5-land_cf_attrs.json",
+    "era5-single-levels-monthly-means": "era5|era5-land_cf_attrs.json",
+    "era5-single-levels-monthly-means-preliminary-back-extension": "era5|era5-land_cf_attrs.json",
+    "era5-single-levels-preliminary-back-extension": "era5|era5-land_cf_attrs.json",
+    "ets-grnch": "ets-grnch_cf_attrs.json",
+    "melcc": "melcc_cf_attrs.json",
+    "rdrs-v21": "eccc-rdrs_cf_attrs.json",
+    "wfdei-gem-capa": "wfdei-gem-capa_cf_attrs.json",
+}
+for k, v in CONFIG_FILES.items():
+    CONFIG_FILES[k] = CONFIG_FOLDER / v
 
 
 def dataset_corrections(ds: xr.Dataset, project: str) -> xr.Dataset:
     """Convert variables to CF-compliant format"""
-    metadata_definition = load_json_data_mappings(project)
+    metadata_definition = load_json_data_mappings(project, CONFIG_FILES)
 
     ds = correct_unit_names(ds, project, metadata_definition)
     ds = transform_values(ds, project, metadata_definition)
diff --git a/miranda/convert/melcc.py b/miranda/convert/melcc.py
index dba999dc..5dd034ec 100644
--- a/miranda/convert/melcc.py
+++ b/miranda/convert/melcc.py
@@ -21,11 +21,9 @@
 from xclim.core.units import convert_units_to, pint_multiply, str2pint
 
 from miranda import __version__
-from miranda.convert._data_definitions import load_json_data_mappings
 from miranda.convert.corrections import dataset_corrections
 from miranda.scripting import LOGGING_CONFIG
-
-from ._treatments import metadata_conversion
+from miranda.treatments import load_json_data_mappings, metadata_conversion
 
 logging.config.dictConfig(LOGGING_CONFIG)
 logger = logging.getLogger(__name__)
@@ -562,7 +560,7 @@ def convert_snow_table(file: str | Path, output: str | Path):
         )
 
     ds.attrs.update(frequency="2sem")
-    meta = load_json_data_mappings("melcc-snow")
+    meta = load_json_data_mappings("melcc")
     ds = metadata_conversion(ds, "melcc-snow", meta)
     date = "-".join(ds.indexes["time"][[0, -1]].strftime("%Y%m"))
     # Save
diff --git a/miranda/preprocess/_data_definitions.py b/miranda/preprocess/_data_definitions.py
index f73251e9..e99d5458 100644
--- a/miranda/preprocess/_data_definitions.py
+++ b/miranda/preprocess/_data_definitions.py
@@ -1,41 +1,14 @@
 from __future__ import annotations
 
-import json
 import warnings
 from pathlib import Path
-from typing import Any
 
-_config_folder = Path(__file__).resolve().parent / "configs"
-
-
-__all__ = ["load_json_data_mappings", "find_project_variable_codes"]
+from miranda.treatments import load_json_data_mappings
 
+_config_folder = Path(__file__).resolve().parent / "configs"
 
-def load_json_data_mappings(project: str) -> dict[str, Any]:
-    """Load JSON mappings for supported dataset conversions.
-
-    Parameters
-    ----------
-    project : str
-
-    Returns
-    -------
-    dict[str, Any]
-    """
-    if project == "eccc-homogenized":
-        metadata_definition = json.load(
-            open(_config_folder / "eccc-homogenized_attrs.json")
-        )
-    elif project == "eccc-obs":
-        metadata_definition = json.load(open(_config_folder / "eccc-obs_attrs.json"))
-    elif project == "eccc-obs-summary":
-        metadata_definition = json.load(
-            open(_config_folder / "eccc-obs-summary_attrs.json")
-        )
-    else:
-        raise NotImplementedError(f"Project not supported: {project}")
 
-    return metadata_definition
+__all__ = ["find_project_variable_codes"]
 
 
 def find_project_variable_codes(code: str, table: str) -> str:
diff --git a/miranda/preprocess/_eccc_obs.py b/miranda/preprocess/_eccc_obs.py
index a3c55dfc..5ca34dc4 100644
--- a/miranda/preprocess/_eccc_obs.py
+++ b/miranda/preprocess/_eccc_obs.py
@@ -119,7 +119,7 @@ def convert_station(
 
     if using_dask_array:
         pandas_reader = dd
-        # set the blocksize to 200 MB
+        # set the block size to 200 MB
         chunks = dict(blocksize=200 * 2**20)
     else:
         pandas_reader = pd
@@ -149,7 +149,7 @@ def convert_station(
     except UnicodeDecodeError as e:
         msg = f"File {data.name} was unable to be read. This is probably an issue with the file: {e}"
         logging.error(msg)
-        raise UnicodeDecodeError(msg)
+        raise
 
     # Loop through the station codes
     station_codes = df["code"].unique()
diff --git a/miranda/preprocess/configs/eccc-homogenized_attrs.json b/miranda/preprocess/configs/eccc-ahccd_attrs.json
similarity index 100%
rename from miranda/preprocess/configs/eccc-homogenized_attrs.json
rename to miranda/preprocess/configs/eccc-ahccd_attrs.json
diff --git a/miranda/preprocess/eccc.py b/miranda/preprocess/eccc.py
index 9631090c..8b40b848 100644
--- a/miranda/preprocess/eccc.py
+++ b/miranda/preprocess/eccc.py
@@ -67,7 +67,7 @@ def _run_func_on_archive_with_optional_dask(
             size = file_size(data)
             if size > size_limit or dask_kwargs:
                 if dask_kwargs:
-                    logging.info(f"`dask_kwargs` provided - Using dask.dataframes.")
+                    logging.info("`dask_kwargs` provided - Using dask.dataframes.")
                 elif size > size_limit:
                     logging.info(
                         f"File exceeds {report_file_size(size_limit)} - Using dask.dataframes."
diff --git a/miranda/treatments/__init__.py b/miranda/treatments/__init__.py
new file mode 100644
index 00000000..57319980
--- /dev/null
+++ b/miranda/treatments/__init__.py
@@ -0,0 +1,5 @@
+"""Treatments module."""
+
+from __future__ import annotations
+
+from miranda.treatments._treatments import *
diff --git a/miranda/convert/_treatments.py b/miranda/treatments/_treatments.py
similarity index 95%
rename from miranda/convert/_treatments.py
rename to miranda/treatments/_treatments.py
index e53e0051..d7e1cd99 100644
--- a/miranda/convert/_treatments.py
+++ b/miranda/treatments/_treatments.py
@@ -1,11 +1,14 @@
 from __future__ import annotations
 
 import datetime
+import inspect
+import json
 import logging.config
 import os
 import warnings
 from functools import partial
 from pathlib import Path
+from typing import Any, Dict
 
 import numpy as np
 import xarray as xr
@@ -15,12 +18,10 @@
 from xclim.core.calendar import parse_offset
 
 from miranda import __version__ as __miranda_version__
+from miranda.convert.utils import date_parser
 from miranda.scripting import LOGGING_CONFIG
 from miranda.units import get_time_frequency
 
-from ._data_definitions import load_json_data_mappings
-from .utils import date_parser
-
 logging.config.dictConfig(LOGGING_CONFIG)
 
 VERSION = datetime.datetime.now().strftime("%Y.%m.%d")
@@ -33,6 +34,7 @@
     "dimensions_compliance",
     "ensure_correct_time_frequency",
     "invert_value_sign",
+    "load_json_data_mappings",
     "metadata_conversion",
     "offset_time_dimension",
     "preprocessing_corrections",
@@ -42,6 +44,41 @@
 ]
 
 
+def load_json_data_mappings(
+    project: str, configurations: dict[str, Path] | None = None
+) -> dict[str, Any]:
+    """Load JSON mappings for supported dataset conversions.
+
+    Parameters
+    ----------
+    project : str
+    configurations: dict, optional
+
+    Returns
+    -------
+    dict[str, Any]
+    """
+    if configurations is None:
+        calling_frame = inspect.currentframe().f_back
+        calling_file_path = calling_frame.f_globals["__file__"]
+        config_folder = Path(calling_file_path).parent / "configs"
+
+        configurations = {}
+        for configuration in config_folder.glob("*attrs.json"):
+            project = str(configuration.stem).split("_")[0]
+            if "|" in project:
+                for p in project.split("|"):
+                    configurations[p] = configuration
+            configurations[project] = configuration
+
+    if project in configurations.keys():
+        config_file = configurations[project]
+        metadata_definition = json.load(config_file.open())
+        return metadata_definition
+    else:
+        raise NotImplementedError(f"Project not supported: {project}")
+
+
 def _get_section_entry_key(meta, entry, var, key, project):
     var_meta = meta[entry].get(var, {})
     if key in var_meta:

From 318957e47b8aa5f33452f7dc3e209eb7810beea8 Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Mon, 14 Aug 2023 17:38:38 -0400
Subject: [PATCH 17/33] more refactoring

---
 miranda/convert/__init__.py                   |   2 -
 .../convert/configs/eccc-ahccd_cf_attrs.json  |  43 +-
 .../configs/era5|era5-land_cf_attrs.json      |   4 +-
 miranda/convert/corrections.py                |   6 +-
 miranda/convert/melcc.py                      |   3 +-
 miranda/gis/__init__.py                       |   1 +
 miranda/gis/utils.py                          | 149 +++
 miranda/preprocess/__init__.py                |   2 +-
 miranda/preprocess/_data_definitions.py       |  48 -
 .../{_eccc_homogenized.py => _eccc_ahccd.py}  |  29 +-
 miranda/preprocess/_eccc_obs.py               |   6 +-
 miranda/preprocess/_metadata.py               |  37 +-
 .../preprocess/configs/eccc-ahccd_attrs.json  |  19 +
 miranda/preprocess/eccc.py                    |   5 +-
 miranda/treatments/__init__.py                | 115 ++-
 miranda/treatments/_dimensions.py             | 243 +++++
 miranda/treatments/_preprocessing.py          | 111 +++
 miranda/treatments/_treatments.py             | 854 ------------------
 miranda/treatments/_variables.py              | 273 ++++++
 miranda/treatments/utils.py                   |  64 ++
 miranda/vocabularies/__init__.py              |   3 +
 miranda/vocabularies/eccc.py                  |  14 +-
 22 files changed, 1069 insertions(+), 962 deletions(-)
 create mode 100644 miranda/gis/utils.py
 delete mode 100644 miranda/preprocess/_data_definitions.py
 rename miranda/preprocess/{_eccc_homogenized.py => _eccc_ahccd.py} (92%)
 create mode 100644 miranda/treatments/_dimensions.py
 create mode 100644 miranda/treatments/_preprocessing.py
 delete mode 100644 miranda/treatments/_treatments.py
 create mode 100644 miranda/treatments/_variables.py
 create mode 100644 miranda/treatments/utils.py

diff --git a/miranda/convert/__init__.py b/miranda/convert/__init__.py
index 3533bcac..d57a32ec 100644
--- a/miranda/convert/__init__.py
+++ b/miranda/convert/__init__.py
@@ -4,5 +4,3 @@
 from . import deh, hq, melcc, utils
 from ._aggregation import *
 from ._data_definitions import *
-
-# from ._reconstruction import *
diff --git a/miranda/convert/configs/eccc-ahccd_cf_attrs.json b/miranda/convert/configs/eccc-ahccd_cf_attrs.json
index 5c777230..594de4e2 100644
--- a/miranda/convert/configs/eccc-ahccd_cf_attrs.json
+++ b/miranda/convert/configs/eccc-ahccd_cf_attrs.json
@@ -1,6 +1,6 @@
 {
   "Header": {
-    "Conventions": "CF-1.8",
+    "Conventions": "CF-1.9",
     "_citation": {
       "gen2": "Mekis, É and L.A. Vincent, 2011: An overview of the second generation adjusted daily precipitation dataset for trend analysis in Canada. Atmosphere-Ocean 49(2), 163-177 doi:10.1080/07055900.2011.583910",
       "gen3": "Vincent, L.A., M.M. Hartwell and X.L. Wang, 2020: A Third Generation of Homogenized Temperature for Trend Analysis and Monitoring Changes in Canada’s Climate. Atmosphere-Ocean. https://doi.org/10.1080/07055900.2020.1765728"
@@ -27,8 +27,30 @@
     "table_date": "2023-03-23",
     "table_id": "ECCC"
   },
-  "variable_entry": {
+  "dimensions:": {
+    "lat": {
+      "axis": "Y",
+      "long_name": "Latitude",
+      "standard_name": "latitude",
+      "units": "degrees_north"
+    },
+    "long": {
+      "_cf_dimension_name": "lon",
+      "axis": "X",
+      "long_name": "Longitude",
+      "standard_name": "longitude",
+      "units": "degrees_east"
+    },
+    "time": {
+      "axis": "T",
+      "calendar": "gregorian",
+      "long_name": "Time",
+      "standard_name": "time"
+    }
+  },
+  "variables": {
     "dm": {
+      "_cf_variable_name": "tas",
       "add_offset": 273.15,
       "cell_methods": "time: mean",
       "comments": "Station data converted from Mean Temp (°C)",
@@ -36,13 +58,12 @@
       "grid_mapping": "regular_lon_lat",
       "long_name": "Near-Surface Air Temperature",
       "original_field": "Mean Temp (°C)",
-      "out_name": "tas",
-      "scale_factor": 1,
       "standard_name": "air_temperature",
       "type": "real",
       "units": "K"
     },
     "dn": {
+      "_cf_variable_name": "tasmin",
       "add_offset": 273.15,
       "cell_methods": "time: minimum",
       "comments": "Station data converted from Min Temp (°C)",
@@ -50,55 +71,51 @@
       "grid_mapping": "regular_lon_lat",
       "long_name": "Daily Minimum Near-Surface Air Temperature",
       "original_field": "Min Temp (°C)",
-      "out_name": "tasmin",
-      "scale_factor": 1,
       "standard_name": "air_temperature",
       "type": "real",
       "units": "K"
     },
     "dr": {
-      "add_offset": 0,
+      "_cf_variable_name": "prlp",
       "cell_methods": "time: mean",
       "comments": "Station data converted from Total Rain (mm) using a density of 1000 kg/m³",
       "frequency": "day",
       "grid_mapping": "regular_lon_lat",
       "long_name": "Liquid Precipitation",
       "original_field": "Total Rain (mm)",
-      "out_name": "prlp",
       "scale_factor": 1.1574074074074073e-05,
       "standard_name": "rainfall_flux",
       "type": "real",
       "units": "kg m-2 s-1"
     },
     "ds": {
-      "add_offset": 0,
+      "_cf_variable_name": "prsn",
       "cell_methods": "time: mean",
       "comments": "station data converted from Total Snow (cm) using a density of 100 kg/m³",
       "frequency": "day",
       "grid_mapping": "regular_lon_lat",
       "long_name": "Snowfall Flux",
       "original_field": "Total Snow (cm)",
-      "out_name": "prsn",
       "scale_factor": 1.1574074074074073e-05,
       "standard_name": "snowfall_flux",
       "type": "real",
       "units": "kg m-2 s-1"
     },
     "dt": {
-      "add_offset": 0,
+      "_cf_variable_name": "pr",
       "cell_methods": "time: mean",
       "comments": "Station data converted from Total Precip (mm) using a density of 1000 kg/m³",
       "frequency": "day",
       "grid_mapping": "regular_lon_lat",
       "long_name": "Precipitation",
       "original_field": "Total Precip (mm)",
-      "out_name": "pr",
       "scale_factor": 1.1574074074074073e-05,
       "standard_name": "precipitation_flux",
       "type": "real",
       "units": "kg m-2 s-1"
     },
     "dx": {
+      "_cf_variable_name": "tasmax",
       "add_offset": 273.15,
       "cell_methods": "time: maximum",
       "comments": "station data converted from Max Temp (°C)",
@@ -106,8 +123,6 @@
       "grid_mapping": "regular_lon_lat",
       "long_name": "Daily Maximum Near-Surface Air Temperature",
       "original_field": "Max Temp (°C)",
-      "out_name": "tasmax",
-      "scale_factor": 1,
       "standard_name": "air_temperature",
       "type": "real",
       "units": "K"
diff --git a/miranda/convert/configs/era5|era5-land_cf_attrs.json b/miranda/convert/configs/era5|era5-land_cf_attrs.json
index 1a0afa83..1cf1257c 100644
--- a/miranda/convert/configs/era5|era5-land_cf_attrs.json
+++ b/miranda/convert/configs/era5|era5-land_cf_attrs.json
@@ -45,6 +45,7 @@
         "era5-land-monthly-means": 4
       },
       "axis": "Y",
+      "long_name": "Latitude",
       "standard_name": "latitude"
     },
     "longitude": {
@@ -54,6 +55,7 @@
         "era5-land-monthly-means": 4
       },
       "axis": "X",
+      "long_name": "Longitude",
       "standard_name": "longitude"
     },
     "time": {
@@ -71,7 +73,7 @@
       },
       "_strict_time": false,
       "axis": "T",
-      "long_name": "time",
+      "long_name": "Time",
       "standard_name": "time"
     }
   },
diff --git a/miranda/convert/corrections.py b/miranda/convert/corrections.py
index e2157de2..44e1247d 100644
--- a/miranda/convert/corrections.py
+++ b/miranda/convert/corrections.py
@@ -10,23 +10,21 @@
 import xarray as xr
 
 from miranda.convert.utils import find_version_hash
-from miranda.gis import subset_domain
+from miranda.gis import conservative_regrid, subset_domain, threshold_mask
 from miranda.treatments import (
     cf_units_conversion,
     clip_values,
-    conservative_regrid,
     correct_unit_names,
     dimensions_compliance,
     ensure_correct_time_frequency,
     invert_value_sign,
-    load_json_data_mappings,
     metadata_conversion,
     offset_time_dimension,
     preprocessing_corrections,
-    threshold_mask,
     transform_values,
     variable_conversion,
 )
+from miranda.treatments.utils import load_json_data_mappings
 
 CONFIG_FOLDER = Path(__file__).parent / "data"
 CONFIG_FILES = {
diff --git a/miranda/convert/melcc.py b/miranda/convert/melcc.py
index 5dd034ec..d5bb084d 100644
--- a/miranda/convert/melcc.py
+++ b/miranda/convert/melcc.py
@@ -23,7 +23,8 @@
 from miranda import __version__
 from miranda.convert.corrections import dataset_corrections
 from miranda.scripting import LOGGING_CONFIG
-from miranda.treatments import load_json_data_mappings, metadata_conversion
+from miranda.treatments import metadata_conversion
+from miranda.treatments.utils import load_json_data_mappings
 
 logging.config.dictConfig(LOGGING_CONFIG)
 logger = logging.getLogger(__name__)
diff --git a/miranda/gis/__init__.py b/miranda/gis/__init__.py
index fe49f1c9..288cf522 100644
--- a/miranda/gis/__init__.py
+++ b/miranda/gis/__init__.py
@@ -2,3 +2,4 @@
 from __future__ import annotations
 
 from ._domains import *
+from .utils import *
diff --git a/miranda/gis/utils.py b/miranda/gis/utils.py
new file mode 100644
index 00000000..1d5dddbd
--- /dev/null
+++ b/miranda/gis/utils.py
@@ -0,0 +1,149 @@
+"""Utility functions for GIS operations."""
+from __future__ import annotations
+
+import datetime
+import logging
+import warnings
+
+import numpy as np
+import xarray as xr
+
+__all__ = [
+    "conservative_regrid",
+    "threshold_mask",
+]
+
+
+def _simple_fix_dims(d: xr.Dataset | xr.DataArray) -> xr.Dataset | xr.DataArray:
+    """Adjust dimensions found in a file so that it can be used for regridding purposes."""
+    if "lon" not in d.dims or "lat" not in d.dims:
+        dim_rename = dict()
+        for dim in d.dims:
+            if str(dim).lower().startswith("lon"):
+                dim_rename[str(dim)] = "lon"
+            if str(dim).lower().startswith("lat"):
+                dim_rename[str(dim)] = "lat"
+        d = d.rename(dim_rename)
+    if np.any(d.lon > 180):
+        lon_wrapped = d.lon.where(d.lon <= 180.0, d.lon - 360.0)
+        d["lon"] = lon_wrapped
+        d = d.sortby(["lon"])
+
+    if "time" in d.dims:
+        d = d.isel(time=0, drop=True)
+
+    return d
+
+
+def conservative_regrid(
+    ds: xr.DataArray | xr.Dataset, ref_grid: xr.DataArray | xr.Dataset
+) -> xr.DataArray | xr.Dataset:
+    """Perform a conservative_normed regridding"""
+    try:
+        import xesmf as xe  # noqa
+    except ModuleNotFoundError:
+        raise ModuleNotFoundError(
+            "This function requires the `xesmf` library which is not installed. "
+            "Regridding step will be skipped."
+        )
+
+    ref_grid = _simple_fix_dims(ref_grid)
+    method = "conservative_normed"
+
+    logging.info(
+        f"Performing regridding and masking with `xesmf` using method: {method}."
+    )
+
+    regridder = xe.Regridder(ds, ref_grid, method, periodic=False)
+    ds = regridder(ds)
+
+    ds.attrs["history"] = (
+        f"{datetime.datetime.now()}:"
+        f"Regridded dataset using xesmf with method: {method}. "
+        f"{ds.attrs.get('history')}".strip()
+    )
+    return ds
+
+
+def threshold_mask(
+    ds: xr.Dataset | xr.DataArray,
+    *,
+    mask: xr.Dataset | xr.DataArray,
+    mask_cutoff: float | bool = False,
+) -> xr.Dataset | xr.DataArray:
+    """Land-Sea mask operations.
+
+    Parameters
+    ----------
+    ds : xr.Dataset or str or os.PathLike
+    mask : xr.Dataset or xr.DataArray
+    mask_cutoff : float or bool
+
+    Returns
+    -------
+    xr.Dataset or xr.DataArray
+    """
+    mask = _simple_fix_dims(mask)
+
+    if isinstance(mask, xr.Dataset):
+        if len(mask.data_vars) == 1:
+            mask_variable = list(mask.data_vars)[0]
+            mask = mask[mask_variable]
+        else:
+            raise ValueError(
+                "More than one data variable found in land-sea mask. Supply a DataArray instead."
+            )
+    else:
+        mask_variable = mask.name
+
+    try:
+        from clisops.core import subset_bbox  # noqa
+
+        log_msg = f"Masking dataset with {mask_variable}."
+        if mask_cutoff:
+            log_msg = f"{log_msg.strip('.')} at `{mask_cutoff}` cutoff value."
+        logging.info(log_msg)
+
+        lon_bounds = np.array([ds.lon.min(), ds.lon.max()])
+        lat_bounds = np.array([ds.lat.min(), ds.lat.max()])
+
+        mask_subset = subset_bbox(
+            mask,
+            lon_bnds=lon_bounds,
+            lat_bnds=lat_bounds,
+        ).load()
+    except ModuleNotFoundError:
+        log_msg = (
+            "This function requires the `clisops` library which is not installed. "
+            "subsetting step will be skipped."
+        )
+        warnings.warn(log_msg)
+        mask_subset = mask.load()
+
+    if mask_subset.dtype == bool:
+        if mask_cutoff:
+            logging.warning("Mask value cutoff set for boolean mask. Ignoring.")
+        mask_subset = mask_subset.where(mask)
+    else:
+        mask_subset = mask_subset.where(mask >= mask_cutoff)
+    ds = ds.where(mask_subset.notnull())
+
+    if mask_subset.min() >= 0:
+        if mask_subset.max() <= 1.00000001:
+            cutoff_info = f"{mask_cutoff * 100} %"
+        elif mask_subset.max() <= 100.00000001:
+            cutoff_info = f"{mask_cutoff} %"
+        else:
+            cutoff_info = f"{mask_cutoff}"
+    else:
+        cutoff_info = f"{mask_cutoff}"
+    ds.attrs["mask_cutoff"] = cutoff_info
+
+    prev_history = ds.attrs.get("history", "")
+    history_msg = f"Mask calculated using `{mask_variable}`."
+    if mask_cutoff:
+        history_msg = f"{history_msg.strip('.')} with cutoff value `{cutoff_info}`."
+    history = f"{history_msg} {prev_history}".strip()
+    ds.attrs.update(dict(history=history))
+
+    return ds
diff --git a/miranda/preprocess/__init__.py b/miranda/preprocess/__init__.py
index 4601a7cc..03673b8d 100644
--- a/miranda/preprocess/__init__.py
+++ b/miranda/preprocess/__init__.py
@@ -1,6 +1,6 @@
 """Preprocessing tools for Miranda."""
 from __future__ import annotations
 
-from ._eccc_homogenized import *
+from ._eccc_ahccd import *
 from ._eccc_obs import *
 from ._eccc_summaries import *
diff --git a/miranda/preprocess/_data_definitions.py b/miranda/preprocess/_data_definitions.py
deleted file mode 100644
index e99d5458..00000000
--- a/miranda/preprocess/_data_definitions.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from __future__ import annotations
-
-import warnings
-from pathlib import Path
-
-from miranda.treatments import load_json_data_mappings
-
-_config_folder = Path(__file__).resolve().parent / "configs"
-
-
-__all__ = ["find_project_variable_codes"]
-
-
-def find_project_variable_codes(code: str, table: str) -> str:
-    """Find the variable code for a given variable name and project.
-
-    Parameters
-    ----------
-    code : str
-        Variable name.
-    table : str
-        Project name.
-
-    Returns
-    -------
-    str
-    """
-    config = load_json_data_mappings(table)
-    variable_codes = {}
-    for variable_code in config["variables"]:
-        variable_name = config["variables"][variable_code].get("_variable_name")
-        if variable_name:
-            variable_codes[variable_name] = variable_code
-        else:
-            warnings.warn(
-                f"Variable `{variable_code}` does not have accompanying `variable_name`. "
-                f"Verify JSON. Continuing with `{variable_code}` as `variable_name`."
-            )
-            variable_codes[variable_code] = variable_code
-
-    if code in variable_codes.values():
-        variable = code
-    else:
-        variable = variable_codes.get(code)
-    if not variable:
-        raise NotImplementedError(f"Variable `{code}` not supported.")
-
-    return variable
diff --git a/miranda/preprocess/_eccc_homogenized.py b/miranda/preprocess/_eccc_ahccd.py
similarity index 92%
rename from miranda/preprocess/_eccc_homogenized.py
rename to miranda/preprocess/_eccc_ahccd.py
index a16b3e14..8d76b05c 100644
--- a/miranda/preprocess/_eccc_homogenized.py
+++ b/miranda/preprocess/_eccc_ahccd.py
@@ -11,12 +11,12 @@
 
 from miranda.io import write_dataset
 from miranda.io.utils import name_output_file
-from miranda.preprocess._data_definitions import find_project_variable_codes
 from miranda.preprocess._metadata import (
     eccc_variable_metadata,
     homogenized_column_definitions,
 )
 from miranda.scripting import LOGGING_CONFIG
+from miranda.treatments import find_project_variable_codes, load_json_data_mappings
 
 logging.config.dictConfig(LOGGING_CONFIG)
 logger = logging.Logger("miranda")
@@ -44,16 +44,19 @@ def convert_ahccd_fwf_file(
     -------
     xarray.Dataset
     """
-    code = find_project_variable_codes(variable, "eccc-homogenized")
+    configuration = load_json_data_mappings("eccc-ahccd")
+    code = find_project_variable_codes(variable, configuration)
 
     variable_meta, global_attrs = eccc_variable_metadata(
-        code, "eccc-homogenized", generation
+        code, "eccc-ahccd", generation, configuration
     )
     column_names, column_spaces, column_dtypes, header = homogenized_column_definitions(
         code
     )
 
     df = pd.read_fwf(ff, header=header, colspecs=column_spaces, dtype=column_dtypes)
+
+    # Handle different variable types
     if "pr" in variable:
         cols = list(df.columns[0:3])
         cols = cols[0::2]
@@ -67,6 +70,7 @@ def convert_ahccd_fwf_file(
     else:
         raise NotImplementedError(f"Variable `{variable}` not supported.")
 
+    # Extract relevant columns
     df = df[cols]
     df.replace(variable_meta[variable]["NaN_value"], np.NaN, inplace=True)
 
@@ -133,6 +137,7 @@ def convert_ahccd_fwf_file(
     metadata = metadata.drop_vars(["stnid", "station_name"])
 
     ds_out[f"{variable}_flag"].attrs["long_name"] = variable_meta[variable]["long_name"]
+
     ds_out["lon"] = metadata["long"]
     ds_out.lon.attrs["units"] = "degrees_east"
     ds_out.lon.attrs["axis"] = "X"
@@ -140,9 +145,9 @@ def convert_ahccd_fwf_file(
     ds_out.lat.attrs["units"] = "degrees_north"
     ds_out.lat.attrs["axis"] = "Y"
     ds_out["elev"] = metadata["elev"]
-    ds_out.elev.attrs["units"] = "m"
+    ds_out.elev.attrs["units"] = "meters"
+    ds_out.elev.attrs["positive"] = "up"
     ds_out.elev.attrs["axis"] = "Z"
-
     metadata = metadata.drop_vars(["long", "lat", "elev"])
     for vv in metadata.data_vars:
         if metadata[vv].dtype == "O" and (variable not in vv):
@@ -176,12 +181,14 @@ def convert_ahccd(
     -------
     None
     """
+    configuration = load_json_data_mappings("eccc-ahccd")
+
     output_dir = Path(output_dir).resolve().joinpath(variable)
     output_dir.mkdir(parents=True, exist_ok=True)
 
-    code = find_project_variable_codes(variable, "eccc-homogenized")
-    var_meta, global_attrs = eccc_variable_metadata(
-        code, "eccc-homogenized", generation
+    code = find_project_variable_codes(variable, configuration)
+    variable_meta, global_attrs = eccc_variable_metadata(
+        code, "eccc-ahccd", generation, configuration
     )
     (
         column_names,
@@ -257,8 +264,10 @@ def merge_ahccd(
     overwrite: bool = False,
 ) -> None:
     """Merge Adjusted and Homogenized Canadian Climate Dataset files."""
+    configuration = load_json_data_mappings("eccc-ahccd")
+
     if variable:
-        code = find_project_variable_codes(variable, "eccc-homogenized")
+        code = find_project_variable_codes(variable, configuration)
         glob_pattern = f"{code}*.nc"
         output_dir = Path(output_dir).resolve().joinpath(variable)
     else:
@@ -284,7 +293,7 @@ def merge_ahccd(
         if ds_ahccd[v].dtype == "O" and "flag" not in v:
             ds_ahccd[v] = ds_ahccd[v].astype(str)
         try:
-            variables_found.add(find_project_variable_codes(str(v), "eccc-homogenized"))
+            variables_found.add(find_project_variable_codes(str(v), configuration))
         except NotImplementedError:
             pass
 
diff --git a/miranda/preprocess/_eccc_obs.py b/miranda/preprocess/_eccc_obs.py
index 5ca34dc4..e48cc756 100644
--- a/miranda/preprocess/_eccc_obs.py
+++ b/miranda/preprocess/_eccc_obs.py
@@ -22,12 +22,9 @@
 from xclim.core.units import convert_units_to
 
 from miranda.archive import group_by_length
-from miranda.preprocess._data_definitions import (
-    find_project_variable_codes,
-    load_json_data_mappings,
-)
 from miranda.preprocess._metadata import eccc_variable_metadata, obs_column_definitions
 from miranda.scripting import LOGGING_CONFIG
+from miranda.treatments import find_project_variable_codes, load_json_data_mappings
 from miranda.vocabularies.eccc import obs_vocabularies
 
 config.dictConfig(LOGGING_CONFIG)
@@ -59,7 +56,6 @@ def convert_observation(
     overwrite: bool = False,
 ):
     """Convert a single station's data from the fixed-width format to a netCDF file."""
-
     output_dir = Path(output_dir).resolve().joinpath(variable)
     output_dir.mkdir(parents=True, exist_ok=True)
 
diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py
index a663c25a..9fb53af3 100644
--- a/miranda/preprocess/_metadata.py
+++ b/miranda/preprocess/_metadata.py
@@ -4,10 +4,8 @@
 from typing import Any
 
 from miranda import __version__ as __miranda_version__
-from miranda.preprocess._data_definitions import (
-    find_project_variable_codes,
-    load_json_data_mappings,
-)
+from miranda.treatments import find_project_variable_codes
+from miranda.treatments.utils import load_json_data_mappings
 
 __all__ = [
     "eccc_variable_metadata",
@@ -17,29 +15,34 @@
 
 
 def eccc_variable_metadata(
-    variable_code: str, project: str, gen: int | None = None
+    variable_code: str,
+    project: str,
+    generation: int | None = None,
+    metadata: dict | None = None,
 ) -> (dict[str, int | float | str], dict, list[tuple[int, int]], int):
     """
 
     Parameters
     ----------
     variable_code: str
-    project: {"eccc-homogenized", "eccc-obs", "eccc-obs-summary"}
-    gen: {1, 2, 3}, optional
+    project: {"eccc-ahccd", "eccc-obs", "eccc-obs-summary"}
+    generation: {1, 2, 3}, optional
+    metadata: dict, optional
 
     Returns
     -------
     dict[str, int or str or float], dict, list[tuple[int, int]], int
     """
-    if project == "eccc-homogenized":
-        generation = {1: "First", 2: "Second", 3: "Third"}.get(gen)
+    if project == "eccc-ahccd":
+        generation = {1: "First", 2: "Second", 3: "Third"}.get(generation)
         if not generation:
-            raise NotImplementedError(f"Generation '{gen}' not supported")
+            raise NotImplementedError(f"Generation '{generation}' not supported")
     else:
         generation = None
 
-    metadata = load_json_data_mappings(project)
-    code = find_project_variable_codes(variable_code, project)
+    if not metadata:
+        metadata = load_json_data_mappings(project)
+    code = find_project_variable_codes(variable_code, metadata)
 
     # Variable metadata
     variable_meta = metadata["variables"].get(code)
@@ -95,6 +98,16 @@ def eccc_variable_metadata(
 def homogenized_column_definitions(
     variable_code: str,
 ) -> tuple[dict, list[tuple[int, int]], dict[str, type[str | int | float] | Any], int]:
+    """Return the column names, widths, and data types for the AHCCD fixed-width format data.
+
+    Parameters
+    ----------
+    variable_code : str
+
+    Returns
+    -------
+    tuple[dict, list[tuple[int, int]], dict[str, type[str | int | float] | Any], int]
+    """
     metadata = load_json_data_mappings("eccc-homogenized")
 
     variable = metadata["variables"][variable_code]["_variable_name"]
diff --git a/miranda/preprocess/configs/eccc-ahccd_attrs.json b/miranda/preprocess/configs/eccc-ahccd_attrs.json
index a56c5b51..3de37b07 100644
--- a/miranda/preprocess/configs/eccc-ahccd_attrs.json
+++ b/miranda/preprocess/configs/eccc-ahccd_attrs.json
@@ -34,6 +34,25 @@
     "table_id": "ECCC",
     "type": "station-obs"
   },
+  "dimensions:": {
+    "lat": {
+      "axis": "Y",
+      "long_name": "Latitude",
+      "standard_name": "latitude",
+      "units": "degrees_north"
+    },
+    "long": {
+      "axis": "X",
+      "long_name": "Longitude",
+      "standard_name": "longitude",
+      "units": "degrees_east"
+    },
+    "time": {
+      "axis": "T",
+      "long_name": "Time",
+      "standard_name": "time"
+    }
+  },
   "variables": {
     "dm": {
       "NaN_value": -9999.9,
diff --git a/miranda/preprocess/eccc.py b/miranda/preprocess/eccc.py
index 8b40b848..30648057 100644
--- a/miranda/preprocess/eccc.py
+++ b/miranda/preprocess/eccc.py
@@ -26,7 +26,7 @@ def _run_func_on_archive_with_optional_dask(
     function: Callable,
     errored_files: list[Path],
     **dask_kwargs,
-):
+) -> None:
     r"""Run a function on a file archive, extracting it if necessary.
 
     Notes
@@ -50,9 +50,8 @@ def _run_func_on_archive_with_optional_dask(
 
     Returns
     -------
-
+    None
     """
-
     with tempfile.TemporaryDirectory() as temp_folder:
         if file.suffix in [".gz", ".tar", ".zip", ".7z"]:
             data_files = generic_extract_archive(file, output_dir=temp_folder)
diff --git a/miranda/treatments/__init__.py b/miranda/treatments/__init__.py
index 57319980..11e62fd2 100644
--- a/miranda/treatments/__init__.py
+++ b/miranda/treatments/__init__.py
@@ -2,4 +2,117 @@
 
 from __future__ import annotations
 
-from miranda.treatments._treatments import *
+import datetime
+import logging.config
+
+import xarray as xr
+
+from miranda import __version__ as __miranda_version__
+from miranda.scripting import LOGGING_CONFIG
+from miranda.treatments._dimensions import *
+from miranda.treatments._preprocessing import *
+from miranda.treatments._variables import *
+from miranda.treatments.utils import *
+from miranda.units import get_time_frequency
+
+logging.config.dictConfig(LOGGING_CONFIG)
+VERSION = datetime.datetime.now().strftime("%Y.%m.%d")
+
+
+def metadata_conversion(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+    """Update xarray dataset and data_vars with project-specific metadata fields.
+
+    Parameters
+    ----------
+    d : xarray.Dataset
+        Dataset with metadata to be updated.
+    p : str
+        Dataset project name.
+    m : dict
+        Metadata definition dictionary for project and variable(s).
+
+    Returns
+    -------
+    xarray.Dataset
+    """
+    logging.info("Converting metadata to CF-like conventions.")
+
+    header = m["Header"]
+
+    # Static handling of version global attributes
+    miranda_version = header.get("_miranda_version")
+    if miranda_version:
+        if isinstance(miranda_version, bool):
+            header["miranda_version"] = __miranda_version__
+        elif isinstance(miranda_version, dict):
+            if p in miranda_version.keys():
+                header["miranda_version"] = __miranda_version__
+        else:
+            logging.warning(
+                f"`_miranda_version` not set for project `{p}`. Not appending."
+            )
+    if "_miranda_version" in header:
+        del header["_miranda_version"]
+
+    frequency = m["Header"].get("_frequency")
+    if frequency:
+        if isinstance(frequency, bool):
+            _, m["Header"]["frequency"] = get_time_frequency(d)
+        elif isinstance(frequency, dict):
+            if p in frequency.keys():
+                m["Header"]["frequency"] = get_time_frequency(d)
+        else:
+            logging.warning("`frequency` not set for project. Not appending.")
+    if "_frequency" in m["Header"]:
+        del m["Header"]["_frequency"]
+
+    # Conditional handling of global attributes based on project name
+    for field in [f for f in header if f.startswith("_")]:
+        if isinstance(header[field], list):
+            if p in header[field]:
+                attr_treatment = header[field][p]
+            else:
+                logging.warning(
+                    f"Attribute handling (`{field}`) not set for project `{p}`. Continuing..."
+                )
+                continue
+        elif isinstance(header[field], dict):
+            attr_treatment = header[field]
+        else:
+            raise AttributeError(
+                f"Attribute treatment configuration for field `{field}` is not properly configured. Verify JSON."
+            )
+
+        if field == "_map_attrs":
+            for attribute, mapping in attr_treatment.items():
+                header[mapping] = d.attrs[attribute]
+                del d.attrs[attribute]
+        elif field == "_remove_attrs":
+            for ff in attr_treatment:
+                del d.attrs[ff]
+        else:
+            if field[1:] in d.attrs:
+                logging.warning(
+                    f"Overwriting `{field[1:]}` based on JSON configuration."
+                )
+            header[field[1:]] = attr_treatment
+
+        del header[field]
+
+    # Add global attributes
+    d.attrs.update(header)
+    d.attrs.update(dict(project=p))
+
+    # Date-based versioning
+    if not d.attrs.get("version"):
+        d.attrs.update(dict(version=f"v{VERSION}"))
+
+    prev_history = d.attrs.get("history", "")
+    history = (
+        f"[{datetime.datetime.now()}] "
+        "Converted variables and modified metadata for CF-like compliance: "
+        f"{prev_history}".strip()
+    )
+    d.attrs.update(dict(history=history))
+
+    return d
diff --git a/miranda/treatments/_dimensions.py b/miranda/treatments/_dimensions.py
new file mode 100644
index 00000000..cd56e243
--- /dev/null
+++ b/miranda/treatments/_dimensions.py
@@ -0,0 +1,243 @@
+from __future__ import annotations
+
+import logging
+import warnings
+from typing import Any
+
+import numpy as np
+import xarray as xr
+from xclim.core.calendar import parse_offset
+
+from miranda.treatments.utils import _get_section_entry_key, _iter_entry_key  # noqa
+from miranda.units import get_time_frequency
+
+
+def find_project_variable_codes(code: str, configuration: dict[str, Any]) -> str:
+    """Find the variable code for a given variable name and project.
+
+    Parameters
+    ----------
+    code : str
+        Variable name.
+    configuration : dict
+        Configuration dictionary.
+
+    Returns
+    -------
+    str
+    """
+    variable_codes = {}
+    for variable_code in configuration["variables"]:
+        variable_name = configuration["variables"][variable_code].get("_variable_name")
+        if variable_name:
+            variable_codes[variable_name] = variable_code
+        else:
+            warnings.warn(
+                f"Variable `{variable_code}` does not have accompanying `variable_name`. "
+                f"Verify JSON. Continuing with `{variable_code}` as `variable_name`."
+            )
+            variable_codes[variable_code] = variable_code
+
+    if code in variable_codes.values():
+        variable = code
+    else:
+        variable = variable_codes.get(code)
+    if not variable:
+        raise NotImplementedError(f"Variable `{code}` not supported.")
+
+    return variable
+
+
+def dimensions_compliance(ds: xr.Dataset, project: str, metadata: dict) -> xr.Dataset:
+    """Rename dimensions to CF to their equivalents and reorder them if needed.
+
+    Parameters
+    ----------
+    ds : xarray.Dataset
+        Dataset with dimensions to be updated.
+    project : str
+        Dataset project name.
+    metadata : dict
+        Metadata definition dictionary for project and variable(s).
+
+    Returns
+    -------
+    xarray.Dataset
+    """
+    rename_dims = dict()
+    for dim in ds.dims:
+        if dim in metadata["dimensions"].keys():
+            cf_name = _get_section_entry_key(
+                metadata, "dimensions", dim, "_cf_dimension_name", project
+            )
+            if cf_name:
+                rename_dims[dim] = cf_name
+
+    # Rename dimensions
+    logging.info(f"Renaming dimensions: {', '.join(rename_dims.keys())}.")
+    ds = ds.rename(rename_dims)
+    for new in ["lon", "lat"]:
+        if new == "lon" and "lon" in ds.coords:
+            if np.any(ds.lon > 180):
+                lon1 = ds.lon.where(ds.lon <= 180.0, ds.lon - 360.0)
+                ds[new] = lon1
+
+        coord_precision = _get_section_entry_key(
+            metadata, "dimensions", new, "_precision", project
+        )
+        if coord_precision is not None:
+            ds[new] = ds[new].round(coord_precision)
+
+    # Ensure that lon and lat are written in proper order for plotting purposes
+    logging.info("Reordering dimensions.")
+    transpose_order = []
+    if "lat" in ds.dims and "lon" in ds.dims:
+        transpose_order = ["lat", "lon"]
+    elif "rlat" in ds.dims and "rlon" in ds.dims:
+        transpose_order = ["rlat", "rlon"]
+    if "time" in ds.dims and transpose_order:
+        transpose_order.insert(0, "time")
+        transpose_order.extend(list(set(ds.dims) - set(transpose_order)))
+    ds = ds.transpose(*transpose_order)
+    ds = ds.sortby(transpose_order)
+
+    # Add dimension original name and update attrs
+    logging.info("Updating dimension attributes.")
+    dim_descriptions = metadata["dimensions"]
+    for dim in metadata["dimensions"].keys():
+        cf_name = dim_descriptions[dim].get("_cf_dimension_name")
+        if cf_name is not None and cf_name in ds.dims:
+            ds[cf_name].attrs.update(dict(original_variable=dim))
+        else:
+            # variable name already follows CF standards
+            cf_name = dim
+        for field in dim_descriptions[dim].keys():
+            if not field.startswith("_"):
+                ds[cf_name].attrs.update({field: dim_descriptions[dim][field]})
+
+    prev_history = ds.attrs.get("history", "")
+    history = f"Transposed and renamed dimensions. {prev_history}"
+    ds.attrs.update(dict(history=history))
+
+    return ds
+
+
+def ensure_correct_time_frequency(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+    """Ensure that time frequency is consistent with expected frequency for project."""
+    key = "_ensure_correct_time"
+    strict_time = "_strict_time"
+
+    if "time" not in m["dimensions"].keys():
+        warnings.warn(f"No time corrections listed for project `{p}`. Continuing...")
+        return d
+
+    if "time" not in list(d.variables.keys()):
+        logging.info(
+            "No time dimension among data variables: "
+            f"{' ,'.join([str(v) for v in d.variables.keys()])}. "
+            "Continuing..."
+        )
+        return d
+
+    if key in m["dimensions"]["time"].keys():
+        freq_found = xr.infer_freq(d.time)
+        if strict_time in m["dimensions"]["time"].keys():
+            if not freq_found:
+                msg = (
+                    "Time frequency could not be found. There may be missing timesteps."
+                )
+                if m["dimensions"]["time"].get(strict_time):
+                    raise ValueError(msg)
+                else:
+                    warnings.warn(f"{msg} Continuing...")
+                    return d
+
+        correct_time_entry = m["dimensions"]["time"][key]
+        if isinstance(correct_time_entry, str):
+            correct_times = [parse_offset(correct_time_entry)[1]]
+        elif isinstance(correct_time_entry, dict):
+            correct_times = correct_time_entry.get(p)
+            if isinstance(correct_times, list):
+                correct_times = [parse_offset(t)[1] for t in correct_times]
+            if correct_times is None:
+                warnings.warn(f"No expected times set for specified project `{p}`.")
+        elif isinstance(correct_time_entry, list):
+            correct_times = correct_time_entry
+        else:
+            warnings.warn("No expected times set for family of projects.")
+            return d
+
+        if freq_found not in correct_times:
+            error_msg = (
+                f"Time frequency {freq_found} not among allowed frequencies: "
+                f"{', '.join(correct_times) if isinstance(correct_times, list) else correct_times}"
+            )
+            if isinstance(correct_time_entry, dict):
+                error_msg = f"{error_msg} for project `{p}`."
+            else:
+                error_msg = f"{error_msg}."
+            raise ValueError(error_msg)
+
+        logging.info(f"Resampling dataset with time frequency: {freq_found}.")
+        with xr.set_options(keep_attrs=True):
+            d_out = d.assign_coords(
+                time=d.time.resample(time=freq_found).mean(dim="time").time
+            )
+            d_out.time.attrs.update(d.time.attrs)
+
+        prev_history = d.attrs.get("history", "")
+        history = f"Resampled time with `freq={freq_found}`. {prev_history}"
+        d_out.attrs.update(dict(history=history))
+        return d_out
+
+    return d
+
+
+def offset_time_dimension(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+    """Offset time dimension using listed frequency."""
+    key = "_offset_time"
+    d_out = xr.Dataset(coords=d.coords, attrs=d.attrs)
+    converted = []
+    offset, offset_meaning = None, None
+
+    time_freq = dict()
+    expected_period = _get_section_entry_key(
+        m, "dimensions", "time", "_ensure_correct_time", p
+    )
+    if isinstance(expected_period, str):
+        time_freq["expected_period"] = expected_period
+
+    for vv, offs in _iter_entry_key(d, m, "dimensions", key, p):
+        if offs:
+            # Offset time by value of one time-step
+            if offset is None and offset_meaning is None:
+                try:
+                    offset, offset_meaning = get_time_frequency(d, **time_freq)
+                except TypeError:
+                    logging.error(
+                        "Unable to parse the time frequency. Verify data integrity before retrying."
+                    )
+                    raise
+
+            logging.info(
+                f"Offsetting data for `{vv}` by `{offset[0]} {offset_meaning}(s)`."
+            )
+            with xr.set_options(keep_attrs=True):
+                out = d[vv]
+                out["time"] = out.time - np.timedelta64(offset[0], offset[1])
+                d_out[vv] = out
+            converted.append(vv)
+        elif offs is False:
+            logging.info(
+                f"No time offsetting needed for `{vv}` in `{p}` (Explicitly set to False)."
+            )
+            continue
+        prev_history = d.attrs.get("history", "")
+        history = f"Offset variable `{vv}` values by `{offset[0]} {offset_meaning}(s). {prev_history}"
+        d_out.attrs.update(dict(history=history))
+
+    # Copy unconverted variables
+    for vv in d.data_vars:
+        if vv not in converted:
+            d_out[vv] = d[vv]
+    return d_out
diff --git a/miranda/treatments/_preprocessing.py b/miranda/treatments/_preprocessing.py
new file mode 100644
index 00000000..7d411d6b
--- /dev/null
+++ b/miranda/treatments/_preprocessing.py
@@ -0,0 +1,111 @@
+from __future__ import annotations
+
+from functools import partial
+from pathlib import Path
+from typing import Any, Dict
+
+import numpy as np
+import xarray as xr
+
+from miranda.convert.utils import date_parser
+
+
+def correct_time_entries(
+    ds: xr.Dataset,
+    split: str = "_",
+    location: int = -1,
+    field: str = "time",
+) -> xr.Dataset:
+    """Correct time entries in dataset.
+
+    Parameters
+    ----------
+    ds : xarray.Dataset
+    split : str
+    location : int
+    field : str
+
+    Returns
+    -------
+    xarray.Dataset
+    """
+    filename = ds.encoding["source"]
+    date = date_parser(Path(filename).stem.split(split)[location])
+    vals = np.arange(len(ds[field]))
+    days_since = f"days since {date}"
+    time = xr.coding.times.decode_cf_datetime(
+        vals, units=days_since, calendar="standard"
+    )
+    ds = ds.assign_coords({field: time})
+
+    prev_history = ds.attrs.get("history", "")
+    history = (
+        f"Time index recalculated in preprocessing step ({days_since}). {prev_history}"
+    )
+    ds.attrs.update(dict(history=history))
+
+    return ds
+
+
+def correct_var_names(
+    ds: xr.Dataset, split: str = "_", location: int = 0
+) -> xr.Dataset:
+    """Correct variable names in dataset.
+
+    Parameters
+    ----------
+    ds : xarray.Dataset
+    split : str
+    location : int
+
+    Returns
+    -------
+    xarray.Dataset
+    """
+    filename = ds.encoding["source"]
+    new_name = Path(filename).stem.split(split)[location]
+    old_name = list(ds.data_vars.keys())[0]
+
+    prev_history = ds.attrs.get("history", "")
+    history = f"Variable renamed in preprocessing step ({old_name}: {new_name}). {prev_history}"
+    ds.attrs.update(dict(history=history))
+
+    return ds.rename({old_name: new_name})
+
+
+def preprocessing_corrections(
+    ds: xr.Dataset, configuration: dict[str, Any]
+) -> xr.Dataset:
+    """Corrections function dispatcher to ensure minimal dataset validity on open.
+
+    Parameters
+    ----------
+    ds : xarray.Dataset
+    configuration : dict
+
+    Returns
+    -------
+    xarray.Dataset
+    """
+
+    def _preprocess_correct(d: xr.Dataset, *, ops: list[partial]) -> xr.Dataset:
+        for correction in ops:
+            d = correction(d)
+        return d
+
+    correction_fields = configuration.get("_preprocess")
+    if correction_fields:
+        preprocess_ops = []
+        for field in correction_fields:
+            if field == "_variable_name":
+                preprocess_ops.append(
+                    partial(correct_var_names, **correction_fields[field])
+                )
+            if field == "_time":
+                preprocess_ops.append(
+                    partial(correct_time_entries, **correction_fields[field])
+                )
+        if preprocess_ops:
+            corrector = partial(_preprocess_correct, ops=preprocess_ops)
+            return corrector(ds)
+    return ds
diff --git a/miranda/treatments/_treatments.py b/miranda/treatments/_treatments.py
deleted file mode 100644
index d7e1cd99..00000000
--- a/miranda/treatments/_treatments.py
+++ /dev/null
@@ -1,854 +0,0 @@
-from __future__ import annotations
-
-import datetime
-import inspect
-import json
-import logging.config
-import os
-import warnings
-from functools import partial
-from pathlib import Path
-from typing import Any, Dict
-
-import numpy as np
-import xarray as xr
-import xclim.core.units
-from xarray.coding import times
-from xclim.core import units
-from xclim.core.calendar import parse_offset
-
-from miranda import __version__ as __miranda_version__
-from miranda.convert.utils import date_parser
-from miranda.scripting import LOGGING_CONFIG
-from miranda.units import get_time_frequency
-
-logging.config.dictConfig(LOGGING_CONFIG)
-
-VERSION = datetime.datetime.now().strftime("%Y.%m.%d")
-
-__all__ = [
-    "cf_units_conversion",
-    "clip_values",
-    "conservative_regrid",
-    "correct_unit_names",
-    "dimensions_compliance",
-    "ensure_correct_time_frequency",
-    "invert_value_sign",
-    "load_json_data_mappings",
-    "metadata_conversion",
-    "offset_time_dimension",
-    "preprocessing_corrections",
-    "threshold_mask",
-    "transform_values",
-    "variable_conversion",
-]
-
-
-def load_json_data_mappings(
-    project: str, configurations: dict[str, Path] | None = None
-) -> dict[str, Any]:
-    """Load JSON mappings for supported dataset conversions.
-
-    Parameters
-    ----------
-    project : str
-    configurations: dict, optional
-
-    Returns
-    -------
-    dict[str, Any]
-    """
-    if configurations is None:
-        calling_frame = inspect.currentframe().f_back
-        calling_file_path = calling_frame.f_globals["__file__"]
-        config_folder = Path(calling_file_path).parent / "configs"
-
-        configurations = {}
-        for configuration in config_folder.glob("*attrs.json"):
-            project = str(configuration.stem).split("_")[0]
-            if "|" in project:
-                for p in project.split("|"):
-                    configurations[p] = configuration
-            configurations[project] = configuration
-
-    if project in configurations.keys():
-        config_file = configurations[project]
-        metadata_definition = json.load(config_file.open())
-        return metadata_definition
-    else:
-        raise NotImplementedError(f"Project not supported: {project}")
-
-
-def _get_section_entry_key(meta, entry, var, key, project):
-    var_meta = meta[entry].get(var, {})
-    if key in var_meta:
-        if isinstance(var_meta[key], dict):
-            config = var_meta[key].get(project)
-            if config is None and "all" in var_meta[key].keys():
-                config = var_meta[key].get("all")
-            return config
-        return var_meta[key]
-    return None
-
-
-def _iter_entry_key(ds, meta, entry, key, project):
-    for vv in set(ds.data_vars).intersection(meta[entry]):
-        val = _get_section_entry_key(meta, entry, vv, key, project)
-        yield vv, val
-
-
-def _simple_fix_dims(d: xr.Dataset | xr.DataArray) -> xr.Dataset | xr.DataArray:
-    """Adjust dimensions found in a file so that it can be used for regridding purposes."""
-    if "lon" not in d.dims or "lat" not in d.dims:
-        dim_rename = dict()
-        for dim in d.dims:
-            if str(dim).lower().startswith("lon"):
-                dim_rename[str(dim)] = "lon"
-            if str(dim).lower().startswith("lat"):
-                dim_rename[str(dim)] = "lat"
-        d = d.rename(dim_rename)
-    if np.any(d.lon > 180):
-        lon_wrapped = d.lon.where(d.lon <= 180.0, d.lon - 360.0)
-        d["lon"] = lon_wrapped
-        d = d.sortby(["lon"])
-
-    if "time" in d.dims:
-        d = d.isel(time=0, drop=True)
-
-    return d
-
-
-def conservative_regrid(
-    ds: xr.DataArray | xr.Dataset, ref_grid: xr.DataArray | xr.Dataset
-) -> xr.DataArray | xr.Dataset:
-    """Perform a conservative_normed regridding"""
-    try:
-        import xesmf as xe  # noqa
-    except ModuleNotFoundError:
-        raise ModuleNotFoundError(
-            "This function requires the `xesmf` library which is not installed. "
-            "Regridding step will be skipped."
-        )
-
-    ref_grid = _simple_fix_dims(ref_grid)
-    method = "conservative_normed"
-
-    logging.info(
-        f"Performing regridding and masking with `xesmf` using method: {method}."
-    )
-
-    regridder = xe.Regridder(ds, ref_grid, method, periodic=False)
-    ds = regridder(ds)
-
-    ds.attrs["history"] = (
-        f"{datetime.datetime.now()}:"
-        f"Regridded dataset using xesmf with method: {method}. "
-        f"{ds.attrs.get('history')}".strip()
-    )
-    return ds
-
-
-def threshold_mask(
-    ds: xr.Dataset | xr.DataArray,
-    *,
-    mask: xr.Dataset | xr.DataArray,
-    mask_cutoff: float | bool = False,
-) -> xr.Dataset | xr.DataArray:
-    """Land-Sea mask operations.
-
-    Parameters
-    ----------
-    ds : xr.Dataset or str or os.PathLike
-    mask : xr.Dataset or xr.DataArray
-    mask_cutoff : float or bool
-
-    Returns
-    -------
-    xr.Dataset or xr.DataArray
-    """
-    mask = _simple_fix_dims(mask)
-
-    if isinstance(mask, xr.Dataset):
-        if len(mask.data_vars) == 1:
-            mask_variable = list(mask.data_vars)[0]
-            mask = mask[mask_variable]
-        else:
-            raise ValueError(
-                "More than one data variable found in land-sea mask. Supply a DataArray instead."
-            )
-    else:
-        mask_variable = mask.name
-
-    try:
-        from clisops.core import subset_bbox  # noqa
-
-        log_msg = f"Masking dataset with {mask_variable}."
-        if mask_cutoff:
-            log_msg = f"{log_msg.strip('.')} at `{mask_cutoff}` cutoff value."
-        logging.info(log_msg)
-
-        lon_bounds = np.array([ds.lon.min(), ds.lon.max()])
-        lat_bounds = np.array([ds.lat.min(), ds.lat.max()])
-
-        mask_subset = subset_bbox(
-            mask,
-            lon_bnds=lon_bounds,
-            lat_bnds=lat_bounds,
-        ).load()
-    except ModuleNotFoundError:
-        log_msg = (
-            "This function requires the `clisops` library which is not installed. "
-            "subsetting step will be skipped."
-        )
-        warnings.warn(log_msg)
-        mask_subset = mask.load()
-
-    if mask_subset.dtype == bool:
-        if mask_cutoff:
-            logging.warning("Mask value cutoff set for boolean mask. Ignoring.")
-        mask_subset = mask_subset.where(mask)
-    else:
-        mask_subset = mask_subset.where(mask >= mask_cutoff)
-    ds = ds.where(mask_subset.notnull())
-
-    if mask_subset.min() >= 0:
-        if mask_subset.max() <= 1.00000001:
-            cutoff_info = f"{mask_cutoff * 100} %"
-        elif mask_subset.max() <= 100.00000001:
-            cutoff_info = f"{mask_cutoff} %"
-        else:
-            cutoff_info = f"{mask_cutoff}"
-    else:
-        cutoff_info = f"{mask_cutoff}"
-    ds.attrs["mask_cutoff"] = cutoff_info
-
-    prev_history = ds.attrs.get("history", "")
-    history_msg = f"Mask calculated using `{mask_variable}`."
-    if mask_cutoff:
-        history_msg = f"{history_msg.strip('.')} with cutoff value `{cutoff_info}`."
-    history = f"{history_msg} {prev_history}".strip()
-    ds.attrs.update(dict(history=history))
-
-    return ds
-
-
-def correct_time_entries(
-    d: xr.Dataset,
-    split: str = "_",
-    location: int = -1,
-    field: str = "time",
-) -> xr.Dataset:
-    filename = d.encoding["source"]
-    date = date_parser(Path(filename).stem.split(split)[location])
-    vals = np.arange(len(d[field]))
-    days_since = f"days since {date}"
-    time = xr.coding.times.decode_cf_datetime(
-        vals, units=days_since, calendar="standard"
-    )
-    d = d.assign_coords({field: time})
-
-    prev_history = d.attrs.get("history", "")
-    history = (
-        f"Time index recalculated in preprocessing step ({days_since}). {prev_history}"
-    )
-    d.attrs.update(dict(history=history))
-
-    return d
-
-
-def correct_var_names(d: xr.Dataset, split: str = "_", location: int = 0) -> xr.Dataset:
-    """
-
-    Parameters
-    ----------
-    d : xarray.Dataset
-    split : str
-    location : int
-
-    Returns
-    -------
-    xarray.Dataset
-    """
-    filename = d.encoding["source"]
-    new_name = Path(filename).stem.split(split)[location]
-    old_name = list(d.data_vars.keys())[0]
-
-    prev_history = d.attrs.get("history", "")
-    history = f"Variable renamed in preprocessing step ({old_name}: {new_name}). {prev_history}"
-    d.attrs.update(dict(history=history))
-
-    return d.rename({old_name: new_name})
-
-
-def preprocessing_corrections(ds: xr.Dataset, project: str) -> xr.Dataset:
-    """Corrections function dispatcher to ensure minimal dataset validity on open.
-
-    Parameters
-    ----------
-    ds : xarray.Dataset
-    project : str
-
-    Returns
-    -------
-    xarray.Dataset
-    """
-
-    def _preprocess_correct(d: xr.Dataset, *, ops: list[partial]) -> xr.Dataset:
-        for correction in ops:
-            d = correction(d)
-        return d
-
-    correction_fields = load_json_data_mappings(project).get("_preprocess")
-    if correction_fields:
-        preprocess_ops = []
-        for field in correction_fields:
-            if field == "_variable_name":
-                preprocess_ops.append(
-                    partial(correct_var_names, **correction_fields[field])
-                )
-            if field == "_time":
-                preprocess_ops.append(
-                    partial(correct_time_entries, **correction_fields[field])
-                )
-        if preprocess_ops:
-            corrector = partial(_preprocess_correct, ops=preprocess_ops)
-            return corrector(ds)
-    return ds
-
-
-def correct_unit_names(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
-    """Correct unit names."""
-    key = "_corrected_units"
-    for var, val in _iter_entry_key(d, m, "variables", key, p):
-        if val:
-            d[var].attrs["units"] = val
-
-    # FIXME: This is no longer relevant. Performed under dimension conversion step.
-    val_time = _get_section_entry_key(m, "variables", "time", key, p)
-    if val_time:
-        d["time"].attrs["units"] = val_time
-
-    return d
-
-
-# for de-accumulation or conversion to flux
-def transform_values(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
-    """Transform dataset values according to operation listed."""
-    key = "_transformation"
-    d_out = xr.Dataset(coords=d.coords, attrs=d.attrs)
-    converted = []
-    offset, offset_meaning = None, None
-
-    time_freq = dict()
-    expected_period = _get_section_entry_key(
-        m, "dimensions", "time", "_ensure_correct_time", p
-    )
-    if isinstance(expected_period, str):
-        time_freq["expected_period"] = expected_period
-
-    for vv, trans in _iter_entry_key(d, m, "variables", key, p):
-        if trans:
-            if trans == "deaccumulate":
-                # Time-step accumulated total to time-based flux (de-accumulation)
-                if offset is None and offset_meaning is None:
-                    try:
-                        offset, offset_meaning = get_time_frequency(d, **time_freq)
-                    except TypeError:
-                        logging.error(
-                            "Unable to parse the time frequency. Verify data integrity before retrying."
-                        )
-                        raise
-
-                logging.info(f"De-accumulating units for variable `{vv}`.")
-                with xr.set_options(keep_attrs=True):
-                    out = d[vv].diff(dim="time")
-                    out = d[vv].where(
-                        getattr(d[vv].time.dt, offset_meaning) == offset[0],
-                        out.broadcast_like(d[vv]),
-                    )
-                    out = units.amount2rate(out, out_units=m["variables"][vv]["units"])
-                    d_out[vv] = out
-                converted.append(vv)
-            elif trans == "amount2rate":
-                # NOTE: This treatment is no longer needed in xclim v0.43.0+ but is kept for backwards compatibility
-                # frequency-based totals to time-based flux
-                logging.info(
-                    f"Performing amount-to-rate units conversion for variable `{vv}`."
-                )
-                with xr.set_options(keep_attrs=True):
-                    out = units.amount2rate(
-                        d[vv],
-                        out_units=m["variables"][vv]["units"],
-                    )
-                    d_out[vv] = out
-                converted.append(vv)
-            elif isinstance(trans, str):
-                if trans.startswith("op "):
-                    op = trans[3]
-                    value = trans[4:].strip()
-                    if value.startswith("attrs"):
-                        value = units.str2pint(d[vv].attrs[value[6:]])
-                    else:
-                        value = units.str2pint(value)
-                    with xr.set_options(keep_attrs=True):
-                        if op == "+":
-                            value = units.convert_units_to(value, d[vv])
-                            d_out[vv] = d[vv] + value
-                        elif op == "-":
-                            value = units.convert_units_to(value, d[vv])
-                            d_out[vv] = d[vv] - value
-                        elif op == "*":
-                            d_out[vv] = units.pint_multiply(d[vv], value)
-                        elif op == "/":
-                            d_out[vv] = units.pint_multiply(d[vv], 1 / value)
-                        else:
-                            raise NotImplementedError(
-                                f"Op transform doesn't implement the «{op}» operator."
-                            )
-                converted.append(vv)
-            else:
-                raise NotImplementedError(f"Unknown transformation: {trans}")
-        elif trans is False:
-            logging.info(
-                f"No transformations needed for `{vv}` (Explicitly set to False)."
-            )
-            continue
-
-        prev_history = d.attrs.get("history", "")
-        history = (
-            f"Transformed variable `{vv}` values using method `{trans}`. {prev_history}"
-        )
-        d_out.attrs.update(dict(history=history))
-
-    # Copy unconverted variables
-    for vv in d.data_vars:
-        if vv not in converted:
-            d_out[vv] = d[vv]
-    return d_out
-
-
-def offset_time_dimension(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
-    """Offset time dimension using listed frequency."""
-    key = "_offset_time"
-    d_out = xr.Dataset(coords=d.coords, attrs=d.attrs)
-    converted = []
-    offset, offset_meaning = None, None
-
-    time_freq = dict()
-    expected_period = _get_section_entry_key(
-        m, "dimensions", "time", "_ensure_correct_time", p
-    )
-    if isinstance(expected_period, str):
-        time_freq["expected_period"] = expected_period
-
-    for vv, offs in _iter_entry_key(d, m, "dimensions", key, p):
-        if offs:
-            # Offset time by value of one time-step
-            if offset is None and offset_meaning is None:
-                try:
-                    offset, offset_meaning = get_time_frequency(d, **time_freq)
-                except TypeError:
-                    logging.error(
-                        "Unable to parse the time frequency. Verify data integrity before retrying."
-                    )
-                    raise
-
-            logging.info(
-                f"Offsetting data for `{vv}` by `{offset[0]} {offset_meaning}(s)`."
-            )
-            with xr.set_options(keep_attrs=True):
-                out = d[vv]
-                out["time"] = out.time - np.timedelta64(offset[0], offset[1])
-                d_out[vv] = out
-            converted.append(vv)
-        elif offs is False:
-            logging.info(
-                f"No time offsetting needed for `{vv}` in `{p}` (Explicitly set to False)."
-            )
-            continue
-        prev_history = d.attrs.get("history", "")
-        history = f"Offset variable `{vv}` values by `{offset[0]} {offset_meaning}(s). {prev_history}"
-        d_out.attrs.update(dict(history=history))
-
-    # Copy unconverted variables
-    for vv in d.data_vars:
-        if vv not in converted:
-            d_out[vv] = d[vv]
-    return d_out
-
-
-def invert_value_sign(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
-    """Flip value of DataArray."""
-    key = "_invert_sign"
-    d_out = xr.Dataset(coords=d.coords, attrs=d.attrs)
-    converted = []
-    for vv, inv_sign in _iter_entry_key(d, m, "variables", key, p):
-        if inv_sign:
-            logging.info(f"Inverting sign for `{vv}` (switching direction of values).")
-            with xr.set_options(keep_attrs=True):
-                out = d[vv]
-                d_out[out.name] = -out
-            converted.append(vv)
-        elif inv_sign is False:
-            logging.info(
-                f"No sign inversion needed for `{vv}` in `{p}` (Explicitly set to False)."
-            )
-            continue
-        prev_history = d.attrs.get("history", "")
-        history = f"Inverted sign for variable `{vv}` (switched direction of values). {prev_history}"
-        d_out.attrs.update(dict(history=history))
-
-    # Copy unconverted variables
-    for vv in d.data_vars:
-        if vv not in converted:
-            d_out[vv] = d[vv]
-    return d_out
-
-
-# For converting variable units to standard workflow units
-def cf_units_conversion(d: xr.Dataset, m: dict) -> xr.Dataset:
-    """Perform pint-based units-conversion."""
-    if "time" in m["dimensions"].keys():
-        if m["dimensions"]["time"].get("units"):
-            d["time"]["units"] = m["dimensions"]["time"]["units"]
-
-    for vv, unit in _iter_entry_key(d, m, "variables", "units", None):
-        if unit:
-            with xr.set_options(keep_attrs=True):
-                d[vv] = units.convert_units_to(d[vv], unit, context="hydro")
-            prev_history = d.attrs.get("history", "")
-            history = f"Converted variable `{vv}` to CF-compliant units (`{unit}`). {prev_history}"
-            d.attrs.update(dict(history=history))
-
-    return d
-
-
-# For clipping variable values to an established maximum/minimum
-def clip_values(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
-    """Clip values to an appropriate range,."""
-    key = "_clip_values"
-    d_out = xr.Dataset(coords=d.coords, attrs=d.attrs)
-    converted = []
-    for vv in d.data_vars:
-        if vv in m["variables"].keys():
-            clip_values = _get_section_entry_key(m, "variables", vv, key, p)
-            if clip_values:
-                min_value, max_value = None, None
-                # Gather unit conversion context, if applicable
-                context = clip_values.get("context", None)
-                for op, value in clip_values.items():
-                    if op == "min":
-                        min_value = xclim.core.units.convert_units_to(
-                            value, d[vv], context
-                        )
-                    if op == "max":
-                        max_value = xclim.core.units.convert_units_to(
-                            value, d[vv], context
-                        )
-                logging.info(
-                    f"Clipping min/max values for `{vv}` ({min_value}/{max_value})."
-                )
-                with xr.set_options(keep_attrs=True):
-                    out = d[vv]
-                    d_out[out.name] = out.clip(min_value, max_value)
-                converted.append(vv)
-            elif clip_values is False:
-                logging.info(
-                    f"No clipping of values needed for `{vv}` in `{p}` (Explicitly set to False)."
-                )
-                continue
-            else:
-                logging.info(f"No clipping of values needed for `{vv}` in `{p}`.")
-                continue
-
-            prev_history = d.attrs.get("history", "")
-            history = f"Clipped variable `{vv}` with `min={min_value}` and `max={max_value}`. {prev_history}"
-            d_out.attrs.update(dict(history=history))
-
-    # Copy unconverted variables
-    for vv in d.data_vars:
-        if vv not in converted:
-            d_out[vv] = d[vv]
-
-    return d_out
-
-
-def ensure_correct_time_frequency(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
-    """Ensure that time frequency is consistent with expected frequency for project."""
-    key = "_ensure_correct_time"
-    strict_time = "_strict_time"
-
-    if "time" not in m["dimensions"].keys():
-        logging.warning(f"No time corrections listed for project `{p}`. Continuing...")
-        return d
-
-    if "time" not in list(d.variables.keys()):
-        logging.info(
-            "No time dimension among data variables: "
-            f"{' ,'.join([str(v) for v in d.variables.keys()])}. "
-            "Continuing..."
-        )
-        return d
-
-    if key in m["dimensions"]["time"].keys():
-        freq_found = xr.infer_freq(d.time)
-        if strict_time in m["dimensions"]["time"].keys():
-            if not freq_found:
-                msg = (
-                    "Time frequency could not be found. There may be missing timesteps."
-                )
-                if m["dimensions"]["time"].get(strict_time):
-                    raise ValueError(msg)
-                else:
-                    logging.warning(f"{msg} Continuing...")
-                    return d
-
-        correct_time_entry = m["dimensions"]["time"][key]
-        if isinstance(correct_time_entry, str):
-            correct_times = [parse_offset(correct_time_entry)[1]]
-        elif isinstance(correct_time_entry, dict):
-            correct_times = correct_time_entry.get(p)
-            if isinstance(correct_times, list):
-                correct_times = [parse_offset(t)[1] for t in correct_times]
-            if correct_times is None:
-                logging.warning(f"No expected times set for specified project `{p}`.")
-        elif isinstance(correct_time_entry, list):
-            correct_times = correct_time_entry
-        else:
-            logging.warning("No expected times set for family of projects.")
-            return d
-
-        if freq_found not in correct_times:
-            error_msg = (
-                f"Time frequency {freq_found} not among allowed frequencies: "
-                f"{', '.join(correct_times) if isinstance(correct_times, list) else correct_times}"
-            )
-            if isinstance(correct_time_entry, dict):
-                error_msg = f"{error_msg} for project `{p}`."
-            else:
-                error_msg = f"{error_msg}."
-            raise ValueError(error_msg)
-
-        logging.info(f"Resampling dataset with time frequency: {freq_found}.")
-        with xr.set_options(keep_attrs=True):
-            d_out = d.assign_coords(
-                time=d.time.resample(time=freq_found).mean(dim="time").time
-            )
-            d_out.time.attrs.update(d.time.attrs)
-
-        prev_history = d.attrs.get("history", "")
-        history = f"Resampled time with `freq={freq_found}`. {prev_history}"
-        d_out.attrs.update(dict(history=history))
-        return d_out
-
-    return d
-
-
-# For renaming and reordering lat and lon dims
-def dimensions_compliance(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
-    """Rename dimensions to CF to their equivalents and reorder them if needed.
-
-    Parameters
-    ----------
-    d : xarray.Dataset
-        Dataset with dimensions to be updated.
-    p : str
-        Dataset project name.
-    m : dict
-        Metadata definition dictionary for project and variable(s).
-
-    Returns
-    -------
-    xarray.Dataset
-    """
-    rename_dims = dict()
-    for dim in d.dims:
-        if dim in m["dimensions"].keys():
-            cf_name = _get_section_entry_key(
-                m, "dimensions", dim, "_cf_dimension_name", p
-            )
-            if cf_name:
-                rename_dims[dim] = cf_name
-    d = d.rename(rename_dims)
-    for new in ["lon", "lat"]:
-        if new == "lon" and "lon" in d.coords:
-            if np.any(d.lon > 180):
-                lon1 = d.lon.where(d.lon <= 180.0, d.lon - 360.0)
-                d[new] = lon1
-
-        coord_precision = _get_section_entry_key(m, "dimensions", new, "_precision", p)
-        if coord_precision is not None:
-            d[new] = d[new].round(coord_precision)
-
-    # Ensure that lon and lat are written in proper order for plotting purposes
-    transpose_order = []
-    if "lat" in d.dims and "lon" in d.dims:
-        transpose_order = ["lat", "lon"]
-    elif "rlat" in d.dims and "rlon" in d.dims:
-        transpose_order = ["rlat", "rlon"]
-    if "time" in d.dims and transpose_order:
-        transpose_order.insert(0, "time")
-        transpose_order.extend(list(set(d.dims) - set(transpose_order)))
-    d = d.transpose(*transpose_order)
-    d = d.sortby(transpose_order)
-
-    # Add dimension original name and update attrs
-    dim_descriptions = m["dimensions"]
-    for dim in m["dimensions"].keys():
-        cf_name = dim_descriptions[dim].get("_cf_dimension_name")
-        if cf_name is not None and cf_name in d.dims:
-            d[cf_name].attrs.update(dict(original_variable=dim))
-        else:
-            # variable name already follows CF standards
-            cf_name = dim
-        for field in dim_descriptions[dim].keys():
-            if not field.startswith("_"):
-                d[cf_name].attrs.update({field: dim_descriptions[dim][field]})
-
-    prev_history = d.attrs.get("history", "")
-    history = f"Transposed and renamed dimensions. {prev_history}"
-    d.attrs.update(dict(history=history))
-
-    return d
-
-
-def variable_conversion(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
-    """Add variable metadata and remove nonstandard entries.
-
-    Parameters
-    ----------
-    d : xarray.Dataset
-        Dataset with variable(s) to be updated.
-    p : str
-        Dataset project name.
-    m : dict
-        Metadata definition dictionary for project and variable(s).
-
-    Returns
-    -------
-    xarray.Dataset
-    """
-    var_descriptions = m["variables"]
-    var_correction_fields = [
-        "_clip_values",
-        "_corrected_units",
-        "_invert_sign",
-        "_offset_time",
-        "_transformation",
-    ]
-    for var in d.variables:
-        if var in var_descriptions.keys():
-            for field in var_correction_fields:
-                if field in var_descriptions[var].keys():
-                    del var_descriptions[var][field]
-            d[var].attrs.update(var_descriptions[var])
-
-    # Rename data variables
-    for orig_var_name, cf_name in _iter_entry_key(
-        d, m, "variables", "_cf_variable_name", None
-    ):
-        if cf_name is not None:
-            d = d.rename({orig_var_name: cf_name})
-            d[cf_name].attrs.update(dict(original_variable=orig_var_name))
-            del d[cf_name].attrs["_cf_variable_name"]
-
-    return d
-
-
-def metadata_conversion(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
-    """Update xarray dataset and data_vars with project-specific metadata fields.
-
-    Parameters
-    ----------
-    d : xarray.Dataset
-        Dataset with metadata to be updated.
-    p : str
-        Dataset project name.
-    m : dict
-        Metadata definition dictionary for project and variable(s).
-
-    Returns
-    -------
-    xarray.Dataset
-    """
-    logging.info("Converting metadata to CF-like conventions.")
-
-    header = m["Header"]
-
-    # Static handling of version global attributes
-    miranda_version = header.get("_miranda_version")
-    if miranda_version:
-        if isinstance(miranda_version, bool):
-            header["miranda_version"] = __miranda_version__
-        elif isinstance(miranda_version, dict):
-            if p in miranda_version.keys():
-                header["miranda_version"] = __miranda_version__
-        else:
-            logging.warning(
-                f"`_miranda_version` not set for project `{p}`. Not appending."
-            )
-    if "_miranda_version" in header:
-        del header["_miranda_version"]
-
-    frequency = m["Header"].get("_frequency")
-    if frequency:
-        if isinstance(frequency, bool):
-            _, m["Header"]["frequency"] = get_time_frequency(d)
-        elif isinstance(frequency, dict):
-            if p in frequency.keys():
-                m["Header"]["frequency"] = get_time_frequency(d)
-        else:
-            logging.warning("`frequency` not set for project. Not appending.")
-    if "_frequency" in m["Header"]:
-        del m["Header"]["_frequency"]
-
-    # Conditional handling of global attributes based on project name
-    for field in [f for f in header if f.startswith("_")]:
-        if isinstance(header[field], list):
-            if p in header[field]:
-                attr_treatment = header[field][p]
-            else:
-                logging.warning(
-                    f"Attribute handling (`{field}`) not set for project `{p}`. Continuing..."
-                )
-                continue
-        elif isinstance(header[field], dict):
-            attr_treatment = header[field]
-        else:
-            raise AttributeError(
-                f"Attribute treatment configuration for field `{field}` is not properly configured. Verify JSON."
-            )
-
-        if field == "_map_attrs":
-            for attribute, mapping in attr_treatment.items():
-                header[mapping] = d.attrs[attribute]
-                del d.attrs[attribute]
-        elif field == "_remove_attrs":
-            for ff in attr_treatment:
-                del d.attrs[ff]
-        else:
-            if field[1:] in d.attrs:
-                logging.warning(
-                    f"Overwriting `{field[1:]}` based on JSON configuration."
-                )
-            header[field[1:]] = attr_treatment
-
-        del header[field]
-
-    # Add global attributes
-    d.attrs.update(header)
-    d.attrs.update(dict(project=p))
-
-    # Date-based versioning
-    if not d.attrs.get("version"):
-        d.attrs.update(dict(version=f"v{VERSION}"))
-
-    prev_history = d.attrs.get("history", "")
-    history = (
-        f"[{datetime.datetime.now()}] "
-        "Converted variables and modified metadata for CF-like compliance: "
-        f"{prev_history}".strip()
-    )
-    d.attrs.update(dict(history=history))
-
-    return d
diff --git a/miranda/treatments/_variables.py b/miranda/treatments/_variables.py
new file mode 100644
index 00000000..5991e4c7
--- /dev/null
+++ b/miranda/treatments/_variables.py
@@ -0,0 +1,273 @@
+from __future__ import annotations
+
+import logging.config
+
+import xarray as xr
+import xclim.core.units
+from xclim.core import units
+
+from miranda.treatments.utils import _get_section_entry_key  # noqa
+from miranda.treatments.utils import _iter_entry_key  # noqa
+from miranda.units import get_time_frequency
+
+__all__ = [
+    "cf_units_conversion",
+    "clip_values",
+    "correct_unit_names",
+    "invert_value_sign",
+    "transform_values",
+    "variable_conversion",
+]
+
+
+def correct_unit_names(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+    """Correct unit names."""
+    key = "_corrected_units"
+    for var, val in _iter_entry_key(d, m, "variables", key, p):
+        if val:
+            d[var].attrs["units"] = val
+            prev_history = d.attrs.get("history", "")
+            history = (
+                f"Corrected units name for variable `{var}` to `{val}`. {prev_history}"
+            )
+            d.attrs.update(dict(history=history))
+
+    return d
+
+
+# for de-accumulation or conversion to flux
+def transform_values(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+    """Transform dataset values according to operation listed."""
+    key = "_transformation"
+    d_out = xr.Dataset(coords=d.coords, attrs=d.attrs)
+    converted = []
+    offset, offset_meaning = None, None
+
+    time_freq = dict()
+    expected_period = _get_section_entry_key(
+        m, "dimensions", "time", "_ensure_correct_time", p
+    )
+    if isinstance(expected_period, str):
+        time_freq["expected_period"] = expected_period
+
+    for vv, trans in _iter_entry_key(d, m, "variables", key, p):
+        if trans:
+            if trans == "deaccumulate":
+                # Time-step accumulated total to time-based flux (de-accumulation)
+                if offset is None and offset_meaning is None:
+                    try:
+                        offset, offset_meaning = get_time_frequency(d, **time_freq)
+                    except TypeError:
+                        logging.error(
+                            "Unable to parse the time frequency. Verify data integrity before retrying."
+                        )
+                        raise
+
+                logging.info(f"De-accumulating units for variable `{vv}`.")
+                with xr.set_options(keep_attrs=True):
+                    out = d[vv].diff(dim="time")
+                    out = d[vv].where(
+                        getattr(d[vv].time.dt, offset_meaning) == offset[0],
+                        out.broadcast_like(d[vv]),
+                    )
+                    out = units.amount2rate(out, out_units=m["variables"][vv]["units"])
+                    d_out[vv] = out
+                converted.append(vv)
+            elif trans == "amount2rate":
+                # NOTE: This treatment is no longer needed in xclim v0.43.0+ but is kept for backwards compatibility
+                # frequency-based totals to time-based flux
+                logging.info(
+                    f"Performing amount-to-rate units conversion for variable `{vv}`."
+                )
+                with xr.set_options(keep_attrs=True):
+                    out = units.amount2rate(
+                        d[vv],
+                        out_units=m["variables"][vv]["units"],
+                    )
+                    d_out[vv] = out
+                converted.append(vv)
+            elif isinstance(trans, str):
+                if trans.startswith("op "):
+                    op = trans[3]
+                    value = trans[4:].strip()
+                    if value.startswith("attrs"):
+                        value = units.str2pint(d[vv].attrs[value[6:]])
+                    else:
+                        value = units.str2pint(value)
+                    with xr.set_options(keep_attrs=True):
+                        if op == "+":
+                            value = units.convert_units_to(value, d[vv])
+                            d_out[vv] = d[vv] + value
+                        elif op == "-":
+                            value = units.convert_units_to(value, d[vv])
+                            d_out[vv] = d[vv] - value
+                        elif op == "*":
+                            d_out[vv] = units.pint_multiply(d[vv], value)
+                        elif op == "/":
+                            d_out[vv] = units.pint_multiply(d[vv], 1 / value)
+                        else:
+                            raise NotImplementedError(
+                                f"Op transform doesn't implement the «{op}» operator."
+                            )
+                converted.append(vv)
+            else:
+                raise NotImplementedError(f"Unknown transformation: {trans}")
+        elif trans is False:
+            logging.info(
+                f"No transformations needed for `{vv}` (Explicitly set to False)."
+            )
+            continue
+
+        prev_history = d.attrs.get("history", "")
+        history = (
+            f"Transformed variable `{vv}` values using method `{trans}`. {prev_history}"
+        )
+        d_out.attrs.update(dict(history=history))
+
+    # Copy unconverted variables
+    for vv in d.data_vars:
+        if vv not in converted:
+            d_out[vv] = d[vv]
+    return d_out
+
+
+def invert_value_sign(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+    """Flip value of DataArray."""
+    key = "_invert_sign"
+    d_out = xr.Dataset(coords=d.coords, attrs=d.attrs)
+    converted = []
+    for vv, inv_sign in _iter_entry_key(d, m, "variables", key, p):
+        if inv_sign:
+            logging.info(f"Inverting sign for `{vv}` (switching direction of values).")
+            with xr.set_options(keep_attrs=True):
+                out = d[vv]
+                d_out[out.name] = -out
+            converted.append(vv)
+        elif inv_sign is False:
+            logging.info(
+                f"No sign inversion needed for `{vv}` in `{p}` (Explicitly set to False)."
+            )
+            continue
+        prev_history = d.attrs.get("history", "")
+        history = f"Inverted sign for variable `{vv}` (switched direction of values). {prev_history}"
+        d_out.attrs.update(dict(history=history))
+
+    # Copy unconverted variables
+    for vv in d.data_vars:
+        if vv not in converted:
+            d_out[vv] = d[vv]
+    return d_out
+
+
+# For converting variable units to standard workflow units
+def cf_units_conversion(d: xr.Dataset, m: dict) -> xr.Dataset:
+    """Perform pint-based units-conversion."""
+    if "time" in m["dimensions"].keys():
+        if m["dimensions"]["time"].get("units"):
+            d["time"]["units"] = m["dimensions"]["time"]["units"]
+
+    for vv, unit in _iter_entry_key(d, m, "variables", "units", None):
+        if unit:
+            with xr.set_options(keep_attrs=True):
+                d[vv] = units.convert_units_to(d[vv], unit, context="hydro")
+            prev_history = d.attrs.get("history", "")
+            history = f"Converted variable `{vv}` to CF-compliant units (`{unit}`). {prev_history}"
+            d.attrs.update(dict(history=history))
+
+    return d
+
+
+# For clipping variable values to an established maximum/minimum
+def clip_values(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+    """Clip values to an appropriate range,."""
+    key = "_clip_values"
+    d_out = xr.Dataset(coords=d.coords, attrs=d.attrs)
+    converted = []
+    for vv in d.data_vars:
+        if vv in m["variables"].keys():
+            clip_values = _get_section_entry_key(m, "variables", vv, key, p)
+            if clip_values:
+                min_value, max_value = None, None
+                # Gather unit conversion context, if applicable
+                context = clip_values.get("context", None)
+                for op, value in clip_values.items():
+                    if op == "min":
+                        min_value = xclim.core.units.convert_units_to(
+                            value, d[vv], context
+                        )
+                    if op == "max":
+                        max_value = xclim.core.units.convert_units_to(
+                            value, d[vv], context
+                        )
+                logging.info(
+                    f"Clipping min/max values for `{vv}` ({min_value}/{max_value})."
+                )
+                with xr.set_options(keep_attrs=True):
+                    out = d[vv]
+                    d_out[out.name] = out.clip(min_value, max_value)
+                converted.append(vv)
+            elif clip_values is False:
+                logging.info(
+                    f"No clipping of values needed for `{vv}` in `{p}` (Explicitly set to False)."
+                )
+                continue
+            else:
+                logging.info(f"No clipping of values needed for `{vv}` in `{p}`.")
+                continue
+
+            prev_history = d.attrs.get("history", "")
+            history = f"Clipped variable `{vv}` with `min={min_value}` and `max={max_value}`. {prev_history}"
+            d_out.attrs.update(dict(history=history))
+
+    # Copy unconverted variables
+    for vv in d.data_vars:
+        if vv not in converted:
+            d_out[vv] = d[vv]
+
+    return d_out
+
+
+# For renaming and reordering lat and lon dims
+
+
+def variable_conversion(d: xr.Dataset, p: str | None, m: dict) -> xr.Dataset:
+    """Add variable metadata and remove nonstandard entries.
+
+    Parameters
+    ----------
+    d : xarray.Dataset
+        Dataset with variable(s) to be updated.
+    p : str
+        Dataset project name.
+    m : dict
+        Metadata definition dictionary for project and variable(s).
+
+    Returns
+    -------
+    xarray.Dataset
+    """
+    var_descriptions = m["variables"]
+    var_correction_fields = [
+        "_clip_values",
+        "_corrected_units",
+        "_invert_sign",
+        "_offset_time",
+        "_transformation",
+    ]
+    for var in d.variables:
+        if var in var_descriptions.keys():
+            for field in var_correction_fields:
+                if field in var_descriptions[var].keys():
+                    del var_descriptions[var][field]
+            d[var].attrs.update(var_descriptions[var])
+
+    # Rename data variables
+    for orig_var_name, cf_name in _iter_entry_key(
+        d, m, "variables", "_cf_variable_name", p
+    ):
+        if cf_name is not None:
+            d = d.rename({orig_var_name: cf_name})
+            d[cf_name].attrs.update(dict(original_variable=orig_var_name))
+            del d[cf_name].attrs["_cf_variable_name"]
+
+    return d
diff --git a/miranda/treatments/utils.py b/miranda/treatments/utils.py
new file mode 100644
index 00000000..e1b15a3a
--- /dev/null
+++ b/miranda/treatments/utils.py
@@ -0,0 +1,64 @@
+"""Utility functions for GIS operations."""
+from __future__ import annotations
+
+import inspect
+import json
+from pathlib import Path
+from typing import Any
+
+__all__ = [
+    "load_json_data_mappings",
+]
+
+
+def _get_section_entry_key(meta, entry, var, key, project):
+    var_meta = meta[entry].get(var, {})
+    if key in var_meta:
+        if isinstance(var_meta[key], dict):
+            config = var_meta[key].get(project)
+            if config is None and "all" in var_meta[key].keys():
+                config = var_meta[key].get("all")
+            return config
+        return var_meta[key]
+    return None
+
+
+def _iter_entry_key(ds, meta, entry, key, project):
+    for vv in set(ds.data_vars).intersection(meta[entry]):
+        val = _get_section_entry_key(meta, entry, vv, key, project)
+        yield vv, val
+
+
+def load_json_data_mappings(
+    project: str, configurations: dict[str, Path] | None = None
+) -> dict[str, Any]:
+    """Load JSON mappings for supported dataset conversions.
+
+    Parameters
+    ----------
+    project : str
+    configurations: dict, optional
+
+    Returns
+    -------
+    dict[str, Any]
+    """
+    if configurations is None:
+        calling_frame = inspect.currentframe().f_back
+        calling_file_path = calling_frame.f_globals["__file__"]
+        config_folder = Path(calling_file_path).parent / "configs"
+
+        configurations = {}
+        for configuration in config_folder.glob("*attrs.json"):
+            project = str(configuration.stem).split("_")[0]
+            if "|" in project:
+                for p in project.split("|"):
+                    configurations[p] = configuration
+            configurations[project] = configuration
+
+    if project in configurations.keys():
+        config_file = configurations[project]
+        metadata_definition = json.load(config_file.open())
+        return metadata_definition
+    else:
+        raise NotImplementedError(f"Project not supported: {project}")
diff --git a/miranda/vocabularies/__init__.py b/miranda/vocabularies/__init__.py
index 74f0a223..0e05c103 100644
--- a/miranda/vocabularies/__init__.py
+++ b/miranda/vocabularies/__init__.py
@@ -1 +1,4 @@
 """Controlled Vocabulary module."""
+from __future__ import annotations
+
+from . import eccc, esgf
diff --git a/miranda/vocabularies/eccc.py b/miranda/vocabularies/eccc.py
index bd739fed..f668ec63 100644
--- a/miranda/vocabularies/eccc.py
+++ b/miranda/vocabularies/eccc.py
@@ -72,12 +72,14 @@
 
 obs_groupings = dict()
 obs_groupings["HLY"] = list(
-    obs_vocabularies["HLY01"]
-    + obs_vocabularies["HLY01_RCS"]
-    + obs_vocabularies["HLY03"]
-    + obs_vocabularies["HLY10"]
-    + obs_vocabularies["HLY15"]
-    + obs_vocabularies["HLY21"]
+    set(
+        obs_vocabularies["HLY01"]
+        + obs_vocabularies["HLY01_RCS"]
+        + obs_vocabularies["HLY03"]
+        + obs_vocabularies["HLY10"]
+        + obs_vocabularies["HLY15"]
+        + obs_vocabularies["HLY21"]
+    )
 )
 obs_groupings["DLY"] = list(
     set(

From f5ca6824cc5d501005de1f348166fd78f054eed2 Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Thu, 29 Feb 2024 16:20:11 -0500
Subject: [PATCH 18/33] fix metadata, adjust tests

---
 miranda/preprocess/_metadata.py               |  14 +-
 .../preprocess/configs/eccc-obs_attrs.json    | 163 ++++++++++++++++--
 tests/test_utils.py                           |  37 ++--
 3 files changed, 181 insertions(+), 33 deletions(-)

diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py
index c8fcafa1..842fb92c 100644
--- a/miranda/preprocess/_metadata.py
+++ b/miranda/preprocess/_metadata.py
@@ -15,23 +15,23 @@
 
 
 def eccc_variable_metadata(
-    variable_code: str,
+    variable_code: str | int,
     project: str,
     generation: int | None = None,
     metadata: dict | None = None,
-) -> (dict[str, int | float | str], dict, list[tuple[int, int]], int):
+) -> dict[str, Any]:
     """Return the metadata for a given variable code and project.
 
     Parameters
     ----------
-    variable_code: str
+    variable_code: str or int
     project: {"eccc-ahccd", "eccc-obs", "eccc-obs-summary"}
     generation: {1, 2, 3}, optional
     metadata: dict, optional
 
     Returns
     -------
-    dict[str, int or str or float], dict, list[tuple[int, int]], int
+    dict
     """
     if project == "eccc-ahccd":
         generation = {1: "First", 2: "Second", 3: "Third"}.get(generation)
@@ -42,6 +42,10 @@ def eccc_variable_metadata(
 
     if not metadata:
         metadata = load_json_data_mappings(project)
+
+    if isinstance(variable_code, int):
+        variable_code = str(variable_code).zfill(3)
+
     code = find_project_variable_codes(variable_code, metadata)
 
     # Variable metadata
@@ -92,7 +96,7 @@ def eccc_variable_metadata(
     for field in to_delete:
         del header[field]
 
-    return variable_meta, header
+    return dict(metadata=variable_meta, header=header)
 
 
 def homogenized_column_definitions(
diff --git a/miranda/preprocess/configs/eccc-obs_attrs.json b/miranda/preprocess/configs/eccc-obs_attrs.json
index 300b559a..8f438b16 100644
--- a/miranda/preprocess/configs/eccc-obs_attrs.json
+++ b/miranda/preprocess/configs/eccc-obs_attrs.json
@@ -2,13 +2,6 @@
   "Header": {
     "_frequency": true,
     "_miranda_version": true,
-    "_missing_flags": "M",
-    "_missing_values": [
-      "-999",
-      "1e20",
-      "-9999",
-      "#####"
-    ],
     "acknowledgement": "This data is provided by Environment and Climate Change Canada (ECCC).",
     "author": "Environment and Climate Change Canada (ECCC)",
     "contact": "ccsc-cccs@ec.gc.ca",
@@ -28,6 +21,8 @@
   "variables": {
     "001": {
       "_variable_name": "tasmax",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "long_name": "Daily Maximum Temperature",
       "original_units": "0.1 °C",
       "scale_factor": 0.1,
@@ -36,6 +31,8 @@
     },
     "002": {
       "_variable_name": "tasmin",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "long_name": "Daily Minimum Temperature",
       "original_units": "0.1 °C",
       "scale_factor": 0.1,
@@ -44,6 +41,8 @@
     },
     "003": {
       "_variable_name": "tas",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "long_name": "Daily Mean Temperature",
       "original_units": "0.1 °C",
       "scale_factor": 0.1,
@@ -52,6 +51,8 @@
     },
     "010": {
       "_variable_name": "prlptot",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "long_name": "Daily Total Rainfall",
       "original_units": "0.1 mm day-1",
       "scale_factor": 0.1,
@@ -60,6 +61,8 @@
     },
     "011": {
       "_variable_name": "prsntot",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "long_name": "Daily Total Snowfall",
       "original_units": "0.1 cm day-1",
       "scale_factor": 0.1,
@@ -68,6 +71,8 @@
     },
     "012": {
       "_variable_name": "prcptot",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "long_name": "Daily Total Precipitation",
       "original_units": "0.1 mm day-1",
       "scale_factor": 0.1,
@@ -76,6 +81,8 @@
     },
     "013": {
       "_variable_name": "sndtot",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "long_name": "Snow on the Ground",
       "original_units": "cm",
       "scale_factor": 1,
@@ -84,6 +91,8 @@
     },
     "014": {
       "_variable_name": "thunder",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "long_name": "Thunderstorms",
       "scale_factor": 1,
       "standard_name": "thunderstorm_presence",
@@ -91,6 +100,8 @@
     },
     "015": {
       "_variable_name": "freezing_rain_drizzle",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "long_name": "Freezing rain or drizzle",
       "scale_factor": 1,
       "standard_name": "freeze_rain_drizzle_presence",
@@ -98,6 +109,8 @@
     },
     "016": {
       "_variable_name": "hail",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "long_name": "Hail",
       "scale_factor": 1,
       "standard_name": "hail_presence",
@@ -105,6 +118,8 @@
     },
     "017": {
       "_variable_name": "fog_ice_fog",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "long_name": "Fog or Ice Fog",
       "scale_factor": 1,
       "standard_name": "fog_ice_fog_presence",
@@ -112,6 +127,8 @@
     },
     "018": {
       "_variable_name": "smoke_haze",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "long_name": "Smoke or Haze",
       "scale_factor": 1,
       "standard_name": "smoke_haze_presence",
@@ -119,6 +136,8 @@
     },
     "019": {
       "_variable_name": "blowing_dust_sand",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "long_name": "Blowing Dust or Sand",
       "scale_factor": 1,
       "standard_name": "blowing_dust_sand_presence",
@@ -126,6 +145,8 @@
     },
     "020": {
       "_variable_name": "blow_snow",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "long_name": "Blowing snow",
       "scale_factor": 1,
       "standard_name": "blowing_snow_presence",
@@ -135,12 +156,16 @@
       "_variable_name": "wind_gt_28kt",
       "long_name": "Wind speed >= 28 Knots",
       "scale_factor": 1,
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "wind_exceeding_28_knots",
       "units": "1"
     },
     "022": {
       "_variable_name": "wind_gt_34kt",
       "long_name": "Wind speed >= 34 Knots",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "scale_factor": 1,
       "standard_name": "wind_exceeding_34_knots",
       "units": "1"
@@ -150,6 +175,8 @@
       "long_name": "Direction of extreme gust (16 pts) to December 1976",
       "original_units": "10's of degrees",
       "scale_factor": 10,
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "gust_to_direction",
       "units": "deg"
     },
@@ -157,6 +184,8 @@
       "_variable_name": "gust_speed",
       "long_name": "Speed of extreme gust",
       "original_units": "km/h",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "wind_speed_of_gust",
       "units": "km h-1"
     },
@@ -164,6 +193,8 @@
       "_variable_name": "gust_hour",
       "long_name": "UTC hour of extreme gust",
       "standard_name": "hour_of_extreme_gust",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "units": "h"
     },
     "061": {
@@ -171,6 +202,8 @@
       "long_name": "RF1 global solar radiation",
       "original_units": "0.001 MJ/m",
       "scale_factor": 0.001,
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "solar_radiation_flux",
       "units": "MJ m-1"
     },
@@ -179,6 +212,8 @@
       "long_name": "RF2 sky (diffuse) radiation",
       "original_units": "0.001 MJ/m",
       "scale_factor": 277.77777777777777,
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "solar_radiation_flux",
       "units": "MJ m-1"
     },
@@ -186,6 +221,8 @@
       "_variable_name": "rf3_radiation",
       "long_name": "RF3 reflected solar radiation",
       "original_units": "0.001 MJ/m",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "scale_factor": 277.77777777777777,
       "standard_name": "solar_radiation_flux",
       "units": "MJ m-1"
@@ -196,11 +233,15 @@
       "original_units": "0.001 MJ/m",
       "scale_factor": 277.77777777777777,
       "standard_name": "solar_radiation_flux",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "units": "MJ m-1"
     },
     "067": {
       "_variable_name": "rf7_radiation",
       "long_name": "RF7 daylight illumination",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "original_units": "0.01 Kilolux_hrs",
       "scale_factor": 0.01,
       "standard_name": "solar_radiation_flux",
@@ -210,6 +251,8 @@
       "_variable_name": "rf8_radiation",
       "long_name": "RF8 direct solar radiation",
       "original_units": "0.001 MJ/m",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "scale_factor": 277.77777777777777,
       "standard_name": "solar_radiation_flux",
       "units": "W m-2 h-1"
@@ -218,6 +261,8 @@
       "_variable_name": "wind_dir_45B",
       "long_name": "Direction - 45B anemometer (8 pts)",
       "original_units": "10's of degrees",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "scale_factor": 10,
       "standard_name": "wind_to_direction",
       "units": "deg"
@@ -225,6 +270,8 @@
     "071": {
       "_variable_name": "ceiling_hgt",
       "long_name": "Ceiling height of lowest layer of clouds",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "original_units": "30's of meters",
       "scale_factor": 30,
       "standard_name": "ceiling_cloud_height",
@@ -235,6 +282,8 @@
       "long_name": "Visibility",
       "original_units": "0.1 km",
       "scale_factor": 0.1,
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "visibility_in_air",
       "units": "km"
     },
@@ -242,6 +291,8 @@
       "_variable_name": "psl",
       "long_name": "Sea Level Pressure",
       "original_units": "0.01 kPa",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "scale_factor": 0.01,
       "standard_name": "air_pressure_at_mean_sea_level",
       "units": "kPa"
@@ -249,6 +300,8 @@
     "074": {
       "_variable_name": "tds",
       "long_name": "Dew Point Temperature",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "original_units": "0.1 °C",
       "scale_factor": 0.1,
       "standard_name": "dew_point_temperature",
@@ -257,6 +310,8 @@
     "075": {
       "_variable_name": "wind_dir_u2a_16",
       "long_name": "Wind Direction at 2 m (U2A Anemometer) (16 pts)",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "original_units": "10's of degrees",
       "scale_factor": 10,
       "standard_name": "wind_direction_u2a",
@@ -264,6 +319,8 @@
     },
     "076": {
       "_variable_name": "wind_speed_u2a",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "long_name": "Wind Speed - U2A (16 pts) to December 1970",
       "original_units": "km/h",
       "scale_factor": 1,
@@ -273,6 +330,8 @@
     "077": {
       "_variable_name": "pressure",
       "long_name": "Station Pressure",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "original_units": "0.01 kPa",
       "scale_factor": 0.01,
       "standard_name": "atmospheric_pressure",
@@ -283,6 +342,8 @@
       "long_name": "Dry Bulb Temperature",
       "original_units": "0.1 °C",
       "scale_factor": 0.1,
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "dry_bulb_temperature",
       "units": "degC"
     },
@@ -291,6 +352,8 @@
       "long_name": "Wet Bulb temperature",
       "original_units": "0.1 °C",
       "scale_factor": 0.1,
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "wet_bulb_temperature",
       "units": "degC"
     },
@@ -298,6 +361,8 @@
       "_variable_name": "hur",
       "long_name": "Relative Humidity",
       "original_units": "%",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "scale_factor": 1,
       "standard_name": "relative_humidity",
       "units": "1"
@@ -306,6 +371,8 @@
       "_variable_name": "clo",
       "long_name": "Total Cloud Opacity",
       "original_units": "%",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "scale_factor": 10,
       "standard_name": "cloud_albedo",
       "units": "1"
@@ -314,6 +381,8 @@
       "_variable_name": "clt",
       "long_name": "Total Cloud Amount",
       "original_units": "%",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "scale_factor": 10,
       "standard_name": "cloud_area_fraction",
       "units": "1"
@@ -323,20 +392,26 @@
       "long_name": "Freezing Rain",
       "scale_factor": 1,
       "standard_name": "freezing_rain",
-      "units": "1"
+      "units": "1",
+      "missing_flags": "M",
+      "missing_values": "-99999"
     },
     "094": {
       "_variable_name": "ice_pellets",
       "long_name": "Ice Pellets",
       "scale_factor": 1,
       "standard_name": "ice_pellet_presence",
-      "units": "1"
+      "units": "1",
+      "missing_flags": "M",
+      "missing_values": "-99999"
     },
     "107": {
       "_variable_name": "1low_cloud_opac",
       "long_name": "Lowest cloud layer opacity",
       "original_units": "Tenths",
       "scale_factor": 10,
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "low_type_cloud_opacity_fraction",
       "units": "1"
     },
@@ -345,6 +420,8 @@
       "long_name": "Lowest cloud layer amount or condition",
       "original_units": "Tenths",
       "scale_factor": 10,
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "low_type_cloud_area_fraction",
       "units": "1"
     },
@@ -352,6 +429,8 @@
       "_variable_name": "1low_cloud_type",
       "long_name": "Lowest cloud layer type",
       "standard_name": "low_type_cloud_type",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "units": "1"
     },
     "110": {
@@ -359,6 +438,8 @@
       "long_name": "Lowest cloud layer height",
       "original_units": "30's of meters",
       "scale_factor": 30,
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "low_type_cloud_height",
       "units": "m"
     },
@@ -366,6 +447,8 @@
       "_variable_name": "2low_cloud_opac",
       "long_name": "Second lowest cloud layer opacity",
       "original_units": "Tenths",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "scale_factor": 10,
       "standard_name": "low_type_cloud_opacity_fraction",
       "units": "1"
@@ -373,6 +456,8 @@
     "112": {
       "_variable_name": "2low_cloud_frac",
       "long_name": "Second lowest cloud layer amount or condition",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "original_units": "Tenths",
       "scale_factor": 10,
       "standard_name": "low_type_cloud_area_fraction",
@@ -381,6 +466,8 @@
     "113": {
       "_variable_name": "2low_cloud_type",
       "long_name": "Second lowest cloud layer type",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "original_units": "",
       "scale_factor": 1,
       "standard_name": "low_type_cloud_type",
@@ -390,6 +477,8 @@
       "_variable_name": "2low_cloud_hgt",
       "long_name": "Second lowest cloud layer height",
       "original_units": "30's of meters",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "scale_factor": 30,
       "standard_name": "low_type_cloud_height",
       "units": "m"
@@ -398,12 +487,16 @@
       "_variable_name": "3low_cloud_opac",
       "long_name": "Thirsd lowest cloud layer opacity",
       "original_units": "Tenths",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "scale_factor": 10,
       "standard_name": "low_type_cloud_opacity_fraction",
       "units": "1"
     },
     "116": {
       "_variable_name": "3low_cloud_frac",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "long_name": "Third lowest cloud layer amount or condition",
       "original_units": "Tenths",
       "scale_factor": 10,
@@ -414,6 +507,8 @@
       "_variable_name": "3low_cloud_type",
       "long_name": "Third lowest cloud layer type",
       "original_units": "",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "scale_factor": 1,
       "standard_name": "low_type_cloud_type",
       "units": "1"
@@ -423,6 +518,8 @@
       "long_name": "Third lowest cloud layer height",
       "original_units": "30's of meters",
       "scale_factor": 30,
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "low_type_cloud_height",
       "units": "m"
     },
@@ -431,6 +528,8 @@
       "long_name": "Total Rainfall",
       "original_units": "0.1 mm",
       "scale_factor": 0.1,
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "rainfall_flux",
       "units": "mm h-1"
     },
@@ -439,6 +538,8 @@
       "long_name": "Sunshine",
       "original_units": "0.1 hrs",
       "scale_factor": 0.1,
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "duration_of_sunshine",
       "units": "h"
     },
@@ -446,6 +547,8 @@
       "_variable_name": "wind_dir_u2a_36",
       "long_name": "Wind Direction - U2A (36 pts) from January 1971",
       "original_units": "10's of degrees",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "scale_factor": 10,
       "standard_name": "wind_direction_u2a",
       "units": "deg"
@@ -455,6 +558,8 @@
       "long_name": "Total Precipitation (minutes 00-60)",
       "original_units": "0.1 mm",
       "scale_factor": 0.1,
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "precipitation_amount",
       "units": "mm"
     },
@@ -462,6 +567,8 @@
       "_variable_name": "prtot_q1",
       "long_name": "Total Precipitation (minutes 00-15)",
       "original_units": "0.1 mm",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "mm"
@@ -472,6 +579,8 @@
       "original_units": "0.1 mm",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "units": "mm"
     },
     "265": {
@@ -479,6 +588,8 @@
       "long_name": "Total Precipitation (minutes 30-45)",
       "original_units": "0.1 mm",
       "scale_factor": 0.1,
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "precipitation_amount",
       "units": "mm"
     },
@@ -486,12 +597,16 @@
       "_variable_name": "prtot_q4",
       "long_name": "Total Precipitation (minutes 45-60)",
       "original_units": "0.1 mm",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
       "units": "mm"
     },
     "267": {
       "_variable_name": "precipitation_weight_q1",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "long_name": "Precipitation Gauge Weight per Unit Area (at minute 15)",
       "original_units": "0.1 kg/m²",
       "scale_factor": 0.1,
@@ -504,6 +619,8 @@
       "original_units": "0.1 kg/m²",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "units": "kg m-2"
     },
     "269": {
@@ -512,7 +629,9 @@
       "original_units": "0.1 kg/m²",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
-      "units": "kg m-2"
+      "units": "kg m-2",
+      "missing_flags": "M",
+      "missing_values": "-99999"
     },
     "270": {
       "_variable_name": "precipitation_weight_q4",
@@ -520,11 +639,15 @@
       "original_units": "0.1 kg/m²",
       "scale_factor": 0.1,
       "standard_name": "precipitation_amount",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "units": "kg m-2"
     },
     "271": {
       "_variable_name": "wind_speed_q1",
       "long_name": "Wind Speed at 2 m (minutes 00-15)",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "original_units": "0.1 km/h",
       "scale_factor": 0.1,
       "standard_name": "wind_speed",
@@ -535,6 +658,8 @@
       "long_name": "Wind Speed at 2 m (minutes 15-30)",
       "original_units": "0.1 km/h",
       "scale_factor": 0.1,
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "wind_speed",
       "units": "km h-1"
     },
@@ -544,11 +669,15 @@
       "original_units": "0.1 km/h",
       "scale_factor": 0.1,
       "standard_name": "wind_speed",
-      "units": "km h-1"
+      "units": "km h-1",
+      "missing_flags": "M",
+      "missing_values": "-99999"
     },
     "274": {
       "_variable_name": "wind_speed_q4",
       "long_name": "Wind Speed at 2 m (minutes 45-60)",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "original_units": "0.1 km/h",
       "scale_factor": 0.1,
       "standard_name": "wind_speed",
@@ -558,6 +687,8 @@
       "_variable_name": "snd_q4",
       "long_name": "Snow Depth (at minute 60)",
       "original_units": "cm",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "surface_snow_thickness",
       "units": "cm"
     },
@@ -565,6 +696,8 @@
       "_variable_name": "snd_q1",
       "long_name": "Snow Depth (at minute 15)",
       "original_units": "cm",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "scale_factor": 1,
       "standard_name": "surface_snow_thickness",
       "units": "cm"
@@ -574,6 +707,8 @@
       "long_name": "Snow Depth (at minute 30)",
       "original_units": "cm",
       "scale_factor": 1,
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "surface_snow_thickness",
       "units": "cm"
     },
@@ -581,6 +716,8 @@
       "_variable_name": "snd_q3",
       "long_name": "Snow Depth (at minute 45)",
       "original_units": "cm",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "scale_factor": 1,
       "standard_name": "surface_snow_thickness",
       "units": "cm"
@@ -588,6 +725,8 @@
     "279": {
       "_variable_name": "wind_dir",
       "long_name": "Wind Direction at 2 m (minutes 50-60)",
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "nc_units": "deg",
       "original_units": "Degrees",
       "standard_name": "wind_direction"
@@ -597,6 +736,8 @@
       "long_name": "Wind Speed at 2 m (minutes 50-60)",
       "original_units": "0.1 km/h",
       "scale_factor": 0.1,
+      "missing_flags": "M",
+      "missing_values": "-99999",
       "standard_name": "wind_speed",
       "units": "km h-1"
     }
diff --git a/tests/test_utils.py b/tests/test_utils.py
index ce1bfca3..85aee93a 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -4,9 +4,9 @@
 from datetime import date
 from pathlib import Path
 
-import pytest  # noqa
+import pytest
 
-import miranda.eccc._utils as eccc_utils  # noqa
+import miranda.preprocess._metadata as metadata
 import miranda.utils
 
 
@@ -28,12 +28,13 @@ def test_hourly_cf_dictionaries(self):
         codes = list()
         variables = dict()
         for key in keys:
-            variables[key] = eccc_utils.cf_station_metadata(key)
-            codes.append(variables[key]["standard_name"])
-            if variables[key]["standard_name"] == "dry_bulb_temperature":
-                assert variables[key]["raw_units"] == "degC"
-                assert variables[key]["units"] == "K"
-            assert variables[key]["missing_flags"] == "M"
+            variables[key] = metadata.eccc_variable_metadata(key, "eccc-obs")
+            var_name = next(iter(variables[key]["metadata"]))
+            var_metadata = variables[key]["metadata"][var_name]
+            codes.append(var_metadata["standard_name"])
+            if var_metadata["standard_name"] == "dry_bulb_temperature":
+                assert var_metadata["units"] == "degC"
+            assert var_metadata["missing_flags"] == "M"
 
         assert set(codes) == {
             "wind_speed_u2a",
@@ -57,15 +58,17 @@ def test_daily_cf_dictionaries(self):
         codes = list()
         variables = dict()
         for key in keys:
-            variables[key] = eccc_utils.cf_station_metadata(key)
-            codes.append(variables[key]["standard_name"])
-            if variables[key]["standard_name"].startswith("air_temperature"):
-                assert variables[key]["raw_units"] == "degC"
-                assert variables[key]["units"] == "K"
-            elif variables[key]["standard_name"].endswith("precipitation_amount"):
-                assert variables[key]["raw_units"] in ["cm", "mm"]
-                assert variables[key]["units"] == "m"
-            assert variables[key]["missing_flags"] == "M"
+            variables[key] = metadata.eccc_variable_metadata(key, "eccc-obs")
+
+            var_name = next(iter(variables[key]["metadata"]))
+            var_metadata = variables[key]["metadata"][var_name]
+            codes.append(var_metadata["standard_name"])
+
+            if var_name.startswith("air_temperature"):
+                assert var_metadata["units"] == "degC"
+            elif var_name.endswith("precipitation_amount"):
+                assert var_metadata["units"] in ["cm", "mm"]
+            assert var_metadata["missing_flags"] == "M"
 
         assert set(codes) == {
             "air_temperature",

From c2e442aa323b0d0a1f425ae5bd58cd3ad2bc65a4 Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Wed, 6 Mar 2024 15:47:04 -0500
Subject: [PATCH 19/33] error handling

---
 miranda/preprocess/_metadata.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py
index 842fb92c..f39ff79d 100644
--- a/miranda/preprocess/_metadata.py
+++ b/miranda/preprocess/_metadata.py
@@ -46,7 +46,10 @@ def eccc_variable_metadata(
     if isinstance(variable_code, int):
         variable_code = str(variable_code).zfill(3)
 
-    code = find_project_variable_codes(variable_code, metadata)
+    try:
+        code = find_project_variable_codes(variable_code, metadata)
+    except KeyError:
+        raise KeyError(f"Variable code `{variable_code}` not found in metadata.")
 
     # Variable metadata
     variable_meta = metadata["variables"].get(code)

From f5fce92f45766e5a1fdc365623efaca213d30415 Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Wed, 6 Mar 2024 15:48:30 -0500
Subject: [PATCH 20/33] better error handling

---
 miranda/preprocess/_metadata.py   | 5 +----
 miranda/treatments/_dimensions.py | 4 ++++
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py
index f39ff79d..842fb92c 100644
--- a/miranda/preprocess/_metadata.py
+++ b/miranda/preprocess/_metadata.py
@@ -46,10 +46,7 @@ def eccc_variable_metadata(
     if isinstance(variable_code, int):
         variable_code = str(variable_code).zfill(3)
 
-    try:
-        code = find_project_variable_codes(variable_code, metadata)
-    except KeyError:
-        raise KeyError(f"Variable code `{variable_code}` not found in metadata.")
+    code = find_project_variable_codes(variable_code, metadata)
 
     # Variable metadata
     variable_meta = metadata["variables"].get(code)
diff --git a/miranda/treatments/_dimensions.py b/miranda/treatments/_dimensions.py
index cd56e243..134faf23 100644
--- a/miranda/treatments/_dimensions.py
+++ b/miranda/treatments/_dimensions.py
@@ -27,6 +27,10 @@ def find_project_variable_codes(code: str, configuration: dict[str, Any]) -> str
     str
     """
     variable_codes = {}
+
+    if "variables" not in configuration:
+        raise ValueError("No `variables` section found in configuration. Check JSON.")
+
     for variable_code in configuration["variables"]:
         variable_name = configuration["variables"][variable_code].get("_variable_name")
         if variable_name:

From 75e8015e1f7431b95d3f171c5258a2bf6bbf72bc Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Wed, 6 Mar 2024 15:56:38 -0500
Subject: [PATCH 21/33] import fixes

---
 miranda/convert/eccc_rdrs.py | 7 ++-----
 miranda/preprocess/eccc.py   | 2 +-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/miranda/convert/eccc_rdrs.py b/miranda/convert/eccc_rdrs.py
index e0abfbcf..cd8c63a6 100644
--- a/miranda/convert/eccc_rdrs.py
+++ b/miranda/convert/eccc_rdrs.py
@@ -11,14 +11,11 @@
 
 from miranda.io import fetch_chunk_config, write_dataset_dict
 from miranda.scripting import LOGGING_CONFIG
+from miranda.treatments import load_json_data_mappings
 from miranda.units import get_time_frequency
 
 from ._aggregation import aggregate
-from ._data_definitions import (
-    gather_eccc_rdrs,
-    gather_raw_rdrs_by_years,
-    load_json_data_mappings,
-)
+from ._data_definitions import gather_eccc_rdrs, gather_raw_rdrs_by_years
 from .corrections import dataset_conversion
 
 logging.config.dictConfig(LOGGING_CONFIG)
diff --git a/miranda/preprocess/eccc.py b/miranda/preprocess/eccc.py
index c94f9f9d..9dec2a56 100644
--- a/miranda/preprocess/eccc.py
+++ b/miranda/preprocess/eccc.py
@@ -9,7 +9,7 @@
 from pathlib import Path
 from typing import Callable
 
-from dask.distributed import ProgressBar
+from dask.diagnostics import ProgressBar
 
 from miranda.scripting import LOGGING_CONFIG
 from miranda.storage import file_size, report_file_size

From 8c4ffccc7099f4c2511393d60f363dbd78d5ee0d Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Mon, 18 Mar 2024 15:50:05 -0400
Subject: [PATCH 22/33] fix logic

---
 miranda/preprocess/_metadata.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py
index 842fb92c..db24aae1 100644
--- a/miranda/preprocess/_metadata.py
+++ b/miranda/preprocess/_metadata.py
@@ -4,7 +4,8 @@
 from typing import Any
 
 from miranda import __version__ as __miranda_version__
-from miranda.treatments import find_project_variable_codes
+
+# from miranda.treatments import find_project_variable_codes
 from miranda.treatments.utils import load_json_data_mappings
 
 __all__ = [
@@ -46,13 +47,13 @@ def eccc_variable_metadata(
     if isinstance(variable_code, int):
         variable_code = str(variable_code).zfill(3)
 
-    code = find_project_variable_codes(variable_code, metadata)
+    # code = find_project_variable_codes(variable_code, metadata)
 
     # Variable metadata
-    variable_meta = metadata["variables"].get(code)
+    variable_meta = metadata["variables"].get(variable_code)
     variable_name = variable_meta.get("_variable_name")
     if variable_name:
-        variable_meta["original_variable_name"] = variable_code
+        variable_meta["original_variable_code"] = variable_code
         variable_meta = {variable_name: variable_meta}
         del variable_meta[variable_name]["_variable_name"]
     else:

From 448ba0782919130b4087ae608157aa5f5788fa77 Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Mon, 18 Mar 2024 16:19:57 -0400
Subject: [PATCH 23/33] fix logic

---
 miranda/preprocess/_metadata.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py
index db24aae1..77a827ba 100644
--- a/miranda/preprocess/_metadata.py
+++ b/miranda/preprocess/_metadata.py
@@ -51,11 +51,14 @@ def eccc_variable_metadata(
 
     # Variable metadata
     variable_meta = metadata["variables"].get(variable_code)
-    variable_name = variable_meta.get("_variable_name")
-    if variable_name:
-        variable_meta["original_variable_code"] = variable_code
-        variable_meta = {variable_name: variable_meta}
-        del variable_meta[variable_name]["_variable_name"]
+    variable_name_fields = ["_variable_name", "_cf_variable_name"]
+    if set(variable_name_fields).issubset(variable_meta.keys()):
+        for variable_field in variable_name_fields:
+            variable_name = variable_meta.get(variable_field)
+            if variable_name:
+                variable_meta["original_variable_code"] = variable_code
+                del variable_meta[variable_field]
+                variable_meta = {variable_name: variable_meta}
     else:
         variable_meta = {variable_code: variable_meta}
 

From 53b9c35d9113f88b64d22180b3a5a23c13d11f47 Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Mon, 18 Mar 2024 16:36:00 -0400
Subject: [PATCH 24/33] fix logic

---
 miranda/preprocess/_metadata.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py
index 77a827ba..798a710d 100644
--- a/miranda/preprocess/_metadata.py
+++ b/miranda/preprocess/_metadata.py
@@ -49,8 +49,11 @@ def eccc_variable_metadata(
 
     # code = find_project_variable_codes(variable_code, metadata)
 
+    print("stuff")
+
     # Variable metadata
     variable_meta = metadata["variables"].get(variable_code)
+    variable_name = ""
     variable_name_fields = ["_variable_name", "_cf_variable_name"]
     if set(variable_name_fields).issubset(variable_meta.keys()):
         for variable_field in variable_name_fields:
@@ -61,6 +64,8 @@ def eccc_variable_metadata(
                 variable_meta = {variable_name: variable_meta}
     else:
         variable_meta = {variable_code: variable_meta}
+    if not variable_name:
+        variable_name = variable_code
 
     # Dataset metadata
     header = metadata.get("Header")

From 159fc94dfd47fd251d818cc7ff12d5267bba48f4 Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Mon, 18 Mar 2024 16:49:03 -0400
Subject: [PATCH 25/33] add files to sdist

---
 pyproject.toml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 89f43a68..2323ecd5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -185,7 +185,11 @@ include = [
   "docs/make.bat",
   "tests/*.py",
   "tox.ini",
-  "miranda"
+  "miranda",
+  "miranda/convert/configs/*.json",
+  "miranda/preprocess/configs/*.csv",
+  "miranda/preprocess/configs/*.json",
+  "miranda/structure/data/*.yml"
 ]
 exclude = [
   "*.py[co]",

From 586f4e6d3500795ae6324ff18229e10d70253a81 Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Mon, 18 Mar 2024 17:05:17 -0400
Subject: [PATCH 26/33] synchronize dependencies

---
 .github/workflows/main.yml | 4 ++--
 pyproject.toml             | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index a2eaded8..e6779597 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -40,7 +40,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install tox
         run: |
-          python -m pip install tox
+          python -m pip install "tox>=4.5.0" "pip>=23.3.0" flit
       - name: Run lint and docs testing suite
         run: |
           python -m tox -e ${{ matrix.tox-env }}
@@ -77,7 +77,7 @@ jobs:
         sudo apt-get install libgdal-dev
     - name: Install tox
       run: |
-        python -m pip install tox
+        python -m pip install "tox>=4.5.0" "pip>=23.3.0" flit
     - name: Test with tox
       run: |
         python -m tox -e ${{ matrix.tox-env }}
diff --git a/pyproject.toml b/pyproject.toml
index 2323ecd5..0a2a58d1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,7 +52,7 @@ dependencies = [
 [project.optional-dependencies]
 dev = [
   # Dev tools and testing
-  "pip >=23.1.2",
+  "pip >=23.3.0",
   "bump-my-version >=0.18.3",
   "watchdog >=3.0.0",
   "flake8 >=6.1.0",

From 6c5586ba95304bec7d6fb565af9fad8e928ab626 Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Mon, 18 Mar 2024 17:05:25 -0400
Subject: [PATCH 27/33] debugging

---
 miranda/preprocess/_metadata.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py
index 798a710d..bbe234ec 100644
--- a/miranda/preprocess/_metadata.py
+++ b/miranda/preprocess/_metadata.py
@@ -34,6 +34,8 @@ def eccc_variable_metadata(
     -------
     dict
     """
+    print(locals())
+
     if project == "eccc-ahccd":
         generation = {1: "First", 2: "Second", 3: "Third"}.get(generation)
         if not generation:
@@ -43,16 +45,18 @@ def eccc_variable_metadata(
 
     if not metadata:
         metadata = load_json_data_mappings(project)
+        print(metadata)
 
     if isinstance(variable_code, int):
         variable_code = str(variable_code).zfill(3)
 
     # code = find_project_variable_codes(variable_code, metadata)
 
-    print("stuff")
-
     # Variable metadata
     variable_meta = metadata["variables"].get(variable_code)
+    if variable_meta is None:
+        raise ValueError(f"No metadata found for variable code: {variable_code}")
+
     variable_name = ""
     variable_name_fields = ["_variable_name", "_cf_variable_name"]
     if set(variable_name_fields).issubset(variable_meta.keys()):

From e2762d396d24cb4cfb8f2959bd59b0eb05b64bba Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Mon, 18 Mar 2024 17:20:53 -0400
Subject: [PATCH 28/33] chase down bug

---
 miranda/preprocess/_metadata.py | 3 ---
 miranda/treatments/utils.py     | 6 +++---
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/miranda/preprocess/_metadata.py b/miranda/preprocess/_metadata.py
index bbe234ec..867f53e6 100644
--- a/miranda/preprocess/_metadata.py
+++ b/miranda/preprocess/_metadata.py
@@ -34,8 +34,6 @@ def eccc_variable_metadata(
     -------
     dict
     """
-    print(locals())
-
     if project == "eccc-ahccd":
         generation = {1: "First", 2: "Second", 3: "Third"}.get(generation)
         if not generation:
@@ -45,7 +43,6 @@ def eccc_variable_metadata(
 
     if not metadata:
         metadata = load_json_data_mappings(project)
-        print(metadata)
 
     if isinstance(variable_code, int):
         variable_code = str(variable_code).zfill(3)
diff --git a/miranda/treatments/utils.py b/miranda/treatments/utils.py
index 9e49bc63..e6f332a6 100644
--- a/miranda/treatments/utils.py
+++ b/miranda/treatments/utils.py
@@ -51,11 +51,11 @@ def load_json_data_mappings(
 
         configurations = {}
         for configuration in config_folder.glob("*attrs.json"):
-            project = str(configuration.stem).split("_")[0]
+            project_config = str(configuration.stem).split("_")[0]
             if "|" in project:
-                for p in project.split("|"):
+                for p in project_config.split("|"):
                     configurations[p] = configuration
-            configurations[project] = configuration
+            configurations[project_config] = configuration
 
     if project in configurations.keys():
         config_file = configurations[project]

From d20be40dc45aaada37726df9244eff25d9fc9679 Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Tue, 19 Mar 2024 11:20:56 -0400
Subject: [PATCH 29/33] update attr treatments

---
 miranda/convert/corrections.py | 27 +++++++++++++--------------
 miranda/treatments/__init__.py | 25 ++++++++++++-------------
 2 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/miranda/convert/corrections.py b/miranda/convert/corrections.py
index 0fcce7d4..7745cc42 100644
--- a/miranda/convert/corrections.py
+++ b/miranda/convert/corrections.py
@@ -3,9 +3,8 @@
 from __future__ import annotations
 
 import datetime
-import os
+import pathlib
 from functools import partial
-from pathlib import Path
 from typing import Callable, Iterator, Sequence
 
 import xarray as xr
@@ -27,7 +26,7 @@
 )
 from miranda.treatments.utils import load_json_data_mappings
 
-CONFIG_FOLDER = Path(__file__).parent / "data"
+CONFIG_FOLDER = pathlib.Path(__file__).parent / "data"
 CONFIG_FILES = {
     "EMDNA": "emdna_cf_attrs.json",
     "ESPO-G6-E5L": "espo-g6-e5l_attrs.json",
@@ -89,9 +88,9 @@ def dataset_corrections(ds: xr.Dataset, project: str) -> xr.Dataset:
 def dataset_conversion(
     input_files: (
         str
-        | os.PathLike
-        | Sequence[str | os.PathLike]
-        | Iterator[os.PathLike]
+        | pathlib.Path
+        | Sequence[str | pathlib.Path]
+        | Iterator[pathlib.Path]
         | xr.Dataset
     ),
     project: str,
@@ -107,7 +106,7 @@ def dataset_conversion(
 
     Parameters
     ----------
-    input_files : str or os.PathLike or Sequence[str or os.PathLike] or Iterator[os.PathLike] or xr.Dataset
+    input_files : str or pathlib.Path or Sequence[str or pathlib.Path] or Iterator[pathlib.Path] or xr.Dataset
         Files or objects to be converted.
         If sent a list or GeneratorType, will open with :py:func:`xarray.open_mfdataset` and concatenate files.
     project : {"cordex", "cmip5", "cmip6", "ets-grnch", "isimip-ft", "pcic-candcs-u6", "converted"}
@@ -136,15 +135,15 @@ def dataset_conversion(
     if isinstance(input_files, xr.Dataset):
         ds = input_files
     else:
-        if isinstance(input_files, (str, os.PathLike)):
-            if Path(input_files).is_dir():
+        if isinstance(input_files, (str, pathlib.Path)):
+            if pathlib.Path(input_files).is_dir():
                 files = []
-                files.extend([f for f in Path(input_files).glob("*.nc")])
-                files.extend([f for f in Path(input_files).glob("*.zarr")])
+                files.extend([f for f in pathlib.Path(input_files).glob("*.nc")])
+                files.extend([f for f in pathlib.Path(input_files).glob("*.zarr")])
             else:
-                files = [Path(input_files)]
+                files = [pathlib.Path(input_files)]
         elif isinstance(input_files, (Sequence, Iterator)):
-            files = [Path(f) for f in input_files]
+            files = [pathlib.Path(f) for f in input_files]
         else:
             files = input_files
         version_hashes = dict()
@@ -175,7 +174,7 @@ def dataset_conversion(
     if domain:
         ds = subset_domain(ds, domain)
 
-    if isinstance(mask, (str, Path)):
+    if isinstance(mask, (str, pathlib.Path)):
         mask = xr.open_dataset(mask)
     if isinstance(mask, (xr.Dataset, xr.DataArray)):
         if regrid:
diff --git a/miranda/treatments/__init__.py b/miranda/treatments/__init__.py
index 11e62fd2..91255248 100644
--- a/miranda/treatments/__init__.py
+++ b/miranda/treatments/__init__.py
@@ -5,7 +5,7 @@
 import datetime
 import logging.config
 
-import xarray as xr
+import xarray
 
 from miranda import __version__ as __miranda_version__
 from miranda.scripting import LOGGING_CONFIG
@@ -19,7 +19,7 @@
 VERSION = datetime.datetime.now().strftime("%Y.%m.%d")
 
 
-def metadata_conversion(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
+def metadata_conversion(d: xarray.Dataset, p: str, m: dict) -> xarray.Dataset:
     """Update xarray dataset and data_vars with project-specific metadata fields.
 
     Parameters
@@ -67,36 +67,35 @@ def metadata_conversion(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
         del m["Header"]["_frequency"]
 
     # Conditional handling of global attributes based on project name
-    for field in [f for f in header if f.startswith("_")]:
+    for field in [f for f in header.keys() if f.startswith("_")]:
         if isinstance(header[field], list):
             if p in header[field]:
-                attr_treatment = header[field][p]
+                attr_treatments = header[field][p]
             else:
                 logging.warning(
                     f"Attribute handling (`{field}`) not set for project `{p}`. Continuing..."
                 )
                 continue
         elif isinstance(header[field], dict):
-            attr_treatment = header[field]
+            attr_treatments = header[field]
         else:
             raise AttributeError(
                 f"Attribute treatment configuration for field `{field}` is not properly configured. Verify JSON."
             )
 
+        if field[1:] in d.attrs:
+            logging.warning(f"Overwriting `{field[1:]}` based on JSON configuration.")
         if field == "_map_attrs":
-            for attribute, mapping in attr_treatment.items():
+            for attribute, mapping in attr_treatments.items():
                 header[mapping] = d.attrs[attribute]
                 del d.attrs[attribute]
         elif field == "_remove_attrs":
-            for ff in attr_treatment:
+            for ff in attr_treatments:
                 del d.attrs[ff]
+        elif field.startswith("_") and p in attr_treatments:
+            header[field[1:]] = attr_treatments[p]
         else:
-            if field[1:] in d.attrs:
-                logging.warning(
-                    f"Overwriting `{field[1:]}` based on JSON configuration."
-                )
-            header[field[1:]] = attr_treatment
-
+            header[field[1:]] = attr_treatments
         del header[field]
 
     # Add global attributes

From 74e7949773a060362e74ead1c1d67b451d6bf49f Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Tue, 19 Mar 2024 11:23:33 -0400
Subject: [PATCH 30/33] fixes

---
 miranda/io/_input.py              | 2 +-
 templates/restructure_datasets.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/miranda/io/_input.py b/miranda/io/_input.py
index d9b0141b..e91992a5 100644
--- a/miranda/io/_input.py
+++ b/miranda/io/_input.py
@@ -50,7 +50,7 @@ def discover_data(
                 input_files = sorted(list(input_files.glob(f"*.{suffix}")))
             else:
                 input_files = input_files.rglob(f"*.{suffix}")
-        if input_files.is_file():
+        elif input_files.is_file():
             logging.warning(
                 "Data discovery yielded a single file. Casting to `list[Path]`."
             )
diff --git a/templates/restructure_datasets.py b/templates/restructure_datasets.py
index d10fa8dc..f0d45ee9 100644
--- a/templates/restructure_datasets.py
+++ b/templates/restructure_datasets.py
@@ -17,5 +17,5 @@
         guess=False,
         method="copy",
         make_dirs=True,
-        filename_pattern="*.zarr",
+        suffix="zarr",
     )

From 42b957dc62f8a93a3a87e553895552bd7ab76b65 Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Tue, 19 Mar 2024 12:55:33 -0400
Subject: [PATCH 31/33] fix folder name

---
 miranda/convert/corrections.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/miranda/convert/corrections.py b/miranda/convert/corrections.py
index 7745cc42..321b4cc7 100644
--- a/miranda/convert/corrections.py
+++ b/miranda/convert/corrections.py
@@ -26,7 +26,7 @@
 )
 from miranda.treatments.utils import load_json_data_mappings
 
-CONFIG_FOLDER = pathlib.Path(__file__).parent / "data"
+CONFIG_FOLDER = pathlib.Path(__file__).parent / "configs"
 CONFIG_FILES = {
     "EMDNA": "emdna_cf_attrs.json",
     "ESPO-G6-E5L": "espo-g6-e5l_attrs.json",

From 288582f95dd0628489d9463fa25bcccf4c239fbf Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Wed, 27 Mar 2024 14:48:44 -0400
Subject: [PATCH 32/33] add support for new `h` freq

---
 miranda/convert/_data_definitions.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/miranda/convert/_data_definitions.py b/miranda/convert/_data_definitions.py
index bf69606b..91713a62 100644
--- a/miranda/convert/_data_definitions.py
+++ b/miranda/convert/_data_definitions.py
@@ -86,6 +86,7 @@
 # Manually map xarray frequencies to CMIP6/CMIP5 controlled vocabulary.
 # see: https://github.com/ES-DOC/pyessv-archive
 xarray_frequencies_to_cmip6like = {
+    "h": "hr",
     "H": "hr",
     "D": "day",
     "W": "sem",

From 55ee882e3992aeb170ee90ec62e37457f8bea368 Mon Sep 17 00:00:00 2001
From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com>
Date: Wed, 27 Mar 2024 14:48:56 -0400
Subject: [PATCH 33/33] dependencies

---
 environment-dev.yml              | 3 +--
 miranda/treatments/_variables.py | 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/environment-dev.yml b/environment-dev.yml
index 3127306d..c425ef3d 100644
--- a/environment-dev.yml
+++ b/environment-dev.yml
@@ -33,7 +33,7 @@ dependencies:
   - xesmf
   - zarr
   # Dev tools and testing
-  - pip >=23.1.2
+  - pip >=23.3.0
   - bump-my-version >=0.18.3
   - watchdog >=3.0.0
   - flake8 >=6.1.0
@@ -48,7 +48,6 @@ dependencies:
   - blackdoc ==0.3.9
   - isort ==5.13.2
   - pre-commit >=3.3.2
-  - pip
   - pip:
       - coverage >=6.2.0,<7.0.0
       - coveralls >=3.3.1
diff --git a/miranda/treatments/_variables.py b/miranda/treatments/_variables.py
index 5991e4c7..ff96b343 100644
--- a/miranda/treatments/_variables.py
+++ b/miranda/treatments/_variables.py
@@ -185,12 +185,12 @@ def clip_values(d: xr.Dataset, p: str, m: dict) -> xr.Dataset:
     converted = []
     for vv in d.data_vars:
         if vv in m["variables"].keys():
-            clip_values = _get_section_entry_key(m, "variables", vv, key, p)
+            clip_vals = _get_section_entry_key(m, "variables", vv, key, p)
             if clip_values:
                 min_value, max_value = None, None
                 # Gather unit conversion context, if applicable
-                context = clip_values.get("context", None)
-                for op, value in clip_values.items():
+                context = clip_vals.get("context", None)
+                for op, value in clip_vals.items():
                     if op == "min":
                         min_value = xclim.core.units.convert_units_to(
                             value, d[vv], context