From 5f4e92585df40ec7e1971cfb8f5bcc0e891d9f77 Mon Sep 17 00:00:00 2001 From: Elias Massoud Date: Sun, 11 Jan 2026 09:37:27 -0600 Subject: [PATCH] Updated convert script to get netcdf files from daily (_DD_) Ameriflux csv files --- .../Diurnal/convert_AMF_to_netcdf_daily.py | 122 ++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 Ameriflux/Diurnal/convert_AMF_to_netcdf_daily.py diff --git a/Ameriflux/Diurnal/convert_AMF_to_netcdf_daily.py b/Ameriflux/Diurnal/convert_AMF_to_netcdf_daily.py new file mode 100644 index 0000000..7034047 --- /dev/null +++ b/Ameriflux/Diurnal/convert_AMF_to_netcdf_daily.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +""" +Convert AmeriFlux daily CSV files (_DD_) to CF-compliant NetCDF files. + +- Only keeps variables that work with daily data: + GPP (GPP_DT_VUT_REF), PR, TAS +- Radiation, RECO, and NEE variables are skipped +- Output is stored in _output folder +- Site names are extracted from the original CSV filenames +""" + +import numpy as np +import pandas as pd +import xarray as xr +from pathlib import Path + +# ---------------------------- +# Configuration +# ---------------------------- +RAW_DIR = Path("_raw") +OUTPUT_DIR = Path("_output") +OUTPUT_DIR.mkdir(exist_ok=True) + +# ---------------------------- +# Variable mapping +# (long_name, short_name, CSV column, units) +# ---------------------------- +VARMAP = [ + ("gross_primary_productivity", "gpp", "GPP_DT_VUT_REF", "gC m-2 d-1"), + # ("net_ecosystem_exchange", "nee", "NEE_VUT_REF", "gC m-2 d-1"), # Not used + # ("ecosystem_respiration", "reco", "RECO_VUT_REF", "gC m-2 d-1"), # Not used + ("precipitation", "pr", "P_F", "mm d-1"), + ("surface_air_temperature", "tas", "TA_F", "K"), + # Radiation variables skipped for daily processing + # ("surface_downward_shortwave_radiation", "rsds", "SW_IN_F", "W m-2"), + # ("surface_upward_shortwave_radiation", "rsus", "SW_OUT", "W m-2"), + # ("surface_downward_longwave_radiation", "rlds", "LW_IN_F", "W m-2"), + # ("surface_upward_longwave_radiation", "rlus", "LW_OUT", "W m-2"), +] + +# ---------------------------- +# Read all daily CSVs +# ---------------------------- +data_by_var = {v[1]: [] for v in VARMAP} +site_names = [] +time_index = None + +print("Reading daily CSV files...") + +for csvfile in sorted(RAW_DIR.glob("*_DD*.csv")): + # Extract site name from filename + # If filename is ICs_2025_DD.csv → site = ICs + # If filename is AMF_ICs_DD_2025.csv → site = ICs + stem_parts = csvfile.stem.split("_") + if stem_parts[0] == "AMF": # if there's an "AMF_" prefix + site = stem_parts[1] + else: + site = stem_parts[0] + + print(f" → {site}") + + df = pd.read_csv(csvfile) + + # ---------------------------- + # Time parsing + # ---------------------------- + if "TIMESTAMP" in df.columns: + df["time"] = pd.to_datetime(df["TIMESTAMP"], format="%Y%m%d") + elif "TIMESTAMP_START" in df.columns: + df["time"] = pd.to_datetime(df["TIMESTAMP_START"], format="%Y%m%d") + else: + raise ValueError(f"No recognizable timestamp column in {csvfile.name}") + + df = df.set_index("time").sort_index() + + if time_index is None: + time_index = df.index + + # ---------------------------- + # Extract variables + # ---------------------------- + for long_name, short, amf_col, units in VARMAP: + if amf_col not in df.columns: + print(f" ⚠️ Missing {amf_col} at {site}, filling with NaNs") + values = np.full(len(time_index), np.nan) + else: + values = df.reindex(time_index)[amf_col].to_numpy() + data_by_var[short].append(values) + + site_names.append(site) + +# ---------------------------- +# Create NetCDF files by variable +# ---------------------------- +print("\nCreating NetCDF files in _output/ ...") +for long_name, short, amf_col, units in VARMAP: + data_array = np.vstack(data_by_var[short]).T # shape: (time, site) + + da = xr.DataArray( + data_array, + dims=("time", "site"), + coords={ + "time": time_index, + "site": site_names, + }, + name=short, + attrs={ + "long_name": long_name, + "units": units, + "source": "AmeriFlux daily _DD_ CSV", + "product": "VUT_REF" if "VUT" in amf_col else "standard", + }, + ) + + ds = da.to_dataset() + + outfile = OUTPUT_DIR / f"{short}_daily.nc" + ds.to_netcdf(outfile) + print(f" ✔ Saved {outfile}") + +print("\nAll done!") +