From d5081d949da30d237f20de34cc8a9b8e1bb9fdd0 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Mon, 25 Sep 2023 16:11:51 -0400 Subject: [PATCH 01/12] lazy save_to_netcdf --- xscen/io.py | 44 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/xscen/io.py b/xscen/io.py index 6bde0cc4..b859ae3f 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -274,12 +274,24 @@ def clean_incomplete(path: Union[str, os.PathLike], complete: Sequence[str]) -> sh.rmtree(fold) +def _coerce_attrs(attrs): + """Ensure no funky objects in attrs.""" + for k in list(attrs.keys()): + if not ( + isinstance(attrs[k], (str, float, int, np.ndarray)) + or isinstance(attrs[k], (tuple, list)) + and isinstance(attrs[k][0], (str, float, int)) + ): + attrs[k] = str(attrs[k]) + + @parse_config def save_to_netcdf( ds: xr.Dataset, filename: str, *, rechunk: Optional[dict] = None, + compute: bool = True, netcdf_kwargs: Optional[dict] = None, ) -> None: """Save a Dataset to NetCDF, rechunking if requested. @@ -295,6 +307,8 @@ def save_to_netcdf( Spatial dimensions can be generalized as 'X' and 'Y', which will be mapped to the actual grid type's dimension names. Rechunking is only done on *data* variables sharing dimensions with this argument. + compute : bool + Whether to start the computation or return a delayed object. netcdf_kwargs : dict, optional Additional arguments to send to_netcdf() @@ -317,21 +331,11 @@ def save_to_netcdf( netcdf_kwargs.setdefault("engine", "h5netcdf") netcdf_kwargs.setdefault("format", "NETCDF4") - # Ensure no funky objects in attrs: - def coerce_attrs(attrs): - for k in attrs.keys(): - if not ( - isinstance(attrs[k], (str, float, int, np.ndarray)) - or isinstance(attrs[k], (tuple, list)) - and isinstance(attrs[k][0], (str, float, int)) - ): - attrs[k] = str(attrs[k]) - - coerce_attrs(ds.attrs) + _coerce_attrs(ds.attrs) for var in ds.variables.values(): - coerce_attrs(var.attrs) + _coerce_attrs(var.attrs) - ds.to_netcdf(filename, **netcdf_kwargs) + return ds.to_netcdf(filename, compute=compute, **netcdf_kwargs) @parse_config @@ -438,19 +442,9 @@ def _skip(var): if len(ds.data_vars) == 0: return None - # Ensure no funky objects in attrs: - def coerce_attrs(attrs): - for k in list(attrs.keys()): - if not ( - isinstance(attrs[k], (str, float, int, np.ndarray)) - or isinstance(attrs[k], (tuple, list)) - and isinstance(attrs[k][0], (str, float, int)) - ): - attrs[k] = str(attrs[k]) - - coerce_attrs(ds.attrs) + _coerce_attrs(ds.attrs) for var in ds.variables.values(): - coerce_attrs(var.attrs) + _coerce_attrs(var.attrs) if itervar: zarr_kwargs["compute"] = True From ba8d219f770a9cb99cee06cce59f213209966f00 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Tue, 26 Sep 2023 16:28:52 -0400 Subject: [PATCH 02/12] Complete version --- xscen/io.py | 224 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 223 insertions(+), 1 deletion(-) diff --git a/xscen/io.py b/xscen/io.py index b859ae3f..06e0575d 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -4,11 +4,12 @@ import shutil as sh from collections.abc import Sequence from pathlib import Path -from typing import Optional, Union +from typing import List, Optional, Union import h5py import netCDF4 import numpy as np +import pandas as pd import xarray as xr import zarr from rechunker import rechunk as _rechunk @@ -487,6 +488,227 @@ def _skip(var): raise +def _to_dataframe( + data: xr.DataArray, + index: list[str], + column: list[str], + coords: list[str], + coords_dims: dict, +): + """Convert a DataArray to a DataFrame with support for MultiColumn.""" + df = data.to_dataframe() + if not column: + # Fast track for the easy case where xarray's default is already what we want. + return df + df_data = ( + df[[data.name]] + .reset_index() + .pivot(index=index, columns=column) + .droplevel(None, axis=1) + ) + dfs = [] + for v in coords: + drop_cols = [c for c in column if c not in coords_dims[v]] + cols = [c for c in column if c in coords_dims[v]] + dfc = ( + df[[v]] + .reset_index() + .drop(columns=drop_cols) + .pivot(index=index, columns=cols) + ) + cols = dfc.columns + # The "None" level has the aux coord name we want it either at the same level as variable, or at lowest missing level otherwise. + varname_lvl = "variable" if "variable" in drop_cols else drop_cols[-1] + cols = cols.rename( + varname_lvl + if not isinstance(cols, pd.MultiIndex) + else [nm or varname_lvl for nm in cols.name] + ) + if isinstance(df_data.columns, pd.MultiIndex) or isinstance( + cols, pd.MultiIndex + ): + # handle different depth of multicolumns, expand MultiCol of coord with None for missing levels. + cols = pd.MultiIndex.from_arrays( + [ + cols.get_level_values(lvl) if lvl in cols.names else [None] + for lvl in df_data.columns.names + ] + ) + dfc.columns = cols + dfs.append( + dfc[~dfc.index.duplicated()] + ) # We dropped columns thus the index is not unique anymore + dfs.append(df_data) + return pd.concat(dfs, axis=1) + + +def to_table( + ds: Union[xr.Dataset, xr.DataArray], + *, + index: Union[None, str, Sequence[str]] = None, + column: Union[None, str, Sequence[str]] = None, + sheet: Union[None, str, Sequence[str]] = None, + coords: Union[bool, Sequence[str]] = True, +) -> Union[pd.DataFrame, dict]: + """Convert a dataset to a pandas DataFrame with support for multicolumns and multisheet. + + This function will trigger a computation of the dataset. + + Parameters + ---------- + ds : xr.Dataset or xr.DataArray + Dataset or DataArray to be saved. + If a Dataset with more than one variable is given, the dimension "variable" + must appear in one of `index`, `column` or `sheet`. + index : str or sequence of str, optional + Name of the dimension(s) to use as index. + Default is all data dimensions. + column : str or sequence of str, optional + Name of the dimension(s) to use as index. + Default is "variable", i.e. the name of the variable(s). + sheet : str or sequence of str, optional + Name of the dimension(s) to use as sheet names. + coords: bool or sequence of str + A list of auxiliary coordinates to add to the columns (as would variables). + If True, all (if any) are added. + + Returns + ------- + pd.DataFrame or dict + DataFrame with a MultiIndex with levels `index` and MultiColumn with levels `column`. + If `sheet` is given, the output is dictionary with keys for each unique "sheet" dimensions tuple, values are DataFrames. + """ + if isinstance(ds, xr.Dataset): + da = ds.to_array(name="data") + if len(ds) == 1: + da = da.isel(variable=0).rename(data=da.variable.values[0]) + + def _ensure_list(seq): + if isinstance(seq, str): + return [seq] + return list(seq) + + index = _ensure_list(index or (set(da.dims) - {"variable"})) + column = _ensure_list(column or (["variable"] if len(ds) > 1 else [])) + sheet = _ensure_list(sheet or []) + + needed_dims = index + column + sheet + if len(set(needed_dims)) != len(needed_dims): + raise ValueError( + f"Repeated dimension names. Got index={index}, column={column} and sheet={sheet}." + "Each dimension should appear only once." + ) + if set(needed_dims) != set(da.dims): + raise ValueError( + f"Passed index, column and sheet do not match available dimensions. Got {needed_dims}, data has {da.dims}." + ) + + coords = coords or [] + if coords is not True: + drop = set(ds.coords.keys()) - set(da.dims) - set(coords) + da = da.drop_vars(drop) + else: + coords = list(set(ds.coords.keys()) - set(da.dims)) + if len(coords) > 1 and "variable" in index: + raise NotImplementedError( + "Keeping auxiliary coords is implemented when 'variable' is in the index." + ) + + table_kwargs = dict( + index=index, + column=column, + coords=coords, + coords_dims={c: ds[c].dims for c in coords}, + ) + if sheet: + out = {} + das = da.stack(sheet=sheet) + for elem in das.sheet: + out[elem.item()] = _to_dataframe( + das.sel(sheet=elem, drop=True), **table_kwargs + ) + return out + return _to_dataframe(da, **table_kwargs) + + +TABLE_FORMATS = {".csv": "csv", ".xls": "excel", ".xlsx": "excel"} + + +def save_to_table( + ds: Union[xr.Dataset, xr.DataArray], + filename: str, + output_format: Optional[str] = None, + *, + index: Union[None, str, Sequence[str]] = None, + column: Union[None, str, Sequence[str]] = "variable", + sheet: Union[None, str, Sequence[str]] = None, + coords: Union[bool, Sequence[str]] = True, + sep: str = "_", + **kwargs, +): + """Save the dataset to a tabular file (csv, excel, ...). + + This function will trigger a computation of the dataset. + + Parameters + ---------- + ds : xr.Dataset or xr.DataArray + Dataset or DataArray to be saved. + If a Dataset with more than one variable is given, the dimension "variable" + must appear in one of `index`, `column` or `sheet`. + filename : str + Name of the file to be saved. + output_format: {'csv', 'excel', ...}, optional + The output format. If None (default), it is inferred + from the extension of `filename`. Not all possible output format are supported for inference. + Valid values are any that matches a :py:class:`pandas.DataFrame` method like "df.to_{format}". + index : str or sequence of str, optional + Name of the dimension(s) to use as index. + Default is all data dimensions. + column : str or sequence of str, optional + Name of the dimension(s) to use as index. + Default is "variable", i.e. the name of the variable(s). + sheet : str or sequence of str, optional + Name of the dimension(s) to use as sheet names. + Only valid if the output format is excel. + coords: bool or sequence of str + A list of auxiliary coordinates to add to the columns (as would variables). + If True, all (if any) are added. + sep : str + For output formats other than excel and for sheet names, + index, column and sheet names from multiple dimensions are + constructed by concatenating values with this separator. + kwargs: + Other arguments passed to the panda function. + """ + filename = Path(filename) + + if output_format is None: + output_format = TABLE_FORMATS.get(filename.suffix) + if output_format is None: + raise ValueError( + f"Output format could not be inferred from filename {filename.name}. Please pass `output_format`." + ) + + if sheet is not None and output_format != "excel": + raise ValueError( + f"Argument `sheet` is only valid with excel as the output format. Got {output_format}." + ) + + out = to_table(ds, index=index, column=column, sheet=sheet, coords=coords) + + if sheet: + with pd.ExcelWriter(filename, engine=kwargs.get("engine")) as writer: + for sheet_name, df in out.items(): + df.to_excel(writer, sheet_name=sep.join(sheet_name), **kwargs) + else: + if isinstance(out.columns, pd.MultiIndex): + out.columns = out.columns.map(lambda lvls: sep.join(map(str, lvls))) + if isinstance(out.index, pd.MultiIndex): + out.index = out.index.map(lambda lvls: sep.join(map(str, lvls))) + getattr(out, f"to_{output_format}")(filename, **kwargs) + + def rechunk_for_saving(ds, rechunk): """Rechunk before saving to .zarr or .nc, generalized as Y/X for different axes lat/lon, rlat/rlon. From e3c9fd81ab213443ea435543ff58d3ad4b456510 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Tue, 26 Sep 2023 16:30:21 -0400 Subject: [PATCH 03/12] upd hist --- HISTORY.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/HISTORY.rst b/HISTORY.rst index 56c6c41b..75b1ccef 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -15,6 +15,7 @@ New features and enhancements * Added the ability to search for simulations that reach a given warming level. (:pull:`251`). * ``xs.spatial_mean`` now accepts the ``region="global"`` keyword to perform a global average (:issue:`94`, :pull:`260`). * ``xs.spatial_mean`` with ``method='xESMF'`` will also automatically segmentize polygons (down to a 1° resolution) to ensure a correct average (:pull:`260`). +* ``xs.io.save_to_table`` and ``xs.io.to_table`` to transform datasets and arrays to DataFrames, but with support for multi-columns and multi-sheets. Breaking changes ^^^^^^^^^^^^^^^^ From ed97cd81476befb20e2ae1b3461378b94803d0ac Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Thu, 28 Sep 2023 11:52:03 -0400 Subject: [PATCH 04/12] Add a test - season sort key --- tests/test_io.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++ xscen/io.py | 16 ++++++++++------ xscen/utils.py | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 6 deletions(-) diff --git a/tests/test_io.py b/tests/test_io.py index 6fa6efaa..ef9359f3 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -1,5 +1,6 @@ import numpy as np import pytest +import xarray as xr import xscen as xs @@ -66,3 +67,49 @@ def test_variables(self, datablock_3d): for v in ds_ch.data_vars: for dim, chunks in zip(list(ds.dims), ds_ch[v].chunks): assert chunks[0] == new_chunks[v][dim] + + +class TestToTable: + ds = xs.utils.unstack_dates( + xr.merge( + [ + xs.testing.datablock_3d( + np.random.random_sample((20, 3, 2)), + v, + "lon", + 0, + "lat", + 0, + 1, + 1, + "1993-01-01", + "QS-JAN", + ) + for v in ["tas", "pr", "snw"] + ] + ) + .stack(site=["lat", "lon"]) + .reset_index("site") + .assign_coords(site=list("abcdef")) + ).transpose("season", "time", "site") + + def test_normal(self): + # Default + tab = xs.io.to_table(self.ds) + assert tab.shape == (120, 5) # 3 vars + 2 aux coords + assert tab.columns.names == ["variable"] + assert tab.index.names == ["season", "time", "site"] + # Season order is chronological, rather than alphabetical + assert tab.xs("1993", level="time").xs( + "a", level="site" + ).index.get_level_values("season") == ["JFM", "AMJ", "JAS", "OND"] + + # Variable in the index, thus no coords + tab = xs.io.to_table( + self.ds, index=["time", "variable"], column=["season", "site"], coords=False + ) + assert tab.shape == (15, 24) + assert tab.columns.names == ["season", "site"] + np.testing.assert_array_equal( + tab.loc[("1993", "pr"), ("JFM",)], self.ds.pr.sel(time="1993", season="JFM") + ) diff --git a/xscen/io.py b/xscen/io.py index 06e0575d..11cb8d2f 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -4,7 +4,7 @@ import shutil as sh from collections.abc import Sequence from pathlib import Path -from typing import List, Optional, Union +from typing import Optional, Union import h5py import netCDF4 @@ -17,7 +17,7 @@ from .config import parse_config from .scripting import TimeoutException -from .utils import translate_time_chunk +from .utils import season_sort_key, translate_time_chunk logger = logging.getLogger(__name__) @@ -27,10 +27,12 @@ "estimate_chunks", "get_engine", "rechunk", + "rechunk_for_saving", + "save_to_table", "save_to_netcdf", "save_to_zarr", "subset_maxsize", - "rechunk_for_saving", + "to_table", ] @@ -532,14 +534,15 @@ def _to_dataframe( [ cols.get_level_values(lvl) if lvl in cols.names else [None] for lvl in df_data.columns.names - ] + ], + names=df_data.columns.names, ) dfc.columns = cols dfs.append( dfc[~dfc.index.duplicated()] ) # We dropped columns thus the index is not unique anymore dfs.append(df_data) - return pd.concat(dfs, axis=1) + return pd.concat(dfs, axis=1).sort_index(level=index, key=season_sort_key) def to_table( @@ -577,6 +580,7 @@ def to_table( pd.DataFrame or dict DataFrame with a MultiIndex with levels `index` and MultiColumn with levels `column`. If `sheet` is given, the output is dictionary with keys for each unique "sheet" dimensions tuple, values are DataFrames. + The DataFrames are always sorted with level priority as given in `index` and in ascending order,. """ if isinstance(ds, xr.Dataset): da = ds.to_array(name="data") @@ -611,7 +615,7 @@ def _ensure_list(seq): coords = list(set(ds.coords.keys()) - set(da.dims)) if len(coords) > 1 and "variable" in index: raise NotImplementedError( - "Keeping auxiliary coords is implemented when 'variable' is in the index." + "Keeping auxiliary coords is not implemented when 'variable' is in the index. Pass `coords=False` or put 'variable' in `column` instead." ) table_kwargs = dict( diff --git a/xscen/utils.py b/xscen/utils.py index cf1e2571..0ad5a790 100644 --- a/xscen/utils.py +++ b/xscen/utils.py @@ -1238,3 +1238,39 @@ def standardize_periods(periods, multiple=True): f"'period' should be a single instance of [start, end], received {len(periods)}." ) return periods[0] + + +def season_sort_key(idx: pd.Index, name: str = None): + """Get a proper sort key for a "season" or "month" index to avoid alphabetical sorting. + + If any of the values in the index is not recognized as a 3-letter + season code or a 3-letter month abbreviation, the operation is + aborted and the index is returned untouched. + DJF is the first season of the year. + + Parameters + ---------- + idx : pd.Index + Any array that implements a `map` method. + If name is "month", index elements are expected to be 3-letter month abbreviations, uppercase (JAN, FEB, etc). + If name is "season", index elements are expected to be 3-letter season abbreviations, uppercase (DJF, AMJ, OND, etc.) + If anything else, the index is returned untouched. + name : str, optional + The index name. By default, the `name` attribute of the index is used, if present. + + Returns + ------- + idx : Integer sort key for months and seasons, the input index untouched otherwise. + """ + try: + if (name or getattr(idx, "name", None)) == "season": + m = "DJFMAMJJASONDJ" + return idx.map(m.index) + if (name or getattr(idx, "name", None)) == "month": + m = list(xr.coding.cftime_offsets._MONTH_ABBREVIATIONS.values()) + return idx.map(m.index) + except (ValueError, TypeError): + # ValueError if string not in seasons, or value not in months + # TypeError if season element was not a string. + pass + return idx From 3d47138b20ef10a12e416425bfc7d5899c6eaf60 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Thu, 28 Sep 2023 12:25:31 -0400 Subject: [PATCH 05/12] Rename index to row to avoid name collision with pandas args --- tests/test_io.py | 2 +- xscen/io.py | 80 ++++++++++++++++++++++++++---------------------- 2 files changed, 44 insertions(+), 38 deletions(-) diff --git a/tests/test_io.py b/tests/test_io.py index ef9359f3..5b7f3d1a 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -106,7 +106,7 @@ def test_normal(self): # Variable in the index, thus no coords tab = xs.io.to_table( - self.ds, index=["time", "variable"], column=["season", "site"], coords=False + self.ds, row=["time", "variable"], column=["season", "site"], coords=False ) assert tab.shape == (15, 24) assert tab.columns.names == ["season", "site"] diff --git a/xscen/io.py b/xscen/io.py index 11cb8d2f..dcf163e2 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -492,7 +492,7 @@ def _skip(var): def _to_dataframe( data: xr.DataArray, - index: list[str], + row: list[str], column: list[str], coords: list[str], coords_dims: dict, @@ -505,7 +505,7 @@ def _to_dataframe( df_data = ( df[[data.name]] .reset_index() - .pivot(index=index, columns=column) + .pivot(index=row, columns=column) .droplevel(None, axis=1) ) dfs = [] @@ -513,10 +513,7 @@ def _to_dataframe( drop_cols = [c for c in column if c not in coords_dims[v]] cols = [c for c in column if c in coords_dims[v]] dfc = ( - df[[v]] - .reset_index() - .drop(columns=drop_cols) - .pivot(index=index, columns=cols) + df[[v]].reset_index().drop(columns=drop_cols).pivot(index=row, columns=cols) ) cols = dfc.columns # The "None" level has the aux coord name we want it either at the same level as variable, or at lowest missing level otherwise. @@ -542,13 +539,13 @@ def _to_dataframe( dfc[~dfc.index.duplicated()] ) # We dropped columns thus the index is not unique anymore dfs.append(df_data) - return pd.concat(dfs, axis=1).sort_index(level=index, key=season_sort_key) + return pd.concat(dfs, axis=1).sort_index(level=row, key=season_sort_key) def to_table( ds: Union[xr.Dataset, xr.DataArray], *, - index: Union[None, str, Sequence[str]] = None, + row: Union[None, str, Sequence[str]] = None, column: Union[None, str, Sequence[str]] = None, sheet: Union[None, str, Sequence[str]] = None, coords: Union[bool, Sequence[str]] = True, @@ -562,12 +559,12 @@ def to_table( ds : xr.Dataset or xr.DataArray Dataset or DataArray to be saved. If a Dataset with more than one variable is given, the dimension "variable" - must appear in one of `index`, `column` or `sheet`. - index : str or sequence of str, optional - Name of the dimension(s) to use as index. + must appear in one of `row`, `column` or `sheet`. + row : str or sequence of str, optional + Name of the dimension(s) to use as indexes (rows). Default is all data dimensions. column : str or sequence of str, optional - Name of the dimension(s) to use as index. + Name of the dimension(s) to use as columns. Default is "variable", i.e. the name of the variable(s). sheet : str or sequence of str, optional Name of the dimension(s) to use as sheet names. @@ -578,9 +575,9 @@ def to_table( Returns ------- pd.DataFrame or dict - DataFrame with a MultiIndex with levels `index` and MultiColumn with levels `column`. + DataFrame with a MultiIndex with levels `row` and MultiColumn with levels `column`. If `sheet` is given, the output is dictionary with keys for each unique "sheet" dimensions tuple, values are DataFrames. - The DataFrames are always sorted with level priority as given in `index` and in ascending order,. + The DataFrames are always sorted with level priority as given in `row` and in ascending order,. """ if isinstance(ds, xr.Dataset): da = ds.to_array(name="data") @@ -592,19 +589,19 @@ def _ensure_list(seq): return [seq] return list(seq) - index = _ensure_list(index or (set(da.dims) - {"variable"})) + row = _ensure_list(row or (set(da.dims) - {"variable"})) column = _ensure_list(column or (["variable"] if len(ds) > 1 else [])) sheet = _ensure_list(sheet or []) - needed_dims = index + column + sheet + needed_dims = row + column + sheet if len(set(needed_dims)) != len(needed_dims): raise ValueError( - f"Repeated dimension names. Got index={index}, column={column} and sheet={sheet}." + f"Repeated dimension names. Got row={row}, column={column} and sheet={sheet}." "Each dimension should appear only once." ) if set(needed_dims) != set(da.dims): raise ValueError( - f"Passed index, column and sheet do not match available dimensions. Got {needed_dims}, data has {da.dims}." + f"Passed row, column and sheet do not match available dimensions. Got {needed_dims}, data has {da.dims}." ) coords = coords or [] @@ -613,13 +610,14 @@ def _ensure_list(seq): da = da.drop_vars(drop) else: coords = list(set(ds.coords.keys()) - set(da.dims)) - if len(coords) > 1 and "variable" in index: + if len(coords) > 1 and ("variable" in row or "variable" in sheet): raise NotImplementedError( - "Keeping auxiliary coords is not implemented when 'variable' is in the index. Pass `coords=False` or put 'variable' in `column` instead." + "Keeping auxiliary coords is not implemented when 'variable' is in the row or in the sheets." + "Pass `coords=False` or put 'variable' in `column` instead." ) table_kwargs = dict( - index=index, + row=row, column=column, coords=coords, coords_dims={c: ds[c].dims for c in coords}, @@ -643,11 +641,12 @@ def save_to_table( filename: str, output_format: Optional[str] = None, *, - index: Union[None, str, Sequence[str]] = None, + row: Union[None, str, Sequence[str]] = None, column: Union[None, str, Sequence[str]] = "variable", sheet: Union[None, str, Sequence[str]] = None, coords: Union[bool, Sequence[str]] = True, - sep: str = "_", + col_sep: str = "_", + row_sep: str = None, **kwargs, ): """Save the dataset to a tabular file (csv, excel, ...). @@ -659,18 +658,18 @@ def save_to_table( ds : xr.Dataset or xr.DataArray Dataset or DataArray to be saved. If a Dataset with more than one variable is given, the dimension "variable" - must appear in one of `index`, `column` or `sheet`. + must appear in one of `row`, `column` or `sheet`. filename : str Name of the file to be saved. output_format: {'csv', 'excel', ...}, optional The output format. If None (default), it is inferred from the extension of `filename`. Not all possible output format are supported for inference. Valid values are any that matches a :py:class:`pandas.DataFrame` method like "df.to_{format}". - index : str or sequence of str, optional - Name of the dimension(s) to use as index. + row : str or sequence of str, optional + Name of the dimension(s) to use as indexes (rows). Default is all data dimensions. column : str or sequence of str, optional - Name of the dimension(s) to use as index. + Name of the dimension(s) to use as columns. Default is "variable", i.e. the name of the variable(s). sheet : str or sequence of str, optional Name of the dimension(s) to use as sheet names. @@ -678,10 +677,11 @@ def save_to_table( coords: bool or sequence of str A list of auxiliary coordinates to add to the columns (as would variables). If True, all (if any) are added. - sep : str - For output formats other than excel and for sheet names, - index, column and sheet names from multiple dimensions are - constructed by concatenating values with this separator. + col_sep : str, + Multi-columns (except in excel) and sheet names are concatenated with this separator. + row_sep : str, optional + Multi-index names are concatenated with this separator, except in excel. + If None (default), each level is written in its own column. kwargs: Other arguments passed to the panda function. """ @@ -699,17 +699,23 @@ def save_to_table( f"Argument `sheet` is only valid with excel as the output format. Got {output_format}." ) - out = to_table(ds, index=index, column=column, sheet=sheet, coords=coords) + out = to_table(ds, row=row, column=column, sheet=sheet, coords=coords) if sheet: with pd.ExcelWriter(filename, engine=kwargs.get("engine")) as writer: for sheet_name, df in out.items(): - df.to_excel(writer, sheet_name=sep.join(sheet_name), **kwargs) + df.to_excel(writer, sheet_name=col_sep.join(sheet_name), **kwargs) else: - if isinstance(out.columns, pd.MultiIndex): - out.columns = out.columns.map(lambda lvls: sep.join(map(str, lvls))) - if isinstance(out.index, pd.MultiIndex): - out.index = out.index.map(lambda lvls: sep.join(map(str, lvls))) + if output_format != "excel" and isinstance(out.columns, pd.MultiIndex): + out.columns = out.columns.map(lambda lvls: col_sep.join(map(str, lvls))) + if ( + output_format != "excel" + and row_sep is not None + and isinstance(out.index, pd.MultiIndex) + ): + new_name = row_sep.join(out.index.names) + out.index = out.index.map(lambda lvls: row_sep.join(map(str, lvls))) + out.index.name = new_name getattr(out, f"to_{output_format}")(filename, **kwargs) From 951a694319bb6fae6eb3f68fe249ff355ca34e4c Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Thu, 28 Sep 2023 15:09:28 -0400 Subject: [PATCH 06/12] keep order --- xscen/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xscen/io.py b/xscen/io.py index dcf163e2..d4c007ea 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -589,7 +589,7 @@ def _ensure_list(seq): return [seq] return list(seq) - row = _ensure_list(row or (set(da.dims) - {"variable"})) + row = _ensure_list(row or (list(da.dims) - {"variable"})) column = _ensure_list(column or (["variable"] if len(ds) > 1 else [])) sheet = _ensure_list(sheet or []) From c0df03c20d5264a0450540ac7a508e92afd9ca1e Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Thu, 28 Sep 2023 15:26:44 -0400 Subject: [PATCH 07/12] coding while in a meeting is impolite --- xscen/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xscen/io.py b/xscen/io.py index d4c007ea..c090c374 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -589,7 +589,7 @@ def _ensure_list(seq): return [seq] return list(seq) - row = _ensure_list(row or (list(da.dims) - {"variable"})) + row = _ensure_list(row or ([d for d in da.dims if d != "variable"])) column = _ensure_list(column or (["variable"] if len(ds) > 1 else [])) sheet = _ensure_list(sheet or []) From 99df3e2e080a2a45801cd9f9d7db3b8160dbc709 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Thu, 28 Sep 2023 15:35:24 -0400 Subject: [PATCH 08/12] fix test --- tests/test_io.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_io.py b/tests/test_io.py index 5b7f3d1a..c8083246 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -100,9 +100,12 @@ def test_normal(self): assert tab.columns.names == ["variable"] assert tab.index.names == ["season", "time", "site"] # Season order is chronological, rather than alphabetical - assert tab.xs("1993", level="time").xs( - "a", level="site" - ).index.get_level_values("season") == ["JFM", "AMJ", "JAS", "OND"] + np.testing.assert_array_equal( + tab.xs("1993", level="time") + .xs("a", level="site") + .index.get_level_values("season"), + ["JFM", "AMJ", "JAS", "OND"], + ) # Variable in the index, thus no coords tab = xs.io.to_table( From 9dbb8af7caae13dcc5cbedaf8d67fef20b0a19ed Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 29 Sep 2023 11:56:25 -0400 Subject: [PATCH 09/12] Add simple toc generation --- xscen/data/fr/LC_MESSAGES/xscen.po | 26 +++++++++++--- xscen/io.py | 58 +++++++++++++++++++++++++++--- 2 files changed, 75 insertions(+), 9 deletions(-) diff --git a/xscen/data/fr/LC_MESSAGES/xscen.po b/xscen/data/fr/LC_MESSAGES/xscen.po index 34f5a8cd..e7d76e0a 100644 --- a/xscen/data/fr/LC_MESSAGES/xscen.po +++ b/xscen/data/fr/LC_MESSAGES/xscen.po @@ -7,7 +7,7 @@ msgid "" msgstr "" "Project-Id-Version: xscen 0.6.18b0\n" "Report-Msgid-Bugs-To: Rondeau-Genesse.Gabriel@ouranos.ca\n" -"POT-Creation-Date: 2023-09-08 15:17-0400\n" +"POT-Creation-Date: 2023-09-29 11:45-0400\n" "PO-Revision-Date: 2023-08-15 16:48-0400\n" "Last-Translator: Pascal Bourgault \n" "Language: fr\n" @@ -18,18 +18,34 @@ msgstr "" "Content-Transfer-Encoding: 8bit\n" "Generated-By: Babel 2.12.1\n" -#: xscen/aggregate.py:184 +#: xscen/aggregate.py:185 msgid "{window}-year mean of {attr}." msgstr "Moyenne {window} ans de {attr}." -#: xscen/aggregate.py:318 +#: xscen/aggregate.py:319 msgid "{attr1}: {kind} delta compared to {refhoriz}." msgstr "{attr1}: Delta {kind} comparé à {refhoriz}." -#: xscen/diagnostics.py:500 +#: xscen/diagnostics.py:501 msgid "Ranking of measure performance" msgstr "Classement de performance de la mesure" -#: xscen/diagnostics.py:559 +#: xscen/diagnostics.py:560 msgid "Fraction of improved grid cells" msgstr "Fraction de points de grille améliorés" + +#: xscen/io.py:650 +msgid "Variable" +msgstr "Variable" + +#: xscen/io.py:650 +msgid "Description" +msgstr "Description" + +#: xscen/io.py:650 +msgid "Units" +msgstr "Unités" + +#: xscen/io.py:654 +msgid "Content" +msgstr "Contenu" diff --git a/xscen/io.py b/xscen/io.py index c090c374..8ae86f9b 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -14,10 +14,12 @@ import zarr from rechunker import rechunk as _rechunk from xclim.core.calendar import get_calendar +from xclim.core.options import METADATA_LOCALES +from xclim.core.options import OPTIONS as XC_OPTIONS from .config import parse_config from .scripting import TimeoutException -from .utils import season_sort_key, translate_time_chunk +from .utils import TRANSLATOR, season_sort_key, translate_time_chunk logger = logging.getLogger(__name__) @@ -26,6 +28,7 @@ "clean_incomplete", "estimate_chunks", "get_engine", + "make_toc", "rechunk", "rechunk_for_saving", "save_to_table", @@ -633,6 +636,36 @@ def _ensure_list(seq): return _to_dataframe(da, **table_kwargs) +def make_toc(ds: Union[xr.Dataset, xr.DataArray], loc: str = None) -> pd.DataFrame: + """Make a table of content describing a dataset's variables. + + This return a simple DataFrame with variable names as index, the long_name as "description" and units. + Column names and long names are taken from the activated locale if found, otherwise the english version is taken. + """ + if loc is None: + loc = (XC_OPTIONS[METADATA_LOCALES] or ["en"])[0] + locsuf = "" if loc == "en" else f"_{loc}" + _ = TRANSLATOR[loc] # Combine translation and gettext parsing (like it usually is) + + if isinstance(ds, xr.DataArray): + ds = ds.to_dataset() + + toc = pd.DataFrame.from_records( + [ + { + _("Variable"): vv, + _("Description"): da.attrs.get( + f"long_name{locsuf}", da.attrs.get("long_name") + ), + _("Units"): da.attrs.get("units"), + } + for vv, da in ds.data_vars.items() + ], + ).set_index(_("Variable")) + toc.attrs["name"] = _("Content") + return toc + + TABLE_FORMATS = {".csv": "csv", ".xls": "excel", ".xlsx": "excel"} @@ -647,6 +680,7 @@ def save_to_table( coords: Union[bool, Sequence[str]] = True, col_sep: str = "_", row_sep: str = None, + add_toc: Union[bool, pd.DataFrame] = False, **kwargs, ): """Save the dataset to a tabular file (csv, excel, ...). @@ -682,8 +716,13 @@ def save_to_table( row_sep : str, optional Multi-index names are concatenated with this separator, except in excel. If None (default), each level is written in its own column. + add_toc : bool or DataFrame + A table of content to add as the first sheet. Only valid if the output format is excel. + If True, :py:func:`make_toc` is used to generate the toc. + The sheet name of the toc can be given through the "name" attribute of the DataFrame, otherwise "Content" is used. kwargs: - Other arguments passed to the panda function. + Other arguments passed to the pandas function. + If the output format is excel and multiple sheets are requested, "engine" will be passed to :py:class:`pandas.ExcelWriter`. """ filename = Path(filename) @@ -698,11 +737,22 @@ def save_to_table( raise ValueError( f"Argument `sheet` is only valid with excel as the output format. Got {output_format}." ) + if add_toc is not False and output_format != "excel": + raise ValueError( + f"A TOC was requested, but the output format is not Excel. Got {output_format}." + ) out = to_table(ds, row=row, column=column, sheet=sheet, coords=coords) - if sheet: - with pd.ExcelWriter(filename, engine=kwargs.get("engine")) as writer: + if add_toc is not False: + if not sheet: + out = {("data",): out} + if add_toc is True: + add_toc = make_toc(ds) + out = {(add_toc.attrs.get("name", "Content"),): add_toc, **out} + + if sheet or (add_toc is not False): + with pd.ExcelWriter(filename, engine=kwargs.pop("engine", None)) as writer: for sheet_name, df in out.items(): df.to_excel(writer, sheet_name=col_sep.join(sheet_name), **kwargs) else: From 30254590e5a51ac49e8a79d8f4698bcb88ede847 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 29 Sep 2023 12:24:50 -0400 Subject: [PATCH 10/12] Better dim guessing - get engine kwargs from kwargs --- xscen/io.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/xscen/io.py b/xscen/io.py index 8ae86f9b..5bb876ae 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -3,6 +3,7 @@ import os import shutil as sh from collections.abc import Sequence +from inspect import signature from pathlib import Path from typing import Optional, Union @@ -592,9 +593,18 @@ def _ensure_list(seq): return [seq] return list(seq) - row = _ensure_list(row or ([d for d in da.dims if d != "variable"])) - column = _ensure_list(column or (["variable"] if len(ds) > 1 else [])) - sheet = _ensure_list(sheet or []) + passed_dims = set().union( + _ensure_list(row or []), _ensure_list(column or []), _ensure_list(sheet or []) + ) + if row is None: + row = [d for d in da.dims if d != "variable" and d not in passed_dims] + row = _ensure_list(row) + if column is None: + column = ["variable"] if len(ds) > 1 and "variable" not in passed_dims else [] + column = _ensure_list(column) + if sheet is None: + sheet = [] + sheet = _ensure_list(sheet) needed_dims = row + column + sheet if len(set(needed_dims)) != len(needed_dims): @@ -722,7 +732,7 @@ def save_to_table( The sheet name of the toc can be given through the "name" attribute of the DataFrame, otherwise "Content" is used. kwargs: Other arguments passed to the pandas function. - If the output format is excel and multiple sheets are requested, "engine" will be passed to :py:class:`pandas.ExcelWriter`. + If the output format is excel, kwargs to :py:class:`pandas.ExcelWriter` can be given here as well. """ filename = Path(filename) @@ -752,7 +762,12 @@ def save_to_table( out = {(add_toc.attrs.get("name", "Content"),): add_toc, **out} if sheet or (add_toc is not False): - with pd.ExcelWriter(filename, engine=kwargs.pop("engine", None)) as writer: + engine_kwargs = {} # Extract engine kwargs + for arg in signature(pd.ExcelWriter).parameters: + if arg in kwargs: + engine_kwargs[arg] = kwargs.pop(arg) + + with pd.ExcelWriter(filename, **engine_kwargs) as writer: for sheet_name, df in out.items(): df.to_excel(writer, sheet_name=col_sep.join(sheet_name), **kwargs) else: From 4383bc8dfb689b2e580382f51ed7c59a11b69e06 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 29 Sep 2023 14:15:30 -0400 Subject: [PATCH 11/12] Add save_to_table to top pkg - more details in hist --- HISTORY.rst | 2 +- xscen/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 75b1ccef..24b8f090 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -15,7 +15,7 @@ New features and enhancements * Added the ability to search for simulations that reach a given warming level. (:pull:`251`). * ``xs.spatial_mean`` now accepts the ``region="global"`` keyword to perform a global average (:issue:`94`, :pull:`260`). * ``xs.spatial_mean`` with ``method='xESMF'`` will also automatically segmentize polygons (down to a 1° resolution) to ensure a correct average (:pull:`260`). -* ``xs.io.save_to_table`` and ``xs.io.to_table`` to transform datasets and arrays to DataFrames, but with support for multi-columns and multi-sheets. +* ``xs.save_to_table`` and ``xs.io.to_table`` to transform datasets and arrays to DataFrames, but with support for multi-columns, multi-sheets and localized table of content generation. Breaking changes ^^^^^^^^^^^^^^^^ diff --git a/xscen/__init__.py b/xscen/__init__.py index 6d8b718e..002f2a61 100644 --- a/xscen/__init__.py +++ b/xscen/__init__.py @@ -36,7 +36,7 @@ subset_warming_level, ) from .indicators import compute_indicators # noqa -from .io import save_to_netcdf, save_to_zarr # noqa +from .io import save_to_netcdf, save_to_table, save_to_zarr # noqa from .reduce import build_reduction_data, reduce_ensemble from .regrid import * from .scripting import ( From db8786c513eccedde7251799d715d7f3d2223a69 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Mon, 2 Oct 2023 11:35:40 -0400 Subject: [PATCH 12/12] All single string in `coords` --- xscen/io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xscen/io.py b/xscen/io.py index 5bb876ae..87235767 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -552,7 +552,7 @@ def to_table( row: Union[None, str, Sequence[str]] = None, column: Union[None, str, Sequence[str]] = None, sheet: Union[None, str, Sequence[str]] = None, - coords: Union[bool, Sequence[str]] = True, + coords: Union[bool, str, Sequence[str]] = True, ) -> Union[pd.DataFrame, dict]: """Convert a dataset to a pandas DataFrame with support for multicolumns and multisheet. @@ -572,7 +572,7 @@ def to_table( Default is "variable", i.e. the name of the variable(s). sheet : str or sequence of str, optional Name of the dimension(s) to use as sheet names. - coords: bool or sequence of str + coords: bool or str or sequence of str A list of auxiliary coordinates to add to the columns (as would variables). If True, all (if any) are added. @@ -617,8 +617,8 @@ def _ensure_list(seq): f"Passed row, column and sheet do not match available dimensions. Got {needed_dims}, data has {da.dims}." ) - coords = coords or [] if coords is not True: + coords = _ensure_list(coords or []) drop = set(ds.coords.keys()) - set(da.dims) - set(coords) da = da.drop_vars(drop) else: