From a677cceb2adf000cd706e31d43c2b1999b8a3428 Mon Sep 17 00:00:00 2001 From: Henry Date: Wed, 10 Apr 2024 12:51:10 +0200 Subject: [PATCH] :sparkles: write counts per bin for Fig. 4a, make general fct - generalized code used for Fig. 2g --- project/01_0_split_data.ipynb | 42 ++++++++++++++++++++++++++++ project/01_0_split_data.py | 21 ++++++++++++++ project/01_2_performance_plots.ipynb | 13 +++------ project/01_2_performance_plots.py | 13 +++------ vaep/pandas/__init__.py | 18 +++++++++++- 5 files changed, 88 insertions(+), 19 deletions(-) diff --git a/project/01_0_split_data.ipynb b/project/01_0_split_data.ipynb index 29dcb55d7..35fde68b8 100644 --- a/project/01_0_split_data.ipynb +++ b/project/01_0_split_data.ipynb @@ -1351,6 +1351,18 @@ "vaep.savefig(ax.get_figure(), fname)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed941a81", + "metadata": {}, + "outputs": [], + "source": [ + "counts_per_bin = vaep.pandas.get_counts_per_bin(df=splits_df, bins=bins)\n", + "counts_per_bin.to_excel(fname.with_suffix('.xlsx'))\n", + "counts_per_bin" + ] + }, { "cell_type": "code", "execution_count": null, @@ -1373,6 +1385,36 @@ "vaep.savefig(ax.get_figure(), fname)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "eede54fd", + "metadata": {}, + "outputs": [], + "source": [ + "# Save binned counts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e6e1f6b", + "metadata": {}, + "outputs": [], + "source": [ + "counts_per_bin = dict()\n", + "for col in splits_df.columns:\n", + " _series = (pd.cut(splits_df[col], bins=bins)\n", + " .to_frame()\n", + " .groupby(col)\n", + " .size())\n", + " _series.index.name = 'bin'\n", + " counts_per_bin[col] = _series\n", + "counts_per_bin = pd.DataFrame(counts_per_bin)\n", + "counts_per_bin.to_excel(fname.with_suffix('.xlsx'))\n", + "counts_per_bin" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/project/01_0_split_data.py b/project/01_0_split_data.py index c01198a44..9b4061096 100644 --- a/project/01_0_split_data.py +++ b/project/01_0_split_data.py @@ -857,6 +857,11 @@ def join_as_str(seq): figures[fname.name] = fname vaep.savefig(ax.get_figure(), fname) +# %% +counts_per_bin = vaep.pandas.get_counts_per_bin(df=splits_df, bins=bins) +counts_per_bin.to_excel(fname.with_suffix('.xlsx')) +counts_per_bin + # %% ax = splits_df.drop('train', axis=1).plot.hist(bins=bins, xticks=list(bins), @@ -873,6 +878,22 @@ def join_as_str(seq): figures[fname.name] = fname vaep.savefig(ax.get_figure(), fname) +# %% +# Save binned counts + +# %% +counts_per_bin = dict() +for col in splits_df.columns: + _series = (pd.cut(splits_df[col], bins=bins) + .to_frame() + .groupby(col) + .size()) + _series.index.name = 'bin' + counts_per_bin[col] = _series +counts_per_bin = pd.DataFrame(counts_per_bin) +counts_per_bin.to_excel(fname.with_suffix('.xlsx')) +counts_per_bin + # %% [markdown] # plot training data missing plots diff --git a/project/01_2_performance_plots.ipynb b/project/01_2_performance_plots.ipynb index c7b03e4ac..263026456 100644 --- a/project/01_2_performance_plots.ipynb +++ b/project/01_2_performance_plots.ipynb @@ -929,15 +929,10 @@ "metadata": {}, "outputs": [], "source": [ - "counts_per_bin = dict()\n", - "for col in [TARGET_COL, *ORDER_MODELS[:top_n]]:\n", - " _series = (pd.cut(pred_test[col], bins=bins)\n", - " .to_frame()\n", - " .groupby(col)\n", - " .size())\n", - " _series.index.name = 'bin'\n", - " counts_per_bin[col] = _series\n", - "counts_per_bin = pd.DataFrame(counts_per_bin)\n", + "counts_per_bin = vaep.pandas.get_counts_per_bin(df=pred_test,\n", + " bins=bins,\n", + " columns=[TARGET_COL, *ORDER_MODELS[:top_n]])\n", + "\n", "counts_per_bin.to_excel(fname.with_suffix('.xlsx'))\n", "counts_per_bin" ] diff --git a/project/01_2_performance_plots.py b/project/01_2_performance_plots.py index 8beb9cd81..80f3f7c02 100644 --- a/project/01_2_performance_plots.py +++ b/project/01_2_performance_plots.py @@ -477,15 +477,10 @@ def build_text(s): vaep.savefig(fig, name=fname) # %% -counts_per_bin = dict() -for col in [TARGET_COL, *ORDER_MODELS[:top_n]]: - _series = (pd.cut(pred_test[col], bins=bins) - .to_frame() - .groupby(col) - .size()) - _series.index.name = 'bin' - counts_per_bin[col] = _series -counts_per_bin = pd.DataFrame(counts_per_bin) +counts_per_bin = vaep.pandas.get_counts_per_bin(df=pred_test, + bins=bins, + columns=[TARGET_COL, *ORDER_MODELS[:top_n]]) + counts_per_bin.to_excel(fname.with_suffix('.xlsx')) counts_per_bin diff --git a/vaep/pandas/__init__.py b/vaep/pandas/__init__.py index e433e6763..ffaa60b17 100644 --- a/vaep/pandas/__init__.py +++ b/vaep/pandas/__init__.py @@ -1,7 +1,7 @@ import collections.abc from collections import namedtuple from types import SimpleNamespace -from typing import Iterable +from typing import Iterable, List, Optional import numpy as np import omegaconf @@ -283,3 +283,19 @@ def get_lower_whiskers(df: pd.DataFrame, factor: float = 1.5) -> pd.Series: iqr = ret.loc['75%'] - ret.loc['25%'] ret = ret.loc['25%'] - iqr * factor return ret + + +def get_counts_per_bin(df: pd.DataFrame, bins: range, columns: Optional[List[str]] = None): + """Return counts per bin for selected columns in DataFrame.""" + counts_per_bin = dict() + if columns is None: + columns = df.columns.to_list() + for col in columns: + _series = (pd.cut(df[col], bins=bins) + .to_frame() + .groupby(col) + .size()) + _series.index.name = 'bin' + counts_per_bin[col] = _series + counts_per_bin = pd.DataFrame(counts_per_bin) + return counts_per_bin