Skip to content

Commit

Permalink
✨ write counts per bin for Fig. 4a, make general fct
Browse files Browse the repository at this point in the history
- generalized code used for Fig. 2g
  • Loading branch information
Henry committed Apr 10, 2024
1 parent 51a6b63 commit a677cce
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 19 deletions.
42 changes: 42 additions & 0 deletions project/01_0_split_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1351,6 +1351,18 @@
"vaep.savefig(ax.get_figure(), fname)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ed941a81",
"metadata": {},
"outputs": [],
"source": [
"counts_per_bin = vaep.pandas.get_counts_per_bin(df=splits_df, bins=bins)\n",
"counts_per_bin.to_excel(fname.with_suffix('.xlsx'))\n",
"counts_per_bin"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -1373,6 +1385,36 @@
"vaep.savefig(ax.get_figure(), fname)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eede54fd",
"metadata": {},
"outputs": [],
"source": [
"# Save binned counts"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e6e1f6b",
"metadata": {},
"outputs": [],
"source": [
"counts_per_bin = dict()\n",
"for col in splits_df.columns:\n",
" _series = (pd.cut(splits_df[col], bins=bins)\n",
" .to_frame()\n",
" .groupby(col)\n",
" .size())\n",
" _series.index.name = 'bin'\n",
" counts_per_bin[col] = _series\n",
"counts_per_bin = pd.DataFrame(counts_per_bin)\n",
"counts_per_bin.to_excel(fname.with_suffix('.xlsx'))\n",
"counts_per_bin"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
21 changes: 21 additions & 0 deletions project/01_0_split_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,6 +857,11 @@ def join_as_str(seq):
figures[fname.name] = fname
vaep.savefig(ax.get_figure(), fname)

# %%
counts_per_bin = vaep.pandas.get_counts_per_bin(df=splits_df, bins=bins)
counts_per_bin.to_excel(fname.with_suffix('.xlsx'))
counts_per_bin

# %%
ax = splits_df.drop('train', axis=1).plot.hist(bins=bins,
xticks=list(bins),
Expand All @@ -873,6 +878,22 @@ def join_as_str(seq):
figures[fname.name] = fname
vaep.savefig(ax.get_figure(), fname)

# %%
# Save binned counts

# %%
counts_per_bin = dict()
for col in splits_df.columns:
_series = (pd.cut(splits_df[col], bins=bins)
.to_frame()
.groupby(col)
.size())
_series.index.name = 'bin'
counts_per_bin[col] = _series
counts_per_bin = pd.DataFrame(counts_per_bin)
counts_per_bin.to_excel(fname.with_suffix('.xlsx'))
counts_per_bin

# %% [markdown]
# plot training data missing plots

Expand Down
13 changes: 4 additions & 9 deletions project/01_2_performance_plots.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -929,15 +929,10 @@
"metadata": {},
"outputs": [],
"source": [
"counts_per_bin = dict()\n",
"for col in [TARGET_COL, *ORDER_MODELS[:top_n]]:\n",
" _series = (pd.cut(pred_test[col], bins=bins)\n",
" .to_frame()\n",
" .groupby(col)\n",
" .size())\n",
" _series.index.name = 'bin'\n",
" counts_per_bin[col] = _series\n",
"counts_per_bin = pd.DataFrame(counts_per_bin)\n",
"counts_per_bin = vaep.pandas.get_counts_per_bin(df=pred_test,\n",
" bins=bins,\n",
" columns=[TARGET_COL, *ORDER_MODELS[:top_n]])\n",
"\n",
"counts_per_bin.to_excel(fname.with_suffix('.xlsx'))\n",
"counts_per_bin"
]
Expand Down
13 changes: 4 additions & 9 deletions project/01_2_performance_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,15 +477,10 @@ def build_text(s):
vaep.savefig(fig, name=fname)

# %%
counts_per_bin = dict()
for col in [TARGET_COL, *ORDER_MODELS[:top_n]]:
_series = (pd.cut(pred_test[col], bins=bins)
.to_frame()
.groupby(col)
.size())
_series.index.name = 'bin'
counts_per_bin[col] = _series
counts_per_bin = pd.DataFrame(counts_per_bin)
counts_per_bin = vaep.pandas.get_counts_per_bin(df=pred_test,
bins=bins,
columns=[TARGET_COL, *ORDER_MODELS[:top_n]])

counts_per_bin.to_excel(fname.with_suffix('.xlsx'))
counts_per_bin

Expand Down
18 changes: 17 additions & 1 deletion vaep/pandas/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import collections.abc
from collections import namedtuple
from types import SimpleNamespace
from typing import Iterable
from typing import Iterable, List, Optional

import numpy as np
import omegaconf
Expand Down Expand Up @@ -283,3 +283,19 @@ def get_lower_whiskers(df: pd.DataFrame, factor: float = 1.5) -> pd.Series:
iqr = ret.loc['75%'] - ret.loc['25%']
ret = ret.loc['25%'] - iqr * factor
return ret


def get_counts_per_bin(df: pd.DataFrame, bins: range, columns: Optional[List[str]] = None):
"""Return counts per bin for selected columns in DataFrame."""
counts_per_bin = dict()
if columns is None:
columns = df.columns.to_list()
for col in columns:
_series = (pd.cut(df[col], bins=bins)
.to_frame()
.groupby(col)
.size())
_series.index.name = 'bin'
counts_per_bin[col] = _series
counts_per_bin = pd.DataFrame(counts_per_bin)
return counts_per_bin

0 comments on commit a677cce

Please sign in to comment.