Skip to content

Commit

Permalink
🎨 add feat name to initial plots, write error information per median …
Browse files Browse the repository at this point in the history
…bin of features
  • Loading branch information
Henry committed Apr 10, 2024
1 parent 559afaf commit 51a6b63
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 122 deletions.
109 changes: 30 additions & 79 deletions project/01_2_performance_plots.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -231,14 +231,17 @@
},
"outputs": [],
"source": [
"fig, axes = plt.subplots(1, 2, sharey=True)\n",
"fig, axes = plt.subplots(1, 2, sharey=True, sharex=True)\n",
"\n",
"vaep.plotting.data.plot_observations(data.val_y.unstack(), ax=axes[0],\n",
" title='Validation split', size=1)\n",
" title='Validation split', size=1, xlabel='')\n",
"vaep.plotting.data.plot_observations(data.test_y.unstack(), ax=axes[1],\n",
" title='Test split', size=1)\n",
"\n",
" title='Test split', size=1, xlabel='')\n",
"fig.suptitle(\"Simulated missing values per sample\", size=8)\n",
"# hide axis and use only for common x label\n",
"fig.add_subplot(111, frameon=False)\n",
"plt.tick_params(labelcolor='none', which='both', top=False, bottom=False, left=False, right=False)\n",
"plt.xlabel(f'Samples ordered by identified {data.val_y.index.names[-1]}')\n",
"group = 1\n",
"fname = args.out_figures / f'2_{group}_fake_na_val_test_splits.png'\n",
"figures[fname.stem] = fname\n",
Expand Down Expand Up @@ -278,7 +281,9 @@
"outputs": [],
"source": [
"prop = freq_feat / len(data.train_X.index.levels[0])\n",
"prop.sort_values().to_frame().plot()"
"prop.sort_values().to_frame().plot(\n",
" xlabel=f'{data.val_y.index.names[-1]}',\n",
" ylabel='Proportion of identification in samples')"
]
},
{
Expand Down Expand Up @@ -551,6 +556,7 @@
"execution_count": null,
"id": "a2440887-b5f2-45a1-90cd-d15ef9bfa0a7",
"metadata": {
"lines_to_next_cell": 2,
"tags": []
},
"outputs": [],
Expand All @@ -561,39 +567,6 @@
"TOP_N_ORDER"
]
},
{
"cell_type": "markdown",
"id": "dd6818ca-460a-4f14-be85-c4309057e161",
"metadata": {},
"source": [
"### Correlation overall"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3aa7831e-ebf3-4de4-af6c-c4b2a8b00373",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"pred_val_corr = pred_val.corr()\n",
"ax = (pred_val_corr\n",
" .loc[TARGET_COL, ORDER_MODELS]\n",
" .plot\n",
" .bar(\n",
" # title='Correlation between Fake NA and model predictions on validation data',\n",
" ylabel='correlation overall'))\n",
"ax = vaep.plotting.add_height_to_barplot(ax)\n",
"ax.set_xticklabels(ax.get_xticklabels(), rotation=45,\n",
" horizontalalignment='right')\n",
"fname = args.out_figures / f'2_{group}_pred_corr_val_overall.pdf'\n",
"figures[fname.stem] = fname\n",
"vaep.savefig(ax.get_figure(), name=fname)\n",
"pred_val_corr"
]
},
{
"cell_type": "markdown",
"id": "0ac5f058-c580-4676-83c8-768bdb30f526",
Expand Down Expand Up @@ -925,7 +898,7 @@
" COLORS_TO_USE[:top_n],\n",
" axes):\n",
"\n",
" ax, _ = vaep.plotting.data.plot_histogram_intensities(\n",
" ax, bins = vaep.plotting.data.plot_histogram_intensities(\n",
" pred_test[TARGET_COL],\n",
" color='grey',\n",
" min_max=min_max,\n",
Expand All @@ -949,36 +922,24 @@
"vaep.savefig(fig, name=fname)"
]
},
{
"cell_type": "markdown",
"id": "116a7b7e",
"metadata": {},
"source": [
"### Correlation overall"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b42efaec-4556-45e9-a813-66da159e771c",
"metadata": {
"tags": []
},
"id": "843a917f",
"metadata": {},
"outputs": [],
"source": [
"pred_test_corr = pred_test.corr()\n",
"ax = pred_test_corr.loc[TARGET_COL, ORDER_MODELS].plot.bar(\n",
" # title='Corr. between Fake NA and model predictions on test data',\n",
" ylabel='correlation coefficient overall',\n",
" ylim=(0.7, 1)\n",
")\n",
"ax = vaep.plotting.add_height_to_barplot(ax)\n",
"ax.set_xticklabels(ax.get_xticklabels(), rotation=45,\n",
" horizontalalignment='right')\n",
"fname = args.out_figures / f'2_{group}_pred_corr_test_overall.pdf'\n",
"figures[fname.stem] = fname\n",
"vaep.savefig(ax.get_figure(), name=fname)\n",
"pred_test_corr"
"counts_per_bin = dict()\n",
"for col in [TARGET_COL, *ORDER_MODELS[:top_n]]:\n",
" _series = (pd.cut(pred_test[col], bins=bins)\n",
" .to_frame()\n",
" .groupby(col)\n",
" .size())\n",
" _series.index.name = 'bin'\n",
" counts_per_bin[col] = _series\n",
"counts_per_bin = pd.DataFrame(counts_per_bin)\n",
"counts_per_bin.to_excel(fname.with_suffix('.xlsx'))\n",
"counts_per_bin"
]
},
{
Expand Down Expand Up @@ -1180,7 +1141,6 @@
"execution_count": null,
"id": "9993d145-8b78-4769-838a-01721900a3c7",
"metadata": {
"lines_to_next_cell": 0,
"tags": []
},
"outputs": [],
Expand Down Expand Up @@ -1479,7 +1439,7 @@
"execution_count": null,
"id": "2a578570",
"metadata": {
"lines_to_next_cell": 0
"lines_to_next_cell": 2
},
"outputs": [],
"source": [
Expand All @@ -1506,30 +1466,21 @@
" fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians_sel.pdf'\n",
" figures[fname.stem] = fname\n",
" vaep.savefig(ax.get_figure(), name=fname)\n",
" plt.show(fig)\n",
"\n",
" dumps[fname.stem] = fname.with_suffix('.csv')\n",
" errors_binned.to_csv(fname.with_suffix('.csv'))\n",
" vaep.plotting.make_large_descriptors(6)\n",
" # ax.xaxis.set_tick_params(rotation=0) # horizontal\n",
" display(errors_binned)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8f9ea7f1",
"metadata": {
"lines_to_next_cell": 2
},
"outputs": [],
"source": [
"\n",
" # ! only used for reporting\n",
" plotted = vaep.plotting.errors.get_data_for_errors_by_median(\n",
" errors=errors_binned,\n",
" feat_name=FEAT_NAME_DISPLAY,\n",
" metric_name=METRIC\n",
" )\n",
" plotted.to_excel(fname.with_suffix('.xlsx'), index=False)\n",
" plotted"
" display(plotted)"
]
},
{
Expand Down
70 changes: 27 additions & 43 deletions project/01_2_performance_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,14 +139,17 @@ def build_text(s):
args.data, file_format=args.file_format)

# %%
fig, axes = plt.subplots(1, 2, sharey=True)
fig, axes = plt.subplots(1, 2, sharey=True, sharex=True)

vaep.plotting.data.plot_observations(data.val_y.unstack(), ax=axes[0],
title='Validation split', size=1)
title='Validation split', size=1, xlabel='')
vaep.plotting.data.plot_observations(data.test_y.unstack(), ax=axes[1],
title='Test split', size=1)

title='Test split', size=1, xlabel='')
fig.suptitle("Simulated missing values per sample", size=8)
# hide axis and use only for common x label
fig.add_subplot(111, frameon=False)
plt.tick_params(labelcolor='none', which='both', top=False, bottom=False, left=False, right=False)
plt.xlabel(f'Samples ordered by identified {data.val_y.index.names[-1]}')
group = 1
fname = args.out_figures / f'2_{group}_fake_na_val_test_splits.png'
figures[fname.stem] = fname
Expand All @@ -163,7 +166,9 @@ def build_text(s):

# %%
prop = freq_feat / len(data.train_X.index.levels[0])
prop.sort_values().to_frame().plot()
prop.sort_values().to_frame().plot(
xlabel=f'{data.val_y.index.names[-1]}',
ylabel='Proportion of identification in samples')

# %% [markdown]
# View training data in wide format
Expand Down Expand Up @@ -287,24 +292,6 @@ def build_text(s):
color in zip(TOP_N_ORDER, COLORS_TO_USE)}
TOP_N_ORDER

# %% [markdown]
# ### Correlation overall

# %%
pred_val_corr = pred_val.corr()
ax = (pred_val_corr
.loc[TARGET_COL, ORDER_MODELS]
.plot
.bar(
# title='Correlation between Fake NA and model predictions on validation data',
ylabel='correlation overall'))
ax = vaep.plotting.add_height_to_barplot(ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
horizontalalignment='right')
fname = args.out_figures / f'2_{group}_pred_corr_val_overall.pdf'
figures[fname.stem] = fname
vaep.savefig(ax.get_figure(), name=fname)
pred_val_corr

# %% [markdown]
# ### Correlation per sample
Expand Down Expand Up @@ -466,7 +453,7 @@ def build_text(s):
COLORS_TO_USE[:top_n],
axes):

ax, _ = vaep.plotting.data.plot_histogram_intensities(
ax, bins = vaep.plotting.data.plot_histogram_intensities(
pred_test[TARGET_COL],
color='grey',
min_max=min_max,
Expand All @@ -489,23 +476,18 @@ def build_text(s):
figures[fname.stem] = fname
vaep.savefig(fig, name=fname)

# %% [markdown]
# ### Correlation overall

# %%
pred_test_corr = pred_test.corr()
ax = pred_test_corr.loc[TARGET_COL, ORDER_MODELS].plot.bar(
# title='Corr. between Fake NA and model predictions on test data',
ylabel='correlation coefficient overall',
ylim=(0.7, 1)
)
ax = vaep.plotting.add_height_to_barplot(ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
horizontalalignment='right')
fname = args.out_figures / f'2_{group}_pred_corr_test_overall.pdf'
figures[fname.stem] = fname
vaep.savefig(ax.get_figure(), name=fname)
pred_test_corr
counts_per_bin = dict()
for col in [TARGET_COL, *ORDER_MODELS[:top_n]]:
_series = (pd.cut(pred_test[col], bins=bins)
.to_frame()
.groupby(col)
.size())
_series.index.name = 'bin'
counts_per_bin[col] = _series
counts_per_bin = pd.DataFrame(counts_per_bin)
counts_per_bin.to_excel(fname.with_suffix('.xlsx'))
counts_per_bin

# %% [markdown]
# ### Correlation per sample
Expand Down Expand Up @@ -628,6 +610,7 @@ def highlight_min(s, color, tolerence=0.00001):
)
else:
print("None found")

# %% [markdown]
# ### Error plot

Expand Down Expand Up @@ -814,20 +797,21 @@ def highlight_min(s, color, tolerence=0.00001):
fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians_sel.pdf'
figures[fname.stem] = fname
vaep.savefig(ax.get_figure(), name=fname)
plt.show(fig)

dumps[fname.stem] = fname.with_suffix('.csv')
errors_binned.to_csv(fname.with_suffix('.csv'))
vaep.plotting.make_large_descriptors(6)
# ax.xaxis.set_tick_params(rotation=0) # horizontal
display(errors_binned)
# %%

# # ! only used for reporting
plotted = vaep.plotting.errors.get_data_for_errors_by_median(
errors=errors_binned,
feat_name=FEAT_NAME_DISPLAY,
metric_name=METRIC
)
plotted.to_excel(fname.with_suffix('.xlsx'), index=False)
plotted
display(plotted)


# %% [markdown]
Expand Down

0 comments on commit 51a6b63

Please sign in to comment.