From 51a6b632e4505c907c03075dd4f25fe4cf14af2a Mon Sep 17 00:00:00 2001 From: Henry Date: Wed, 10 Apr 2024 11:26:20 +0200 Subject: [PATCH] :art: add feat name to initial plots, write error information per median bin of features --- project/01_2_performance_plots.ipynb | 109 ++++++++------------------- project/01_2_performance_plots.py | 70 +++++++---------- 2 files changed, 57 insertions(+), 122 deletions(-) diff --git a/project/01_2_performance_plots.ipynb b/project/01_2_performance_plots.ipynb index 6eb47aa89..c7b03e4ac 100644 --- a/project/01_2_performance_plots.ipynb +++ b/project/01_2_performance_plots.ipynb @@ -231,14 +231,17 @@ }, "outputs": [], "source": [ - "fig, axes = plt.subplots(1, 2, sharey=True)\n", + "fig, axes = plt.subplots(1, 2, sharey=True, sharex=True)\n", "\n", "vaep.plotting.data.plot_observations(data.val_y.unstack(), ax=axes[0],\n", - " title='Validation split', size=1)\n", + " title='Validation split', size=1, xlabel='')\n", "vaep.plotting.data.plot_observations(data.test_y.unstack(), ax=axes[1],\n", - " title='Test split', size=1)\n", - "\n", + " title='Test split', size=1, xlabel='')\n", "fig.suptitle(\"Simulated missing values per sample\", size=8)\n", + "# hide axis and use only for common x label\n", + "fig.add_subplot(111, frameon=False)\n", + "plt.tick_params(labelcolor='none', which='both', top=False, bottom=False, left=False, right=False)\n", + "plt.xlabel(f'Samples ordered by identified {data.val_y.index.names[-1]}')\n", "group = 1\n", "fname = args.out_figures / f'2_{group}_fake_na_val_test_splits.png'\n", "figures[fname.stem] = fname\n", @@ -278,7 +281,9 @@ "outputs": [], "source": [ "prop = freq_feat / len(data.train_X.index.levels[0])\n", - "prop.sort_values().to_frame().plot()" + "prop.sort_values().to_frame().plot(\n", + " xlabel=f'{data.val_y.index.names[-1]}',\n", + " ylabel='Proportion of identification in samples')" ] }, { @@ -551,6 +556,7 @@ "execution_count": null, "id": "a2440887-b5f2-45a1-90cd-d15ef9bfa0a7", "metadata": { + "lines_to_next_cell": 2, "tags": [] }, "outputs": [], @@ -561,39 +567,6 @@ "TOP_N_ORDER" ] }, - { - "cell_type": "markdown", - "id": "dd6818ca-460a-4f14-be85-c4309057e161", - "metadata": {}, - "source": [ - "### Correlation overall" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3aa7831e-ebf3-4de4-af6c-c4b2a8b00373", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "pred_val_corr = pred_val.corr()\n", - "ax = (pred_val_corr\n", - " .loc[TARGET_COL, ORDER_MODELS]\n", - " .plot\n", - " .bar(\n", - " # title='Correlation between Fake NA and model predictions on validation data',\n", - " ylabel='correlation overall'))\n", - "ax = vaep.plotting.add_height_to_barplot(ax)\n", - "ax.set_xticklabels(ax.get_xticklabels(), rotation=45,\n", - " horizontalalignment='right')\n", - "fname = args.out_figures / f'2_{group}_pred_corr_val_overall.pdf'\n", - "figures[fname.stem] = fname\n", - "vaep.savefig(ax.get_figure(), name=fname)\n", - "pred_val_corr" - ] - }, { "cell_type": "markdown", "id": "0ac5f058-c580-4676-83c8-768bdb30f526", @@ -925,7 +898,7 @@ " COLORS_TO_USE[:top_n],\n", " axes):\n", "\n", - " ax, _ = vaep.plotting.data.plot_histogram_intensities(\n", + " ax, bins = vaep.plotting.data.plot_histogram_intensities(\n", " pred_test[TARGET_COL],\n", " color='grey',\n", " min_max=min_max,\n", @@ -949,36 +922,24 @@ "vaep.savefig(fig, name=fname)" ] }, - { - "cell_type": "markdown", - "id": "116a7b7e", - "metadata": {}, - "source": [ - "### Correlation overall" - ] - }, { "cell_type": "code", "execution_count": null, - "id": "b42efaec-4556-45e9-a813-66da159e771c", - "metadata": { - "tags": [] - }, + "id": "843a917f", + "metadata": {}, "outputs": [], "source": [ - "pred_test_corr = pred_test.corr()\n", - "ax = pred_test_corr.loc[TARGET_COL, ORDER_MODELS].plot.bar(\n", - " # title='Corr. between Fake NA and model predictions on test data',\n", - " ylabel='correlation coefficient overall',\n", - " ylim=(0.7, 1)\n", - ")\n", - "ax = vaep.plotting.add_height_to_barplot(ax)\n", - "ax.set_xticklabels(ax.get_xticklabels(), rotation=45,\n", - " horizontalalignment='right')\n", - "fname = args.out_figures / f'2_{group}_pred_corr_test_overall.pdf'\n", - "figures[fname.stem] = fname\n", - "vaep.savefig(ax.get_figure(), name=fname)\n", - "pred_test_corr" + "counts_per_bin = dict()\n", + "for col in [TARGET_COL, *ORDER_MODELS[:top_n]]:\n", + " _series = (pd.cut(pred_test[col], bins=bins)\n", + " .to_frame()\n", + " .groupby(col)\n", + " .size())\n", + " _series.index.name = 'bin'\n", + " counts_per_bin[col] = _series\n", + "counts_per_bin = pd.DataFrame(counts_per_bin)\n", + "counts_per_bin.to_excel(fname.with_suffix('.xlsx'))\n", + "counts_per_bin" ] }, { @@ -1180,7 +1141,6 @@ "execution_count": null, "id": "9993d145-8b78-4769-838a-01721900a3c7", "metadata": { - "lines_to_next_cell": 0, "tags": [] }, "outputs": [], @@ -1479,7 +1439,7 @@ "execution_count": null, "id": "2a578570", "metadata": { - "lines_to_next_cell": 0 + "lines_to_next_cell": 2 }, "outputs": [], "source": [ @@ -1506,22 +1466,13 @@ " fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians_sel.pdf'\n", " figures[fname.stem] = fname\n", " vaep.savefig(ax.get_figure(), name=fname)\n", + " plt.show(fig)\n", + "\n", " dumps[fname.stem] = fname.with_suffix('.csv')\n", " errors_binned.to_csv(fname.with_suffix('.csv'))\n", " vaep.plotting.make_large_descriptors(6)\n", " # ax.xaxis.set_tick_params(rotation=0) # horizontal\n", - " display(errors_binned)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8f9ea7f1", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ + "\n", " # ! only used for reporting\n", " plotted = vaep.plotting.errors.get_data_for_errors_by_median(\n", " errors=errors_binned,\n", @@ -1529,7 +1480,7 @@ " metric_name=METRIC\n", " )\n", " plotted.to_excel(fname.with_suffix('.xlsx'), index=False)\n", - " plotted" + " display(plotted)" ] }, { diff --git a/project/01_2_performance_plots.py b/project/01_2_performance_plots.py index c6b351569..8beb9cd81 100644 --- a/project/01_2_performance_plots.py +++ b/project/01_2_performance_plots.py @@ -139,14 +139,17 @@ def build_text(s): args.data, file_format=args.file_format) # %% -fig, axes = plt.subplots(1, 2, sharey=True) +fig, axes = plt.subplots(1, 2, sharey=True, sharex=True) vaep.plotting.data.plot_observations(data.val_y.unstack(), ax=axes[0], - title='Validation split', size=1) + title='Validation split', size=1, xlabel='') vaep.plotting.data.plot_observations(data.test_y.unstack(), ax=axes[1], - title='Test split', size=1) - + title='Test split', size=1, xlabel='') fig.suptitle("Simulated missing values per sample", size=8) +# hide axis and use only for common x label +fig.add_subplot(111, frameon=False) +plt.tick_params(labelcolor='none', which='both', top=False, bottom=False, left=False, right=False) +plt.xlabel(f'Samples ordered by identified {data.val_y.index.names[-1]}') group = 1 fname = args.out_figures / f'2_{group}_fake_na_val_test_splits.png' figures[fname.stem] = fname @@ -163,7 +166,9 @@ def build_text(s): # %% prop = freq_feat / len(data.train_X.index.levels[0]) -prop.sort_values().to_frame().plot() +prop.sort_values().to_frame().plot( + xlabel=f'{data.val_y.index.names[-1]}', + ylabel='Proportion of identification in samples') # %% [markdown] # View training data in wide format @@ -287,24 +292,6 @@ def build_text(s): color in zip(TOP_N_ORDER, COLORS_TO_USE)} TOP_N_ORDER -# %% [markdown] -# ### Correlation overall - -# %% -pred_val_corr = pred_val.corr() -ax = (pred_val_corr - .loc[TARGET_COL, ORDER_MODELS] - .plot - .bar( - # title='Correlation between Fake NA and model predictions on validation data', - ylabel='correlation overall')) -ax = vaep.plotting.add_height_to_barplot(ax) -ax.set_xticklabels(ax.get_xticklabels(), rotation=45, - horizontalalignment='right') -fname = args.out_figures / f'2_{group}_pred_corr_val_overall.pdf' -figures[fname.stem] = fname -vaep.savefig(ax.get_figure(), name=fname) -pred_val_corr # %% [markdown] # ### Correlation per sample @@ -466,7 +453,7 @@ def build_text(s): COLORS_TO_USE[:top_n], axes): - ax, _ = vaep.plotting.data.plot_histogram_intensities( + ax, bins = vaep.plotting.data.plot_histogram_intensities( pred_test[TARGET_COL], color='grey', min_max=min_max, @@ -489,23 +476,18 @@ def build_text(s): figures[fname.stem] = fname vaep.savefig(fig, name=fname) -# %% [markdown] -# ### Correlation overall - # %% -pred_test_corr = pred_test.corr() -ax = pred_test_corr.loc[TARGET_COL, ORDER_MODELS].plot.bar( - # title='Corr. between Fake NA and model predictions on test data', - ylabel='correlation coefficient overall', - ylim=(0.7, 1) -) -ax = vaep.plotting.add_height_to_barplot(ax) -ax.set_xticklabels(ax.get_xticklabels(), rotation=45, - horizontalalignment='right') -fname = args.out_figures / f'2_{group}_pred_corr_test_overall.pdf' -figures[fname.stem] = fname -vaep.savefig(ax.get_figure(), name=fname) -pred_test_corr +counts_per_bin = dict() +for col in [TARGET_COL, *ORDER_MODELS[:top_n]]: + _series = (pd.cut(pred_test[col], bins=bins) + .to_frame() + .groupby(col) + .size()) + _series.index.name = 'bin' + counts_per_bin[col] = _series +counts_per_bin = pd.DataFrame(counts_per_bin) +counts_per_bin.to_excel(fname.with_suffix('.xlsx')) +counts_per_bin # %% [markdown] # ### Correlation per sample @@ -628,6 +610,7 @@ def highlight_min(s, color, tolerence=0.00001): ) else: print("None found") + # %% [markdown] # ### Error plot @@ -814,12 +797,13 @@ def highlight_min(s, color, tolerence=0.00001): fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians_sel.pdf' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), name=fname) + plt.show(fig) + dumps[fname.stem] = fname.with_suffix('.csv') errors_binned.to_csv(fname.with_suffix('.csv')) vaep.plotting.make_large_descriptors(6) # ax.xaxis.set_tick_params(rotation=0) # horizontal - display(errors_binned) - # %% + # # ! only used for reporting plotted = vaep.plotting.errors.get_data_for_errors_by_median( errors=errors_binned, @@ -827,7 +811,7 @@ def highlight_min(s, color, tolerence=0.00001): metric_name=METRIC ) plotted.to_excel(fname.with_suffix('.xlsx'), index=False) - plotted + display(plotted) # %% [markdown]