From 51a6b632e4505c907c03075dd4f25fe4cf14af2a Mon Sep 17 00:00:00 2001
From: Henry <henry.webel@cpr.ku.dk>
Date: Wed, 10 Apr 2024 11:26:20 +0200
Subject: [PATCH] :art: add feat name to initial plots, write error information
 per median bin of features

---
 project/01_2_performance_plots.ipynb | 109 ++++++++-------------------
 project/01_2_performance_plots.py    |  70 +++++++----------
 2 files changed, 57 insertions(+), 122 deletions(-)

diff --git a/project/01_2_performance_plots.ipynb b/project/01_2_performance_plots.ipynb
index 6eb47aa89..c7b03e4ac 100644
--- a/project/01_2_performance_plots.ipynb
+++ b/project/01_2_performance_plots.ipynb
@@ -231,14 +231,17 @@
    },
    "outputs": [],
    "source": [
-    "fig, axes = plt.subplots(1, 2, sharey=True)\n",
+    "fig, axes = plt.subplots(1, 2, sharey=True, sharex=True)\n",
     "\n",
     "vaep.plotting.data.plot_observations(data.val_y.unstack(), ax=axes[0],\n",
-    "                                     title='Validation split', size=1)\n",
+    "                                     title='Validation split', size=1, xlabel='')\n",
     "vaep.plotting.data.plot_observations(data.test_y.unstack(), ax=axes[1],\n",
-    "                                     title='Test split', size=1)\n",
-    "\n",
+    "                                     title='Test split', size=1, xlabel='')\n",
     "fig.suptitle(\"Simulated missing values per sample\", size=8)\n",
+    "# hide axis and use only for common x label\n",
+    "fig.add_subplot(111, frameon=False)\n",
+    "plt.tick_params(labelcolor='none', which='both', top=False, bottom=False, left=False, right=False)\n",
+    "plt.xlabel(f'Samples ordered by identified {data.val_y.index.names[-1]}')\n",
     "group = 1\n",
     "fname = args.out_figures / f'2_{group}_fake_na_val_test_splits.png'\n",
     "figures[fname.stem] = fname\n",
@@ -278,7 +281,9 @@
    "outputs": [],
    "source": [
     "prop = freq_feat / len(data.train_X.index.levels[0])\n",
-    "prop.sort_values().to_frame().plot()"
+    "prop.sort_values().to_frame().plot(\n",
+    "    xlabel=f'{data.val_y.index.names[-1]}',\n",
+    "    ylabel='Proportion of identification in samples')"
    ]
   },
   {
@@ -551,6 +556,7 @@
    "execution_count": null,
    "id": "a2440887-b5f2-45a1-90cd-d15ef9bfa0a7",
    "metadata": {
+    "lines_to_next_cell": 2,
     "tags": []
    },
    "outputs": [],
@@ -561,39 +567,6 @@
     "TOP_N_ORDER"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "dd6818ca-460a-4f14-be85-c4309057e161",
-   "metadata": {},
-   "source": [
-    "### Correlation overall"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3aa7831e-ebf3-4de4-af6c-c4b2a8b00373",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "pred_val_corr = pred_val.corr()\n",
-    "ax = (pred_val_corr\n",
-    "      .loc[TARGET_COL, ORDER_MODELS]\n",
-    "      .plot\n",
-    "      .bar(\n",
-    "          # title='Correlation between Fake NA and model predictions on validation data',\n",
-    "          ylabel='correlation overall'))\n",
-    "ax = vaep.plotting.add_height_to_barplot(ax)\n",
-    "ax.set_xticklabels(ax.get_xticklabels(), rotation=45,\n",
-    "                   horizontalalignment='right')\n",
-    "fname = args.out_figures / f'2_{group}_pred_corr_val_overall.pdf'\n",
-    "figures[fname.stem] = fname\n",
-    "vaep.savefig(ax.get_figure(), name=fname)\n",
-    "pred_val_corr"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "0ac5f058-c580-4676-83c8-768bdb30f526",
@@ -925,7 +898,7 @@
     "        COLORS_TO_USE[:top_n],\n",
     "        axes):\n",
     "\n",
-    "    ax, _ = vaep.plotting.data.plot_histogram_intensities(\n",
+    "    ax, bins = vaep.plotting.data.plot_histogram_intensities(\n",
     "        pred_test[TARGET_COL],\n",
     "        color='grey',\n",
     "        min_max=min_max,\n",
@@ -949,36 +922,24 @@
     "vaep.savefig(fig, name=fname)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "116a7b7e",
-   "metadata": {},
-   "source": [
-    "### Correlation overall"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b42efaec-4556-45e9-a813-66da159e771c",
-   "metadata": {
-    "tags": []
-   },
+   "id": "843a917f",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "pred_test_corr = pred_test.corr()\n",
-    "ax = pred_test_corr.loc[TARGET_COL, ORDER_MODELS].plot.bar(\n",
-    "    # title='Corr. between Fake NA and model predictions on test data',\n",
-    "    ylabel='correlation coefficient overall',\n",
-    "    ylim=(0.7, 1)\n",
-    ")\n",
-    "ax = vaep.plotting.add_height_to_barplot(ax)\n",
-    "ax.set_xticklabels(ax.get_xticklabels(), rotation=45,\n",
-    "                   horizontalalignment='right')\n",
-    "fname = args.out_figures / f'2_{group}_pred_corr_test_overall.pdf'\n",
-    "figures[fname.stem] = fname\n",
-    "vaep.savefig(ax.get_figure(), name=fname)\n",
-    "pred_test_corr"
+    "counts_per_bin = dict()\n",
+    "for col in [TARGET_COL, *ORDER_MODELS[:top_n]]:\n",
+    "    _series = (pd.cut(pred_test[col], bins=bins)\n",
+    "               .to_frame()\n",
+    "               .groupby(col)\n",
+    "               .size())\n",
+    "    _series.index.name = 'bin'\n",
+    "    counts_per_bin[col] = _series\n",
+    "counts_per_bin = pd.DataFrame(counts_per_bin)\n",
+    "counts_per_bin.to_excel(fname.with_suffix('.xlsx'))\n",
+    "counts_per_bin"
    ]
   },
   {
@@ -1180,7 +1141,6 @@
    "execution_count": null,
    "id": "9993d145-8b78-4769-838a-01721900a3c7",
    "metadata": {
-    "lines_to_next_cell": 0,
     "tags": []
    },
    "outputs": [],
@@ -1479,7 +1439,7 @@
    "execution_count": null,
    "id": "2a578570",
    "metadata": {
-    "lines_to_next_cell": 0
+    "lines_to_next_cell": 2
    },
    "outputs": [],
    "source": [
@@ -1506,22 +1466,13 @@
     "    fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians_sel.pdf'\n",
     "    figures[fname.stem] = fname\n",
     "    vaep.savefig(ax.get_figure(), name=fname)\n",
+    "    plt.show(fig)\n",
+    "\n",
     "    dumps[fname.stem] = fname.with_suffix('.csv')\n",
     "    errors_binned.to_csv(fname.with_suffix('.csv'))\n",
     "    vaep.plotting.make_large_descriptors(6)\n",
     "    # ax.xaxis.set_tick_params(rotation=0) # horizontal\n",
-    "    display(errors_binned)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8f9ea7f1",
-   "metadata": {
-    "lines_to_next_cell": 2
-   },
-   "outputs": [],
-   "source": [
+    "\n",
     "    # ! only used for reporting\n",
     "    plotted = vaep.plotting.errors.get_data_for_errors_by_median(\n",
     "        errors=errors_binned,\n",
@@ -1529,7 +1480,7 @@
     "        metric_name=METRIC\n",
     "    )\n",
     "    plotted.to_excel(fname.with_suffix('.xlsx'), index=False)\n",
-    "    plotted"
+    "    display(plotted)"
    ]
   },
   {
diff --git a/project/01_2_performance_plots.py b/project/01_2_performance_plots.py
index c6b351569..8beb9cd81 100644
--- a/project/01_2_performance_plots.py
+++ b/project/01_2_performance_plots.py
@@ -139,14 +139,17 @@ def build_text(s):
     args.data, file_format=args.file_format)
 
 # %%
-fig, axes = plt.subplots(1, 2, sharey=True)
+fig, axes = plt.subplots(1, 2, sharey=True, sharex=True)
 
 vaep.plotting.data.plot_observations(data.val_y.unstack(), ax=axes[0],
-                                     title='Validation split', size=1)
+                                     title='Validation split', size=1, xlabel='')
 vaep.plotting.data.plot_observations(data.test_y.unstack(), ax=axes[1],
-                                     title='Test split', size=1)
-
+                                     title='Test split', size=1, xlabel='')
 fig.suptitle("Simulated missing values per sample", size=8)
+# hide axis and use only for common x label
+fig.add_subplot(111, frameon=False)
+plt.tick_params(labelcolor='none', which='both', top=False, bottom=False, left=False, right=False)
+plt.xlabel(f'Samples ordered by identified {data.val_y.index.names[-1]}')
 group = 1
 fname = args.out_figures / f'2_{group}_fake_na_val_test_splits.png'
 figures[fname.stem] = fname
@@ -163,7 +166,9 @@ def build_text(s):
 
 # %%
 prop = freq_feat / len(data.train_X.index.levels[0])
-prop.sort_values().to_frame().plot()
+prop.sort_values().to_frame().plot(
+    xlabel=f'{data.val_y.index.names[-1]}',
+    ylabel='Proportion of identification in samples')
 
 # %% [markdown]
 # View training data in wide format
@@ -287,24 +292,6 @@ def build_text(s):
                        color in zip(TOP_N_ORDER, COLORS_TO_USE)}
 TOP_N_ORDER
 
-# %% [markdown]
-# ### Correlation overall
-
-# %%
-pred_val_corr = pred_val.corr()
-ax = (pred_val_corr
-      .loc[TARGET_COL, ORDER_MODELS]
-      .plot
-      .bar(
-          # title='Correlation between Fake NA and model predictions on validation data',
-          ylabel='correlation overall'))
-ax = vaep.plotting.add_height_to_barplot(ax)
-ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
-                   horizontalalignment='right')
-fname = args.out_figures / f'2_{group}_pred_corr_val_overall.pdf'
-figures[fname.stem] = fname
-vaep.savefig(ax.get_figure(), name=fname)
-pred_val_corr
 
 # %% [markdown]
 # ### Correlation per sample
@@ -466,7 +453,7 @@ def build_text(s):
         COLORS_TO_USE[:top_n],
         axes):
 
-    ax, _ = vaep.plotting.data.plot_histogram_intensities(
+    ax, bins = vaep.plotting.data.plot_histogram_intensities(
         pred_test[TARGET_COL],
         color='grey',
         min_max=min_max,
@@ -489,23 +476,18 @@ def build_text(s):
 figures[fname.stem] = fname
 vaep.savefig(fig, name=fname)
 
-# %% [markdown]
-# ### Correlation overall
-
 # %%
-pred_test_corr = pred_test.corr()
-ax = pred_test_corr.loc[TARGET_COL, ORDER_MODELS].plot.bar(
-    # title='Corr. between Fake NA and model predictions on test data',
-    ylabel='correlation coefficient overall',
-    ylim=(0.7, 1)
-)
-ax = vaep.plotting.add_height_to_barplot(ax)
-ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
-                   horizontalalignment='right')
-fname = args.out_figures / f'2_{group}_pred_corr_test_overall.pdf'
-figures[fname.stem] = fname
-vaep.savefig(ax.get_figure(), name=fname)
-pred_test_corr
+counts_per_bin = dict()
+for col in [TARGET_COL, *ORDER_MODELS[:top_n]]:
+    _series = (pd.cut(pred_test[col], bins=bins)
+               .to_frame()
+               .groupby(col)
+               .size())
+    _series.index.name = 'bin'
+    counts_per_bin[col] = _series
+counts_per_bin = pd.DataFrame(counts_per_bin)
+counts_per_bin.to_excel(fname.with_suffix('.xlsx'))
+counts_per_bin
 
 # %% [markdown]
 # ### Correlation per sample
@@ -628,6 +610,7 @@ def highlight_min(s, color, tolerence=0.00001):
             )
 else:
     print("None found")
+
 # %% [markdown]
 # ### Error plot
 
@@ -814,12 +797,13 @@ def highlight_min(s, color, tolerence=0.00001):
     fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians_sel.pdf'
     figures[fname.stem] = fname
     vaep.savefig(ax.get_figure(), name=fname)
+    plt.show(fig)
+
     dumps[fname.stem] = fname.with_suffix('.csv')
     errors_binned.to_csv(fname.with_suffix('.csv'))
     vaep.plotting.make_large_descriptors(6)
     # ax.xaxis.set_tick_params(rotation=0) # horizontal
-    display(errors_binned)
-    # %%
+
     # # ! only used for reporting
     plotted = vaep.plotting.errors.get_data_for_errors_by_median(
         errors=errors_binned,
@@ -827,7 +811,7 @@ def highlight_min(s, color, tolerence=0.00001):
         metric_name=METRIC
     )
     plotted.to_excel(fname.with_suffix('.xlsx'), index=False)
-    plotted
+    display(plotted)
 
 
 # %% [markdown]