✨ add some plots and preprocessing to tutorial

RasmussenLab · Feb 8, 2024 · abfddee · abfddee
1 parent 18caa3a
commit abfddee
Show file tree

Hide file tree

Showing 2 changed files with 150 additions and 15 deletions.
diff --git a/project/04_1_train_pimms_models.ipynb b/project/04_1_train_pimms_models.ipynb
@@ -121,39 +121,126 @@
   },
   {
    "cell_type": "markdown",
-   "id": "693b1ee5",
+   "id": "c6c788f0",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ab8dc7f",
    "metadata": {},
    "source": [
-    "Transform to long-data format:"
+    "Transform the data using the logarithm, here using base 2:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "646ea5bb",
-   "metadata": {},
+   "id": "554d4fa7",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
    "outputs": [],
    "source": [
-    "df = df.stack().to_frame('intensity')\n",
+    "df = np.log2(df + 1)\n",
     "df.head()"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "2ab8dc7f",
+   "id": "fbce73d1",
    "metadata": {},
    "source": [
-    "Transform the data using the logarithm, here using base 2:"
+    "two plots on data availability:\n",
+    "\n",
+    "1. proportion of missing values per feature median (N = protein groups)\n",
+    "2. CDF of available intensities per protein group"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "554d4fa7",
+   "id": "536793bb",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n",
+    "    data=df, type='boxplot')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "313bc55e",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "df.notna().sum().sort_values().plot()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cf54a3af",
+   "metadata": {},
+   "source": [
+    "define a minimum feature and sample frequency for a feature to be included"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "28ad27de",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "SELECT_FEAT = True\n",
+    "\n",
+    "\n",
+    "def select_features(df, feat_prevalence=.2, axis=0):\n",
+    "    N = df.shape[axis]\n",
+    "    minimum_freq = N * feat_prevalence\n",
+    "    freq = df.notna().sum(axis=axis)\n",
+    "    mask = freq >= minimum_freq\n",
+    "    print(f\"Drop {(~mask).sum()} along axis {axis}.\")\n",
+    "    freq = freq.loc[mask]\n",
+    "    if axis == 0:\n",
+    "        df = df.loc[:, mask]\n",
+    "    else:\n",
+    "        df = df.loc[mask]\n",
+    "    return df\n",
+    "\n",
+    "\n",
+    "if SELECT_FEAT:\n",
+    "    # potentially this can take a few iterations to stabilize.\n",
+    "    df = select_features(df, feat_prevalence=.2)\n",
+    "    df = select_features(df=df, feat_prevalence=.3, axis=1)\n",
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "693b1ee5",
    "metadata": {},
+   "source": [
+    "Transform to long-data format:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "646ea5bb",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
    "outputs": [],
    "source": [
-    "df = np.log2(df)\n",
+    "df = df.stack().to_frame('intensity')\n",
     "df.head()"
    ]
   },
@@ -320,7 +407,7 @@
     "df = pd.read_csv(fn_intensities, index_col=0)\n",
     "df.index.name = 'Sample ID'  # already set\n",
     "df.columns.name = 'protein group'  # not set due to csv disk file format\n",
-    "df = np.log2(df)  # log transform\n",
+    "df = np.log2(df + 1)  # log transform\n",
     "df.head()"
    ]
   },

diff --git a/project/04_1_train_pimms_models.py b/project/04_1_train_pimms_models.py
@@ -63,19 +63,67 @@
 df.head()
 
 # %% [markdown]
-# Transform to long-data format:
+#
+
+# %% [markdown]
+# Transform the data using the logarithm, here using base 2:
 
 # %%
-df = df.stack().to_frame('intensity')
+df = np.log2(df + 1)
 df.head()
 
+
 # %% [markdown]
-# Transform the data using the logarithm, here using base 2:
+# two plots on data availability:
+#
+# 1. proportion of missing values per feature median (N = protein groups)
+# 2. CDF of available intensities per protein group
+
+# %%
+ax = vaep.plotting.data.plot_feat_median_over_prop_missing(
+    data=df, type='boxplot')
+
 
 # %%
-df = np.log2(df)
+df.notna().sum().sort_values().plot()
+
+
+# %% [markdown]
+# define a minimum feature and sample frequency for a feature to be included
+
+# %%
+SELECT_FEAT = True
+
+
+def select_features(df, feat_prevalence=.2, axis=0):
+    N = df.shape[axis]
+    minimum_freq = N * feat_prevalence
+    freq = df.notna().sum(axis=axis)
+    mask = freq >= minimum_freq
+    print(f"Drop {(~mask).sum()} along axis {axis}.")
+    freq = freq.loc[mask]
+    if axis == 0:
+        df = df.loc[:, mask]
+    else:
+        df = df.loc[mask]
+    return df
+
+
+if SELECT_FEAT:
+    # potentially this can take a few iterations to stabilize.
+    df = select_features(df, feat_prevalence=.2)
+    df = select_features(df=df, feat_prevalence=.3, axis=1)
+df.shape
+
+
+# %% [markdown]
+# Transform to long-data format:
+
+# %%
+df = df.stack().to_frame('intensity')
 df.head()
 
+
 # %% [markdown]
 # The resulting DataFrame with one column has an `MulitIndex` with the sample and feature identifier.
 
@@ -161,7 +209,7 @@
 df = pd.read_csv(fn_intensities, index_col=0)
 df.index.name = 'Sample ID'  # already set
 df.columns.name = 'protein group'  # not set due to csv disk file format
-df = np.log2(df)  # log transform
+df = np.log2(df + 1)  # log transform
 df.head()
 
 # %% [markdown]