Merge pull request #55 from RasmussenLab/further_R_methods

Methods: - added GSimp. - reduced the dimensionality of the example data in the GitHub Action so GSimp finishes (~1h) -> does not scale - MNAR algorithm of MSIMPUTE added Data: - ensure that training data has at least 4 samples (MSIMPUTE includes that check) - Formatted and updated workflow configs and declarations (v1&v2). Added script for command creation
RasmussenLab · Nov 26, 2023 · 29a549a · 29a549a
2 parents 089cc8e + 89046b4
commit 29a549a
Show file tree

Hide file tree

Showing 28 changed files with 1,804 additions and 525 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -65,7 +65,7 @@ jobs:
       run: | 
        cd project
        snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml -n
-       snakemake -p -c2 -k --configfile config/single_dev_dataset/example/config.yaml
+       snakemake -p -c1 -k --configfile config/single_dev_dataset/example/config.yaml
     - name: Archive results
       uses: actions/upload-artifact@v3
       with:

diff --git a/project/01_0_split_data.ipynb b/project/01_0_split_data.ipynb
@@ -28,7 +28,7 @@
     "\n",
     "import vaep\n",
     "from vaep.io.datasplits import DataSplits\n",
-    "from vaep.sampling import feature_frequency, sample_data\n",
+    "from vaep.sampling import feature_frequency\n",
     "\n",
     "from vaep.analyzers import analyzers\n",
     "from vaep.analyzers.analyzers import AnalyzePeptides\n",
@@ -245,7 +245,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "In case there are multiple features for each intensity values (currenlty: peptide sequence and charge), combine the column names to a single str index.\n",
+    "In case there are multiple features for each intensity values (currenlty: peptide sequence and charge),\n",
+    "combine the column names to a single str index.\n",
     "\n",
     "> The Collaborative Modeling approach will need a single feature column."
    ]
@@ -290,7 +291,7 @@
     "if params.fn_rawfile_metadata:\n",
     "    df_meta = pd.read_csv(params.fn_rawfile_metadata, index_col=0)\n",
     "else:\n",
-    "    logger.warning(f\"No metadata for samples provided, create placeholder.\")\n",
+    "    logger.warning(\"No metadata for samples provided, create placeholder.\")\n",
     "    if params.meta_date_col:\n",
     "        raise ValueError(\n",
     "            f\"No metadata provided, but data column set: {params.meta_date_col}\")\n",
@@ -346,7 +347,8 @@
     "if params.min_RT_time:\n",
     "    logger.info(\n",
     "        \"Metadata should have 'MS max RT' entry from ThermoRawFileParser\")\n",
-    "    msg = f\"Minimum RT time maxiumum is set to {params.min_RT_time} minutes (to exclude too short runs, which are potentially fractions).\"\n",
+    "    msg = (f\"Minimum RT time maxiumum is set to {params.min_RT_time} minutes\"\n",
+    "           \" (to exclude too short runs, which are potentially fractions).\")\n",
     "    # can be integrated into query string\n",
     "    mask_RT = df_meta['MS max RT'] >= params.min_RT_time\n",
     "    msg += f\" Total number of samples retained: {int(mask_RT.sum())}\"\n",
@@ -598,7 +600,7 @@
    "outputs": [],
    "source": [
     "ax = df.notna().sum(axis=0).sort_values().plot()\n",
-    "_new_labels = [l.get_text().split(';')[0] for l in ax.get_xticklabels()]\n",
+    "_new_labels = [l_.get_text().split(';')[0] for l_ in ax.get_xticklabels()]\n",
     "_ = ax.set_xticklabels(_new_labels, rotation=45,\n",
     "                       horizontalalignment='right')\n",
     "ax.set_xlabel('feature prevalence')\n",
@@ -1000,13 +1002,6 @@
    "outputs": [],
    "source": [
     "group = 2\n",
-    "# if not mnar:\n",
-    "#     fake_na, splits.train_X = sample_data(df_long.squeeze(),\n",
-    "#                                           sample_index_to_drop=0,\n",
-    "#                                           weights=freq_per_feature,\n",
-    "#                                           frac=0.1,\n",
-    "#                                           random_state=params.random_state,)\n",
-    "#     assert len(splits.train_X) > len(fake_na)\n",
     "! move parameter checks to start of script\n",
     "if 0.0 <= params.frac_mnar <= 1.0:\n",
     "    fig, axes = plt.subplots(1, 2, figsize=(8, 2))\n",
@@ -1146,9 +1141,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "lines_to_next_cell": 2
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "diff = (splits\n",
@@ -1168,6 +1161,41 @@
     "diff"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Some tools require at least 4 observation in the training data,\n",
+    "which is a good requirment. Due to \"MNAR\" sampling, most measurments\n",
+    "of a features can end up in the validation or test data.\n",
+    "\n",
+    "In that case: Move the validation measurments back to the training data.\n",
+    "If after this procedure the condition is still not met, a value error is raised."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4\n",
+    "if mask_min_4_measurments.any():\n",
+    "    idx = mask_min_4_measurments.loc[mask_min_4_measurments].index\n",
+    "    logger.warning(f\"Features with less than 4 measurments in training data: {idx.to_list()}\")\n",
+    "    to_remove = splits.val_y.loc[pd.IndexSlice[:, idx]]\n",
+    "    print(\"To remove from validation data: \")\n",
+    "    display(to_remove)\n",
+    "    splits.train_X = pd.concat([splits.train_X, to_remove])\n",
+    "    splits.val_y = splits.val_y.drop(to_remove.index)\n",
+    "    # check condition again\n",
+    "    mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4\n",
+    "    if mask_min_4_measurments.any():\n",
+    "        idx = mask_min_4_measurments.loc[mask_min_4_measurments].index\n",
+    "        raise ValueError(\"Some features still have less than 4 measurments in training data\"\n",
+    "                         f\" after removing the features from the validation data: {idx.to_list()}\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/project/01_0_split_data.py b/project/01_0_split_data.py
@@ -32,7 +32,7 @@
 
 import vaep
 from vaep.io.datasplits import DataSplits
-from vaep.sampling import feature_frequency, sample_data
+from vaep.sampling import feature_frequency
 
 from vaep.analyzers import analyzers
 from vaep.analyzers.analyzers import AnalyzePeptides
@@ -174,7 +174,8 @@ def add_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame):
 
 
 # %% [markdown]
-# In case there are multiple features for each intensity values (currenlty: peptide sequence and charge), combine the column names to a single str index.
+# In case there are multiple features for each intensity values (currenlty: peptide sequence and charge),
+# combine the column names to a single str index.
 #
 # > The Collaborative Modeling approach will need a single feature column.
 
@@ -203,7 +204,7 @@ def join_as_str(seq):
 if params.fn_rawfile_metadata:
     df_meta = pd.read_csv(params.fn_rawfile_metadata, index_col=0)
 else:
-    logger.warning(f"No metadata for samples provided, create placeholder.")
+    logger.warning("No metadata for samples provided, create placeholder.")
     if params.meta_date_col:
         raise ValueError(
             f"No metadata provided, but data column set: {params.meta_date_col}")
@@ -236,7 +237,8 @@ def join_as_str(seq):
 if params.min_RT_time:
     logger.info(
         "Metadata should have 'MS max RT' entry from ThermoRawFileParser")
-    msg = f"Minimum RT time maxiumum is set to {params.min_RT_time} minutes (to exclude too short runs, which are potentially fractions)."
+    msg = (f"Minimum RT time maxiumum is set to {params.min_RT_time} minutes"
+           " (to exclude too short runs, which are potentially fractions).")
     # can be integrated into query string
     mask_RT = df_meta['MS max RT'] >= params.min_RT_time
     msg += f" Total number of samples retained: {int(mask_RT.sum())}"
@@ -378,7 +380,7 @@ def join_as_str(seq):
 
 # %%
 ax = df.notna().sum(axis=0).sort_values().plot()
-_new_labels = [l.get_text().split(';')[0] for l in ax.get_xticklabels()]
+_new_labels = [l_.get_text().split(';')[0] for l_ in ax.get_xticklabels()]
 _ = ax.set_xticklabels(_new_labels, rotation=45,
                        horizontalalignment='right')
 ax.set_xlabel('feature prevalence')
@@ -608,13 +610,6 @@ def join_as_str(seq):
 
 # %%
 group = 2
-# if not mnar:
-#     fake_na, splits.train_X = sample_data(df_long.squeeze(),
-#                                           sample_index_to_drop=0,
-#                                           weights=freq_per_feature,
-#                                           frac=0.1,
-#                                           random_state=params.random_state,)
-#     assert len(splits.train_X) > len(fake_na)
 # ! move parameter checks to start of script
 if 0.0 <= params.frac_mnar <= 1.0:
     fig, axes = plt.subplots(1, 2, figsize=(8, 2))
@@ -743,6 +738,30 @@ def join_as_str(seq):
     splits.test_y = splits.test_y.drop(to_remove.index)
 diff
 
+# %% [markdown]
+# Some tools require at least 4 observation in the training data,
+# which is a good requirment. Due to "MNAR" sampling, most measurments
+# of a features can end up in the validation or test data.
+#
+# In that case: Move the validation measurments back to the training data.
+# If after this procedure the condition is still not met, a value error is raised.
+
+# %%
+mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4
+if mask_min_4_measurments.any():
+    idx = mask_min_4_measurments.loc[mask_min_4_measurments].index
+    logger.warning(f"Features with less than 4 measurments in training data: {idx.to_list()}")
+    to_remove = splits.val_y.loc[pd.IndexSlice[:, idx]]
+    print("To remove from validation data: ")
+    display(to_remove)
+    splits.train_X = pd.concat([splits.train_X, to_remove])
+    splits.val_y = splits.val_y.drop(to_remove.index)
+    # check condition again
+    mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4
+    if mask_min_4_measurments.any():
+        idx = mask_min_4_measurments.loc[mask_min_4_measurments].index
+        raise ValueError("Some features still have less than 4 measurments in training data"
+                         f" after removing the features from the validation data: {idx.to_list()}")
 
 # %% [markdown]
 # ### Save in long format