Skip to content

Commit

Permalink
Merge pull request #55 from RasmussenLab/further_R_methods
Browse files Browse the repository at this point in the history
Methods:

- added GSimp.
- reduced the dimensionality of the example data in the GitHub Action so 
  GSimp finishes (~1h) -> does not scale
- MNAR algorithm of MSIMPUTE added

Data:

- ensure that training data has at least 4 samples (MSIMPUTE includes that check)
- Formatted and updated workflow configs and declarations (v1&v2). Added script for command creation
  • Loading branch information
Henry Webel authored Nov 26, 2023
2 parents 089cc8e + 89046b4 commit 29a549a
Show file tree
Hide file tree
Showing 28 changed files with 1,804 additions and 525 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ jobs:
run: |
cd project
snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml -n
snakemake -p -c2 -k --configfile config/single_dev_dataset/example/config.yaml
snakemake -p -c1 -k --configfile config/single_dev_dataset/example/config.yaml
- name: Archive results
uses: actions/upload-artifact@v3
with:
Expand Down
58 changes: 43 additions & 15 deletions project/01_0_split_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"\n",
"import vaep\n",
"from vaep.io.datasplits import DataSplits\n",
"from vaep.sampling import feature_frequency, sample_data\n",
"from vaep.sampling import feature_frequency\n",
"\n",
"from vaep.analyzers import analyzers\n",
"from vaep.analyzers.analyzers import AnalyzePeptides\n",
Expand Down Expand Up @@ -245,7 +245,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"In case there are multiple features for each intensity values (currenlty: peptide sequence and charge), combine the column names to a single str index.\n",
"In case there are multiple features for each intensity values (currenlty: peptide sequence and charge),\n",
"combine the column names to a single str index.\n",
"\n",
"> The Collaborative Modeling approach will need a single feature column."
]
Expand Down Expand Up @@ -290,7 +291,7 @@
"if params.fn_rawfile_metadata:\n",
" df_meta = pd.read_csv(params.fn_rawfile_metadata, index_col=0)\n",
"else:\n",
" logger.warning(f\"No metadata for samples provided, create placeholder.\")\n",
" logger.warning(\"No metadata for samples provided, create placeholder.\")\n",
" if params.meta_date_col:\n",
" raise ValueError(\n",
" f\"No metadata provided, but data column set: {params.meta_date_col}\")\n",
Expand Down Expand Up @@ -346,7 +347,8 @@
"if params.min_RT_time:\n",
" logger.info(\n",
" \"Metadata should have 'MS max RT' entry from ThermoRawFileParser\")\n",
" msg = f\"Minimum RT time maxiumum is set to {params.min_RT_time} minutes (to exclude too short runs, which are potentially fractions).\"\n",
" msg = (f\"Minimum RT time maxiumum is set to {params.min_RT_time} minutes\"\n",
" \" (to exclude too short runs, which are potentially fractions).\")\n",
" # can be integrated into query string\n",
" mask_RT = df_meta['MS max RT'] >= params.min_RT_time\n",
" msg += f\" Total number of samples retained: {int(mask_RT.sum())}\"\n",
Expand Down Expand Up @@ -598,7 +600,7 @@
"outputs": [],
"source": [
"ax = df.notna().sum(axis=0).sort_values().plot()\n",
"_new_labels = [l.get_text().split(';')[0] for l in ax.get_xticklabels()]\n",
"_new_labels = [l_.get_text().split(';')[0] for l_ in ax.get_xticklabels()]\n",
"_ = ax.set_xticklabels(_new_labels, rotation=45,\n",
" horizontalalignment='right')\n",
"ax.set_xlabel('feature prevalence')\n",
Expand Down Expand Up @@ -1000,13 +1002,6 @@
"outputs": [],
"source": [
"group = 2\n",
"# if not mnar:\n",
"# fake_na, splits.train_X = sample_data(df_long.squeeze(),\n",
"# sample_index_to_drop=0,\n",
"# weights=freq_per_feature,\n",
"# frac=0.1,\n",
"# random_state=params.random_state,)\n",
"# assert len(splits.train_X) > len(fake_na)\n",
"! move parameter checks to start of script\n",
"if 0.0 <= params.frac_mnar <= 1.0:\n",
" fig, axes = plt.subplots(1, 2, figsize=(8, 2))\n",
Expand Down Expand Up @@ -1146,9 +1141,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"lines_to_next_cell": 2
},
"metadata": {},
"outputs": [],
"source": [
"diff = (splits\n",
Expand All @@ -1168,6 +1161,41 @@
"diff"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Some tools require at least 4 observation in the training data,\n",
"which is a good requirment. Due to \"MNAR\" sampling, most measurments\n",
"of a features can end up in the validation or test data.\n",
"\n",
"In that case: Move the validation measurments back to the training data.\n",
"If after this procedure the condition is still not met, a value error is raised."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4\n",
"if mask_min_4_measurments.any():\n",
" idx = mask_min_4_measurments.loc[mask_min_4_measurments].index\n",
" logger.warning(f\"Features with less than 4 measurments in training data: {idx.to_list()}\")\n",
" to_remove = splits.val_y.loc[pd.IndexSlice[:, idx]]\n",
" print(\"To remove from validation data: \")\n",
" display(to_remove)\n",
" splits.train_X = pd.concat([splits.train_X, to_remove])\n",
" splits.val_y = splits.val_y.drop(to_remove.index)\n",
" # check condition again\n",
" mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4\n",
" if mask_min_4_measurments.any():\n",
" idx = mask_min_4_measurments.loc[mask_min_4_measurments].index\n",
" raise ValueError(\"Some features still have less than 4 measurments in training data\"\n",
" f\" after removing the features from the validation data: {idx.to_list()}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
43 changes: 31 additions & 12 deletions project/01_0_split_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

import vaep
from vaep.io.datasplits import DataSplits
from vaep.sampling import feature_frequency, sample_data
from vaep.sampling import feature_frequency

from vaep.analyzers import analyzers
from vaep.analyzers.analyzers import AnalyzePeptides
Expand Down Expand Up @@ -174,7 +174,8 @@ def add_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame):


# %% [markdown]
# In case there are multiple features for each intensity values (currenlty: peptide sequence and charge), combine the column names to a single str index.
# In case there are multiple features for each intensity values (currenlty: peptide sequence and charge),
# combine the column names to a single str index.
#
# > The Collaborative Modeling approach will need a single feature column.

Expand Down Expand Up @@ -203,7 +204,7 @@ def join_as_str(seq):
if params.fn_rawfile_metadata:
df_meta = pd.read_csv(params.fn_rawfile_metadata, index_col=0)
else:
logger.warning(f"No metadata for samples provided, create placeholder.")
logger.warning("No metadata for samples provided, create placeholder.")
if params.meta_date_col:
raise ValueError(
f"No metadata provided, but data column set: {params.meta_date_col}")
Expand Down Expand Up @@ -236,7 +237,8 @@ def join_as_str(seq):
if params.min_RT_time:
logger.info(
"Metadata should have 'MS max RT' entry from ThermoRawFileParser")
msg = f"Minimum RT time maxiumum is set to {params.min_RT_time} minutes (to exclude too short runs, which are potentially fractions)."
msg = (f"Minimum RT time maxiumum is set to {params.min_RT_time} minutes"
" (to exclude too short runs, which are potentially fractions).")
# can be integrated into query string
mask_RT = df_meta['MS max RT'] >= params.min_RT_time
msg += f" Total number of samples retained: {int(mask_RT.sum())}"
Expand Down Expand Up @@ -378,7 +380,7 @@ def join_as_str(seq):

# %%
ax = df.notna().sum(axis=0).sort_values().plot()
_new_labels = [l.get_text().split(';')[0] for l in ax.get_xticklabels()]
_new_labels = [l_.get_text().split(';')[0] for l_ in ax.get_xticklabels()]
_ = ax.set_xticklabels(_new_labels, rotation=45,
horizontalalignment='right')
ax.set_xlabel('feature prevalence')
Expand Down Expand Up @@ -608,13 +610,6 @@ def join_as_str(seq):

# %%
group = 2
# if not mnar:
# fake_na, splits.train_X = sample_data(df_long.squeeze(),
# sample_index_to_drop=0,
# weights=freq_per_feature,
# frac=0.1,
# random_state=params.random_state,)
# assert len(splits.train_X) > len(fake_na)
# ! move parameter checks to start of script
if 0.0 <= params.frac_mnar <= 1.0:
fig, axes = plt.subplots(1, 2, figsize=(8, 2))
Expand Down Expand Up @@ -743,6 +738,30 @@ def join_as_str(seq):
splits.test_y = splits.test_y.drop(to_remove.index)
diff

# %% [markdown]
# Some tools require at least 4 observation in the training data,
# which is a good requirment. Due to "MNAR" sampling, most measurments
# of a features can end up in the validation or test data.
#
# In that case: Move the validation measurments back to the training data.
# If after this procedure the condition is still not met, a value error is raised.

# %%
mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4
if mask_min_4_measurments.any():
idx = mask_min_4_measurments.loc[mask_min_4_measurments].index
logger.warning(f"Features with less than 4 measurments in training data: {idx.to_list()}")
to_remove = splits.val_y.loc[pd.IndexSlice[:, idx]]
print("To remove from validation data: ")
display(to_remove)
splits.train_X = pd.concat([splits.train_X, to_remove])
splits.val_y = splits.val_y.drop(to_remove.index)
# check condition again
mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4
if mask_min_4_measurments.any():
idx = mask_min_4_measurments.loc[mask_min_4_measurments].index
raise ValueError("Some features still have less than 4 measurments in training data"
f" after removing the features from the validation data: {idx.to_list()}")

# %% [markdown]
# ### Save in long format
Expand Down
Loading

0 comments on commit 29a549a

Please sign in to comment.