Skip to content

Commit

Permalink
🎨🐛 update overfitting analysis (25MNAR)
Browse files Browse the repository at this point in the history
- 🐛 remove metadata fpath from train_X.yaml
- also run KNN comp. with workflow v2 with a share of 25MNAR
  • Loading branch information
Henry committed Dec 7, 2023
1 parent 052ed78 commit 49d628b
Show file tree
Hide file tree
Showing 16 changed files with 73 additions and 54 deletions.
17 changes: 15 additions & 2 deletions project/03_1_best_models_comparison.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"logger = setup_logger(logger=logging.getLogger('vaep'), level=10)\n",
"\n",
"plt.rcParams['figure.figsize'] = [4.0, 2.0]\n",
"vaep.plotting.make_large_descriptors(5)"
"vaep.plotting.make_large_descriptors(7)"
]
},
{
Expand Down Expand Up @@ -93,7 +93,10 @@
"min_max_MAE = (selected\n",
" .loc[pd.IndexSlice[:, 'MAE', :]]\n",
" .groupby('model')\n",
" .agg(['min', 'max']))\n",
" .agg(['min', 'max'])\n",
" .stack()\n",
" .T\n",
" .loc[IDX[0]])\n",
"min_max_MAE.to_excel(writer, sheet_name='min_max_MAE')\n",
"min_max_MAE"
]
Expand Down Expand Up @@ -182,6 +185,16 @@
"vaep.savefig(fig, FOLDER / \"model_performance_repeated_runs.pdf\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0813889a",
"metadata": {},
"outputs": [],
"source": [
"writer.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
10 changes: 8 additions & 2 deletions project/03_1_best_models_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
logger = setup_logger(logger=logging.getLogger('vaep'), level=10)

plt.rcParams['figure.figsize'] = [4.0, 2.0]
vaep.plotting.make_large_descriptors(5)
vaep.plotting.make_large_descriptors(7)

# %%
IDX = [['proteinGroups', 'peptides', 'evidence'],
Expand Down Expand Up @@ -63,7 +63,10 @@
min_max_MAE = (selected
.loc[pd.IndexSlice[:, 'MAE', :]]
.groupby('model')
.agg(['min', 'max']))
.agg(['min', 'max'])
.stack()
.T
.loc[IDX[0]])
min_max_MAE.to_excel(writer, sheet_name='min_max_MAE')
min_max_MAE

Expand Down Expand Up @@ -114,3 +117,6 @@
vaep.savefig(fig, FOLDER / "model_performance_repeated_runs.pdf")

# %%
writer.close()

# %%
32 changes: 16 additions & 16 deletions project/config/knn_comparison/hela_pgs_large/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,23 @@
config_split: config/knn_comparison/hela_pgs_large/split.yaml
config_train: runs/knn_comparison/hela_pgs_large/configs_train/train_{model}.yaml
folder_experiment: runs/knn_comparison/hela_pgs_large
fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadata.csv
fn_rawfile_metadata: None
file_format: csv
cuda: False
models:
- Median:
model: Median
- 3NN:
neighbors: 3
model: KNN
- 5NN:
neighbors: 5
model: KNN
- 10NN:
neighbors: 10
model: KNN
- 15NN:
neighbors: 15
model: KNN
- Median:
model: Median
- 3NN:
neighbors: 3
model: KNN
- 5NN:
neighbors: 5
model: KNN
- 10NN:
neighbors: 10
model: KNN
- 15NN:
neighbors: 15
model: KNN
NAGuideR_methods:
- KNN_IMPUTE
- KNN_IMPUTE
2 changes: 1 addition & 1 deletion project/config/knn_comparison/hela_pgs_large/split.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
FN_INTENSITIES: data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl
sample_completeness: 0.5
min_RT_time: 120
frac_mnar: 0.25
18 changes: 8 additions & 10 deletions project/config/permuted_dataset/config.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
# config for Snakefile_v1
# fit permuted data to the same model as the original data
config_split: config/permuted_dataset/split.yaml # proteinGroups
config_split: config/permuted_dataset/split.yaml # proteinGroups
config_train: config/single_dev_dataset/proteinGroups/train_{model}.yaml
folder_experiment: runs/permuted #/proteinGroups
fn_rawfile_metadata: # no metadata for permuted data
cuda: False
models:
- Median
- CF
- DAE
- VAE
- KNN
- Median
- CF
- DAE
- VAE
- KNN
NAGuideR_methods:
- lls
- knnmethod
- rf
# - impseq # fails
- KNN_IMPUTE
# - RF
1 change: 1 addition & 0 deletions project/config/permuted_dataset/split.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
FN_INTENSITIES: data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070_permuted.pkl
sample_completeness: 0.5
frac_mnar: 0.25
15 changes: 8 additions & 7 deletions project/config/repeat_best/split.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
epochs_max:
- 100
- 100
repeats: 5
folder: "runs/repeat_best_split"
levels:
- proteinGroups
- peptides
- evidence
- proteinGroups
- peptides
- evidence
fn_rawfile_metadata: data/dev_datasets/df_intensities_{level}_long/metadata.csv
config_split: 'config/single_dev_dataset/{level}/split.yaml'
config_train: 'config/single_dev_dataset/{level}/train_{model}.yaml'
repitition_name: 'repeat'
config_split: "config/single_dev_dataset/{level}/split.yaml"
config_train: "config/single_dev_dataset/{level}/train_{model}.yaml"
repitition_name: "repeat"
file_format: pkl
cuda: True
15 changes: 8 additions & 7 deletions project/config/repeat_best/train.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
epochs_max:
- 100
- 100
repeats: 5
folder: "runs/repeat_best_train"
levels:
- proteinGroups
- peptides
- evidence
- proteinGroups
- peptides
- evidence
fn_rawfile_metadata: data/dev_datasets/df_intensities_{level}_long/metadata.csv
config_split: 'config/single_dev_dataset/{level}/split.yaml'
config_train: 'config/single_dev_dataset/{level}/train_{model}.yaml'
repitition_name: 'repeat'
config_split: "config/single_dev_dataset/{level}/split.yaml"
config_train: "config/single_dev_dataset/{level}/train_{model}.yaml"
repitition_name: "repeat"
file_format: pkl
cuda: True
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
file_format: csv
fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadata.csv
latent_dim: 50
batch_size: 32768
epochs_max: 100
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
file_format: csv
fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadata.csv
latent_dim: 25
batch_size: 64
epochs_max: 100
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
neighbors: 3
file_format: csv
fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadata.csv
file_format: csv
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
file_format: csv
fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadata.csv
file_format: csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# models_training:
file_format: csv
fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadata.csv
latent_dim: 25
batch_size: 64
epochs_max: 50
Expand Down
4 changes: 2 additions & 2 deletions project/workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ rule comparison:
out=f"{{folder_experiment}}/{nb_stem}.o",
shell:
"papermill {input.nb} {output.nb:q}"
" -r fn_rawfile_metadata {params.meta_data:q}"
" -p fn_rawfile_metadata {params.meta_data:q}"
" -r folder_experiment {wildcards.folder_experiment:q}"
" -r models {params.models:q}"
" && jupyter nbconvert --to html {output.nb:q}"
Expand Down Expand Up @@ -179,7 +179,7 @@ rule train_models:
"papermill {input.nb:q} {output.nb:q}"
" -f {input.configfile:q}"
" -r folder_experiment {params.folder_experiment:q}"
" -r fn_rawfile_metadata {params.meta_data:q}"
" -p fn_rawfile_metadata {params.meta_data:q}"
" -r model_key {wildcards.model:q}"
" 2> {log.err}"
" && jupyter nbconvert --to html {output.nb:q}"
Expand Down
2 changes: 2 additions & 0 deletions project/workflow/Snakefile_best_repeated_split.smk
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,13 @@ rule train_models:
model_key="{model}",
meta_data=config["fn_rawfile_metadata"],
file_format=config["file_format"],
cuda=config["cuda"],
shell:
"papermill {input.nb} {output.nb}"
" -f {input.configfile}"
" -r folder_experiment {params.folder_experiment}"
" -r fn_rawfile_metadata {params.meta_data}"
" -r file_format {params.file_format}"
" -r model_key {params.model_key}"
" -p cuda {params.cuda}"
" && jupyter nbconvert --to html {output.nb}"
2 changes: 2 additions & 0 deletions project/workflow/Snakefile_best_repeated_train.smk
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,13 @@ rule train_models:
model_key="{model}_{repeat}",
meta_data=config["fn_rawfile_metadata"],
file_format=config["file_format"],
cuda=config['cuda'],
shell:
"papermill {input.nb} {output.nb}"
" -f {input.configfile}"
" -r folder_experiment {params.folder_experiment}"
" -r fn_rawfile_metadata {params.meta_data}"
" -r file_format {params.file_format}"
" -r model_key {params.model_key}"
" -p cuda {params.cuda}"
" && jupyter nbconvert --to html {output.nb}"

0 comments on commit 49d628b

Please sign in to comment.