Skip to content

Commit

Permalink
🚧 update Snakefile v2
Browse files Browse the repository at this point in the history
- cluster execution and renamed files
- format
  • Loading branch information
Henry committed Oct 16, 2023
1 parent fd1d07d commit 02b2d9f
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 41 deletions.
30 changes: 15 additions & 15 deletions project/config/knn_comparison/ald_pgs_all/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,19 @@ fn_rawfile_metadata: data/ALD_study/processed/raw_meta.csv
file_format: pkl
cuda: False
models:
- Median:
model: Median
- 3NN:
neighbors: 3
model: KNN
- 5NN:
neighbors: 5
model: KNN
- 10NN:
neighbors: 10
model: KNN
- 15NN:
neighbors: 15
model: KNN
- Median:
model: Median
- 3NN:
neighbors: 3
model: KNN
- 5NN:
neighbors: 5
model: KNN
- 10NN:
neighbors: 10
model: KNN
- 15NN:
neighbors: 15
model: KNN
NAGuideR_methods:
- KNN_IMPUTE
- KNN_IMPUTE
97 changes: 71 additions & 26 deletions project/workflow/Snakefile_v2
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,31 @@ Document how all the notebooks for a single experiment are connected.
"""
from snakemake.logging import logger


configfile: "config/single_dev_dataset/proteinGroups_N50/config.yaml"


MAX_WALLTIME = "24:00:00"
# Thinnode resources sharing: 40 cores and 196 GB RAM (minus 2GB for snakemake)
# JOB_RAM_MB = int(204_800 / 40 * config['THREATS_MQ'])
JOB_RAM_MB = "4gb"
folder_experiment = config["folder_experiment"]
logger.info(f"{folder_experiment = }")


# local rules are excuted in the process (job) running snakemake
localrules:
all,
comparison,
transform_NAGuideR_predictions,
transform_data_to_wide_format,
create_splits,


rule all:
input:
f"{folder_experiment}/figures/errors_binned_by_int_test.pdf",
f"{folder_experiment}/01_2_performance_summary.xlsx"
f"{folder_experiment}/figures/2_1_test_errors_binned_by_int.pdf",
f"{folder_experiment}/01_2_performance_summary.xlsx",


nb = "01_2_performance_plots.ipynb"
Expand All @@ -33,98 +48,118 @@ else:
if config["NAGuideR_methods"]:
MODELS += config["NAGuideR_methods"]

print(model_configs)
nb_stem = "01_2_performance_summary"


print(MODELS)

# import pdb; pdb.set_trace()

rule comparison:
input:
nb=nb,
runs=expand(
"{folder_experiment}/preds/pred_test_{model}.csv",
folder_experiment=folder_experiment,
folder_experiment=config["folder_experiment"],
model=MODELS,
),
output:
xlsx="{folder_experiment}/01_2_performance_summary.xlsx",
pdf="{folder_experiment}/figures/errors_binned_by_int_test.pdf",
xlsx=f"{{folder_experiment}}/{nb_stem}.xlsx",
pdf="{folder_experiment}/figures/2_1_test_errors_binned_by_int.pdf",
nb="{folder_experiment}" f"/{nb}",
params:
meta_data=config["fn_rawfile_metadata"],
models=",".join(MODELS),
err=f"{{folder_experiment}}/{nb_stem}.e",
out=f"{{folder_experiment}}/{nb_stem}.o",
shell:
"papermill {input.nb} {output.nb}"
" -r fn_rawfile_metadata {params.meta_data:q}"
" -r folder_experiment {wildcards.folder_experiment:q}"
" -r models {params.models:q}"
" && jupyter nbconvert --to html {output.nb}"


##########################################################################################
# train NaGuideR methods
nb_stem = "01_1_transfer_NAGuideR_pred"


rule transform_NAGuideR_predictions:
input:
input:
dumps=expand(
"{{folder_experiment}}/preds/pred_all_{method}.csv",
method=config["NAGuideR_methods"],
),
nb=f"{nb_stem}.ipynb",
output:
# "{{folder_experiment}}/preds/pred_real_na_{method}.csv"),
expand( (
"{{folder_experiment}}/preds/pred_val_{method}.csv",
"{{folder_experiment}}/preds/pred_test_{method}.csv"),
# "{{folder_experiment}}/preds/pred_real_na_{method}.csv"),
expand(
(
"{{folder_experiment}}/preds/pred_val_{method}.csv",
"{{folder_experiment}}/preds/pred_test_{method}.csv",
),
method=config["NAGuideR_methods"],
),
nb="{folder_experiment}/01_1_transfer_NAGuideR_pred.ipynb",
benchmark:
"{folder_experiment}/"f"{nb_stem}.tsv",
"{folder_experiment}/" f"{nb_stem}.tsv"
params:
err=f"{{folder_experiment}}/{nb_stem}.e",
out=f"{{folder_experiment}}/{nb_stem}.o",
folder_experiment="{folder_experiment}",
# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#non-file-parameters-for-rules
dumps_as_str=lambda wildcards, input: ','.join(input.dumps)
dumps_as_str=lambda wildcards, input: ",".join(input.dumps),
shell:
"papermill {input.nb} {output.nb}"
" -r folder_experiment {params.folder_experiment}"
" -p dumps {params.dumps_as_str}"
" && jupyter nbconvert --to html {output.nb}"


rule train_NAGuideR_model:
input:
nb="01_1_train_NAGuideR_methods.ipynb",
train_split="{folder_experiment}/data/data_wide_sample_cols.csv",
output:
nb="{folder_experiment}/01_1_train_NAGuideR_{method}.ipynb",
dump=temp("{folder_experiment}/preds/pred_all_{method}.csv")
dump="{folder_experiment}/preds/pred_all_{method}.csv",
resources:
mem_mb=JOB_RAM_MB,
walltime=MAX_WALLTIME,
threads: 1 # R is single threaded
benchmark:
"{folder_experiment}/01_1_train_NAGuideR_{method}.tsv"
params:
err="{folder_experiment}/01_1_train_NAGuideR_{method}.e",
out="{folder_experiment}/01_1_train_NAGuideR_{method}.o",
folder_experiment="{folder_experiment}",
method="{method}",
name="{method}",
shell:
"papermill {input.nb} {output.nb}"
" -r train_split {input.train_split}"
" -r method {params.method}"
" -r folder_experiment {params.folder_experiment}"
" && jupyter nbconvert --to html {output.nb}"


nb_stem = "01_0_transform_data_to_wide_format"


rule transform_data_to_wide_format:
input:
nb="01_0_transform_data_to_wide_format.ipynb",
nb=f"{nb_stem}.ipynb",
train_split="{folder_experiment}/data/train_X.csv",
output:
nb="{folder_experiment}/01_0_transform_data_to_wide_format.ipynb",
train_split=temp("{folder_experiment}/data/data_wide_sample_cols.csv"),
train_split="{folder_experiment}/data/data_wide_sample_cols.csv",
params:
folder_experiment="{folder_experiment}",
err=f"{{folder_experiment}}/{nb_stem}.e",
out=f"{{folder_experiment}}/{nb_stem}.o",
shell:
"papermill {input.nb} {output.nb}"
" -r folder_experiment {params.folder_experiment}"
" && jupyter nbconvert --to html {output.nb}"


##########################################################################################
# train models in python
rule train_models:
Expand All @@ -134,12 +169,15 @@ rule train_models:
configfile=config["config_train"],
output:
nb="{folder_experiment}/01_1_train_{model}.ipynb",
pred="{folder_experiment}/preds/pred_test_{model}.csv"
pred="{folder_experiment}/preds/pred_test_{model}.csv",
benchmark:
"{folder_experiment}/01_1_train_{model}.tsv"
params:
folder_experiment="{folder_experiment}",
meta_data=config["fn_rawfile_metadata"],
err="{folder_experiment}/01_1_train_{model}.e",
out="{folder_experiment}/01_1_train_{model}.o",
name="{model}",
shell:
"papermill {input.nb} {output.nb}"
" -f {input.configfile}"
Expand All @@ -148,32 +186,39 @@ rule train_models:
" -r model_key {wildcards.model}"
" && jupyter nbconvert --to html {output.nb}"


##########################################################################################
# create config file dumps for each model


rule dump_train_config:
output:
configfile=config["config_train"]
configfile=config["config_train"],
run:
import yaml

with open(output.configfile, "w") as f:
yaml.dump(model_configs[wildcards.model], f)


##########################################################################################
# Create Data splits
# separate workflow by level -> provide custom configs
nb = "01_0_split_data.ipynb"
nb_stem = "01_0_split_data"


rule create_splits:
input:
nb=nb,
configfile=config["config_split"],
nb=f"{nb_stem}.ipynb",
configfile=f"{folder_experiment}/{nb_stem}.yaml",
output:
train_split="{folder_experiment}/data/train_X.csv",
nb="{folder_experiment}" f"/{nb}",
nb="{folder_experiment}" f"/{nb_stem}.ipynb",
params:
folder_experiment="{folder_experiment}",
meta_data=config["fn_rawfile_metadata"],
err=f"{{folder_experiment}}/{nb_stem}.e",
out=f"{{folder_experiment}}/{nb_stem}.o",
shell:
"papermill {input.nb} {output.nb}"
" -f {input.configfile}"
Expand Down

0 comments on commit 02b2d9f

Please sign in to comment.