From 33a3b3b6f381251eff2b40715c0d6093aed19b56 Mon Sep 17 00:00:00 2001 From: Michael-Geuenich Date: Mon, 29 May 2023 16:01:45 -0400 Subject: [PATCH] cell type assignment --- pipeline/cell-type-assignment/CyTOFLDA.R | 4 + pipeline/cell-type-assignment/Seurat.R | 35 --- .../active-learning-accuracy.R | 12 +- .../predict-random-forest.py | 10 +- .../random-forest-train.py | 23 +- pipeline/cell-type-assignment/scmap.R | 8 + .../simulate-active-learner.R | 3 + pipeline/cell-type-assignment/singleR.R | 4 + .../subset-simulated-active-learner-n-cells.R | 1 + pipeline/cell-type-predictions.smk | 217 +++++++++--------- 10 files changed, 157 insertions(+), 160 deletions(-) diff --git a/pipeline/cell-type-assignment/CyTOFLDA.R b/pipeline/cell-type-assignment/CyTOFLDA.R index cc25c32..5364090 100644 --- a/pipeline/cell-type-assignment/CyTOFLDA.R +++ b/pipeline/cell-type-assignment/CyTOFLDA.R @@ -72,4 +72,8 @@ if(is.null(snakemake@wildcards[['cell_selection']])){ df_output$cell_selection <- snakemake@wildcards[['cell_selection']] } +if(!is.null(snakemake@wildcards$cell_selection)){ + df_output$pred_cells <- snakemake@wildcards$cell_selection +} + write_tsv(df_output, snakemake@output[['prediction']]) \ No newline at end of file diff --git a/pipeline/cell-type-assignment/Seurat.R b/pipeline/cell-type-assignment/Seurat.R index 1cf9e99..6bfa8b1 100644 --- a/pipeline/cell-type-assignment/Seurat.R +++ b/pipeline/cell-type-assignment/Seurat.R @@ -9,7 +9,6 @@ sce <- readRDS(snakemake@input[['training_rds']]) markers <- read_yaml(snakemake@input[['markers']])$cell_types unique_markers <- unlist(markers) %>% unique() - ### [PROCESS DATA] ### if(snakemake@params[['mod']] == 'scRNASeq' | snakemake@params[['mod']] == 'snRNASeq'){ # Normalize scRNASeq data @@ -101,40 +100,6 @@ assignments %>% select(cell_id, predicted_cell_type, prediction_params, cluster, modality) %>% write_tsv(snakemake@output[['assignments']]) -### Create diagnostic plots -# Positive markers -# lapply(1:length(markers), function(x){ -# cell_types <- names(markers[x]) %>% -# gsub(" ", "-", .) - -# out_path <- snakemake@params[['positive_markers_diagnostic']] %>% -# gsub("\\[", "{", .) %>% -# gsub("\\]", "}", .) -# out <- glue(out_path) - -# pdf(out, width = 12) -# print(FeaturePlot(seu, features = markers[[x]]$positive)) -# dev.off() -# }) - -# # Negative markers -# lapply(1:length(markers), function(x){ -# cell_types <- names(markers[x]) %>% -# gsub(" ", "-", .) - -# out_path <- snakemake@params[['negative_markers_diagnostic']] %>% -# gsub("\\[", "{", .) %>% -# gsub("\\]", "}", .) -# out <- glue(out_path) - -# if(!is.null(markers[[x]]$negative)){ -# pdf(out, width = 12) -# print(FeaturePlot(seu, features = markers[[x]]$negative)) -# dev.off() -# } -# }) - - ### Cell type umap Idents(seu) <- assignments$predicted_cell_type diff --git a/pipeline/cell-type-assignment/active-learning-accuracy.R b/pipeline/cell-type-assignment/active-learning-accuracy.R index 99062ed..1a6b2e4 100644 --- a/pipeline/cell-type-assignment/active-learning-accuracy.R +++ b/pipeline/cell-type-assignment/active-learning-accuracy.R @@ -24,6 +24,7 @@ if(grepl("maxp", snakemake@wildcards$strat)){ df_expression <- load_scs(sce) df_expression$cell_type <- NA df_expression$gt_cell_type <- sce$CellType +df_expression$non_corrupted_gt_cell_type <- sce$CellType df_expression$iteration <- NA df_expression$corrupted <- NA @@ -63,6 +64,7 @@ df_PCA <- bind_cols( df_PCA[,1:min(20, ncol(df_PCA))], tibble(cell_type = df_expression$cell_type, gt_cell_type = df_expression$gt_cell_type, + non_corrupted_gt_cell_type = df_expression$non_corrupted_gt_cell_type, iteration = df_expression$iteration) ) @@ -74,24 +76,26 @@ f1_scores <- tibble( ) for(i in 1:iters){ + print(i) annotated_cells <- df_PCA %>% filter(!is.na(cell_type)) %>% filter(cell_type != "Skipped", cell_type != "Unclear") + print(dim(annotated_cells)) left_cells <- df_PCA %>% filter(is.na(cell_type)) - ModelFit <- fit_AL_classifier(select(annotated_cells, -gt_cell_type, -iteration), + ModelFit <- fit_AL_classifier(select(annotated_cells, -gt_cell_type, -iteration, -non_corrupted_gt_cell_type), snakemake@wildcards$AL_alg) ### Calculate F1-score predicted_scores <- predict(ModelFit, - select(left_cells, -X1, -cell_type, -gt_cell_type, -iteration), + select(left_cells, -X1, -cell_type, -gt_cell_type, -iteration, -non_corrupted_gt_cell_type), type = "raw") preds <- tibble( cell_id = left_cells$X1, - annotated_cell_type = left_cells$gt_cell_type, + annotated_cell_type = left_cells$non_corrupted_gt_cell_type, predicted_cell_type = predicted_scores ) @@ -106,7 +110,7 @@ for(i in 1:iters){ # Continue AL - predict probabilities predicted_scores <- predict(ModelFit, - select(left_cells, -X1, -cell_type, -gt_cell_type, -iteration), + select(left_cells, -X1, -cell_type, -gt_cell_type, -iteration, -non_corrupted_gt_cell_type), type = "prob") # Get next set of cells diff --git a/pipeline/cell-type-assignment/predict-random-forest.py b/pipeline/cell-type-assignment/predict-random-forest.py index 7d5e7fb..bd664a1 100644 --- a/pipeline/cell-type-assignment/predict-random-forest.py +++ b/pipeline/cell-type-assignment/predict-random-forest.py @@ -14,9 +14,14 @@ predicted = model.predict(expression) +if 'cell_num' in dict(snakemake.wildcards).keys(): + cell_num = snakemake.wildcards['cell_num'] +else: + cell_num = 'NA' + output = pd.DataFrame({'cell_id': cell_ids, 'predicted_cell_type': predicted, - 'prediction_params': 'Random-Forest-knn-' + snakemake.wildcards['neighbors'] + '-res-' + snakemake.wildcards['res'] + '-cell_numbers-' + snakemake.wildcards['cell_num'] + '-randomSelection-' + snakemake.wildcards['rand'] + '-corrupted-' + snakemake.wildcards['corrupt'] + '-Init-' + snakemake.wildcards['initial'] + '-seed-' + snakemake.wildcards['s'], + 'prediction_params': 'Random-Forest-knn-' + snakemake.wildcards['neighbors'] + '-res-' + snakemake.wildcards['res'] + '-cell_numbers-' + cell_num + '-randomSelection-' + snakemake.wildcards['rand'] + '-corrupted-' + snakemake.wildcards['corrupt'] + '-Init-' + snakemake.wildcards['initial'] + '-seed-' + snakemake.wildcards['s'], 'selection_procedure': snakemake.wildcards['selection_procedure'] + '-strategy-' + snakemake.wildcards['strat'] + '-ALAlg-' + snakemake.wildcards['AL_alg'], 'modality': snakemake.wildcards['modality']}) @@ -28,4 +33,7 @@ if 'similarity' in dict(snakemake.wildcards).keys(): output['similarity'] = snakemake.wildcards['bal'] + '-' + snakemake.wildcards['similarity'] +if 'cell_selection' in dict(snakemake.wildcards).keys(): + output['pred_cells'] = snakemake.wildcards['cell_selection'] + output.to_csv(snakemake.output['predictions'], sep = '\t', index = False) \ No newline at end of file diff --git a/pipeline/cell-type-assignment/random-forest-train.py b/pipeline/cell-type-assignment/random-forest-train.py index 9f897b5..a15a2ea 100644 --- a/pipeline/cell-type-assignment/random-forest-train.py +++ b/pipeline/cell-type-assignment/random-forest-train.py @@ -14,19 +14,24 @@ expression = pd.read_csv(snakemake.input['train'], sep = "\t", header = 0) annotation = pd.read_csv(str(snakemake.input['annotation']), sep = '\t', header = 0) -expression = pd.merge(expression, annotation, on = 'cell_id') +if 'cell_selection' in dict(snakemake.wildcards).keys(): + expression = pd.merge(expression, annotation, on = 'cell_id') + X_train = expression.drop(['cell_type_y', 'entropy', 'labeling', 'cell_type_x', 'cell_id', 'corrupted_cell_type', 'iteration', 'method', 'cell_num', 'params'], axis = 1) + y_train = expression['cell_type_y'] +else: + expression = pd.merge(expression, annotation, on = 'cell_id') -if snakemake.wildcards['selection_procedure'] == 'Active-Learning_entropy' or snakemake.wildcards['selection_procedure'] == 'Active-Learning_maxp': - expression = expression.drop(['iteration', 'cell_num', 'corrupted_cell_type'], axis = 1) + if snakemake.wildcards['selection_procedure'] == 'Active-Learning_entropy' or snakemake.wildcards['selection_procedure'] == 'Active-Learning_maxp': + expression = expression.drop(['iteration', 'cell_num', 'corrupted_cell_type'], axis = 1) -if snakemake.wildcards['selection_procedure'] == 'random': - expression = expression.drop(['set','params'], axis = 1) + if snakemake.wildcards['selection_procedure'] == 'random': + expression = expression.drop(['set','params'], axis = 1) -if snakemake.wildcards['selection_procedure'] == 'Seurat-clustering': - expression = expression.drop(['params'], axis = 1) + if snakemake.wildcards['selection_procedure'] == 'NoMarkerSeurat-clustering' or snakemake.wildcards['selection_procedure'] == 'MarkerSeurat-clustering': + expression = expression.drop(['params'], axis = 1) -X_train = expression.drop(['cell_type_y', 'method', 'cell_type_x', 'cell_id'], axis = 1) -y_train = expression['cell_type_y'] + X_train = expression.drop(['cell_type_y', 'method', 'cell_type_x', 'cell_id'], axis = 1) + y_train = expression['cell_type_y'] # ## ML Pipeline start RSEED = 42 diff --git a/pipeline/cell-type-assignment/scmap.R b/pipeline/cell-type-assignment/scmap.R index a452905..1b5ddd4 100644 --- a/pipeline/cell-type-assignment/scmap.R +++ b/pipeline/cell-type-assignment/scmap.R @@ -66,6 +66,10 @@ if(!is.null(snakemake@wildcards[['similarity']])){ clustering_prediction$similarity <- paste0(snakemake@wildcards[['bal']], '-', snakemake@wildcards[['similarity']]) } +if(!is.null(snakemake@wildcards$cell_selection)){ + clustering_prediction$pred_cells <- snakemake@wildcards$cell_selection +} + write_tsv(clustering_prediction, snakemake@output[['cluster_predictions']]) @@ -111,5 +115,9 @@ if(!is.null(snakemake@wildcards[['similarity']])){ sc_prediction$similarity <- paste0(snakemake@wildcards[['bal']], '-', snakemake@wildcards[['similarity']]) } +if(!is.null(snakemake@wildcards$cell_selection)){ + sc_prediction$pred_cells <- snakemake@wildcards$cell_selection +} + write_tsv(sc_prediction, snakemake@output[['sc_predictions']]) diff --git a/pipeline/cell-type-assignment/simulate-active-learner.R b/pipeline/cell-type-assignment/simulate-active-learner.R index e4d5b16..88c6484 100644 --- a/pipeline/cell-type-assignment/simulate-active-learner.R +++ b/pipeline/cell-type-assignment/simulate-active-learner.R @@ -43,6 +43,7 @@ if(snakemake@wildcards[['initial']] == 'ranking'){ }else if(snakemake@wildcards[['initial']] == 'random'){ random_cell_idx <- sample(1:nrow(df_expression), 20) df_expression$cell_type[random_cell_idx] <- df_expression$gt_cell_type[random_cell_idx] + df_expression$iteration[random_cell_idx] <- 0 } if(!is.null(snakemake@wildcards[['similarity']])){ @@ -82,6 +83,8 @@ df_PCA <- bind_cols( iteration = df_expression$iteration) ) +table(df_PCA$cell_type) + for(i in 1:max_AL_iterations){ AL <- active_learning_wrapper(select(df_PCA, -gt_cell_type, -iteration), snakemake@wildcards[['AL_alg']], diff --git a/pipeline/cell-type-assignment/singleR.R b/pipeline/cell-type-assignment/singleR.R index a5bdbb7..eeb23ba 100644 --- a/pipeline/cell-type-assignment/singleR.R +++ b/pipeline/cell-type-assignment/singleR.R @@ -49,4 +49,8 @@ if(!is.null(snakemake@wildcards[['similarity']])){ result$similarity <- paste0(snakemake@wildcards[['bal']], '-', snakemake@wildcards[['similarity']]) } +if(!is.null(snakemake@wildcards$cell_selection)){ + result$pred_cells <- snakemake@wildcards$cell_selection +} + write_tsv(result, snakemake@output[['predictions']]) \ No newline at end of file diff --git a/pipeline/cell-type-assignment/subset-simulated-active-learner-n-cells.R b/pipeline/cell-type-assignment/subset-simulated-active-learner-n-cells.R index 56f162d..0364e8f 100644 --- a/pipeline/cell-type-assignment/subset-simulated-active-learner-n-cells.R +++ b/pipeline/cell-type-assignment/subset-simulated-active-learner-n-cells.R @@ -1,6 +1,7 @@ library(tidyverse) training <- read_tsv(snakemake@input[['assignment']]) %>% + filter(!is.na(iteration)) |> arrange(iteration) subset <- training[1:as.integer(snakemake@wildcards[['subset_val']]),] diff --git a/pipeline/cell-type-predictions.smk b/pipeline/cell-type-predictions.smk index 91f303d..f16e31b 100644 --- a/pipeline/cell-type-predictions.smk +++ b/pipeline/cell-type-predictions.smk @@ -5,7 +5,7 @@ def expand_predictions_by_mod(mod, train_test_seeds = train_test_seeds, cell_numbers = cell_numbers): rand_0 = [] - pred_rand_0 = [expand(output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-{method}-predictions-seed-{s}-{cell_num}-cells.tsv', + pred_rand_0 = [expand(output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-{method}-predictions-seed-{{s}}-{cell_num}-cells.tsv', modality = mod, initial = selection_expansion_dict[select]['initial'], selection_procedure = [select], @@ -16,7 +16,7 @@ def expand_predictions_by_mod(mod, neighbors = selection_expansion_dict[select]['neighbors'], res = selection_expansion_dict[select]['res'], method = evaluation_methods_dict[mod], - s = train_test_seeds, + #s = train_test_seeds, cell_num = cell_numbers) for select in selection_expansion_dict.keys()] @@ -24,7 +24,7 @@ def expand_predictions_by_mod(mod, rand_0.extend(i) corr_0 = [] - pred_corr_0 = [expand(output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-{method}-predictions-seed-{s}-{cell_num}-cells.tsv', + pred_corr_0 = [expand(output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-{method}-predictions-seed-{{s}}-{cell_num}-cells.tsv', modality = mod, initial = selection_expansion_dict[select]['initial'], selection_procedure = [select], @@ -35,7 +35,7 @@ def expand_predictions_by_mod(mod, neighbors = selection_expansion_dict[select]['neighbors'], res = selection_expansion_dict[select]['res'], method = evaluation_methods_dict[mod], - s = train_test_seeds, + #s = train_test_seeds, cell_num = cell_numbers) for select in selection_expansion_dict.keys()] @@ -44,12 +44,12 @@ def expand_predictions_by_mod(mod, return rand_0 + corr_0 -def get_labels(procedure, mod, imbalanced = False): +def get_labels(procedure, mod, imbalanced = False, doublet = False): if imbalanced: if procedure == "Active-Learning_entropy" or procedure == "Active-Learning_maxp": path = output + 'data/imbalance-{{similarity}}-{{bal}}/{modality}/{{selection_procedure}}/AL-batches-subset/Init-{{initial}}-strat-{{strat}}-ALAlg-{{AL_alg}}-rand_sel-{{rand}}-corr-{{corrupt}}-{modality}-knn_neighbors-{{neighbors}}-resolution-{{res}}-seed-{{s}}-{{cell_num}}_cells.tsv' else: - path = output + 'data/imbalance-{{similarity}}-{{bal}}/{modality}/{{selection_procedure}}/{{selection_procedure}}-NA-ALAlg-NA-rand_sel-{{rand}}-corr-{{corrupt}}-{modality}-knn_neighbors-{{neighbors}}-resolution-{{res}}-seed-{{s}}-{{cell_num}}_cells.tsv' + path = output + 'data/imbalance-{{similarity}}-{{bal}}/{modality}/{{selection_procedure}}/Init_{{initial}}-strat-{{strat}}-rand_sel-{{rand}}-corr-{{corrupt}}-knn_neighbors-{{neighbors}}-resolution-{{res}}-seed-{{s}}-{{cell_num}}_cells.tsv' path = expand(path, modality = mod) else: if procedure == "Active-Learning_entropy" or procedure == "Active-Learning_maxp": @@ -61,119 +61,114 @@ def get_labels(procedure, mod, imbalanced = False): return path cell_type_predictions = { - # 'scRNASeq': expand_predictions_by_mod("scRNASeq"), - # 'snRNASeq': expand_predictions_by_mod("snRNASeq"), - # 'CyTOF': expand_predictions_by_mod("CyTOF") + 'scRNASeq': expand_predictions_by_mod("scRNASeq"), + 'snRNASeq': expand_predictions_by_mod("snRNASeq"), + 'CyTOF': expand_predictions_by_mod("CyTOF") } -# rule train_and_predict_scmap: -# input: -# annotation = lambda wildcards: get_labels(wildcards.selection_procedure, wildcards.modality), -# train_data = 'data/{modality}/{modality}-train-seed-{s}.rds', -# test_data = 'data/{modality}/{modality}-test-seed-{s}.rds' -# output: -# cluster_predictions = output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-scmap-cluster-predictions-seed-{s}-{cell_num}-cells.tsv', -# sc_predictions = output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-scmap-sc-predictions-seed-{s}-{cell_num}-cells.tsv' -# script: -# 'cell-type-assignment/scmap.R' +rule train_and_predict_scmap: + input: + annotation = lambda wildcards: get_labels(wildcards.selection_procedure, wildcards.modality), + train_data = 'data/{modality}/{modality}-train-seed-{s}.rds', + test_data = 'data/{modality}/{modality}-test-seed-{s}.rds' + output: + cluster_predictions = output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-scmap-cluster-predictions-seed-{s}-{cell_num}-cells.tsv', + sc_predictions = output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-scmap-sc-predictions-seed-{s}-{cell_num}-cells.tsv' + script: + 'cell-type-assignment/scmap.R' -# rule train_and_predict_singleR: -# input: -# annotation = lambda wildcards: get_labels(wildcards.selection_procedure, wildcards.modality), -# train_data = 'data/{modality}/{modality}-train-seed-{s}.rds', -# test_data = 'data/{modality}/{modality}-test-seed-{s}.rds' -# output: -# predictions = output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-singleR-predictions-seed-{s}-{cell_num}-cells.tsv' -# script: -# 'cell-type-assignment/singleR.R' +rule train_and_predict_singleR: + input: + annotation = lambda wildcards: get_labels(wildcards.selection_procedure, wildcards.modality), + train_data = 'data/{modality}/{modality}-train-seed-{s}.rds', + test_data = 'data/{modality}/{modality}-test-seed-{s}.rds' + output: + predictions = output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-singleR-predictions-seed-{s}-{cell_num}-cells.tsv' + script: + 'cell-type-assignment/singleR.R' -# rule train_random_forest: -# input: -# train = 'data/{modality}/{modality}-expression-df-train-seed-{s}.tsv', -# annotation = lambda wildcards: get_labels(wildcards.selection_procedure, wildcards.modality) -# output: -# model = output + 'models/random-forest-Init_{initial}-{modality}-trained-on-{selection_procedure}-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}-cells.pkl' -# resources: -# mem_mb=20000 -# log: -# output + 'logs/cell-type-predictions/random-forest-Init_{initial}-{modality}-trained-on-{selection_procedure}-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}-cells.log' -# script: -# 'cell-type-assignment/random-forest-train.py' +rule train_random_forest: + input: + train = 'data/{modality}/{modality}-expression-df-train-seed-{s}.tsv', + annotation = lambda wildcards: get_labels(wildcards.selection_procedure, wildcards.modality) + output: + model = output + 'models/random-forest-Init_{initial}-{modality}-trained-on-{selection_procedure}-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}-cells.pkl' + resources: + mem_mb=20000 + log: + output + 'logs/cell-type-predictions/random-forest-Init_{initial}-{modality}-trained-on-{selection_procedure}-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}-cells.log' + script: + 'cell-type-assignment/random-forest-train.py' -# rule predict_random_forest: -# input: -# test = 'data/{modality}/{modality}-expression-df-test-seed-{s}.tsv', -# model = output + 'models/random-forest-Init_{initial}-{modality}-trained-on-{selection_procedure}-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}-cells.pkl' -# resources: -# mem_mb=5000 -# output: -# predictions = output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-Random-Forest-predictions-seed-{s}-{cell_num}-cells.tsv' -# script: -# 'cell-type-assignment/predict-random-forest.py' +rule predict_random_forest: + input: + test = 'data/{modality}/{modality}-expression-df-test-seed-{s}.tsv', + model = output + 'models/random-forest-Init_{initial}-{modality}-trained-on-{selection_procedure}-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}-cells.pkl' + resources: + mem_mb=5000 + output: + predictions = output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-Random-Forest-predictions-seed-{s}-{cell_num}-cells.tsv' + script: + 'cell-type-assignment/predict-random-forest.py' -# rule Seurat_clustering_scRNASeq: -# input: -# training_rds = 'data/scRNASeq/scRNASeq-train-seed-{s}.rds', -# markers = 'markers/scRNASeq.yml' -# params: -# positive_markers_diagnostic = output + 'figures/diagnostics/scRNASeq-Seurat-[cell_types]-positive-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', -# negative_markers_diagnostic = output + 'figures/diagnostics/scRNASeq-Seurat-[cell_types]-negative-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', -# mod = "scRNASeq" -# output: -# cluster_umap_pdf = output + 'figures/scRNASeq-Seurat-cluster-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', -# cell_type_umap_pdf = output + 'figures/scRNASeq-Seurat-cell-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', -# assignments = output + 'cluster-and-interpret/scRNASeq/scRNASeq-Seurat-assignments-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv', -# diagnostics = expand(output + 'figures/diagnostics/scRNASeq-Seurat-{cell_types}-{pn}-knn_neighbors-{{neighbors}}-resolution-{{res}}-seed-{{s}}.pdf', -# cell_types = all_cell_types['scRNASeq'], pn = ['positive']), -# ground_truth_umap_pdf = output + 'figures/scRNASeq-Seurat-ground-truth-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf' -# script: -# 'cell-type-assignment/Seurat.R' +rule Seurat_clustering_scRNASeq: + input: + training_rds = 'data/scRNASeq/scRNASeq-train-seed-{s}.rds', + markers = 'markers/scRNASeq.yml' + resources: + mem_mb=5000 + params: + positive_markers_diagnostic = output + 'figures/diagnostics/scRNASeq-{clusteringMarkers}-[cell_types]-positive-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', + negative_markers_diagnostic = output + 'figures/diagnostics/scRNASeq-{clusteringMarkers}-[cell_types]-negative-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', + mod = "scRNASeq" + output: + cluster_umap_pdf = output + 'figures/scRNASeq-{clusteringMarkers}-cluster-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', + cell_type_umap_pdf = output + 'figures/scRNASeq-{clusteringMarkers}-cell-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', + assignments = output + 'cluster-and-interpret/scRNASeq/scRNASeq-{clusteringMarkers}-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv', + ground_truth_umap_pdf = output + 'figures/scRNASeq-{clusteringMarkers}-ground-truth-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf' + script: + 'cell-type-assignment/Seurat.R' -# rule Seurat_clustering_snRNASeq: -# input: -# training_rds = 'data/snRNASeq/snRNASeq-train-seed-{s}.rds', -# markers = 'markers/snRNASeq.yml' -# params: -# positive_markers_diagnostic = output + 'figures/diagnostics/snRNASeq-Seurat-[cell_types]-positive-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', -# negative_markers_diagnostic = output + 'figures/diagnostics/snRNASeq-Seurat-[cell_types]-negative-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', -# mod = "snRNASeq" -# output: -# cluster_umap_pdf = output + 'figures/snRNASeq-Seurat-cluster-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', -# cell_type_umap_pdf = output + 'figures/snRNASeq-Seurat-cell-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', -# assignments = output + 'cluster-and-interpret/snRNASeq/snRNASeq-Seurat-assignments-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv', -# diagnostics = expand(output + 'figures/diagnostics/snRNASeq-Seurat-{cell_types}-{pn}-knn_neighbors-{{neighbors}}-resolution-{{res}}-seed-{{s}}.pdf', -# cell_types = all_cell_types['snRNASeq'], pn = ['positive']), -# ground_truth_umap_pdf = output + 'figures/snRNASeq-Seurat-ground-truth-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf' -# script: -# 'cell-type-assignment/Seurat.R' +rule Seurat_clustering_snRNASeq: + input: + training_rds = 'data/snRNASeq/snRNASeq-train-seed-{s}.rds', + markers = 'markers/snRNASeq.yml' + params: + positive_markers_diagnostic = output + 'figures/diagnostics/snRNASeq-{clusteringMarkers}-[cell_types]-positive-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', + negative_markers_diagnostic = output + 'figures/diagnostics/snRNASeq-{clusteringMarkers}-[cell_types]-negative-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', + mod = "snRNASeq" + output: + cluster_umap_pdf = output + 'figures/snRNASeq-{clusteringMarkers}-cluster-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', + cell_type_umap_pdf = output + 'figures/snRNASeq-{clusteringMarkers}-cell-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', + assignments = output + 'cluster-and-interpret/snRNASeq/snRNASeq-{clusteringMarkers}-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv', + ground_truth_umap_pdf = output + 'figures/snRNASeq-{clusteringMarkers}-ground-truth-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf' + script: + 'cell-type-assignment/Seurat.R' -# rule Seurat_clustering_CyTOF: -# input: -# training_rds = 'data/CyTOF/CyTOF-train-seed-{s}.rds', -# markers = 'markers/CyTOF.yml' -# params: -# positive_markers_diagnostic = output + 'figures/diagnostics/CyTOF-Seurat-[cell_types]-positive-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', -# negative_markers_diagnostic = output + 'figures/diagnostics/CyTOF-Seurat-[cell_types]-negative-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', -# mod = "CyTOF" -# output: -# cluster_umap_pdf = output + 'figures/CyTOF-Seurat-cluster-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', -# cell_type_umap_pdf = output + 'figures/CyTOF-Seurat-cell-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', -# assignments = output + 'cluster-and-interpret/CyTOF/CyTOF-Seurat-assignments-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv', -# diagnostics = expand(output + 'figures/diagnostics/CyTOF-Seurat-{cell_types}-{pn}-knn_neighbors-{{neighbors}}-resolution-{{res}}-seed-{{s}}.pdf', -# cell_types = all_cell_types['CyTOF'], pn = ['positive']), -# ground_truth_umap_pdf = output + 'figures/CyTOF-Seurat-ground-truth-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf' -# script: -# 'cell-type-assignment/Seurat.R' +rule Seurat_clustering_CyTOF: + input: + training_rds = 'data/CyTOF/CyTOF-train-seed-{s}.rds', + markers = 'markers/CyTOF.yml' + params: + positive_markers_diagnostic = output + 'figures/diagnostics/CyTOF-{clusteringMarkers}-[cell_types]-positive-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', + negative_markers_diagnostic = output + 'figures/diagnostics/CyTOF-{clusteringMarkers}-[cell_types]-negative-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', + mod = "CyTOF" + output: + cluster_umap_pdf = output + 'figures/CyTOF-{clusteringMarkers}-cluster-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', + cell_type_umap_pdf = output + 'figures/CyTOF-{clusteringMarkers}-cell-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf', + assignments = output + 'cluster-and-interpret/CyTOF/CyTOF-{clusteringMarkers}-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv', + ground_truth_umap_pdf = output + 'figures/CyTOF-{clusteringMarkers}-ground-truth-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf' + script: + 'cell-type-assignment/Seurat.R' -# ## CHECKED -# rule CyTOF_LDA: -# input: -# training_rds = 'data/CyTOF/CyTOF-train-seed-{s}.rds', -# annotation_rds = 'data/CyTOF/CyTOF-test-seed-{s}.rds', -# labels = lambda wildcards: get_labels(wildcards.selection_procedure, 'CyTOF'), -# output: -# prediction = output + 'rare-subtype-benchmarking/Init_{initial}-CyTOF-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-CyTOF-LDA-predictions-seed-{s}-{cell_num}-cells.tsv' -# script: -# 'cell-type-assignment/CyTOFLDA.R' +rule CyTOF_LDA: + input: + training_rds = 'data/CyTOF/CyTOF-train-seed-{s}.rds', + annotation_rds = 'data/CyTOF/CyTOF-test-seed-{s}.rds', + labels = lambda wildcards: get_labels(wildcards.selection_procedure, 'CyTOF'), + output: + prediction = output + 'rare-subtype-benchmarking/Init_{initial}-CyTOF-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-CyTOF-LDA-predictions-seed-{s}-{cell_num}-cells.tsv' + script: + 'cell-type-assignment/CyTOFLDA.R'