From 33a3b3b6f381251eff2b40715c0d6093aed19b56 Mon Sep 17 00:00:00 2001
From: Michael-Geuenich <michaelgeuenich@yahoo.com>
Date: Mon, 29 May 2023 16:01:45 -0400
Subject: [PATCH] cell type assignment

---
 pipeline/cell-type-assignment/CyTOFLDA.R      |   4 +
 pipeline/cell-type-assignment/Seurat.R        |  35 ---
 .../active-learning-accuracy.R                |  12 +-
 .../predict-random-forest.py                  |  10 +-
 .../random-forest-train.py                    |  23 +-
 pipeline/cell-type-assignment/scmap.R         |   8 +
 .../simulate-active-learner.R                 |   3 +
 pipeline/cell-type-assignment/singleR.R       |   4 +
 .../subset-simulated-active-learner-n-cells.R |   1 +
 pipeline/cell-type-predictions.smk            | 217 +++++++++---------
 10 files changed, 157 insertions(+), 160 deletions(-)

diff --git a/pipeline/cell-type-assignment/CyTOFLDA.R b/pipeline/cell-type-assignment/CyTOFLDA.R
index cc25c32..5364090 100644
--- a/pipeline/cell-type-assignment/CyTOFLDA.R
+++ b/pipeline/cell-type-assignment/CyTOFLDA.R
@@ -72,4 +72,8 @@ if(is.null(snakemake@wildcards[['cell_selection']])){
   df_output$cell_selection <- snakemake@wildcards[['cell_selection']]
 }
 
+if(!is.null(snakemake@wildcards$cell_selection)){
+  df_output$pred_cells <- snakemake@wildcards$cell_selection
+}
+
 write_tsv(df_output, snakemake@output[['prediction']])
\ No newline at end of file
diff --git a/pipeline/cell-type-assignment/Seurat.R b/pipeline/cell-type-assignment/Seurat.R
index 1cf9e99..6bfa8b1 100644
--- a/pipeline/cell-type-assignment/Seurat.R
+++ b/pipeline/cell-type-assignment/Seurat.R
@@ -9,7 +9,6 @@ sce <- readRDS(snakemake@input[['training_rds']])
 markers <- read_yaml(snakemake@input[['markers']])$cell_types
 unique_markers <- unlist(markers) %>% unique()
 
-
 ### [PROCESS DATA] ###
 if(snakemake@params[['mod']] == 'scRNASeq' | snakemake@params[['mod']] == 'snRNASeq'){
   # Normalize scRNASeq data
@@ -101,40 +100,6 @@ assignments %>%
   select(cell_id, predicted_cell_type, prediction_params, cluster, modality) %>% 
   write_tsv(snakemake@output[['assignments']])
 
-### Create diagnostic plots
-# Positive markers
-# lapply(1:length(markers), function(x){
-#   cell_types <- names(markers[x]) %>% 
-#     gsub(" ", "-", .)
-  
-#   out_path <- snakemake@params[['positive_markers_diagnostic']] %>% 
-#     gsub("\\[", "{", .) %>% 
-#     gsub("\\]", "}", .)
-#   out <- glue(out_path)
-  
-#   pdf(out, width = 12)
-#   print(FeaturePlot(seu, features = markers[[x]]$positive))
-#   dev.off()
-# })
-
-# # Negative markers
-# lapply(1:length(markers), function(x){
-#   cell_types <- names(markers[x]) %>% 
-#     gsub(" ", "-", .)
-  
-#   out_path <- snakemake@params[['negative_markers_diagnostic']] %>% 
-#     gsub("\\[", "{", .) %>% 
-#     gsub("\\]", "}", .)
-#   out <- glue(out_path)
-  
-#   if(!is.null(markers[[x]]$negative)){
-#     pdf(out, width = 12)
-#     print(FeaturePlot(seu, features = markers[[x]]$negative))
-#     dev.off()
-#   }
-# })
-
-
 ### Cell type umap
 Idents(seu) <- assignments$predicted_cell_type
 
diff --git a/pipeline/cell-type-assignment/active-learning-accuracy.R b/pipeline/cell-type-assignment/active-learning-accuracy.R
index 99062ed..1a6b2e4 100644
--- a/pipeline/cell-type-assignment/active-learning-accuracy.R
+++ b/pipeline/cell-type-assignment/active-learning-accuracy.R
@@ -24,6 +24,7 @@ if(grepl("maxp", snakemake@wildcards$strat)){
 df_expression <- load_scs(sce)
 df_expression$cell_type <- NA
 df_expression$gt_cell_type <- sce$CellType
+df_expression$non_corrupted_gt_cell_type <- sce$CellType
 df_expression$iteration <- NA
 df_expression$corrupted <- NA
 
@@ -63,6 +64,7 @@ df_PCA <- bind_cols(
   df_PCA[,1:min(20, ncol(df_PCA))], 
   tibble(cell_type = df_expression$cell_type,
          gt_cell_type = df_expression$gt_cell_type,
+         non_corrupted_gt_cell_type = df_expression$non_corrupted_gt_cell_type,
          iteration = df_expression$iteration)
 )
 
@@ -74,24 +76,26 @@ f1_scores <- tibble(
 )
 
 for(i in 1:iters){
+  print(i)
   annotated_cells <- df_PCA %>% 
     filter(!is.na(cell_type)) %>% 
     filter(cell_type != "Skipped", cell_type != "Unclear")
+  print(dim(annotated_cells))
   
   left_cells <- df_PCA %>% 
     filter(is.na(cell_type))
   
-  ModelFit <- fit_AL_classifier(select(annotated_cells, -gt_cell_type, -iteration),
+  ModelFit <- fit_AL_classifier(select(annotated_cells, -gt_cell_type, -iteration, -non_corrupted_gt_cell_type),
                                 snakemake@wildcards$AL_alg)
   
   ### Calculate F1-score
   predicted_scores <- predict(ModelFit, 
-                              select(left_cells, -X1, -cell_type, -gt_cell_type, -iteration),
+                              select(left_cells, -X1, -cell_type, -gt_cell_type, -iteration, -non_corrupted_gt_cell_type),
                               type = "raw")
   
   preds <- tibble(
     cell_id = left_cells$X1,
-    annotated_cell_type = left_cells$gt_cell_type,
+    annotated_cell_type = left_cells$non_corrupted_gt_cell_type,
     predicted_cell_type = predicted_scores
   )
   
@@ -106,7 +110,7 @@ for(i in 1:iters){
   
   # Continue AL - predict probabilities
   predicted_scores <- predict(ModelFit, 
-                              select(left_cells, -X1, -cell_type, -gt_cell_type, -iteration),
+                              select(left_cells, -X1, -cell_type, -gt_cell_type, -iteration, -non_corrupted_gt_cell_type),
                               type = "prob")
   
   # Get next set of cells
diff --git a/pipeline/cell-type-assignment/predict-random-forest.py b/pipeline/cell-type-assignment/predict-random-forest.py
index 7d5e7fb..bd664a1 100644
--- a/pipeline/cell-type-assignment/predict-random-forest.py
+++ b/pipeline/cell-type-assignment/predict-random-forest.py
@@ -14,9 +14,14 @@
 
 predicted = model.predict(expression)
 
+if 'cell_num' in dict(snakemake.wildcards).keys():
+    cell_num = snakemake.wildcards['cell_num']
+else:
+    cell_num = 'NA'
+
 output = pd.DataFrame({'cell_id': cell_ids,
                        'predicted_cell_type': predicted,
-                       'prediction_params': 'Random-Forest-knn-' + snakemake.wildcards['neighbors'] + '-res-' + snakemake.wildcards['res'] + '-cell_numbers-' + snakemake.wildcards['cell_num'] + '-randomSelection-' + snakemake.wildcards['rand'] + '-corrupted-' + snakemake.wildcards['corrupt'] + '-Init-' + snakemake.wildcards['initial'] + '-seed-' + snakemake.wildcards['s'],
+                       'prediction_params': 'Random-Forest-knn-' + snakemake.wildcards['neighbors'] + '-res-' + snakemake.wildcards['res'] + '-cell_numbers-' + cell_num + '-randomSelection-' + snakemake.wildcards['rand'] + '-corrupted-' + snakemake.wildcards['corrupt'] + '-Init-' + snakemake.wildcards['initial'] + '-seed-' + snakemake.wildcards['s'],
                        'selection_procedure': snakemake.wildcards['selection_procedure'] + '-strategy-' + snakemake.wildcards['strat'] + '-ALAlg-' + snakemake.wildcards['AL_alg'],
                        'modality': snakemake.wildcards['modality']})
 
@@ -28,4 +33,7 @@
 if 'similarity' in dict(snakemake.wildcards).keys():
     output['similarity'] = snakemake.wildcards['bal'] + '-' + snakemake.wildcards['similarity']
 
+if 'cell_selection' in dict(snakemake.wildcards).keys():
+    output['pred_cells'] = snakemake.wildcards['cell_selection']
+
 output.to_csv(snakemake.output['predictions'], sep = '\t', index = False)
\ No newline at end of file
diff --git a/pipeline/cell-type-assignment/random-forest-train.py b/pipeline/cell-type-assignment/random-forest-train.py
index 9f897b5..a15a2ea 100644
--- a/pipeline/cell-type-assignment/random-forest-train.py
+++ b/pipeline/cell-type-assignment/random-forest-train.py
@@ -14,19 +14,24 @@
 expression = pd.read_csv(snakemake.input['train'], sep = "\t", header = 0)
 annotation = pd.read_csv(str(snakemake.input['annotation']), sep = '\t', header = 0)
 
-expression = pd.merge(expression, annotation, on = 'cell_id')
+if 'cell_selection' in dict(snakemake.wildcards).keys():
+    expression = pd.merge(expression, annotation, on = 'cell_id')
+    X_train = expression.drop(['cell_type_y', 'entropy', 'labeling', 'cell_type_x', 'cell_id', 'corrupted_cell_type', 'iteration', 'method', 'cell_num', 'params'], axis = 1)
+    y_train = expression['cell_type_y']
+else:
+    expression = pd.merge(expression, annotation, on = 'cell_id')
 
-if snakemake.wildcards['selection_procedure'] == 'Active-Learning_entropy' or snakemake.wildcards['selection_procedure'] == 'Active-Learning_maxp':
-    expression = expression.drop(['iteration', 'cell_num', 'corrupted_cell_type'], axis = 1)
+    if snakemake.wildcards['selection_procedure'] == 'Active-Learning_entropy' or snakemake.wildcards['selection_procedure'] == 'Active-Learning_maxp':
+        expression = expression.drop(['iteration', 'cell_num', 'corrupted_cell_type'], axis = 1)
 
-if snakemake.wildcards['selection_procedure'] == 'random':
-    expression = expression.drop(['set','params'], axis = 1)
+    if snakemake.wildcards['selection_procedure'] == 'random':
+        expression = expression.drop(['set','params'], axis = 1)
 
-if snakemake.wildcards['selection_procedure'] == 'Seurat-clustering':
-    expression = expression.drop(['params'], axis = 1)
+    if snakemake.wildcards['selection_procedure'] == 'NoMarkerSeurat-clustering' or snakemake.wildcards['selection_procedure'] == 'MarkerSeurat-clustering':
+        expression = expression.drop(['params'], axis = 1)
 
-X_train = expression.drop(['cell_type_y', 'method', 'cell_type_x', 'cell_id'], axis = 1)
-y_train = expression['cell_type_y']
+    X_train = expression.drop(['cell_type_y', 'method', 'cell_type_x', 'cell_id'], axis = 1)
+    y_train = expression['cell_type_y']
 
 # ## ML Pipeline start
 RSEED = 42
diff --git a/pipeline/cell-type-assignment/scmap.R b/pipeline/cell-type-assignment/scmap.R
index a452905..1b5ddd4 100644
--- a/pipeline/cell-type-assignment/scmap.R
+++ b/pipeline/cell-type-assignment/scmap.R
@@ -66,6 +66,10 @@ if(!is.null(snakemake@wildcards[['similarity']])){
   clustering_prediction$similarity <- paste0(snakemake@wildcards[['bal']], '-', snakemake@wildcards[['similarity']])
 }
 
+if(!is.null(snakemake@wildcards$cell_selection)){
+  clustering_prediction$pred_cells <- snakemake@wildcards$cell_selection
+}
+
 
 write_tsv(clustering_prediction, snakemake@output[['cluster_predictions']])
 
@@ -111,5 +115,9 @@ if(!is.null(snakemake@wildcards[['similarity']])){
   sc_prediction$similarity <- paste0(snakemake@wildcards[['bal']], '-', snakemake@wildcards[['similarity']])
 }
 
+if(!is.null(snakemake@wildcards$cell_selection)){
+  sc_prediction$pred_cells <- snakemake@wildcards$cell_selection
+}
+
 write_tsv(sc_prediction, snakemake@output[['sc_predictions']])
 
diff --git a/pipeline/cell-type-assignment/simulate-active-learner.R b/pipeline/cell-type-assignment/simulate-active-learner.R
index e4d5b16..88c6484 100644
--- a/pipeline/cell-type-assignment/simulate-active-learner.R
+++ b/pipeline/cell-type-assignment/simulate-active-learner.R
@@ -43,6 +43,7 @@ if(snakemake@wildcards[['initial']] == 'ranking'){
 }else if(snakemake@wildcards[['initial']] == 'random'){
   random_cell_idx <- sample(1:nrow(df_expression), 20)
   df_expression$cell_type[random_cell_idx] <- df_expression$gt_cell_type[random_cell_idx]
+  df_expression$iteration[random_cell_idx] <- 0
 }
 
 if(!is.null(snakemake@wildcards[['similarity']])){
@@ -82,6 +83,8 @@ df_PCA <- bind_cols(
          iteration = df_expression$iteration)
 )
 
+table(df_PCA$cell_type)
+
 for(i in 1:max_AL_iterations){
   AL <- active_learning_wrapper(select(df_PCA, -gt_cell_type, -iteration), 
                                 snakemake@wildcards[['AL_alg']], 
diff --git a/pipeline/cell-type-assignment/singleR.R b/pipeline/cell-type-assignment/singleR.R
index a5bdbb7..eeb23ba 100644
--- a/pipeline/cell-type-assignment/singleR.R
+++ b/pipeline/cell-type-assignment/singleR.R
@@ -49,4 +49,8 @@ if(!is.null(snakemake@wildcards[['similarity']])){
   result$similarity <- paste0(snakemake@wildcards[['bal']], '-', snakemake@wildcards[['similarity']])
 }
 
+if(!is.null(snakemake@wildcards$cell_selection)){
+  result$pred_cells <- snakemake@wildcards$cell_selection
+}
+
 write_tsv(result, snakemake@output[['predictions']])
\ No newline at end of file
diff --git a/pipeline/cell-type-assignment/subset-simulated-active-learner-n-cells.R b/pipeline/cell-type-assignment/subset-simulated-active-learner-n-cells.R
index 56f162d..0364e8f 100644
--- a/pipeline/cell-type-assignment/subset-simulated-active-learner-n-cells.R
+++ b/pipeline/cell-type-assignment/subset-simulated-active-learner-n-cells.R
@@ -1,6 +1,7 @@
 library(tidyverse)
 
 training <- read_tsv(snakemake@input[['assignment']]) %>% 
+  filter(!is.na(iteration)) |>
   arrange(iteration)
 
 subset <- training[1:as.integer(snakemake@wildcards[['subset_val']]),]
diff --git a/pipeline/cell-type-predictions.smk b/pipeline/cell-type-predictions.smk
index 91f303d..f16e31b 100644
--- a/pipeline/cell-type-predictions.smk
+++ b/pipeline/cell-type-predictions.smk
@@ -5,7 +5,7 @@ def expand_predictions_by_mod(mod,
                              train_test_seeds = train_test_seeds,
                              cell_numbers = cell_numbers):
     rand_0 = []
-    pred_rand_0 = [expand(output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-{method}-predictions-seed-{s}-{cell_num}-cells.tsv',
+    pred_rand_0 = [expand(output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-{method}-predictions-seed-{{s}}-{cell_num}-cells.tsv',
                         modality = mod,
                         initial = selection_expansion_dict[select]['initial'],
                         selection_procedure = [select], 
@@ -16,7 +16,7 @@ def expand_predictions_by_mod(mod,
                         neighbors = selection_expansion_dict[select]['neighbors'], 
                         res = selection_expansion_dict[select]['res'], 
                         method = evaluation_methods_dict[mod],
-                        s = train_test_seeds,
+                        #s = train_test_seeds,
                         cell_num = cell_numbers) 
                         for select in selection_expansion_dict.keys()]
     
@@ -24,7 +24,7 @@ def expand_predictions_by_mod(mod,
         rand_0.extend(i)
 
     corr_0 = []
-    pred_corr_0 = [expand(output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-{method}-predictions-seed-{s}-{cell_num}-cells.tsv',
+    pred_corr_0 = [expand(output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-{method}-predictions-seed-{{s}}-{cell_num}-cells.tsv',
                         modality = mod,
                         initial = selection_expansion_dict[select]['initial'],
                         selection_procedure = [select], 
@@ -35,7 +35,7 @@ def expand_predictions_by_mod(mod,
                         neighbors = selection_expansion_dict[select]['neighbors'], 
                         res = selection_expansion_dict[select]['res'], 
                         method = evaluation_methods_dict[mod],
-                        s = train_test_seeds,
+                        #s = train_test_seeds,
                         cell_num = cell_numbers) 
                         for select in selection_expansion_dict.keys()]
     
@@ -44,12 +44,12 @@ def expand_predictions_by_mod(mod,
 
     return rand_0 + corr_0
 
-def get_labels(procedure, mod, imbalanced = False):
+def get_labels(procedure, mod, imbalanced = False, doublet = False):
     if imbalanced:
         if procedure == "Active-Learning_entropy" or procedure == "Active-Learning_maxp":
             path = output + 'data/imbalance-{{similarity}}-{{bal}}/{modality}/{{selection_procedure}}/AL-batches-subset/Init-{{initial}}-strat-{{strat}}-ALAlg-{{AL_alg}}-rand_sel-{{rand}}-corr-{{corrupt}}-{modality}-knn_neighbors-{{neighbors}}-resolution-{{res}}-seed-{{s}}-{{cell_num}}_cells.tsv'
         else:
-            path = output + 'data/imbalance-{{similarity}}-{{bal}}/{modality}/{{selection_procedure}}/{{selection_procedure}}-NA-ALAlg-NA-rand_sel-{{rand}}-corr-{{corrupt}}-{modality}-knn_neighbors-{{neighbors}}-resolution-{{res}}-seed-{{s}}-{{cell_num}}_cells.tsv'
+            path = output + 'data/imbalance-{{similarity}}-{{bal}}/{modality}/{{selection_procedure}}/Init_{{initial}}-strat-{{strat}}-rand_sel-{{rand}}-corr-{{corrupt}}-knn_neighbors-{{neighbors}}-resolution-{{res}}-seed-{{s}}-{{cell_num}}_cells.tsv'
         path = expand(path, modality = mod)
     else:
         if procedure == "Active-Learning_entropy" or procedure == "Active-Learning_maxp":
@@ -61,119 +61,114 @@ def get_labels(procedure, mod, imbalanced = False):
     return path
 
 cell_type_predictions = {
-    # 'scRNASeq': expand_predictions_by_mod("scRNASeq"),
-    # 'snRNASeq': expand_predictions_by_mod("snRNASeq"),
-    # 'CyTOF': expand_predictions_by_mod("CyTOF")
+    'scRNASeq': expand_predictions_by_mod("scRNASeq"),
+    'snRNASeq': expand_predictions_by_mod("snRNASeq"),
+    'CyTOF': expand_predictions_by_mod("CyTOF")
 }
 
-# rule train_and_predict_scmap:
-#     input:
-#         annotation = lambda wildcards: get_labels(wildcards.selection_procedure, wildcards.modality),
-#         train_data = 'data/{modality}/{modality}-train-seed-{s}.rds',
-#         test_data = 'data/{modality}/{modality}-test-seed-{s}.rds'
-#     output:
-#         cluster_predictions = output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-scmap-cluster-predictions-seed-{s}-{cell_num}-cells.tsv',
-#         sc_predictions = output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-scmap-sc-predictions-seed-{s}-{cell_num}-cells.tsv'
-#     script:
-#         'cell-type-assignment/scmap.R'
+rule train_and_predict_scmap:
+    input:
+        annotation = lambda wildcards: get_labels(wildcards.selection_procedure, wildcards.modality),
+        train_data = 'data/{modality}/{modality}-train-seed-{s}.rds',
+        test_data = 'data/{modality}/{modality}-test-seed-{s}.rds'
+    output:
+        cluster_predictions = output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-scmap-cluster-predictions-seed-{s}-{cell_num}-cells.tsv',
+        sc_predictions = output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-scmap-sc-predictions-seed-{s}-{cell_num}-cells.tsv'
+    script:
+        'cell-type-assignment/scmap.R'
 
-# rule train_and_predict_singleR:
-#     input:
-#         annotation = lambda wildcards: get_labels(wildcards.selection_procedure, wildcards.modality),
-#         train_data = 'data/{modality}/{modality}-train-seed-{s}.rds',
-#         test_data = 'data/{modality}/{modality}-test-seed-{s}.rds'
-#     output:
-#         predictions = output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-singleR-predictions-seed-{s}-{cell_num}-cells.tsv'
-#     script:
-#         'cell-type-assignment/singleR.R'
+rule train_and_predict_singleR:
+    input:
+        annotation = lambda wildcards: get_labels(wildcards.selection_procedure, wildcards.modality),
+        train_data = 'data/{modality}/{modality}-train-seed-{s}.rds',
+        test_data = 'data/{modality}/{modality}-test-seed-{s}.rds'
+    output:
+        predictions = output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-singleR-predictions-seed-{s}-{cell_num}-cells.tsv'
+    script:
+        'cell-type-assignment/singleR.R'
 
-# rule train_random_forest:
-#     input:
-#         train = 'data/{modality}/{modality}-expression-df-train-seed-{s}.tsv',
-#         annotation = lambda wildcards: get_labels(wildcards.selection_procedure, wildcards.modality)
-#     output:
-#         model = output + 'models/random-forest-Init_{initial}-{modality}-trained-on-{selection_procedure}-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}-cells.pkl'
-#     resources:
-#         mem_mb=20000
-#     log:
-#         output + 'logs/cell-type-predictions/random-forest-Init_{initial}-{modality}-trained-on-{selection_procedure}-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}-cells.log'
-#     script:
-#         'cell-type-assignment/random-forest-train.py'
+rule train_random_forest:
+    input:
+        train = 'data/{modality}/{modality}-expression-df-train-seed-{s}.tsv',
+        annotation = lambda wildcards: get_labels(wildcards.selection_procedure, wildcards.modality)
+    output:
+        model = output + 'models/random-forest-Init_{initial}-{modality}-trained-on-{selection_procedure}-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}-cells.pkl'
+    resources:
+        mem_mb=20000
+    log:
+        output + 'logs/cell-type-predictions/random-forest-Init_{initial}-{modality}-trained-on-{selection_procedure}-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}-cells.log'
+    script:
+        'cell-type-assignment/random-forest-train.py'
 
-# rule predict_random_forest:
-#     input:
-#         test = 'data/{modality}/{modality}-expression-df-test-seed-{s}.tsv',
-#         model = output + 'models/random-forest-Init_{initial}-{modality}-trained-on-{selection_procedure}-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}-cells.pkl'
-#     resources:
-#         mem_mb=5000
-#     output:
-#         predictions = output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-Random-Forest-predictions-seed-{s}-{cell_num}-cells.tsv'
-#     script:
-#         'cell-type-assignment/predict-random-forest.py'
+rule predict_random_forest:
+    input:
+        test = 'data/{modality}/{modality}-expression-df-test-seed-{s}.tsv',
+        model = output + 'models/random-forest-Init_{initial}-{modality}-trained-on-{selection_procedure}-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}-cells.pkl'
+    resources:
+        mem_mb=5000
+    output:
+        predictions = output + 'rare-subtype-benchmarking/Init_{initial}-{modality}-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-Random-Forest-predictions-seed-{s}-{cell_num}-cells.tsv'
+    script:
+        'cell-type-assignment/predict-random-forest.py'
 
-# rule Seurat_clustering_scRNASeq:
-#     input:
-#         training_rds = 'data/scRNASeq/scRNASeq-train-seed-{s}.rds',
-#         markers = 'markers/scRNASeq.yml'
-#     params:
-#         positive_markers_diagnostic = output + 'figures/diagnostics/scRNASeq-Seurat-[cell_types]-positive-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
-#         negative_markers_diagnostic = output + 'figures/diagnostics/scRNASeq-Seurat-[cell_types]-negative-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
-#         mod = "scRNASeq"
-#     output:
-#         cluster_umap_pdf = output + 'figures/scRNASeq-Seurat-cluster-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
-#         cell_type_umap_pdf = output + 'figures/scRNASeq-Seurat-cell-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
-#         assignments = output + 'cluster-and-interpret/scRNASeq/scRNASeq-Seurat-assignments-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv',
-#         diagnostics = expand(output + 'figures/diagnostics/scRNASeq-Seurat-{cell_types}-{pn}-knn_neighbors-{{neighbors}}-resolution-{{res}}-seed-{{s}}.pdf', 
-#                              cell_types = all_cell_types['scRNASeq'], pn = ['positive']),
-#         ground_truth_umap_pdf = output + 'figures/scRNASeq-Seurat-ground-truth-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf'
-#     script:
-#         'cell-type-assignment/Seurat.R'
+rule Seurat_clustering_scRNASeq:
+    input:
+        training_rds = 'data/scRNASeq/scRNASeq-train-seed-{s}.rds',
+        markers = 'markers/scRNASeq.yml'
+    resources:
+        mem_mb=5000
+    params:
+        positive_markers_diagnostic = output + 'figures/diagnostics/scRNASeq-{clusteringMarkers}-[cell_types]-positive-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
+        negative_markers_diagnostic = output + 'figures/diagnostics/scRNASeq-{clusteringMarkers}-[cell_types]-negative-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
+        mod = "scRNASeq"
+    output:
+        cluster_umap_pdf = output + 'figures/scRNASeq-{clusteringMarkers}-cluster-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
+        cell_type_umap_pdf = output + 'figures/scRNASeq-{clusteringMarkers}-cell-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
+        assignments = output + 'cluster-and-interpret/scRNASeq/scRNASeq-{clusteringMarkers}-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv',
+        ground_truth_umap_pdf = output + 'figures/scRNASeq-{clusteringMarkers}-ground-truth-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf'
+    script:
+        'cell-type-assignment/Seurat.R'
 
-# rule Seurat_clustering_snRNASeq:
-#     input:
-#         training_rds = 'data/snRNASeq/snRNASeq-train-seed-{s}.rds',
-#         markers = 'markers/snRNASeq.yml'
-#     params:
-#         positive_markers_diagnostic = output + 'figures/diagnostics/snRNASeq-Seurat-[cell_types]-positive-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
-#         negative_markers_diagnostic = output + 'figures/diagnostics/snRNASeq-Seurat-[cell_types]-negative-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
-#         mod = "snRNASeq"
-#     output:
-#         cluster_umap_pdf = output + 'figures/snRNASeq-Seurat-cluster-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
-#         cell_type_umap_pdf = output + 'figures/snRNASeq-Seurat-cell-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
-#         assignments = output + 'cluster-and-interpret/snRNASeq/snRNASeq-Seurat-assignments-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv',
-#         diagnostics = expand(output + 'figures/diagnostics/snRNASeq-Seurat-{cell_types}-{pn}-knn_neighbors-{{neighbors}}-resolution-{{res}}-seed-{{s}}.pdf', 
-#                              cell_types = all_cell_types['snRNASeq'], pn = ['positive']),
-#         ground_truth_umap_pdf = output + 'figures/snRNASeq-Seurat-ground-truth-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf'
-#     script:
-#         'cell-type-assignment/Seurat.R'
+rule Seurat_clustering_snRNASeq:
+    input:
+        training_rds = 'data/snRNASeq/snRNASeq-train-seed-{s}.rds',
+        markers = 'markers/snRNASeq.yml'
+    params:
+        positive_markers_diagnostic = output + 'figures/diagnostics/snRNASeq-{clusteringMarkers}-[cell_types]-positive-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
+        negative_markers_diagnostic = output + 'figures/diagnostics/snRNASeq-{clusteringMarkers}-[cell_types]-negative-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
+        mod = "snRNASeq"
+    output:
+        cluster_umap_pdf = output + 'figures/snRNASeq-{clusteringMarkers}-cluster-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
+        cell_type_umap_pdf = output + 'figures/snRNASeq-{clusteringMarkers}-cell-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
+        assignments = output + 'cluster-and-interpret/snRNASeq/snRNASeq-{clusteringMarkers}-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv',
+        ground_truth_umap_pdf = output + 'figures/snRNASeq-{clusteringMarkers}-ground-truth-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf'
+    script:
+        'cell-type-assignment/Seurat.R'
 
-# rule Seurat_clustering_CyTOF:
-#     input:
-#         training_rds = 'data/CyTOF/CyTOF-train-seed-{s}.rds',
-#         markers = 'markers/CyTOF.yml'
-#     params:
-#         positive_markers_diagnostic = output + 'figures/diagnostics/CyTOF-Seurat-[cell_types]-positive-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
-#         negative_markers_diagnostic = output + 'figures/diagnostics/CyTOF-Seurat-[cell_types]-negative-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
-#         mod = "CyTOF"
-#     output:
-#         cluster_umap_pdf = output + 'figures/CyTOF-Seurat-cluster-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
-#         cell_type_umap_pdf = output + 'figures/CyTOF-Seurat-cell-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
-#         assignments = output + 'cluster-and-interpret/CyTOF/CyTOF-Seurat-assignments-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv',
-#         diagnostics = expand(output + 'figures/diagnostics/CyTOF-Seurat-{cell_types}-{pn}-knn_neighbors-{{neighbors}}-resolution-{{res}}-seed-{{s}}.pdf', 
-#             cell_types = all_cell_types['CyTOF'], pn = ['positive']),
-#         ground_truth_umap_pdf = output + 'figures/CyTOF-Seurat-ground-truth-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf'
-#     script:
-#         'cell-type-assignment/Seurat.R'
+rule Seurat_clustering_CyTOF:
+    input:
+        training_rds = 'data/CyTOF/CyTOF-train-seed-{s}.rds',
+        markers = 'markers/CyTOF.yml'
+    params:
+        positive_markers_diagnostic = output + 'figures/diagnostics/CyTOF-{clusteringMarkers}-[cell_types]-positive-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
+        negative_markers_diagnostic = output + 'figures/diagnostics/CyTOF-{clusteringMarkers}-[cell_types]-negative-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
+        mod = "CyTOF"
+    output:
+        cluster_umap_pdf = output + 'figures/CyTOF-{clusteringMarkers}-cluster-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
+        cell_type_umap_pdf = output + 'figures/CyTOF-{clusteringMarkers}-cell-assignment-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf',
+        assignments = output + 'cluster-and-interpret/CyTOF/CyTOF-{clusteringMarkers}-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv',
+        ground_truth_umap_pdf = output + 'figures/CyTOF-{clusteringMarkers}-ground-truth-umap-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.pdf'
+    script:
+        'cell-type-assignment/Seurat.R'
 
 
-# ## CHECKED
-# rule CyTOF_LDA:
-#     input:
-#         training_rds = 'data/CyTOF/CyTOF-train-seed-{s}.rds',
-#         annotation_rds = 'data/CyTOF/CyTOF-test-seed-{s}.rds',
-#         labels = lambda wildcards: get_labels(wildcards.selection_procedure, 'CyTOF'),
-#     output:
-#         prediction = output + 'rare-subtype-benchmarking/Init_{initial}-CyTOF-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-CyTOF-LDA-predictions-seed-{s}-{cell_num}-cells.tsv'
-#     script:
-#         'cell-type-assignment/CyTOFLDA.R'
+rule CyTOF_LDA:
+    input:
+        training_rds = 'data/CyTOF/CyTOF-train-seed-{s}.rds',
+        annotation_rds = 'data/CyTOF/CyTOF-test-seed-{s}.rds',
+        labels = lambda wildcards: get_labels(wildcards.selection_procedure, 'CyTOF'),
+    output:
+        prediction = output + 'rare-subtype-benchmarking/Init_{initial}-CyTOF-sel_{selection_procedure}-strat-{strat}-ALAlg-{AL_alg}-rand_sel-{rand}-corr-{corrupt}-knn_neighbors-{neighbors}-resolution-{res}-CyTOF-LDA-predictions-seed-{s}-{cell_num}-cells.tsv'
+    script:
+        'cell-type-assignment/CyTOFLDA.R'