expand rf/lr & rand/ranking figure to new datasets

camlab-bioml · Oct 23, 2023 · a1ac35a · a1ac35a
1 parent 204e58a
commit a1ac35a
Showing 1 changed file with 193 additions and 44 deletions.
diff --git a/pipeline/figures/compare-lr-rf-random-and-ranking.R b/pipeline/figures/compare-lr-rf-random-and-ranking.R
@@ -45,35 +45,53 @@ props <- bind_rows(
   get_proportion_rep(snakemake@input$cytof,
                      snakemake@input$cytof_AL_rand,
                      snakemake@input$cytof_AL_rank,
-                     "CyTOF"),
+                     "CyTOF\nBone marrow"),
   get_proportion_rep(snakemake@input$scrna,
                      snakemake@input$scrna_AL_rand,
                      snakemake@input$scrna_AL_rank,
-                     "scRNASeq"),
+                     "scRNASeq\nBreast cancer cell lines"),
   get_proportion_rep(snakemake@input$snrna,
                      snakemake@input$snrna_AL_rand,
                      snakemake@input$snrna_AL_rank,
-                     "snRNASeq")
+                     "snRNASeq\nPancreas cancer"),
+  get_proportion_rep(snakemake@input$snrna_lung,
+                     snakemake@input$snrna_lung_AL_rand,
+                     snakemake@input$snrna_lung_AL_rank,
+                     "scRNASeq\nLung cancer cell lines"),
+  get_proportion_rep(snakemake@input$liverAtlas,
+                     snakemake@input$liverAtlas_AL_rand,
+                     snakemake@input$liverAtlas_AL_rank,
+                     "scRNASeq\nLiver"),
+  get_proportion_rep(snakemake@input$tabulaVasc,
+                     snakemake@input$tabulaVasc_AL_rand,
+                     snakemake@input$tabulaVasc_AL_rank,
+                     "scRNASeq\nVasculature")
 )
 
 prop <- props |> 
-  mutate(cohort = factor(cohort, levels = c("scRNASeq", "snRNASeq", "CyTOF"))) |> 
   ggplot(aes(x = prop_sel, y = type, fill = type)) +
   geom_boxplot() +
   labs(x = "Proportion of cell types selected in initial training set of 20 cells",
        y = " \nInitial selection\nmethod") +
   scale_fill_manual(values = c("#8F3985", "#98DFEA")) +
-  facet_wrap(~cohort, nrow = 1) +
+  facet_wrap(~cohort, nrow = 2) +
   whatsthatcell_theme() + 
   theme(legend.position = "none")
 
+pdf(snakemake@output$rank_vs_rand_cell_num, height = 2.5, width = 7)
+  prop
+dev.off()
+
 
 ## Part B: heatmaps
 acc <- lapply(snakemake@input$accs, function(x){
   df <- read_tsv(x) |> 
     mutate(cohort = case_when(grepl("CyTOF", basename(x)) ~ "CyTOF",
                               grepl("snRNASeq", basename(x)) ~ "snRNASeq",
-                              grepl("scRNASeq", basename(x)) ~ "scRNASeq"))
+                              grepl("scRNASeq", basename(x)) ~ "scRNASeq",
+                              grepl("scRNALung", basename(x)) ~ "scRNALung",
+                              grepl("liverAtlas", basename(x)) ~ "liverAtlas",
+                              grepl("tabulaVasc", basename(x)) ~ "tabulaVasc"))
   df
 }) |> bind_rows() |>
   mutate(selection_procedure = case_when(selection_procedure == "NoMarkerSeurat_clustering" ~ "AR NoMarker",
@@ -87,7 +105,7 @@ acc <- lapply(snakemake@input$accs, function(x){
                                          TRUE ~ selection_procedure))
 
 
-create_heatmap <- function(acc, sel_cohort, comp, legend = FALSE){
+create_heatmap <- function(acc, sel_cohort, comp, title, legend = FALSE){
   subset_acc <- filter(acc, corrupted == 0) |> 
     filter(rand == 0 | is.na(rand)) |> 
     filter(.metric == 'f_meas' & cohort == sel_cohort) |> 
@@ -129,57 +147,95 @@ create_heatmap <- function(acc, sel_cohort, comp, legend = FALSE){
     column_to_rownames("cell_num") |> 
     as.matrix() |> 
     Heatmap(right_annotation = ha,
-            column_title = sel_cohort,
+            column_title = title,
             show_heatmap_legend = legend,
             row_order = c("100", "250", "500"),
             col = colorRamp2(seq(-0.2, 0.2, length = 3), c("blue", "#EEEEEE", "red")),
             name = "Improvement\nscore")
 }
 
-ranking_vs_random <- create_heatmap(acc, "scRNASeq", "random_vs_ranked") + 
-  create_heatmap(acc, "snRNASeq", "random_vs_ranked") +
-  create_heatmap(acc, "CyTOF", "random_vs_ranked", TRUE)
+### Ranking vs random
+row1 <- create_heatmap(acc, "scRNASeq", "random_vs_ranked", "scRNASeq\nBreast cancer cell lines") + 
+  create_heatmap(acc, "snRNASeq", "random_vs_ranked", "snRNASeq\nPancreas cancer") + 
+  create_heatmap(acc, "CyTOF", "random_vs_ranked", "CyTOF\nBone marrow", TRUE)
+
+row2 <- create_heatmap(acc, "scRNALung", "random_vs_ranked", "scRNASeq\nLung cancer cell lines") + 
+  create_heatmap(acc, "liverAtlas", "random_vs_ranked", "scRNASeq\nLiver") +
+  create_heatmap(acc, "tabulaVasc", "random_vs_ranked", "scRNASeq\nVasculature", TRUE)
 
-ranking <- draw(ranking_vs_random, column_title = "Selecting initial cells based on marker expression")
+row1 <- draw(row1, column_title = "Selecting initial cells based on marker expression")
+row2 <- draw(row2)
 
-pdf(snakemake@output$rank_rand_hm, height = 3.5, width = 7)
-  ranking
+pdf(snakemake@output$row_1_ranking_vs_random, height = 3.5, width = 7.5)
+  row1
 dev.off()
 
-lr_vs_rf <- create_heatmap(acc, "scRNASeq", "lr_vs_rf") + 
-  create_heatmap(acc, "snRNASeq", "lr_vs_rf") +
-  create_heatmap(acc, "CyTOF", "lr_vs_rf", TRUE)
+pdf(snakemake@output$row_2_ranking_vs_random, height = 3.4, width = 7.5)
+  row2
+dev.off()
 
-lr_vs_rf <- draw(lr_vs_rf, column_title = "F1-score improvement by random forest compared to logistic regression")
 
-pdf(snakemake@output$lr_rf_hm, height = 3.5, width = 7)
-  lr_vs_rf
-dev.off()
+### Logistic regression vs random forest
+lr_vs_rf_row1 <- create_heatmap(acc, "scRNASeq", "lr_vs_rf", "scRNASeq\nBreast cancer cell lines") + 
+  create_heatmap(acc, "snRNASeq", "lr_vs_rf", "snRNASeq\nPancreas cancer") +
+  create_heatmap(acc, "CyTOF", "lr_vs_rf", "CyTOF\nBone marrow", TRUE)
+lr_vs_rf_row1 <- draw(lr_vs_rf_row1,
+                      column_title = "F1-score improvement by random forest compared to logistic regression")
 
-## Full main figure
-ranking <- image_read_pdf(snakemake@output$rank_rand_hm)
-ranking <- ggplot() +
-  background_image(ranking)
+lr_vs_rf_row2 <- create_heatmap(acc, "scRNALung", "lr_vs_rf", "scRNASeq\nLung cancer cell lines") + 
+  create_heatmap(acc, "liverAtlas", "lr_vs_rf", "scRNASeq\nLiver") +
+  create_heatmap(acc, "tabulaVasc", "lr_vs_rf", "scRNASeq\nVasculature", TRUE)
+lr_vs_rf_row2 <- draw(lr_vs_rf_row2)
 
-lr_vs_rf <- image_read_pdf(snakemake@output$lr_rf_hm)
-lr_vs_rf <- ggplot() +
-  background_image(lr_vs_rf)
+pdf(snakemake@output$lr_rf_hm_1, height = 3.5, width = 7.5)
+  lr_vs_rf_row1
+dev.off()
 
-pdf(snakemake@output$main_fig, height = 8.8, width = 7)
-   (lr_vs_rf / wrap_elements(full = prop) / ranking) + plot_layout(heights = c(3, 1.2, 3)) + plot_annotation(tag_levels = "A")
+pdf(snakemake@output$lr_rf_hm_2, height = 3.4, width = 7.5)
+  lr_vs_rf_row2
 dev.off()
 
+
+
 ### Supplementary figures
 # LR vs RF
-rf_vs_rf_random <- filter(acc, corrupted == 0) |> 
+rf_vs_rf_random_1 <- filter(acc, corrupted == 0) |> 
+  filter(rand == 0 | is.na(rand)) |> 
+  filter(selection_procedure != "random" & !grepl("-AR", selection_procedure)) |> 
+  filter(initial == "random") |> 
+  filter(!grepl("-AR", selection_procedure)) |> 
+  filter(cohort == "CyTOF" | cohort == "scRNASeq" | cohort == "snRNASeq") |> 
+  mutate(AL_alg = case_when(AL_alg == "rf" ~ "RF",
+                            AL_alg == "multinom" ~ "LR"),
+         cohort = case_when(cohort == "CyTOF" ~ "CyTOF\nBone marrow",
+                            cohort == "scRNASeq" ~ "scRNASeq\nBreast cancer cell lines",
+                            cohort == "snRNASeq" ~ "snRNASeq\nPancreas cancer")) |> 
+  ggplot(aes(x = selection_procedure, y = .estimate, fill = AL_alg)) +
+  geom_boxplot() +
+  scale_fill_manual(values = al_colours()) +
+  labs(x = "Selection procedure", fill = "Active learning\nalgorithm", 
+       title = "Performance of selection methods comparing random forest and logistic regression active learning strategies",
+       subtitle = "Shown are results obtained when randomly selecting the initial set of cells") +
+  facet_grid(.metric ~ cohort + cell_num) +
+  whatsthatcell_theme() +
+  theme(axis.text.x = element_blank(),
+        axis.ticks.x = element_blank(),
+        axis.title.x = element_blank())
+
+rf_vs_rf_random_2 <- filter(acc, corrupted == 0) |> 
   filter(rand == 0 | is.na(rand)) |> 
   filter(selection_procedure != "random" & !grepl("-AR", selection_procedure)) |> 
   filter(initial == "random") |> 
   filter(!grepl("-AR", selection_procedure)) |> 
-  mutate(AL_alg = case_when(AL_alg == "rf" ~ "Random Forest",
-                            AL_alg == "multinom" ~ "Logistic Regression")) |> 
+  filter(cohort == "scRNALung" | cohort == "liverAtlas" | cohort == "tabulaVasc") |> 
+  mutate(AL_alg = case_when(AL_alg == "rf" ~ "RF",
+                            AL_alg == "multinom" ~ "LR"),
+         cohort = case_when(cohort == "scRNALung" ~ "scRNASeq\nLung cancer cell lines",
+                            cohort == "liverAtlas" ~ "scRNASeq\nLiver",
+                            cohort == "tabulaVasc" ~ "scRNASeq\nVasculature")) |> 
   ggplot(aes(x = selection_procedure, y = .estimate, fill = AL_alg)) +
   geom_boxplot() +
+  scale_fill_manual(values = al_colours()) +
   labs(x = "Selection procedure", fill = "Active learning\nalgorithm", 
        title = "Performance of selection methods comparing random forest and logistic regression active learning strategies",
        subtitle = "Shown are results obtained when randomly selecting the initial set of cells") +
@@ -189,37 +245,73 @@ rf_vs_rf_random <- filter(acc, corrupted == 0) |>
         axis.ticks.x = element_blank(),
         axis.title.x = element_blank())
 
-rf_vs_rf_ranking <- filter(acc, corrupted == 0) |> 
+rf_vs_rf_ranking_1 <- filter(acc, corrupted == 0) |> 
+  filter(rand == 0 | is.na(rand)) |> 
+  filter(selection_procedure != "random" & !grepl("-AR", selection_procedure)) |> 
+  filter(initial == "ranking") |> 
+  filter(!grepl("-AR", selection_procedure)) |> 
+  filter(cohort == "CyTOF" | cohort == "scRNASeq" | cohort == "snRNASeq") |> 
+  mutate(AL_alg = case_when(AL_alg == "rf" ~ "RF",
+                            AL_alg == "multinom" ~ "LR"),
+         cohort = case_when(cohort == "CyTOF" ~ "CyTOF\nBone marrow",
+                            cohort == "scRNASeq" ~ "scRNASeq\nBreast cancer cell lines",
+                            cohort == "snRNASeq" ~ "snRNASeq\nPancreas cancer")) |> 
+  ggplot(aes(x = selection_procedure, y = .estimate, fill = AL_alg)) +
+  geom_boxplot() +
+  scale_fill_manual(values = al_colours()) +
+  labs(x = "Selection procedure", fill = "Active learning\nalgorithm", 
+       title = "Performance of selection methods comparing random forest and logistic regression active learning strategies",
+       subtitle = "Shown are results obtained when ranking the initial set of cells") +
+  facet_grid(.metric ~ cohort + cell_num) +
+  whatsthatcell_theme() +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust =1))
+
+
+rf_vs_rf_ranking_2 <- filter(acc, corrupted == 0) |> 
   filter(rand == 0 | is.na(rand)) |> 
   filter(selection_procedure != "random" & !grepl("-AR", selection_procedure)) |> 
   filter(initial == "ranking") |> 
   filter(!grepl("-AR", selection_procedure)) |> 
-  mutate(AL_alg = case_when(AL_alg == "rf" ~ "Random Forest",
-                            AL_alg == "multinom" ~ "Logistic Regression")) |> 
+  filter(cohort == "scRNALung" | cohort == "liverAtlas" | cohort == "tabulaVasc") |> 
+  mutate(AL_alg = case_when(AL_alg == "rf" ~ "RF",
+                            AL_alg == "multinom" ~ "LR"),
+         cohort = case_when(cohort == "scRNALung" ~ "scRNASeq\nLung cancer cell lines",
+                            cohort == "liverAtlas" ~ "scRNASeq\nLiver",
+                            cohort == "tabulaVasc" ~ "scRNASeq\nVasculature")) |> 
   ggplot(aes(x = selection_procedure, y = .estimate, fill = AL_alg)) +
   geom_boxplot() +
+  scale_fill_manual(values = al_colours()) +
   labs(x = "Selection procedure", fill = "Active learning\nalgorithm", 
        title = "Performance of selection methods comparing random forest and logistic regression active learning strategies",
        subtitle = "Shown are results obtained when ranking the initial set of cells") +
   facet_grid(.metric ~ cohort + cell_num) +
   whatsthatcell_theme() +
   theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust =1))
 
-pdf(snakemake@output$rf_lr_supp, height = 13, width = 12)
-  rf_vs_rf_random / rf_vs_rf_ranking + plot_layout(guides = "collect") + 
+pdf(snakemake@output$rf_lr_supp_1, height = 13, width = 15)
+  rf_vs_rf_random_1 / rf_vs_rf_ranking_1 + plot_layout(guides = "collect") + 
+    plot_annotation(tag_levels = "A")
+dev.off()
+
+pdf(snakemake@output$rf_lr_supp_2, height = 13, width = 15)
+  rf_vs_rf_random_2 / rf_vs_rf_ranking_2 + plot_layout(guides = "collect") + 
     plot_annotation(tag_levels = "A")
 dev.off()
 
 
 
 # random vs ranking
-rand_vs_rank_lr <- filter(acc, corrupted == 0) |> 
+rand_vs_rank_lr_1 <- filter(acc, corrupted == 0) |> 
   filter(rand == 0 | is.na(rand)) |> 
   filter(selection_procedure != "random" & !grepl("-AR", selection_procedure)) |> 
   filter(AL_alg == "multinom") |> 
   filter(!grepl("-AR", selection_procedure)) |> 
+  filter(cohort == "CyTOF" | cohort == "scRNASeq" | cohort == "snRNASeq") |> 
   mutate(initial = case_when(initial == "random" ~ "Random",
-                            initial == "ranking" ~ "Ranking")) |> 
+                            initial == "ranking" ~ "Ranking"),
+         cohort = case_when(cohort == "CyTOF" ~ "CyTOF\nBone marrow",
+                            cohort == "scRNASeq" ~ "scRNASeq\nBreast cancer cell lines",
+                            cohort == "snRNASeq" ~ "snRNASeq\nPancreas cancer")) |> 
   ggplot(aes(x = selection_procedure, y = .estimate, fill = initial)) +
   geom_boxplot() +
   labs(x = "Selection procedure", fill = "Initial selection", 
@@ -231,13 +323,40 @@ rand_vs_rank_lr <- filter(acc, corrupted == 0) |>
         axis.ticks.x = element_blank(),
         axis.title.x = element_blank())
 
-rand_vs_rank_rf <- filter(acc, corrupted == 0) |> 
+rand_vs_rank_lr_2 <- filter(acc, corrupted == 0) |> 
+  filter(rand == 0 | is.na(rand)) |> 
+  filter(selection_procedure != "random" & !grepl("-AR", selection_procedure)) |> 
+  filter(AL_alg == "multinom") |> 
+  filter(!grepl("-AR", selection_procedure)) |> 
+  filter(cohort == "scRNALung" | cohort == "liverAtlas" | cohort == "tabulaVasc") |> 
+  mutate(initial = case_when(initial == "random" ~ "Random",
+                             initial == "ranking" ~ "Ranking"),
+         cohort = case_when(cohort == "scRNALung" ~ "scRNASeq\nLung cancer cell lines",
+                            cohort == "liverAtlas" ~ "scRNASeq\nLiver",
+                            cohort == "tabulaVasc" ~ "scRNASeq\nVasculature")) |> 
+  ggplot(aes(x = selection_procedure, y = .estimate, fill = initial)) +
+  geom_boxplot() +
+  labs(x = "Selection procedure", fill = "Initial selection", 
+       title = "Performance of selection methods comparing random and ranking based initial cell selections",
+       subtitle = "Shown are results obtained using logistic regression as an active learning algorithm") +
+  facet_grid(.metric ~ cohort + cell_num) +
+  whatsthatcell_theme() +
+  theme(axis.text.x = element_blank(),
+        axis.ticks.x = element_blank(),
+        axis.title.x = element_blank())
+
+
+rand_vs_rank_rf_1 <- filter(acc, corrupted == 0) |> 
   filter(rand == 0 | is.na(rand)) |> 
   filter(selection_procedure != "random" & !grepl("-AR", selection_procedure)) |> 
   filter(AL_alg == "rf") |> 
   filter(!grepl("-AR", selection_procedure)) |> 
+  filter(cohort == "CyTOF" | cohort == "scRNASeq" | cohort == "snRNASeq") |> 
   mutate(initial = case_when(initial == "random" ~ "Random",
-                            initial == "ranking" ~ "Ranking")) |> 
+                            initial == "ranking" ~ "Ranking"),
+         cohort = case_when(cohort == "CyTOF" ~ "CyTOF\nBone marrow",
+                            cohort == "scRNASeq" ~ "scRNASeq\nBreast cancer cell lines",
+                            cohort == "snRNASeq" ~ "snRNASeq\nPancreas cancer")) |> 
   ggplot(aes(x = selection_procedure, y = .estimate, fill = initial)) +
   geom_boxplot() +
   labs(x = "Selection procedure", fill = "Initial selection", 
@@ -247,7 +366,37 @@ rand_vs_rank_rf <- filter(acc, corrupted == 0) |>
   whatsthatcell_theme() +
   theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust =1))
 
-pdf(snakemake@output$rand_rank_supp, height = 13, width = 12)
-  rand_vs_rank_lr / rand_vs_rank_rf + plot_layout(guides = "collect") + 
+
+rand_vs_rank_rf_2 <- filter(acc, corrupted == 0) |> 
+  filter(rand == 0 | is.na(rand)) |> 
+  filter(selection_procedure != "random" & !grepl("-AR", selection_procedure)) |> 
+  filter(AL_alg == "rf") |> 
+  filter(!grepl("-AR", selection_procedure)) |> 
+  filter(cohort == "scRNALung" | cohort == "liverAtlas" | cohort == "tabulaVasc") |> 
+  mutate(initial = case_when(initial == "random" ~ "Random",
+                             initial == "ranking" ~ "Ranking"),
+         cohort = case_when(cohort == "scRNALung" ~ "scRNASeq\nLung cancer cell lines",
+                            cohort == "liverAtlas" ~ "scRNASeq\nLiver",
+                            cohort == "tabulaVasc" ~ "scRNASeq\nVasculature")) |> 
+  ggplot(aes(x = selection_procedure, y = .estimate, fill = initial)) +
+  geom_boxplot() +
+  labs(x = "Selection procedure", fill = "Initial selection", 
+       title = "Performance of selection methods comparing random and ranking based initial cell selections",
+       subtitle = "Shown are results obtained using random forest as an active learning algorithm") +
+  facet_grid(.metric ~ cohort + cell_num) +
+  whatsthatcell_theme() +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust =1))
+
+
+
+
+pdf(snakemake@output$rand_rank_supp_1, height = 13, width = 15)
+  rand_vs_rank_lr_1 / rand_vs_rank_rf_1 + plot_layout(guides = "collect") + 
+    plot_annotation(tag_levels = "A")
+dev.off()
+
+
+pdf(snakemake@output$rand_rank_supp_2, height = 13, width = 15)
+  rand_vs_rank_lr_2 / rand_vs_rank_rf_2 + plot_layout(guides = "collect") + 
     plot_annotation(tag_levels = "A")
 dev.off()