Skip to content

Commit

Permalink
expand rf/lr & rand/ranking figure to new datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael-Geuenich committed Oct 23, 2023
1 parent 204e58a commit a1ac35a
Showing 1 changed file with 193 additions and 44 deletions.
237 changes: 193 additions & 44 deletions pipeline/figures/compare-lr-rf-random-and-ranking.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,35 +45,53 @@ props <- bind_rows(
get_proportion_rep(snakemake@input$cytof,
snakemake@input$cytof_AL_rand,
snakemake@input$cytof_AL_rank,
"CyTOF"),
"CyTOF\nBone marrow"),
get_proportion_rep(snakemake@input$scrna,
snakemake@input$scrna_AL_rand,
snakemake@input$scrna_AL_rank,
"scRNASeq"),
"scRNASeq\nBreast cancer cell lines"),
get_proportion_rep(snakemake@input$snrna,
snakemake@input$snrna_AL_rand,
snakemake@input$snrna_AL_rank,
"snRNASeq")
"snRNASeq\nPancreas cancer"),
get_proportion_rep(snakemake@input$snrna_lung,
snakemake@input$snrna_lung_AL_rand,
snakemake@input$snrna_lung_AL_rank,
"scRNASeq\nLung cancer cell lines"),
get_proportion_rep(snakemake@input$liverAtlas,
snakemake@input$liverAtlas_AL_rand,
snakemake@input$liverAtlas_AL_rank,
"scRNASeq\nLiver"),
get_proportion_rep(snakemake@input$tabulaVasc,
snakemake@input$tabulaVasc_AL_rand,
snakemake@input$tabulaVasc_AL_rank,
"scRNASeq\nVasculature")
)

prop <- props |>
mutate(cohort = factor(cohort, levels = c("scRNASeq", "snRNASeq", "CyTOF"))) |>
ggplot(aes(x = prop_sel, y = type, fill = type)) +
geom_boxplot() +
labs(x = "Proportion of cell types selected in initial training set of 20 cells",
y = " \nInitial selection\nmethod") +
scale_fill_manual(values = c("#8F3985", "#98DFEA")) +
facet_wrap(~cohort, nrow = 1) +
facet_wrap(~cohort, nrow = 2) +
whatsthatcell_theme() +
theme(legend.position = "none")

pdf(snakemake@output$rank_vs_rand_cell_num, height = 2.5, width = 7)
prop
dev.off()


## Part B: heatmaps
acc <- lapply(snakemake@input$accs, function(x){
df <- read_tsv(x) |>
mutate(cohort = case_when(grepl("CyTOF", basename(x)) ~ "CyTOF",
grepl("snRNASeq", basename(x)) ~ "snRNASeq",
grepl("scRNASeq", basename(x)) ~ "scRNASeq"))
grepl("scRNASeq", basename(x)) ~ "scRNASeq",
grepl("scRNALung", basename(x)) ~ "scRNALung",
grepl("liverAtlas", basename(x)) ~ "liverAtlas",
grepl("tabulaVasc", basename(x)) ~ "tabulaVasc"))
df
}) |> bind_rows() |>
mutate(selection_procedure = case_when(selection_procedure == "NoMarkerSeurat_clustering" ~ "AR NoMarker",
Expand All @@ -87,7 +105,7 @@ acc <- lapply(snakemake@input$accs, function(x){
TRUE ~ selection_procedure))


create_heatmap <- function(acc, sel_cohort, comp, legend = FALSE){
create_heatmap <- function(acc, sel_cohort, comp, title, legend = FALSE){
subset_acc <- filter(acc, corrupted == 0) |>
filter(rand == 0 | is.na(rand)) |>
filter(.metric == 'f_meas' & cohort == sel_cohort) |>
Expand Down Expand Up @@ -129,57 +147,95 @@ create_heatmap <- function(acc, sel_cohort, comp, legend = FALSE){
column_to_rownames("cell_num") |>
as.matrix() |>
Heatmap(right_annotation = ha,
column_title = sel_cohort,
column_title = title,
show_heatmap_legend = legend,
row_order = c("100", "250", "500"),
col = colorRamp2(seq(-0.2, 0.2, length = 3), c("blue", "#EEEEEE", "red")),
name = "Improvement\nscore")
}

ranking_vs_random <- create_heatmap(acc, "scRNASeq", "random_vs_ranked") +
create_heatmap(acc, "snRNASeq", "random_vs_ranked") +
create_heatmap(acc, "CyTOF", "random_vs_ranked", TRUE)
### Ranking vs random
row1 <- create_heatmap(acc, "scRNASeq", "random_vs_ranked", "scRNASeq\nBreast cancer cell lines") +
create_heatmap(acc, "snRNASeq", "random_vs_ranked", "snRNASeq\nPancreas cancer") +
create_heatmap(acc, "CyTOF", "random_vs_ranked", "CyTOF\nBone marrow", TRUE)

row2 <- create_heatmap(acc, "scRNALung", "random_vs_ranked", "scRNASeq\nLung cancer cell lines") +
create_heatmap(acc, "liverAtlas", "random_vs_ranked", "scRNASeq\nLiver") +
create_heatmap(acc, "tabulaVasc", "random_vs_ranked", "scRNASeq\nVasculature", TRUE)

ranking <- draw(ranking_vs_random, column_title = "Selecting initial cells based on marker expression")
row1 <- draw(row1, column_title = "Selecting initial cells based on marker expression")
row2 <- draw(row2)

pdf(snakemake@output$rank_rand_hm, height = 3.5, width = 7)
ranking
pdf(snakemake@output$row_1_ranking_vs_random, height = 3.5, width = 7.5)
row1
dev.off()

lr_vs_rf <- create_heatmap(acc, "scRNASeq", "lr_vs_rf") +
create_heatmap(acc, "snRNASeq", "lr_vs_rf") +
create_heatmap(acc, "CyTOF", "lr_vs_rf", TRUE)
pdf(snakemake@output$row_2_ranking_vs_random, height = 3.4, width = 7.5)
row2
dev.off()

lr_vs_rf <- draw(lr_vs_rf, column_title = "F1-score improvement by random forest compared to logistic regression")

pdf(snakemake@output$lr_rf_hm, height = 3.5, width = 7)
lr_vs_rf
dev.off()
### Logistic regression vs random forest
lr_vs_rf_row1 <- create_heatmap(acc, "scRNASeq", "lr_vs_rf", "scRNASeq\nBreast cancer cell lines") +
create_heatmap(acc, "snRNASeq", "lr_vs_rf", "snRNASeq\nPancreas cancer") +
create_heatmap(acc, "CyTOF", "lr_vs_rf", "CyTOF\nBone marrow", TRUE)
lr_vs_rf_row1 <- draw(lr_vs_rf_row1,
column_title = "F1-score improvement by random forest compared to logistic regression")

## Full main figure
ranking <- image_read_pdf(snakemake@output$rank_rand_hm)
ranking <- ggplot() +
background_image(ranking)
lr_vs_rf_row2 <- create_heatmap(acc, "scRNALung", "lr_vs_rf", "scRNASeq\nLung cancer cell lines") +
create_heatmap(acc, "liverAtlas", "lr_vs_rf", "scRNASeq\nLiver") +
create_heatmap(acc, "tabulaVasc", "lr_vs_rf", "scRNASeq\nVasculature", TRUE)
lr_vs_rf_row2 <- draw(lr_vs_rf_row2)

lr_vs_rf <- image_read_pdf(snakemake@output$lr_rf_hm)
lr_vs_rf <- ggplot() +
background_image(lr_vs_rf)
pdf(snakemake@output$lr_rf_hm_1, height = 3.5, width = 7.5)
lr_vs_rf_row1
dev.off()

pdf(snakemake@output$main_fig, height = 8.8, width = 7)
(lr_vs_rf / wrap_elements(full = prop) / ranking) + plot_layout(heights = c(3, 1.2, 3)) + plot_annotation(tag_levels = "A")
pdf(snakemake@output$lr_rf_hm_2, height = 3.4, width = 7.5)
lr_vs_rf_row2
dev.off()



### Supplementary figures
# LR vs RF
rf_vs_rf_random <- filter(acc, corrupted == 0) |>
rf_vs_rf_random_1 <- filter(acc, corrupted == 0) |>
filter(rand == 0 | is.na(rand)) |>
filter(selection_procedure != "random" & !grepl("-AR", selection_procedure)) |>
filter(initial == "random") |>
filter(!grepl("-AR", selection_procedure)) |>
filter(cohort == "CyTOF" | cohort == "scRNASeq" | cohort == "snRNASeq") |>
mutate(AL_alg = case_when(AL_alg == "rf" ~ "RF",
AL_alg == "multinom" ~ "LR"),
cohort = case_when(cohort == "CyTOF" ~ "CyTOF\nBone marrow",
cohort == "scRNASeq" ~ "scRNASeq\nBreast cancer cell lines",
cohort == "snRNASeq" ~ "snRNASeq\nPancreas cancer")) |>
ggplot(aes(x = selection_procedure, y = .estimate, fill = AL_alg)) +
geom_boxplot() +
scale_fill_manual(values = al_colours()) +
labs(x = "Selection procedure", fill = "Active learning\nalgorithm",
title = "Performance of selection methods comparing random forest and logistic regression active learning strategies",
subtitle = "Shown are results obtained when randomly selecting the initial set of cells") +
facet_grid(.metric ~ cohort + cell_num) +
whatsthatcell_theme() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank())

rf_vs_rf_random_2 <- filter(acc, corrupted == 0) |>
filter(rand == 0 | is.na(rand)) |>
filter(selection_procedure != "random" & !grepl("-AR", selection_procedure)) |>
filter(initial == "random") |>
filter(!grepl("-AR", selection_procedure)) |>
mutate(AL_alg = case_when(AL_alg == "rf" ~ "Random Forest",
AL_alg == "multinom" ~ "Logistic Regression")) |>
filter(cohort == "scRNALung" | cohort == "liverAtlas" | cohort == "tabulaVasc") |>
mutate(AL_alg = case_when(AL_alg == "rf" ~ "RF",
AL_alg == "multinom" ~ "LR"),
cohort = case_when(cohort == "scRNALung" ~ "scRNASeq\nLung cancer cell lines",
cohort == "liverAtlas" ~ "scRNASeq\nLiver",
cohort == "tabulaVasc" ~ "scRNASeq\nVasculature")) |>
ggplot(aes(x = selection_procedure, y = .estimate, fill = AL_alg)) +
geom_boxplot() +
scale_fill_manual(values = al_colours()) +
labs(x = "Selection procedure", fill = "Active learning\nalgorithm",
title = "Performance of selection methods comparing random forest and logistic regression active learning strategies",
subtitle = "Shown are results obtained when randomly selecting the initial set of cells") +
Expand All @@ -189,37 +245,73 @@ rf_vs_rf_random <- filter(acc, corrupted == 0) |>
axis.ticks.x = element_blank(),
axis.title.x = element_blank())

rf_vs_rf_ranking <- filter(acc, corrupted == 0) |>
rf_vs_rf_ranking_1 <- filter(acc, corrupted == 0) |>
filter(rand == 0 | is.na(rand)) |>
filter(selection_procedure != "random" & !grepl("-AR", selection_procedure)) |>
filter(initial == "ranking") |>
filter(!grepl("-AR", selection_procedure)) |>
filter(cohort == "CyTOF" | cohort == "scRNASeq" | cohort == "snRNASeq") |>
mutate(AL_alg = case_when(AL_alg == "rf" ~ "RF",
AL_alg == "multinom" ~ "LR"),
cohort = case_when(cohort == "CyTOF" ~ "CyTOF\nBone marrow",
cohort == "scRNASeq" ~ "scRNASeq\nBreast cancer cell lines",
cohort == "snRNASeq" ~ "snRNASeq\nPancreas cancer")) |>
ggplot(aes(x = selection_procedure, y = .estimate, fill = AL_alg)) +
geom_boxplot() +
scale_fill_manual(values = al_colours()) +
labs(x = "Selection procedure", fill = "Active learning\nalgorithm",
title = "Performance of selection methods comparing random forest and logistic regression active learning strategies",
subtitle = "Shown are results obtained when ranking the initial set of cells") +
facet_grid(.metric ~ cohort + cell_num) +
whatsthatcell_theme() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust =1))


rf_vs_rf_ranking_2 <- filter(acc, corrupted == 0) |>
filter(rand == 0 | is.na(rand)) |>
filter(selection_procedure != "random" & !grepl("-AR", selection_procedure)) |>
filter(initial == "ranking") |>
filter(!grepl("-AR", selection_procedure)) |>
mutate(AL_alg = case_when(AL_alg == "rf" ~ "Random Forest",
AL_alg == "multinom" ~ "Logistic Regression")) |>
filter(cohort == "scRNALung" | cohort == "liverAtlas" | cohort == "tabulaVasc") |>
mutate(AL_alg = case_when(AL_alg == "rf" ~ "RF",
AL_alg == "multinom" ~ "LR"),
cohort = case_when(cohort == "scRNALung" ~ "scRNASeq\nLung cancer cell lines",
cohort == "liverAtlas" ~ "scRNASeq\nLiver",
cohort == "tabulaVasc" ~ "scRNASeq\nVasculature")) |>
ggplot(aes(x = selection_procedure, y = .estimate, fill = AL_alg)) +
geom_boxplot() +
scale_fill_manual(values = al_colours()) +
labs(x = "Selection procedure", fill = "Active learning\nalgorithm",
title = "Performance of selection methods comparing random forest and logistic regression active learning strategies",
subtitle = "Shown are results obtained when ranking the initial set of cells") +
facet_grid(.metric ~ cohort + cell_num) +
whatsthatcell_theme() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust =1))

pdf(snakemake@output$rf_lr_supp, height = 13, width = 12)
rf_vs_rf_random / rf_vs_rf_ranking + plot_layout(guides = "collect") +
pdf(snakemake@output$rf_lr_supp_1, height = 13, width = 15)
rf_vs_rf_random_1 / rf_vs_rf_ranking_1 + plot_layout(guides = "collect") +
plot_annotation(tag_levels = "A")
dev.off()

pdf(snakemake@output$rf_lr_supp_2, height = 13, width = 15)
rf_vs_rf_random_2 / rf_vs_rf_ranking_2 + plot_layout(guides = "collect") +
plot_annotation(tag_levels = "A")
dev.off()



# random vs ranking
rand_vs_rank_lr <- filter(acc, corrupted == 0) |>
rand_vs_rank_lr_1 <- filter(acc, corrupted == 0) |>
filter(rand == 0 | is.na(rand)) |>
filter(selection_procedure != "random" & !grepl("-AR", selection_procedure)) |>
filter(AL_alg == "multinom") |>
filter(!grepl("-AR", selection_procedure)) |>
filter(cohort == "CyTOF" | cohort == "scRNASeq" | cohort == "snRNASeq") |>
mutate(initial = case_when(initial == "random" ~ "Random",
initial == "ranking" ~ "Ranking")) |>
initial == "ranking" ~ "Ranking"),
cohort = case_when(cohort == "CyTOF" ~ "CyTOF\nBone marrow",
cohort == "scRNASeq" ~ "scRNASeq\nBreast cancer cell lines",
cohort == "snRNASeq" ~ "snRNASeq\nPancreas cancer")) |>
ggplot(aes(x = selection_procedure, y = .estimate, fill = initial)) +
geom_boxplot() +
labs(x = "Selection procedure", fill = "Initial selection",
Expand All @@ -231,13 +323,40 @@ rand_vs_rank_lr <- filter(acc, corrupted == 0) |>
axis.ticks.x = element_blank(),
axis.title.x = element_blank())

rand_vs_rank_rf <- filter(acc, corrupted == 0) |>
rand_vs_rank_lr_2 <- filter(acc, corrupted == 0) |>
filter(rand == 0 | is.na(rand)) |>
filter(selection_procedure != "random" & !grepl("-AR", selection_procedure)) |>
filter(AL_alg == "multinom") |>
filter(!grepl("-AR", selection_procedure)) |>
filter(cohort == "scRNALung" | cohort == "liverAtlas" | cohort == "tabulaVasc") |>
mutate(initial = case_when(initial == "random" ~ "Random",
initial == "ranking" ~ "Ranking"),
cohort = case_when(cohort == "scRNALung" ~ "scRNASeq\nLung cancer cell lines",
cohort == "liverAtlas" ~ "scRNASeq\nLiver",
cohort == "tabulaVasc" ~ "scRNASeq\nVasculature")) |>
ggplot(aes(x = selection_procedure, y = .estimate, fill = initial)) +
geom_boxplot() +
labs(x = "Selection procedure", fill = "Initial selection",
title = "Performance of selection methods comparing random and ranking based initial cell selections",
subtitle = "Shown are results obtained using logistic regression as an active learning algorithm") +
facet_grid(.metric ~ cohort + cell_num) +
whatsthatcell_theme() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank())


rand_vs_rank_rf_1 <- filter(acc, corrupted == 0) |>
filter(rand == 0 | is.na(rand)) |>
filter(selection_procedure != "random" & !grepl("-AR", selection_procedure)) |>
filter(AL_alg == "rf") |>
filter(!grepl("-AR", selection_procedure)) |>
filter(cohort == "CyTOF" | cohort == "scRNASeq" | cohort == "snRNASeq") |>
mutate(initial = case_when(initial == "random" ~ "Random",
initial == "ranking" ~ "Ranking")) |>
initial == "ranking" ~ "Ranking"),
cohort = case_when(cohort == "CyTOF" ~ "CyTOF\nBone marrow",
cohort == "scRNASeq" ~ "scRNASeq\nBreast cancer cell lines",
cohort == "snRNASeq" ~ "snRNASeq\nPancreas cancer")) |>
ggplot(aes(x = selection_procedure, y = .estimate, fill = initial)) +
geom_boxplot() +
labs(x = "Selection procedure", fill = "Initial selection",
Expand All @@ -247,7 +366,37 @@ rand_vs_rank_rf <- filter(acc, corrupted == 0) |>
whatsthatcell_theme() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust =1))

pdf(snakemake@output$rand_rank_supp, height = 13, width = 12)
rand_vs_rank_lr / rand_vs_rank_rf + plot_layout(guides = "collect") +

rand_vs_rank_rf_2 <- filter(acc, corrupted == 0) |>
filter(rand == 0 | is.na(rand)) |>
filter(selection_procedure != "random" & !grepl("-AR", selection_procedure)) |>
filter(AL_alg == "rf") |>
filter(!grepl("-AR", selection_procedure)) |>
filter(cohort == "scRNALung" | cohort == "liverAtlas" | cohort == "tabulaVasc") |>
mutate(initial = case_when(initial == "random" ~ "Random",
initial == "ranking" ~ "Ranking"),
cohort = case_when(cohort == "scRNALung" ~ "scRNASeq\nLung cancer cell lines",
cohort == "liverAtlas" ~ "scRNASeq\nLiver",
cohort == "tabulaVasc" ~ "scRNASeq\nVasculature")) |>
ggplot(aes(x = selection_procedure, y = .estimate, fill = initial)) +
geom_boxplot() +
labs(x = "Selection procedure", fill = "Initial selection",
title = "Performance of selection methods comparing random and ranking based initial cell selections",
subtitle = "Shown are results obtained using random forest as an active learning algorithm") +
facet_grid(.metric ~ cohort + cell_num) +
whatsthatcell_theme() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust =1))




pdf(snakemake@output$rand_rank_supp_1, height = 13, width = 15)
rand_vs_rank_lr_1 / rand_vs_rank_rf_1 + plot_layout(guides = "collect") +
plot_annotation(tag_levels = "A")
dev.off()


pdf(snakemake@output$rand_rank_supp_2, height = 13, width = 15)
rand_vs_rank_lr_2 / rand_vs_rank_rf_2 + plot_layout(guides = "collect") +
plot_annotation(tag_levels = "A")
dev.off()

0 comments on commit a1ac35a

Please sign in to comment.