diff --git a/pipeline/process-data.smk b/pipeline/process-data.smk index c9b622c..b8c181d 100644 --- a/pipeline/process-data.smk +++ b/pipeline/process-data.smk @@ -1,22 +1,63 @@ +celllines = ["AU565", "HCC1937", "HCC38", "MDAMB468", "EFM19", "HCC1187", 'JIMT1', "MDAMB361", 'HCC1500', "HCC70", "CAL51", "MDAMB415", 'BT549', "BT483", "MDAMB436", "DU4475", "HS578T", "MCF7", "CAMA1", "BT20", "T47D", "EVSAT", "HDQP1", "BT474", "CAL851", "HCC1143", "MCF12A", "HCC1954", "KPL1", "ZR751", "MX1", "MDAMB453"] process_data_output = { 'train_test_split': expand('data/{modality}/{modality}-{split}-seed-{s}.rds', modality = modalities, split = data_splits, s = train_test_seeds), 'scRNA_expression_df': expand('data/{modality}/{modality}-expression-df-{split}-seed-{s}.tsv', modality = modalities, split = data_splits, s = train_test_seeds), - 'CyTOF_download': 'data/CyTOF/CyTOF-full.rds', - 'dataset_dimensionality': expand(output + 'reports/{modality}-dataset-dimensionality-seed-{s}.html', modality = modalities, s = train_test_seeds), - + 'celllines': 'data/scRNASeq/scRNASeq-cellLines-full.rds', + #'markers': expand(output + 'figures/scRNASeq-markers/{plot}.pdf', plot = ['heatmap', 'lfcs-dot-plot']), + # Random subsets 'sce_subset1': expand(output + 'data/{modality}/random/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-NA-resolution-NA-seed-{s}-{cell_num}_cells.rds', modality = modalities, cell_num = cell_numbers, s = train_test_seeds), 'gt_subset1': expand(output + 'data/{modality}/random/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-NA-resolution-NA-seed-{s}-{cell_num}_cells.tsv', modality = modalities, cell_num = cell_numbers, s = train_test_seeds), # Seurat clustering subsets - 'seu_sce': expand(output + 'data/{modality}/Seurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.rds', - modality = modalities, neighbors = Seurat_neighbors, res = Seurat_resolution, cell_num = cell_numbers, s = train_test_seeds), - 'gt_seu': expand(output + 'data/{modality}/Seurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.tsv', - modality = modalities, neighbors = Seurat_neighbors, res = Seurat_resolution, cell_num = cell_numbers, s = train_test_seeds), + 'seu_sce': expand(output + 'data/{modality}/{clusteringMarkers}/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.rds', + modality = modalities, clusteringMarkers = ['NoMarkerSeurat-clustering', 'MarkerSeurat-clustering'], neighbors = Seurat_neighbors, res = Seurat_resolution, cell_num = cell_numbers, s = train_test_seeds), + 'gt_seu': expand(output + 'data/{modality}/{clusteringMarkers}/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.tsv', + modality = modalities, clusteringMarkers = ['NoMarkerSeurat-clustering', 'MarkerSeurat-clustering'], neighbors = Seurat_neighbors, res = Seurat_resolution, cell_num = cell_numbers, s = train_test_seeds), } +# This set of rules is required to process the cell line data and the outputs were used to create a marker file +# rule process_cell_line_mats: +# input: +# mat = "data/scRNASeq/matrix.mtx.gz", +# barcodes = "data/scRNASeq/barcodes.tsv", +# features = "data/scRNASeq/features.tsv.gz" +# resources: mem_mb=50000 +# output: +# sce = temp('data/scRNASeq/{cell_line}-sce.rds') +# script: +# 'process-data/process-cell-line-data.R' + +# rule combine_cell_lines_into_sce: +# input: +# mat = expand('data/scRNASeq/{cell_line}-sce.rds', cell_line = celllines) +# resources: mem_mb=50000 +# output: +# detected = output + 'figures/scRNASeq-cell-lines-detected.pdf', +# dim_red = output + 'figures/scRNASeq-cell-lines-dimred.png', +# sce = 'data/scRNASeq/scRNASeq-cellLines-full.rds' +# script: +# 'process-data/create-sce.R' + +# rule subset_cellLines: +# input: +# sce = "data/scRNASeq/scRNASeq-cellLines-full.rds" +# output: +# sce = "data/scRNASeq/scRNASeq-full.rds" +# script: +# 'process-data/subset-celllines.R' + +# rule get_markers: +# input: +# sce = "data/scRNASeq/scRNASeq-full.rds" +# output: +# heatmap = output + 'figures/scRNASeq-markers/heatmap.pdf', +# lfc_point = output + 'figures/scRNASeq-markers/lfcs-dot-plot.pdf' +# script: +# 'process-data/scRNASeq-find-markers.R' + rule split_datasets: input: rds = 'data/{modality}/{modality}-full.rds' @@ -38,23 +79,6 @@ rule process_data_for_random_forest: script: 'process-data/Create-expression-df.R' -rule download_CyTOF: - output: - Levine_CyTOF = 'data/CyTOF/CyTOF-full.rds' - script: - 'process-data/download-and-process-CyTOF.R' - -rule determine_dataset_dimensionality: - input: - markers = 'markers/{modality}.yml', - training_rds = 'data/{modality}/{modality}-train-seed-{s}.rds' - resources: - mem_mb=5000 - output: - html = output + 'reports/{modality}-dataset-dimensionality-seed-{s}.html' - script: - 'process-data/determine-dataset-dimensionality.Rmd' - rule create_random_subsets: input: sce = 'data/{modality}/{modality}-train-seed-{s}.rds' @@ -64,12 +88,26 @@ rule create_random_subsets: script: 'process-data/select-random-subset.R' -rule create_clustering_subsets: +rule create_seurat_clustering_subsets_no_markers: + input: + seurat = output + 'cluster-and-interpret/{modality}/{modality}-NoMarkerSeurat-clustering-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv', + sce = 'data/{modality}/{modality}-train-seed-{s}.rds' + params: + selection_type = "wout_markers" + output: + ground_truth = output + 'data/{modality}/NoMarkerSeurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.tsv', + sce = output + 'data/{modality}/NoMarkerSeurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.rds' + script: + 'process-data/select-cluster-subset.R' + +rule create_seurat_clustering_subsets_markers: input: - seurat = output + 'cluster-and-interpret/{modality}/{modality}-Seurat-assignments-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv', + seurat = output + 'cluster-and-interpret/{modality}/{modality}-MarkerSeurat-clustering-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv', sce = 'data/{modality}/{modality}-train-seed-{s}.rds' + params: + selection_type = "w_markers" output: - ground_truth = output + 'data/{modality}/Seurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.tsv', - sce = output + 'data/{modality}/Seurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.rds' + ground_truth = output + 'data/{modality}/MarkerSeurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.tsv', + sce = output + 'data/{modality}/MarkerSeurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.rds' script: 'process-data/select-cluster-subset.R' diff --git a/pipeline/process-data/create-sce.R b/pipeline/process-data/create-sce.R new file mode 100644 index 0000000..1088fd7 --- /dev/null +++ b/pipeline/process-data/create-sce.R @@ -0,0 +1,60 @@ +library(tidyverse) +library(SingleCellExperiment) +library(scater) +library(annotables) +library(patchwork) +library(Matrix) +set.seed(1) + +sces <- lapply(snakemake@input$mat, readRDS) +sce <- do.call('cbind', sces) + +sce$CellType <- str_split_fixed(colnames(sce), "_", 2)[,1] + +## More QC +grch38 <- select(grch38, symbol, chr) |> + filter(!grepl("GL|KI|CHR_", chr)) |> + filter(symbol %in% rownames(sce)) |> + distinct() |> + group_by(symbol) |> + slice_head(n = 1) + +gene_metadata <- tibble(symbol = rownames(sce)) |> + left_join(grch38) + +if(!all(gene_metadata$symbol == rownames(sce))){ + stop("Error: gene order is not the same") +} + +rowData(sce)$chr <- gene_metadata$chr + +sce <- logNormCounts(sce) + +head(rownames(sce)) +mt_genes <- which(rowData(sce)$chr == "MT") +ribo_genes <- grepl("^RP[LS]", rownames(sce)) +feature_ctrls <- list(mito = rownames(sce)[mt_genes], + ribo = rownames(sce)[ribo_genes]) +lapply(feature_ctrls, head) + +sce <- addPerCellQC(sce, subsets = feature_ctrls) + +pdf(snakemake@output$detected) + plotColData(sce, y = "subsets_mito_percent") +dev.off() + + +# run PCA and UMAP +sce <- runPCA(sce, n_dimred = 20) +sce <- runUMAP(sce, n_dimred = 20, ncomponents = 2) + +dim_reductions <- (plotReducedDim(sce, dimred = 'UMAP', colour_by = "detected") | + plotReducedDim(sce, dimred = 'UMAP', colour_by = 'CellType')) / + (plotReducedDim(sce, dimred = 'PCA', colour_by = 'detected') | + plotReducedDim(sce, dimred = 'PCA', colour_by = 'CellType')) + +png(snakemake@output$dim_red, width = 1500, height = 1500, res = 150) + dim_reductions +dev.off() + +saveRDS(sce, snakemake@output$sce) \ No newline at end of file diff --git a/pipeline/process-data/determine-dataset-dimensionality.Rmd b/pipeline/process-data/determine-dataset-dimensionality.Rmd deleted file mode 100644 index f80335f..0000000 --- a/pipeline/process-data/determine-dataset-dimensionality.Rmd +++ /dev/null @@ -1,76 +0,0 @@ ---- -title: "Determine Seurat parameters" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -library(Seurat) -library(SingleCellExperiment) -library(yaml) -library(tidyverse) -``` - - -```{r message=F} -### [READ IN FILES] ### -sce <- readRDS(snakemake@input[['training_rds']]) -markers <- read_yaml(snakemake@input[['markers']])$cell_types -unique_markers <- unlist(markers) %>% unique() - - -### [PROCESS DATA] ### -if(length(names(assays(sce))) == 2){ - seu <- CreateSeuratObject(counts = assays(sce)$counts) - seu <- NormalizeData(seu) -} else{ - seu <- CreateSeuratObject(counts = assays(sce)$logcounts) -} - -seu <- FindVariableFeatures(seu, selection.method = 'vst', nfeatures = 2000) - -all.genes <- rownames(seu) -seu <- ScaleData(seu, features = all.genes) - - -seu <- RunPCA(seu, features = VariableFeatures(object = seu)) -``` - - -### Which PC's contain what marker genes? -```{r} -t <- print(seu[["pca"]], dims = 1:10, nfeatures = 10) - -``` - - -### Plot loadings -```{r fig.height=17, fig.width=10} -VizDimLoadings(seu, dims = 1:10, reduction = "pca") -``` - - -### Dim heatmaps -```{r fig.height=25, fig.width=9} -DimHeatmap(seu, dims = 1:27, cells = 500, balanced = TRUE) -``` - - -### JackStraw - -```{r} -seu <- JackStraw(seu, num.replicate = 100, dims = 38) -seu <- ScoreJackStraw(seu, dims = 1:38) -``` - -```{r} -JackStrawPlot(seu, dims = 1:38) -``` - -### Elbowplot -```{r} -ElbowPlot(seu, ndims = 38) -``` - - - diff --git a/pipeline/process-data/process-cell-line-data.R b/pipeline/process-data/process-cell-line-data.R new file mode 100644 index 0000000..1d74a7c --- /dev/null +++ b/pipeline/process-data/process-cell-line-data.R @@ -0,0 +1,49 @@ +library(DropletUtils) +library(Matrix) +library(tidyverse) +library(scater) +library(annotables) +library(SingleCellExperiment) + +matrix <- readMM(snakemake@input$mat) +barcodes <- read_tsv(snakemake@input$barcodes, col_names = FALSE) +features <- read_tsv(snakemake@input$features, col_names = FALSE) + +rownames(matrix) <- features$X1 +colnames(matrix) <- barcodes$X1 + +grch38_filtered <- grch38 |> + filter(!grepl("GL|KI|CHR_", chr) & biotype == "protein_coding") |> + filter(!grepl("^RP[L|S]|^MALAT1|^HSP|^FOS|^JUN", symbol)) +gene_map <- select(grch38_filtered, ensgene, symbol) |> deframe() + +# Remove any genes that don't have a hugo name +matrix <- matrix[rownames(matrix) %in% names(gene_map), ] + +# change rownames to hugo +rownames(matrix) <- gene_map[rownames(matrix)] + +sel_cells <- grepl(snakemake@wildcards$cell_line, colnames(matrix)) +subset_mat <- matrix[, sel_cells] + +subset_mat <- as.matrix(subset_mat) |> + as.data.frame() |> + rownames_to_column("gene") |> + pivot_longer(-gene, names_to = "cell_id", values_to = "counts") + +subset_mat <- subset_mat |> + group_by(gene, cell_id) %>% + summarise(counts = sum(counts)) |> + ungroup() + +subset_mat <- pivot_wider(subset_mat, values_from = "counts", names_from = "cell_id") |> + as.data.frame() |> + column_to_rownames("gene") |> + t() + +subset_mat <- as(subset_mat, "sparseMatrix") + +sce <- SingleCellExperiment(assays = list(counts = t(subset_mat))) + +saveRDS(sce, snakemake@output$sce) + diff --git a/pipeline/process-data/select-cluster-subset.R b/pipeline/process-data/select-cluster-subset.R index e2d68c8..16f0f66 100644 --- a/pipeline/process-data/select-cluster-subset.R +++ b/pipeline/process-data/select-cluster-subset.R @@ -1,50 +1,154 @@ library(tidyverse) library(splitstackshape) library(SingleCellExperiment) +library(fabricatr) set.seed(42) seu <- read_tsv(snakemake@input[['seurat']]) sce <- readRDS(snakemake@input[['sce']]) +sce_gt <- tibble(cell_id = colnames(sce), + cell_type = sce$CellType) + req_cells <- as.integer(snakemake@wildcards[['cell_num']]) n_cells <- 0 -while(n_cells < req_cells){ - pred_cell_types <- seu$predicted_cell_type %>% - unique() %>% - length() - - if(exists("sce_subset")){ - no_to_select <- ceiling((req_cells - ncol(sce_subset)) / pred_cell_types) - }else{ - no_to_select <- ceiling(req_cells / pred_cell_types) +if(snakemake@params$selection_type == "wout_markers"){ + sel_method <- "NoMarkerSeurat-clustering" + while(n_cells < req_cells){ + num_clusters <- seu$cluster %>% + unique() %>% + length() + + if(exists("sce_subset")){ + no_to_select <- ceiling((req_cells - ncol(sce_subset)) / num_clusters) + }else{ + no_to_select <- ceiling(req_cells / num_clusters) + } + + subset <- stratified(seu, "cluster", no_to_select) + + n_cells <- n_cells + nrow(subset) + oversampled <- n_cells - req_cells + while(oversampled > 0){ + rem_from_cluster <- subset |> + group_by(cluster) |> + tally() |> + filter(n == max(n)) |> + slice_head(n = 1) |> + pull(cluster) + print(paste("rem from clust", rem_from_cluster)) + + rem_cell_id <- subset |> + filter(cluster == rem_from_cluster) |> + slice_head(n = 1) |> + pull(cell_id) + + subset <- filter(subset, cell_id != rem_cell_id) + oversampled <- nrow(subset) - req_cells + } + + if(exists("sce_subset")){ + sce_subset <- cbind(sce_subset, sce[, colnames(sce) %in% subset$cell_id]) + }else{ + sce_subset <- sce[, colnames(sce) %in% subset$cell_id] + } + seu <- filter(seu, !(cell_id %in% colnames(sce_subset))) } +}else if(snakemake@params$selection_type == "w_markers"){ + sel_method <- "MarkerSeurat-clustering" + while(n_cells < req_cells){ + num_clusters <- seu$predicted_cell_type %>% + unique() %>% + length() + + if(exists("sce_subset")){ + no_to_select <- ceiling((req_cells - ncol(sce_subset)) / num_clusters) + }else{ + no_to_select <- ceiling(req_cells / num_clusters) + } + + subset <- stratified(seu, "predicted_cell_type", no_to_select) + + n_cells <- n_cells + nrow(subset) + oversampled <- n_cells - req_cells + while(oversampled > 0){ + rem_from_cluster <- subset |> + group_by(predicted_cell_type) |> + tally() |> + filter(n == max(n)) |> + slice_head(n = 1) |> + pull(predicted_cell_type) + + rem_cell_id <- subset |> + filter(predicted_cell_type == rem_from_cluster) |> + slice_head(n = 1) |> + pull(cell_id) + + subset <- filter(subset, cell_id != rem_cell_id) + oversampled <- nrow(subset) - req_cells + } + + if(exists("sce_subset")){ + sce_subset <- cbind(sce_subset, sce[, colnames(sce) %in% subset$cell_id]) + }else{ + sce_subset <- sce[, colnames(sce) %in% subset$cell_id] + } + seu <- filter(seu, !(cell_id %in% colnames(sce_subset))) + } +} + + +# sce_subset <- sce[, colnames(sce) %in% seu_subset$cell_id] + +# while(n_cells < req_cells){ +# pred_cell_types <- seu$cluster %>% +# unique() %>% +# length() - subset <- stratified(seu, "predicted_cell_type", no_to_select) |> - pull(cell_id) +# if(exists("sce_subset")){ +# no_to_select <- ceiling((req_cells - ncol(sce_subset)) / pred_cell_types) +# }else{ +# no_to_select <- ceiling(req_cells / pred_cell_types) +# } - n_cells <- n_cells + length(subset) - oversampled <- n_cells - req_cells - if(oversampled > 0){ - subset <- subset[-sample(1:length(subset), oversampled)] - } +# subset <- stratified(seu, "cluster", no_to_select) |> +# pull(cell_id) - if(exists("sce_subset")){ - sce_subset <- cbind(sce_subset, sce[, colnames(sce) %in% subset]) - }else{ - sce_subset <- sce[, colnames(sce) %in% subset] - } - seu <- filter(seu, !(cell_id %in% colnames(sce_subset))) +# n_cells <- n_cells + length(subset) +# oversampled <- n_cells - req_cells +# if(oversampled > 0){ +# subset <- subset[-sample(1:length(subset), oversampled)] +# } + +# if(exists("sce_subset")){ +# sce_subset <- cbind(sce_subset, sce[, colnames(sce) %in% subset]) +# }else{ +# sce_subset <- sce[, colnames(sce) %in% subset] +# } +# seu <- filter(seu, !(cell_id %in% colnames(sce_subset))) +# } + +# For the imbalanced analysis. Ensures there is at least one cell type of both types + +if(!is.null(snakemake@wildcards$similarity)){ + sce_subset <- sce_subset[, -c(1:2)] + + each_ct <- filter(sce_gt, !(cell_id %in% colnames(sce_subset))) |> + group_by(cell_type) |> + slice_head(n = 1) + + sce_subset <- cbind(sce_subset, sce[, colnames(sce) %in% each_ct$cell_id]) } -gt <- tibble(cell_id = colnames(sce_subset), - cell_type = sce_subset$CellType, - method = "Seurat-clustering", - params = paste0("knn-", snakemake@wildcards[['neighbors']], "-res-", - snakemake@wildcards[['res']], '-cell-num-', - snakemake@wildcards[['cell_num']])) +gt <- tibble(cell_id = colnames(sce_subset)) |> + left_join(sce_gt) |> + mutate(method = sel_method, + params = paste0("knn-", snakemake@wildcards[['neighbors']], "-res-", + snakemake@wildcards[['res']], '-cell-num-', + snakemake@wildcards[['cell_num']])) saveRDS(sce_subset, snakemake@output[['sce']]) diff --git a/pipeline/process-data/select-random-subset.R b/pipeline/process-data/select-random-subset.R index d547597..2f3f22c 100644 --- a/pipeline/process-data/select-random-subset.R +++ b/pipeline/process-data/select-random-subset.R @@ -2,10 +2,22 @@ library(tidyverse) library(SingleCellExperiment) sce <- readRDS(snakemake@input[['sce']]) +sce_gt <- tibble(cell_id = colnames(sce), + cell_type = sce$CellType) set.seed(1) subset1 <- sce[, sample(1:ncol(sce), as.integer(snakemake@wildcards[['cell_num']]))] +if(!is.null(snakemake@wildcards$similarity)){ + subset1 <- subset1[, -c(1:2)] + + each_ct <- filter(sce_gt, !(cell_id %in% colnames(subset1))) |> + group_by(cell_type) |> + slice_head(n = 1) + + subset1 <- cbind(subset1, sce[, colnames(sce) %in% each_ct$cell_id]) +} + gt_1 <- tibble(cell_id = colnames(subset1), cell_type = subset1$CellType, method = "random", diff --git a/pipeline/process-data/split-train-test.R b/pipeline/process-data/split-train-test.R index 115ec0f..1cdb3ae 100644 --- a/pipeline/process-data/split-train-test.R +++ b/pipeline/process-data/split-train-test.R @@ -15,6 +15,7 @@ if(snakemake@wildcards[['modality']] == "snRNASeq"){ } set.seed(as.integer(snakemake@wildcards[['s']])) +sce <- sce[, sample(1:ncol(sce))] train <- createDataPartition(sce$CellType, p = 0.5)$Resample1 train_sce <- sce[,train] diff --git a/pipeline/process-data/subset-celllines.R b/pipeline/process-data/subset-celllines.R new file mode 100644 index 0000000..1e98216 --- /dev/null +++ b/pipeline/process-data/subset-celllines.R @@ -0,0 +1,12 @@ +library(tidyverse) +library(SingleCellExperiment) + +sce <- readRDS(snakemake@input$sce) + +set.seed(1) + +cell_lines <- unique(sce$CellType)[sample(1:length(unique(sce$CellType)), 10)] + +sub_sce <- sce[, sce$CellType %in% cell_lines] + +saveRDS(sub_sce, snakemake@output$sce) \ No newline at end of file