update processing

camlab-bioml · May 29, 2023 · cb7eda8 · cb7eda8
1 parent ca20bc4
commit cb7eda8
Show file tree

Hide file tree

Showing 8 changed files with 332 additions and 132 deletions.
diff --git a/pipeline/process-data.smk b/pipeline/process-data.smk
@@ -1,22 +1,63 @@
 
+celllines = ["AU565", "HCC1937", "HCC38", "MDAMB468", "EFM19", "HCC1187", 'JIMT1', "MDAMB361", 'HCC1500', "HCC70", "CAL51", "MDAMB415", 'BT549', "BT483", "MDAMB436", "DU4475", "HS578T", "MCF7", "CAMA1", "BT20", "T47D", "EVSAT", "HDQP1", "BT474", "CAL851", "HCC1143", "MCF12A", "HCC1954", "KPL1", "ZR751", "MX1", "MDAMB453"]
 
 process_data_output = {
     'train_test_split': expand('data/{modality}/{modality}-{split}-seed-{s}.rds', modality = modalities, split = data_splits, s = train_test_seeds),
     'scRNA_expression_df': expand('data/{modality}/{modality}-expression-df-{split}-seed-{s}.tsv', modality = modalities, split = data_splits, s = train_test_seeds),
-    'CyTOF_download': 'data/CyTOF/CyTOF-full.rds',
-    'dataset_dimensionality': expand(output + 'reports/{modality}-dataset-dimensionality-seed-{s}.html', modality = modalities, s = train_test_seeds),
-
+    'celllines': 'data/scRNASeq/scRNASeq-cellLines-full.rds',
+    #'markers': expand(output + 'figures/scRNASeq-markers/{plot}.pdf', plot = ['heatmap', 'lfcs-dot-plot']),
+    
     # Random subsets
     'sce_subset1': expand(output + 'data/{modality}/random/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-NA-resolution-NA-seed-{s}-{cell_num}_cells.rds', modality = modalities, cell_num = cell_numbers, s = train_test_seeds),
     'gt_subset1': expand(output + 'data/{modality}/random/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-NA-resolution-NA-seed-{s}-{cell_num}_cells.tsv', modality = modalities, cell_num = cell_numbers, s = train_test_seeds),
 
     # Seurat clustering subsets
-    'seu_sce': expand(output + 'data/{modality}/Seurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.rds', 
-        modality = modalities, neighbors = Seurat_neighbors, res = Seurat_resolution, cell_num = cell_numbers, s = train_test_seeds),
-    'gt_seu': expand(output + 'data/{modality}/Seurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.tsv', 
-        modality = modalities, neighbors = Seurat_neighbors, res = Seurat_resolution, cell_num = cell_numbers, s = train_test_seeds),
+    'seu_sce': expand(output + 'data/{modality}/{clusteringMarkers}/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.rds', 
+        modality = modalities, clusteringMarkers = ['NoMarkerSeurat-clustering', 'MarkerSeurat-clustering'], neighbors = Seurat_neighbors, res = Seurat_resolution, cell_num = cell_numbers, s = train_test_seeds),
+    'gt_seu': expand(output + 'data/{modality}/{clusteringMarkers}/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.tsv', 
+        modality = modalities, clusteringMarkers = ['NoMarkerSeurat-clustering', 'MarkerSeurat-clustering'], neighbors = Seurat_neighbors, res = Seurat_resolution, cell_num = cell_numbers, s = train_test_seeds),
 }
 
+# This set of rules is required to process the cell line data and the outputs were used to create a marker file
+# rule process_cell_line_mats:
+#     input:
+#         mat = "data/scRNASeq/matrix.mtx.gz",
+#         barcodes = "data/scRNASeq/barcodes.tsv",
+#         features = "data/scRNASeq/features.tsv.gz"
+#     resources: mem_mb=50000
+#     output:
+#         sce = temp('data/scRNASeq/{cell_line}-sce.rds')
+#     script:
+#         'process-data/process-cell-line-data.R'
+
+# rule combine_cell_lines_into_sce:
+#     input:
+#         mat = expand('data/scRNASeq/{cell_line}-sce.rds', cell_line = celllines)
+#     resources: mem_mb=50000
+#     output:
+#         detected = output + 'figures/scRNASeq-cell-lines-detected.pdf',
+#         dim_red = output + 'figures/scRNASeq-cell-lines-dimred.png',
+#         sce = 'data/scRNASeq/scRNASeq-cellLines-full.rds'
+#     script:
+#         'process-data/create-sce.R'
+
+# rule subset_cellLines:
+#     input:
+#         sce = "data/scRNASeq/scRNASeq-cellLines-full.rds"
+#     output:
+#         sce = "data/scRNASeq/scRNASeq-full.rds"
+#     script:
+#         'process-data/subset-celllines.R'
+
+# rule get_markers:
+#     input:
+#         sce = "data/scRNASeq/scRNASeq-full.rds"
+#     output:
+#         heatmap = output + 'figures/scRNASeq-markers/heatmap.pdf',
+#         lfc_point = output + 'figures/scRNASeq-markers/lfcs-dot-plot.pdf'
+#     script:
+#         'process-data/scRNASeq-find-markers.R'
+
 rule split_datasets:
     input:
         rds = 'data/{modality}/{modality}-full.rds'
@@ -38,23 +79,6 @@ rule process_data_for_random_forest:
     script:
         'process-data/Create-expression-df.R'
 
-rule download_CyTOF:
-    output:
-        Levine_CyTOF = 'data/CyTOF/CyTOF-full.rds'
-    script:
-        'process-data/download-and-process-CyTOF.R'
-
-rule determine_dataset_dimensionality:
-    input:
-        markers = 'markers/{modality}.yml',
-        training_rds = 'data/{modality}/{modality}-train-seed-{s}.rds'
-    resources:
-        mem_mb=5000
-    output:
-        html = output + 'reports/{modality}-dataset-dimensionality-seed-{s}.html'
-    script:
-        'process-data/determine-dataset-dimensionality.Rmd'
-
 rule create_random_subsets:
     input:
         sce = 'data/{modality}/{modality}-train-seed-{s}.rds'
@@ -64,12 +88,26 @@ rule create_random_subsets:
     script:
         'process-data/select-random-subset.R'
 
-rule create_clustering_subsets:
+rule create_seurat_clustering_subsets_no_markers:
+    input:
+        seurat = output + 'cluster-and-interpret/{modality}/{modality}-NoMarkerSeurat-clustering-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv',
+        sce = 'data/{modality}/{modality}-train-seed-{s}.rds'
+    params:
+        selection_type = "wout_markers"
+    output:
+        ground_truth = output + 'data/{modality}/NoMarkerSeurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.tsv',
+        sce = output + 'data/{modality}/NoMarkerSeurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.rds'
+    script:
+        'process-data/select-cluster-subset.R'
+
+rule create_seurat_clustering_subsets_markers:
     input:
-        seurat = output + 'cluster-and-interpret/{modality}/{modality}-Seurat-assignments-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv',
+        seurat = output + 'cluster-and-interpret/{modality}/{modality}-MarkerSeurat-clustering-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv',
         sce = 'data/{modality}/{modality}-train-seed-{s}.rds'
+    params:
+        selection_type = "w_markers"
     output:
-        ground_truth = output + 'data/{modality}/Seurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.tsv',
-        sce = output + 'data/{modality}/Seurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.rds'
+        ground_truth = output + 'data/{modality}/MarkerSeurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.tsv',
+        sce = output + 'data/{modality}/MarkerSeurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.rds'
     script:
         'process-data/select-cluster-subset.R'
diff --git a/pipeline/process-data/create-sce.R b/pipeline/process-data/create-sce.R
@@ -0,0 +1,60 @@
+library(tidyverse)
+library(SingleCellExperiment)
+library(scater)
+library(annotables)
+library(patchwork)
+library(Matrix)
+set.seed(1)
+
+sces <- lapply(snakemake@input$mat, readRDS)
+sce <- do.call('cbind', sces)
+
+sce$CellType <- str_split_fixed(colnames(sce), "_", 2)[,1]
+
+## More QC
+grch38 <- select(grch38, symbol, chr) |> 
+    filter(!grepl("GL|KI|CHR_", chr)) |>
+    filter(symbol %in% rownames(sce)) |>
+    distinct() |> 
+    group_by(symbol) |>
+    slice_head(n = 1)
+
+gene_metadata <- tibble(symbol = rownames(sce)) |>
+    left_join(grch38)
+
+if(!all(gene_metadata$symbol == rownames(sce))){
+    stop("Error: gene order is not the same")
+}
+
+rowData(sce)$chr <- gene_metadata$chr
+
+sce <- logNormCounts(sce)
+
+head(rownames(sce))
+mt_genes <- which(rowData(sce)$chr == "MT")
+ribo_genes <- grepl("^RP[LS]", rownames(sce))
+feature_ctrls <- list(mito = rownames(sce)[mt_genes],
+                      ribo = rownames(sce)[ribo_genes])
+lapply(feature_ctrls, head)
+
+sce <- addPerCellQC(sce, subsets = feature_ctrls)
+
+pdf(snakemake@output$detected)
+  plotColData(sce, y = "subsets_mito_percent")
+dev.off()
+
+
+# run PCA and UMAP
+sce <- runPCA(sce, n_dimred = 20)
+sce <- runUMAP(sce, n_dimred = 20, ncomponents = 2)
+
+dim_reductions <- (plotReducedDim(sce, dimred = 'UMAP', colour_by = "detected") | 
+  plotReducedDim(sce, dimred = 'UMAP', colour_by = 'CellType')) /
+  (plotReducedDim(sce, dimred = 'PCA', colour_by = 'detected') |
+  plotReducedDim(sce, dimred = 'PCA', colour_by = 'CellType'))
+
+png(snakemake@output$dim_red, width = 1500, height = 1500, res = 150)
+  dim_reductions
+dev.off()
+
+saveRDS(sce, snakemake@output$sce)
diff --git a/pipeline/process-data/determine-dataset-dimensionality.Rmd b/pipeline/process-data/determine-dataset-dimensionality.Rmd
diff --git a/pipeline/process-data/process-cell-line-data.R b/pipeline/process-data/process-cell-line-data.R
@@ -0,0 +1,49 @@
+library(DropletUtils)
+library(Matrix)
+library(tidyverse)
+library(scater)
+library(annotables)
+library(SingleCellExperiment)
+
+matrix <- readMM(snakemake@input$mat)
+barcodes <- read_tsv(snakemake@input$barcodes, col_names = FALSE)
+features <- read_tsv(snakemake@input$features, col_names = FALSE)
+
+rownames(matrix) <- features$X1
+colnames(matrix) <- barcodes$X1
+
+grch38_filtered <- grch38 |> 
+  filter(!grepl("GL|KI|CHR_", chr) & biotype == "protein_coding") |>
+  filter(!grepl("^RP[L|S]|^MALAT1|^HSP|^FOS|^JUN", symbol))
+gene_map <- select(grch38_filtered, ensgene, symbol) |> deframe()
+
+# Remove any genes that don't have a hugo name
+matrix <- matrix[rownames(matrix) %in% names(gene_map), ]
+
+# change rownames to hugo
+rownames(matrix) <- gene_map[rownames(matrix)]
+
+sel_cells <- grepl(snakemake@wildcards$cell_line, colnames(matrix))
+subset_mat <- matrix[, sel_cells]
+
+subset_mat <- as.matrix(subset_mat) |>
+  as.data.frame() |>
+  rownames_to_column("gene") |> 
+  pivot_longer(-gene, names_to = "cell_id", values_to = "counts")
+
+subset_mat <- subset_mat |>
+  group_by(gene, cell_id) %>% 
+  summarise(counts = sum(counts)) |> 
+  ungroup()
+
+subset_mat <- pivot_wider(subset_mat, values_from = "counts", names_from = "cell_id") |>
+  as.data.frame() |>
+  column_to_rownames("gene") |>
+  t()
+
+subset_mat <- as(subset_mat, "sparseMatrix")
+
+sce <- SingleCellExperiment(assays = list(counts = t(subset_mat)))
+
+saveRDS(sce, snakemake@output$sce)
+