Skip to content

Commit

Permalink
update processing
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael-Geuenich committed May 29, 2023
1 parent ca20bc4 commit cb7eda8
Show file tree
Hide file tree
Showing 8 changed files with 332 additions and 132 deletions.
94 changes: 66 additions & 28 deletions pipeline/process-data.smk
Original file line number Diff line number Diff line change
@@ -1,22 +1,63 @@

celllines = ["AU565", "HCC1937", "HCC38", "MDAMB468", "EFM19", "HCC1187", 'JIMT1', "MDAMB361", 'HCC1500', "HCC70", "CAL51", "MDAMB415", 'BT549', "BT483", "MDAMB436", "DU4475", "HS578T", "MCF7", "CAMA1", "BT20", "T47D", "EVSAT", "HDQP1", "BT474", "CAL851", "HCC1143", "MCF12A", "HCC1954", "KPL1", "ZR751", "MX1", "MDAMB453"]

process_data_output = {
'train_test_split': expand('data/{modality}/{modality}-{split}-seed-{s}.rds', modality = modalities, split = data_splits, s = train_test_seeds),
'scRNA_expression_df': expand('data/{modality}/{modality}-expression-df-{split}-seed-{s}.tsv', modality = modalities, split = data_splits, s = train_test_seeds),
'CyTOF_download': 'data/CyTOF/CyTOF-full.rds',
'dataset_dimensionality': expand(output + 'reports/{modality}-dataset-dimensionality-seed-{s}.html', modality = modalities, s = train_test_seeds),

'celllines': 'data/scRNASeq/scRNASeq-cellLines-full.rds',
#'markers': expand(output + 'figures/scRNASeq-markers/{plot}.pdf', plot = ['heatmap', 'lfcs-dot-plot']),
# Random subsets
'sce_subset1': expand(output + 'data/{modality}/random/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-NA-resolution-NA-seed-{s}-{cell_num}_cells.rds', modality = modalities, cell_num = cell_numbers, s = train_test_seeds),
'gt_subset1': expand(output + 'data/{modality}/random/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-NA-resolution-NA-seed-{s}-{cell_num}_cells.tsv', modality = modalities, cell_num = cell_numbers, s = train_test_seeds),

# Seurat clustering subsets
'seu_sce': expand(output + 'data/{modality}/Seurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.rds',
modality = modalities, neighbors = Seurat_neighbors, res = Seurat_resolution, cell_num = cell_numbers, s = train_test_seeds),
'gt_seu': expand(output + 'data/{modality}/Seurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.tsv',
modality = modalities, neighbors = Seurat_neighbors, res = Seurat_resolution, cell_num = cell_numbers, s = train_test_seeds),
'seu_sce': expand(output + 'data/{modality}/{clusteringMarkers}/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.rds',
modality = modalities, clusteringMarkers = ['NoMarkerSeurat-clustering', 'MarkerSeurat-clustering'], neighbors = Seurat_neighbors, res = Seurat_resolution, cell_num = cell_numbers, s = train_test_seeds),
'gt_seu': expand(output + 'data/{modality}/{clusteringMarkers}/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.tsv',
modality = modalities, clusteringMarkers = ['NoMarkerSeurat-clustering', 'MarkerSeurat-clustering'], neighbors = Seurat_neighbors, res = Seurat_resolution, cell_num = cell_numbers, s = train_test_seeds),
}

# This set of rules is required to process the cell line data and the outputs were used to create a marker file
# rule process_cell_line_mats:
# input:
# mat = "data/scRNASeq/matrix.mtx.gz",
# barcodes = "data/scRNASeq/barcodes.tsv",
# features = "data/scRNASeq/features.tsv.gz"
# resources: mem_mb=50000
# output:
# sce = temp('data/scRNASeq/{cell_line}-sce.rds')
# script:
# 'process-data/process-cell-line-data.R'

# rule combine_cell_lines_into_sce:
# input:
# mat = expand('data/scRNASeq/{cell_line}-sce.rds', cell_line = celllines)
# resources: mem_mb=50000
# output:
# detected = output + 'figures/scRNASeq-cell-lines-detected.pdf',
# dim_red = output + 'figures/scRNASeq-cell-lines-dimred.png',
# sce = 'data/scRNASeq/scRNASeq-cellLines-full.rds'
# script:
# 'process-data/create-sce.R'

# rule subset_cellLines:
# input:
# sce = "data/scRNASeq/scRNASeq-cellLines-full.rds"
# output:
# sce = "data/scRNASeq/scRNASeq-full.rds"
# script:
# 'process-data/subset-celllines.R'

# rule get_markers:
# input:
# sce = "data/scRNASeq/scRNASeq-full.rds"
# output:
# heatmap = output + 'figures/scRNASeq-markers/heatmap.pdf',
# lfc_point = output + 'figures/scRNASeq-markers/lfcs-dot-plot.pdf'
# script:
# 'process-data/scRNASeq-find-markers.R'

rule split_datasets:
input:
rds = 'data/{modality}/{modality}-full.rds'
Expand All @@ -38,23 +79,6 @@ rule process_data_for_random_forest:
script:
'process-data/Create-expression-df.R'

rule download_CyTOF:
output:
Levine_CyTOF = 'data/CyTOF/CyTOF-full.rds'
script:
'process-data/download-and-process-CyTOF.R'

rule determine_dataset_dimensionality:
input:
markers = 'markers/{modality}.yml',
training_rds = 'data/{modality}/{modality}-train-seed-{s}.rds'
resources:
mem_mb=5000
output:
html = output + 'reports/{modality}-dataset-dimensionality-seed-{s}.html'
script:
'process-data/determine-dataset-dimensionality.Rmd'

rule create_random_subsets:
input:
sce = 'data/{modality}/{modality}-train-seed-{s}.rds'
Expand All @@ -64,12 +88,26 @@ rule create_random_subsets:
script:
'process-data/select-random-subset.R'

rule create_clustering_subsets:
rule create_seurat_clustering_subsets_no_markers:
input:
seurat = output + 'cluster-and-interpret/{modality}/{modality}-NoMarkerSeurat-clustering-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv',
sce = 'data/{modality}/{modality}-train-seed-{s}.rds'
params:
selection_type = "wout_markers"
output:
ground_truth = output + 'data/{modality}/NoMarkerSeurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.tsv',
sce = output + 'data/{modality}/NoMarkerSeurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.rds'
script:
'process-data/select-cluster-subset.R'

rule create_seurat_clustering_subsets_markers:
input:
seurat = output + 'cluster-and-interpret/{modality}/{modality}-Seurat-assignments-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv',
seurat = output + 'cluster-and-interpret/{modality}/{modality}-MarkerSeurat-clustering-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}.tsv',
sce = 'data/{modality}/{modality}-train-seed-{s}.rds'
params:
selection_type = "w_markers"
output:
ground_truth = output + 'data/{modality}/Seurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.tsv',
sce = output + 'data/{modality}/Seurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.rds'
ground_truth = output + 'data/{modality}/MarkerSeurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.tsv',
sce = output + 'data/{modality}/MarkerSeurat-clustering/Init_NA-strat-NA-ALAlg-NA-rand_sel-0-corr-0-knn_neighbors-{neighbors}-resolution-{res}-seed-{s}-{cell_num}_cells.rds'
script:
'process-data/select-cluster-subset.R'
60 changes: 60 additions & 0 deletions pipeline/process-data/create-sce.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
library(tidyverse)
library(SingleCellExperiment)
library(scater)
library(annotables)
library(patchwork)
library(Matrix)
set.seed(1)

sces <- lapply(snakemake@input$mat, readRDS)
sce <- do.call('cbind', sces)

sce$CellType <- str_split_fixed(colnames(sce), "_", 2)[,1]

## More QC
grch38 <- select(grch38, symbol, chr) |>
filter(!grepl("GL|KI|CHR_", chr)) |>
filter(symbol %in% rownames(sce)) |>
distinct() |>
group_by(symbol) |>
slice_head(n = 1)

gene_metadata <- tibble(symbol = rownames(sce)) |>
left_join(grch38)

if(!all(gene_metadata$symbol == rownames(sce))){
stop("Error: gene order is not the same")
}

rowData(sce)$chr <- gene_metadata$chr

sce <- logNormCounts(sce)

head(rownames(sce))
mt_genes <- which(rowData(sce)$chr == "MT")
ribo_genes <- grepl("^RP[LS]", rownames(sce))
feature_ctrls <- list(mito = rownames(sce)[mt_genes],
ribo = rownames(sce)[ribo_genes])
lapply(feature_ctrls, head)

sce <- addPerCellQC(sce, subsets = feature_ctrls)

pdf(snakemake@output$detected)
plotColData(sce, y = "subsets_mito_percent")
dev.off()


# run PCA and UMAP
sce <- runPCA(sce, n_dimred = 20)
sce <- runUMAP(sce, n_dimred = 20, ncomponents = 2)

dim_reductions <- (plotReducedDim(sce, dimred = 'UMAP', colour_by = "detected") |
plotReducedDim(sce, dimred = 'UMAP', colour_by = 'CellType')) /
(plotReducedDim(sce, dimred = 'PCA', colour_by = 'detected') |
plotReducedDim(sce, dimred = 'PCA', colour_by = 'CellType'))

png(snakemake@output$dim_red, width = 1500, height = 1500, res = 150)
dim_reductions
dev.off()

saveRDS(sce, snakemake@output$sce)
76 changes: 0 additions & 76 deletions pipeline/process-data/determine-dataset-dimensionality.Rmd

This file was deleted.

49 changes: 49 additions & 0 deletions pipeline/process-data/process-cell-line-data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
library(DropletUtils)
library(Matrix)
library(tidyverse)
library(scater)
library(annotables)
library(SingleCellExperiment)

matrix <- readMM(snakemake@input$mat)
barcodes <- read_tsv(snakemake@input$barcodes, col_names = FALSE)
features <- read_tsv(snakemake@input$features, col_names = FALSE)

rownames(matrix) <- features$X1
colnames(matrix) <- barcodes$X1

grch38_filtered <- grch38 |>
filter(!grepl("GL|KI|CHR_", chr) & biotype == "protein_coding") |>
filter(!grepl("^RP[L|S]|^MALAT1|^HSP|^FOS|^JUN", symbol))
gene_map <- select(grch38_filtered, ensgene, symbol) |> deframe()

# Remove any genes that don't have a hugo name
matrix <- matrix[rownames(matrix) %in% names(gene_map), ]

# change rownames to hugo
rownames(matrix) <- gene_map[rownames(matrix)]

sel_cells <- grepl(snakemake@wildcards$cell_line, colnames(matrix))
subset_mat <- matrix[, sel_cells]

subset_mat <- as.matrix(subset_mat) |>
as.data.frame() |>
rownames_to_column("gene") |>
pivot_longer(-gene, names_to = "cell_id", values_to = "counts")

subset_mat <- subset_mat |>
group_by(gene, cell_id) %>%
summarise(counts = sum(counts)) |>
ungroup()

subset_mat <- pivot_wider(subset_mat, values_from = "counts", names_from = "cell_id") |>
as.data.frame() |>
column_to_rownames("gene") |>
t()

subset_mat <- as(subset_mat, "sparseMatrix")

sce <- SingleCellExperiment(assays = list(counts = t(subset_mat)))

saveRDS(sce, snakemake@output$sce)

Loading

0 comments on commit cb7eda8

Please sign in to comment.