diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0e87b99 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +output/* +*.Rproj +data/* \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..f53bbf7 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "utils"] + path = utils + url = https://github.com/tabdelaal/CyTOF-Linear-Classifier.git diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..b9ba84f --- /dev/null +++ b/Pipfile @@ -0,0 +1,11 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] + +[dev-packages] + +[requires] +python_version = "3.7" diff --git a/Snakefile b/Snakefile new file mode 100644 index 0000000..5ebe3d9 --- /dev/null +++ b/Snakefile @@ -0,0 +1,20 @@ + +import pandas as pd +import numpy as np + +configfile: 'config/config.yml' +output = 'output/' + config['version'] + '/' + +selection_procedures = ['random'] +annotators = ['test'] +modalities = ['scRNASeq'] +data_splits = ['train', 'test'] + +include: 'pipeline/process-data.smk' +include: 'pipeline/cell-type-predictions.smk' + + +rule all: + input: + process_data_output.values(), + cell_type_predictions.values() \ No newline at end of file diff --git a/config/config.yml b/config/config.yml new file mode 100644 index 0000000..e1c90db --- /dev/null +++ b/config/config.yml @@ -0,0 +1,2 @@ + +version: v1 \ No newline at end of file diff --git a/markers/scRNA.yml b/markers/scRNA.yml new file mode 100644 index 0000000..02b52c9 --- /dev/null +++ b/markers/scRNA.yml @@ -0,0 +1,204 @@ +cell_types: + CD4 T cell: + positive: + - CD3D + - CD3E + - CD3G + - TRAC + - CD4 + - TCF7 + - CD27 + - IL7R + negative: + - CD8A + - CD8B + - GNLY + - NKG7 + - CST7 + + Cytotoxic T cell: + positive: + - CD3D + - CD3E + - CD3G + - TRAC + - CD8A + - CD8B + - GZMK + - CCL5 + - NKG7 + negative: + - CD4 + - FCER1G + + B cell: + positive: + - CD19 + - MS4A1 + - CD79A + - CD79B + - MZB1 + - IGHD + - IGHM + + Natural Killer Cell: + positive: + - NCAM1 + - NKG7 + - KLRB1 + - KLRD1 + - KLRF1 + - KLRC1 + - KLRC2 + - KLRC3 + - KLRC4 + - FCGR3A + - FCGR3B + - ITGAL + - ITGAM + - FCER1G + negative: + - CD3D + - CD3E + - CD3G + - CD14 + - TRAC + + CD14 monocyte: + positive: + - VCAN + - FCN1 + - S100A8 + - S100A9 + - CD14 + - ITGAL + - ITGAM + - CSF3R + - CSF1R + - CX3CR1 + - TYROBP + - LYZ + - S100A12 + - FCN1 + - FCGR3A + - FCGR3B + - ITGAL + - ITGAM + - CSF3R + - CSF1R + - CX3CR1 + - CDKN1C + - MS4A7 + negative: + - FCGR3A + - FCGR3B + - CD3D + - CD3E + - CD3G + - TRAC + - NKG7 + - KLRB1 + - KLRD1 + - S100A8 + - S100A9 + - S100A12 + - CD14 + - CD3D + - CD3E + - CD3G + - TRAC + - NKG7 + - KLRB1 + - KLRD1 + + Dendritic cell: + positive: + - HLA-DPB1 + - HLA-DPA1 + - HLA-DQA1 + - ITGAX + - CD1C + - CD1E + - FCER1A + - CLEC10A + - FCGR2B + negative: + - CD3D + - CD3E + - CD3G + - NCAM1 + - CD19 + - CD14 + - MS4A1 + - CD79A + - CD79B + + Plasmacytoid dendritic cell: + positive: + - IL3RA + - GZMB + - JCHAIN + - IRF7 + - TCF4 + - LILRA4 + - CLEC4C + negative: + - ITGAX + - CD3D + - CD3E + - CD3G + - NCAM1 + - CD19 + - CD14 + - MS4A1 + - CD79A + - CD79B + + Plasma cell: + positive: + - CD38 + - XBP1 + - CD27 + - SLAMF7 + - IGHA1 + - IGHA2 + - IGHG1 + - IGHG2 + - IGHG3 + - IGHG4 + negative: + - CD19 + - MS4A1 + - CD3D + - CD3E + - CD3G + + Platelet: + positive: + - PF4 + - PPBP + - GP5 + - ITGA2B + - NRGN + - TUBB1 + - SPARC + - RGS18 + - MYL9 + - GNG11 + + + + + + + + + + + + + + + + + diff --git a/pipeline/cell-type-assignment/CyTOFLDA.R b/pipeline/cell-type-assignment/CyTOFLDA.R new file mode 100644 index 0000000..022bb2e --- /dev/null +++ b/pipeline/cell-type-assignment/CyTOFLDA.R @@ -0,0 +1,60 @@ +suppressPackageStartupMessages({ + library(scater) + library(SingleCellExperiment) + library(tidyverse) + source(file.path("CyTOF-Linear-Classifier/", 'CyTOF_LDAtrain.R')) + source(file.path("CyTOF-Linear-Classifier/", 'CyTOF_LDApredict.R')) +}) +library(devtools) +devtools::load_all("../taproom/") + +sce_train <- readRDS("data/training/zurich_subset1k.rds") +labs <- read_csv("data/zurich1_astir_assignments.csv") +cell_id <- labs$X1 +labs$X1 <- NULL +labs$cell_type <- get_celltypes(labs) +labs <- as.data.frame(labs) +rownames(labs) <- cell_id + + +labs <- select(labs, cell_type) + +sce_train$cell_type <- labs[colnames(sce_train), 'cell_type'] + +sce_annotate <- sce_train[,1:500] +sce_train <- sce_train[,501:1000] + + +data_train <- as.data.frame(t(logcounts(sce_train))) +data_train$cell_type <- sce_train$cell_type +data_annotate <- as.data.frame(t(logcounts(sce_annotate))) + +train_dir <- file.path(tempdir(), "train") +annotate_dir <- file.path(tempdir(), "annotate") + +dir.create(train_dir) +dir.create(annotate_dir) + +write.table(data_train, file = file.path(train_dir, "train.csv"), + col.names = FALSE, row.names = FALSE, sep = ',') +write.table(data_annotate, file = file.path(annotate_dir, "annotate.csv"), + col.names = FALSE, row.names = FALSE, sep = ',') + +LDA.Model <- CyTOF_LDAtrain(TrainingSamplesExt = train_dir, TrainingLabelsExt = '', + mode = 'CSV', RelevantMarkers = seq_len(nrow(sce_train)), + LabelIndex = ncol(data_train), Transformation = FALSE) + +predictions <- CyTOF_LDApredict(LDA.Model, TestingSamplesExt = annotate_dir, + mode = 'CSV', RejectionThreshold = 0) + +predictions <- unlist(Predictions) + +df_output <- tibble( + cell_id = data_annotate$cell_id, + cell_type = Predictions, + annotator = args$annotator, + cohort = args$cohort, + method = args$method +) + +write_tsv(df_output, args$output_assignments) \ No newline at end of file diff --git a/pipeline/cell-type-assignment/predict-random-forest.py b/pipeline/cell-type-assignment/predict-random-forest.py new file mode 100644 index 0000000..437b085 --- /dev/null +++ b/pipeline/cell-type-assignment/predict-random-forest.py @@ -0,0 +1,8 @@ +import numpy as np +import pandas as pd + +import joblib + + +## Load everything +model = joblib.load(open(snakemake.input['model'], 'rb')) \ No newline at end of file diff --git a/pipeline/cell-type-assignment/scmap.R b/pipeline/cell-type-assignment/scmap.R new file mode 100644 index 0000000..b900579 --- /dev/null +++ b/pipeline/cell-type-assignment/scmap.R @@ -0,0 +1,76 @@ +suppressPackageStartupMessages({ + library(SingleCellExperiment) + library(scmap) + library(tidyverse) +}) + +# Read in training data expression +train_sce <- readRDS(snakemake@input[['train_data']]) + +# Read in annotated labels +annotations <- read_tsv(snakemake@input[['annotation']]) %>% + as.data.frame() %>% + column_to_rownames('cell_id') + +# Add labels to expression +train_sce$cell_type1 <- annotations[colnames(train_sce),] + +# Process data +rowData(train_sce)$feature_symbol <- rownames(train_sce) +train_sce <- selectFeatures(train_sce, suppress_plot = TRUE) + +# Read in test set +annotate_sce <- readRDS(snakemake@input[['test_data']]) +rowData(annotate_sce)$feature_symbol <- rownames(annotate_sce) + +### Start training +train_sce <- indexCluster(train_sce) + + +# Map clusters +scmapCluster_results <- scmapCluster( + projection = annotate_sce, + index_list = list( + yan = metadata(train_sce)$scmap_cluster_index + ) +) + +clustering_prediction <- tibble(cell_id = colnames(annotate_sce), + predicted_cell_type = scmapCluster_results$combined_labs, + prediction_params = 'clusters', + selection_procedure = snakemake@wildcards[['selection_procedure']], + training_annotator = snakemake@wildcards[['annotator']], + modality = 'scRNASeq') + +write_tsv(clustering_prediction, snakemake@output[['cluster_predictions']]) + + +### Cell level +set.seed(1) + +train_sce <- indexCell(train_sce) + +scmapCell_results <- scmapCell( + annotate_sce, + list( + yan = metadata(train_sce)$scmap_cell_index + ) +) + + +scmapCell_clusters <- scmapCell2Cluster( + scmapCell_results, + list( + as.character(colData(train_sce)$cell_type1) + ) +) + +sc_prediction <- tibble(cell_id = colnames(annotate_sce), + predicted_cell_type = scmapCell_clusters$scmap_cluster_labs, + prediction_params = 'single-cell', + selection_procedure = snakemake@wildcards[['selection_procedure']], + training_annotator = snakemake@wildcards[['annotator']], + modality = 'scRNASeq') + +write_tsv(sc_prediction, snakemake@output[['sc_predictions']]) + diff --git a/pipeline/cell-type-assignment/singleR.R b/pipeline/cell-type-assignment/singleR.R new file mode 100644 index 0000000..18e04f3 --- /dev/null +++ b/pipeline/cell-type-assignment/singleR.R @@ -0,0 +1,27 @@ +suppressPackageStartupMessages({ + library(SingleR) + library(tidyverse) +}) + +train_sce <- readRDS(snakemake@input[['train_data']]) +assays(train_sce)$counts <- NULL + +train_labels <- read_tsv(snakemake@input[['annotation']]) %>% + as.data.frame() %>% + column_to_rownames('cell_id') + +train_sce$cell_type <- train_labels[colnames(train_sce),] + +annotate_sce <- readRDS(snakemake@input[['test_data']]) +assays(annotate_sce)$counts <- NULL + +pred <- SingleR(annotate_sce, train_sce, labels = train_sce$CellType) + +result <- tibble(cell_id = rownames(pred), + predicted_cell_type = pred$labels, + prediction_params = 'labels', + selection_procedure = snakemake@wildcards[['selection_procedure']], + training_annotator = snakemake@wildcards[['annotator']], + modality = 'scRNASeq') + +write_tsv(result, snakemake@output[['predictions']]) \ No newline at end of file diff --git a/pipeline/cell-type-predictions.smk b/pipeline/cell-type-predictions.smk new file mode 100644 index 0000000..83c42b2 --- /dev/null +++ b/pipeline/cell-type-predictions.smk @@ -0,0 +1,52 @@ + +cell_type_predictions = { + 'random_forest_models': expand(output + 'models/random-forest-{modality}-trained-on-{selection_procedure}-by-{annotator}.pkl', + modality = modalities, selection_procedure = selection_procedures, annotator = annotators), + 'scmap_cluster': expand(output + 'rare-subtype-benchmarking/scRNASeq-{selection_procedure}-annotation-{annotator}-scmap-cluster-predictions.tsv', + selection_procedure = selection_procedures, annotator = annotators), + 'scmap_sc': expand(output + 'rare-subtype-benchmarking/scRNASeq-{selection_procedure}-annotation-{annotator}-scmap-sc-predictions.tsv', + selection_procedure = selection_procedures, annotator = annotators), + 'singleR': expand(output + 'rare-subtype-benchmarking/scRNASeq-{selection_procedure}-annotation-{annotator}-singleR-predictions.tsv', + selection_procedure = selection_procedures, annotator = annotators) +} + +rule train_and_predict_scmap: + input: + annotation = 'data/scRNASeq/{selection_procedure}/{selection_procedure}-annotation-{annotator}.tsv', + train_data = 'data/scRNASeq/scRNASeq-train.rds', + test_data = 'data/scRNASeq/scRNASeq-test.rds' + output: + cluster_predictions = output + 'rare-subtype-benchmarking/scRNASeq-{selection_procedure}-annotation-{annotator}-scmap-cluster-predictions.tsv', + sc_predictions = output + 'rare-subtype-benchmarking/scRNASeq-{selection_procedure}-annotation-{annotator}-scmap-sc-predictions.tsv' + script: + 'cell-type-assignment/scmap.R' + +rule train_and_predict_singleR: + input: + annotation = 'data/scRNASeq/{selection_procedure}/{selection_procedure}-annotation-{annotator}.tsv', + train_data = 'data/scRNASeq/scRNASeq-train.rds', + test_data = 'data/scRNASeq/scRNASeq-test.rds' + output: + predictions = output + 'rare-subtype-benchmarking/scRNASeq-{selection_procedure}-annotation-{annotator}-singleR-predictions.tsv' + script: + 'cell-type-assignment/singleR.R' + +rule train_random_forest: + input: + train = 'data/{modality}/{modality}-expression-df-train.tsv', + annotation = 'data/{modality}/{selection_procedure}/{selection_procedure}-annotation-{annotator}.tsv' + output: + model = output + 'models/random-forest-{modality}-trained-on-{selection_procedure}-by-{annotator}.pkl' + log: + output + 'logs/cell-type-predictions/random-forest-{modality}-trained-on-{selection_procedure}-by-{annotator}.log' + script: + 'random-forest-train.py' + +rule predict_random_forest: + input: + test = 'data/{modality}/{modality}-expression-df-test.tsv', + model = output + 'models/random-forest-{modality}-trained-on-{selection_procedure}-by-{annotator}.pkl' + output: + predictions = output + 'rare-subtype-benchmarking/{modality}-{selection-procedure}-annotation-{annotator}-randomForest-predictions.tsv' + script: + 'random-forest-test.py' \ No newline at end of file diff --git a/pipeline/process-data.smk b/pipeline/process-data.smk new file mode 100644 index 0000000..5850d4f --- /dev/null +++ b/pipeline/process-data.smk @@ -0,0 +1,25 @@ + +process_data_output = { + 'train_test_split': expand('data/{modality}/{modality}-{split}.rds', modality = modalities, split = data_splits), + 'random_subset': expand('data/{modality}/random/random-{modality}-500-cells.rds', modality = modalities), + 'scRNA_expression_df': expand('data/{modality}/{modality}-expression-df-{split}.tsv', modality = modalities, split = data_splits) +} + +rule split_datasets: + input: + rds = 'data/scRNASeq/scRNASeq-full.rds' + output: + train = 'data/scRNASeq/scRNASeq-train.rds', + test = 'data/scRNASeq/scRNASeq-test.rds', + random_train = 'data/scRNASeq/random/random-scRNASeq-500-cells.rds', + script: + 'process-data/split-train-test.R' + + +rule process_data_for_random_forest: + input: + expression_sce = 'data/{modality}/{modality}-full.rds' + output: + expression_df = 'data/{modality}/{modality}-expression-df-{split}.tsv' + script: + 'process-data/Create-expression-df.R' diff --git a/pipeline/process-data/Create-expression-df.R b/pipeline/process-data/Create-expression-df.R new file mode 100644 index 0000000..f2c9b2c --- /dev/null +++ b/pipeline/process-data/Create-expression-df.R @@ -0,0 +1,14 @@ +library(tidyverse) +library(SingleCellExperiment) + +sce <- readRDS(snakemake@input[['expression_sce']]) + +expression <- assays(sce)$logcounts %>% + t() %>% as.matrix() %>% + as.data.frame() + +expression$cell_type <- sce[, rownames(expression)]$CellType +expression$cell_id <- rownames(expression) +rownames(expression) <- NULL + +write_tsv(expression, snakemake@output[['expression_df']]) \ No newline at end of file diff --git a/pipeline/process-data/split-train-test.R b/pipeline/process-data/split-train-test.R new file mode 100644 index 0000000..acd96ce --- /dev/null +++ b/pipeline/process-data/split-train-test.R @@ -0,0 +1,27 @@ +suppressPackageStartupMessages({ + library(tidyverse) + library(SingleCellExperiment) + library(caret) +}) + +sce <- readRDS(snakemake@input[['rds']]) + +set.seed(42) +train <- createDataPartition(sce$CellType, p = 0.5)$Resample1 + +train_sce <- sce[,train] +test_sce <- sce[,-train] + +saveRDS(train_sce, snakemake@output[['train']]) +saveRDS(test_sce, snakemake@output[['test']]) + +### random subset of 500 cells +cell_ids <- sample(1:ncol(train_sce), 500) +random_subset <- train_sce[,cell_ids] + +saveRDS(random_subset, snakemake@output[['random_train']]) + +test_labels <- tibble(cell_id = colnames(random_subset), + cell_type = random_subset$CellType) + +write_tsv(test_labels, 'data/scRNASeq/random/random-annotation-test.tsv') \ No newline at end of file diff --git a/pipeline/random-forest-train.py b/pipeline/random-forest-train.py new file mode 100644 index 0000000..5a41192 --- /dev/null +++ b/pipeline/random-forest-train.py @@ -0,0 +1,38 @@ +import numpy as np +import pandas as pd +import random + +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import GridSearchCV +from sklearn.decomposition import PCA +import joblib + + +# ## Data processing +expression = pd.read_csv(snakemake.input['train'], sep = "\t", header = 0) +X_train = expression.drop(['cell_type', 'cell_id'], axis = 1) +y_train = expression['cell_type'] + +# ## ML Pipeline start +RSEED = 42 + +steps = [('Scale', StandardScaler()), ('pca', PCA()), ('RandomForest', RandomForestClassifier())] +pipeline = Pipeline(steps) + +parameters = { + 'pca__n_components': [100, 450], + 'RandomForest__max_features': [2, 4, 6, 10], + 'RandomForest__n_estimators': [50, 100, 150] +} + +print("starting grid search") +grid = GridSearchCV(pipeline, param_grid=parameters, cv=10, return_train_score=True) + +grid.fit(X_train, y_train) + +joblib.dump(grid.best_estimator_, snakemake.output['model'], compress = 1) + +print(f"Training score: {grid.score(X_train, y_train)}") +print(f"The best parameters are: {grid.best_params_}") \ No newline at end of file diff --git a/utils b/utils new file mode 160000 index 0000000..19c2f8f --- /dev/null +++ b/utils @@ -0,0 +1 @@ +Subproject commit 19c2f8f524ab01d99a46e32989be385f5de20a8c