initial commit

camlab-bioml · Jul 12, 2021 · 8d168ee · 8d168ee
commit 8d168ee
Show file tree

Hide file tree

Showing 16 changed files with 571 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+output/*
+*.Rproj
+data/*
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "utils"]
+	path = utils
+	url = https://github.com/tabdelaal/CyTOF-Linear-Classifier.git
diff --git a/Pipfile b/Pipfile
@@ -0,0 +1,11 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+
+[dev-packages]
+
+[requires]
+python_version = "3.7"
diff --git a/Snakefile b/Snakefile
@@ -0,0 +1,20 @@
+
+import pandas as pd
+import numpy as np
+
+configfile: 'config/config.yml'
+output = 'output/' + config['version'] + '/'
+
+selection_procedures = ['random']
+annotators = ['test']
+modalities = ['scRNASeq']
+data_splits = ['train', 'test']
+
+include: 'pipeline/process-data.smk'
+include: 'pipeline/cell-type-predictions.smk'
+
+
+rule all:
+    input:
+        process_data_output.values(),
+        cell_type_predictions.values()
diff --git a/config/config.yml b/config/config.yml
@@ -0,0 +1,2 @@
+
+version: v1
diff --git a/markers/scRNA.yml b/markers/scRNA.yml
@@ -0,0 +1,204 @@
+cell_types:
+    CD4 T cell:
+        positive:
+            - CD3D
+            - CD3E
+            - CD3G
+            - TRAC
+            - CD4
+            - TCF7
+            - CD27
+            - IL7R
+        negative:
+            - CD8A
+            - CD8B
+            - GNLY
+            - NKG7
+            - CST7
+
+    Cytotoxic T cell:
+        positive:
+            - CD3D
+            - CD3E
+            - CD3G
+            - TRAC
+            - CD8A
+            - CD8B
+            - GZMK
+            - CCL5
+            - NKG7
+        negative:
+            - CD4
+            - FCER1G
+
+    B cell:
+        positive:
+            - CD19
+            - MS4A1
+            - CD79A
+            - CD79B
+            - MZB1
+            - IGHD
+            - IGHM
+
+    Natural Killer Cell:
+        positive:
+            - NCAM1
+            - NKG7
+            - KLRB1
+            - KLRD1
+            - KLRF1
+            - KLRC1
+            - KLRC2
+            - KLRC3
+            - KLRC4
+            - FCGR3A
+            - FCGR3B
+            - ITGAL
+            - ITGAM
+            - FCER1G
+        negative:
+            - CD3D
+            - CD3E
+            - CD3G
+            - CD14
+            - TRAC
+
+    CD14 monocyte:
+        positive:
+            - VCAN
+            - FCN1
+            - S100A8
+            - S100A9
+            - CD14
+            - ITGAL
+            - ITGAM
+            - CSF3R
+            - CSF1R
+            - CX3CR1
+            - TYROBP
+            - LYZ
+            - S100A12
+            - FCN1
+            - FCGR3A
+            - FCGR3B
+            - ITGAL
+            - ITGAM
+            - CSF3R
+            - CSF1R
+            - CX3CR1
+            - CDKN1C
+            - MS4A7
+        negative:
+            - FCGR3A
+            - FCGR3B
+            - CD3D
+            - CD3E
+            - CD3G
+            - TRAC
+            - NKG7
+            - KLRB1
+            - KLRD1
+            - S100A8
+            - S100A9
+            - S100A12
+            - CD14
+            - CD3D
+            - CD3E
+            - CD3G
+            - TRAC
+            - NKG7
+            - KLRB1
+            - KLRD1
+
+    Dendritic cell:
+      positive:
+        - HLA-DPB1
+        - HLA-DPA1
+        - HLA-DQA1
+        - ITGAX
+        - CD1C
+        - CD1E
+        - FCER1A
+        - CLEC10A
+        - FCGR2B
+      negative:
+        - CD3D
+        - CD3E
+        - CD3G
+        - NCAM1
+        - CD19
+        - CD14
+        - MS4A1
+        - CD79A
+        - CD79B
+
+    Plasmacytoid dendritic cell:
+      positive:
+        - IL3RA
+        - GZMB
+        - JCHAIN
+        - IRF7
+        - TCF4
+        - LILRA4
+        - CLEC4C
+      negative:
+        - ITGAX
+        - CD3D
+        - CD3E
+        - CD3G
+        - NCAM1
+        - CD19
+        - CD14
+        - MS4A1
+        - CD79A
+        - CD79B
+
+    Plasma cell:
+      positive:
+        - CD38
+        - XBP1
+        - CD27
+        - SLAMF7
+        - IGHA1
+        - IGHA2
+        - IGHG1
+        - IGHG2
+        - IGHG3
+        - IGHG4
+      negative:
+        - CD19
+        - MS4A1
+        - CD3D
+        - CD3E
+        - CD3G
+
+    Platelet:
+      positive:
+        - PF4
+        - PPBP
+        - GP5
+        - ITGA2B
+        - NRGN
+        - TUBB1
+        - SPARC
+        - RGS18
+        - MYL9
+        - GNG11
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/pipeline/cell-type-assignment/CyTOFLDA.R b/pipeline/cell-type-assignment/CyTOFLDA.R
@@ -0,0 +1,60 @@
+suppressPackageStartupMessages({
+  library(scater)
+  library(SingleCellExperiment)
+  library(tidyverse)
+  source(file.path("CyTOF-Linear-Classifier/", 'CyTOF_LDAtrain.R'))
+  source(file.path("CyTOF-Linear-Classifier/", 'CyTOF_LDApredict.R'))
+})
+library(devtools)
+devtools::load_all("../taproom/")
+
+sce_train <- readRDS("data/training/zurich_subset1k.rds")
+labs <- read_csv("data/zurich1_astir_assignments.csv")
+cell_id <- labs$X1
+labs$X1 <- NULL
+labs$cell_type <- get_celltypes(labs)
+labs <- as.data.frame(labs)
+rownames(labs) <- cell_id
+
+
+labs <- select(labs, cell_type)
+
+sce_train$cell_type <- labs[colnames(sce_train), 'cell_type']
+
+sce_annotate <- sce_train[,1:500]
+sce_train <- sce_train[,501:1000]
+
+
+data_train <- as.data.frame(t(logcounts(sce_train)))
+data_train$cell_type <- sce_train$cell_type
+data_annotate <- as.data.frame(t(logcounts(sce_annotate)))
+
+train_dir <- file.path(tempdir(), "train")
+annotate_dir <- file.path(tempdir(), "annotate")
+
+dir.create(train_dir)
+dir.create(annotate_dir)
+
+write.table(data_train, file = file.path(train_dir, "train.csv"), 
+            col.names = FALSE, row.names = FALSE, sep = ',')
+write.table(data_annotate, file = file.path(annotate_dir, "annotate.csv"),
+            col.names = FALSE, row.names = FALSE, sep = ',')
+
+LDA.Model <- CyTOF_LDAtrain(TrainingSamplesExt = train_dir, TrainingLabelsExt = '',
+                            mode = 'CSV', RelevantMarkers = seq_len(nrow(sce_train)),
+                            LabelIndex = ncol(data_train), Transformation = FALSE)
+
+predictions <- CyTOF_LDApredict(LDA.Model, TestingSamplesExt = annotate_dir,
+                                mode = 'CSV', RejectionThreshold = 0)
+
+predictions <- unlist(Predictions)
+
+df_output <- tibble(
+  cell_id = data_annotate$cell_id,
+  cell_type = Predictions,
+  annotator = args$annotator,
+  cohort = args$cohort,
+  method = args$method
+)
+
+write_tsv(df_output, args$output_assignments)
diff --git a/pipeline/cell-type-assignment/predict-random-forest.py b/pipeline/cell-type-assignment/predict-random-forest.py
@@ -0,0 +1,8 @@
+import numpy as np
+import pandas as pd 
+
+import joblib
+
+
+## Load everything 
+model = joblib.load(open(snakemake.input['model'], 'rb'))