Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael-Geuenich committed Jul 12, 2021
0 parents commit 8d168ee
Show file tree
Hide file tree
Showing 16 changed files with 571 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
output/*
*.Rproj
data/*
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "utils"]
path = utils
url = https://github.com/tabdelaal/CyTOF-Linear-Classifier.git
11 changes: 11 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"

[packages]

[dev-packages]

[requires]
python_version = "3.7"
20 changes: 20 additions & 0 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@

import pandas as pd
import numpy as np

configfile: 'config/config.yml'
output = 'output/' + config['version'] + '/'

selection_procedures = ['random']
annotators = ['test']
modalities = ['scRNASeq']
data_splits = ['train', 'test']

include: 'pipeline/process-data.smk'
include: 'pipeline/cell-type-predictions.smk'


rule all:
input:
process_data_output.values(),
cell_type_predictions.values()
2 changes: 2 additions & 0 deletions config/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

version: v1
204 changes: 204 additions & 0 deletions markers/scRNA.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
cell_types:
CD4 T cell:
positive:
- CD3D
- CD3E
- CD3G
- TRAC
- CD4
- TCF7
- CD27
- IL7R
negative:
- CD8A
- CD8B
- GNLY
- NKG7
- CST7

Cytotoxic T cell:
positive:
- CD3D
- CD3E
- CD3G
- TRAC
- CD8A
- CD8B
- GZMK
- CCL5
- NKG7
negative:
- CD4
- FCER1G

B cell:
positive:
- CD19
- MS4A1
- CD79A
- CD79B
- MZB1
- IGHD
- IGHM

Natural Killer Cell:
positive:
- NCAM1
- NKG7
- KLRB1
- KLRD1
- KLRF1
- KLRC1
- KLRC2
- KLRC3
- KLRC4
- FCGR3A
- FCGR3B
- ITGAL
- ITGAM
- FCER1G
negative:
- CD3D
- CD3E
- CD3G
- CD14
- TRAC

CD14 monocyte:
positive:
- VCAN
- FCN1
- S100A8
- S100A9
- CD14
- ITGAL
- ITGAM
- CSF3R
- CSF1R
- CX3CR1
- TYROBP
- LYZ
- S100A12
- FCN1
- FCGR3A
- FCGR3B
- ITGAL
- ITGAM
- CSF3R
- CSF1R
- CX3CR1
- CDKN1C
- MS4A7
negative:
- FCGR3A
- FCGR3B
- CD3D
- CD3E
- CD3G
- TRAC
- NKG7
- KLRB1
- KLRD1
- S100A8
- S100A9
- S100A12
- CD14
- CD3D
- CD3E
- CD3G
- TRAC
- NKG7
- KLRB1
- KLRD1

Dendritic cell:
positive:
- HLA-DPB1
- HLA-DPA1
- HLA-DQA1
- ITGAX
- CD1C
- CD1E
- FCER1A
- CLEC10A
- FCGR2B
negative:
- CD3D
- CD3E
- CD3G
- NCAM1
- CD19
- CD14
- MS4A1
- CD79A
- CD79B

Plasmacytoid dendritic cell:
positive:
- IL3RA
- GZMB
- JCHAIN
- IRF7
- TCF4
- LILRA4
- CLEC4C
negative:
- ITGAX
- CD3D
- CD3E
- CD3G
- NCAM1
- CD19
- CD14
- MS4A1
- CD79A
- CD79B

Plasma cell:
positive:
- CD38
- XBP1
- CD27
- SLAMF7
- IGHA1
- IGHA2
- IGHG1
- IGHG2
- IGHG3
- IGHG4
negative:
- CD19
- MS4A1
- CD3D
- CD3E
- CD3G

Platelet:
positive:
- PF4
- PPBP
- GP5
- ITGA2B
- NRGN
- TUBB1
- SPARC
- RGS18
- MYL9
- GNG11

















60 changes: 60 additions & 0 deletions pipeline/cell-type-assignment/CyTOFLDA.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
suppressPackageStartupMessages({
library(scater)
library(SingleCellExperiment)
library(tidyverse)
source(file.path("CyTOF-Linear-Classifier/", 'CyTOF_LDAtrain.R'))
source(file.path("CyTOF-Linear-Classifier/", 'CyTOF_LDApredict.R'))
})
library(devtools)
devtools::load_all("../taproom/")

sce_train <- readRDS("data/training/zurich_subset1k.rds")
labs <- read_csv("data/zurich1_astir_assignments.csv")
cell_id <- labs$X1
labs$X1 <- NULL
labs$cell_type <- get_celltypes(labs)
labs <- as.data.frame(labs)
rownames(labs) <- cell_id


labs <- select(labs, cell_type)

sce_train$cell_type <- labs[colnames(sce_train), 'cell_type']

sce_annotate <- sce_train[,1:500]
sce_train <- sce_train[,501:1000]


data_train <- as.data.frame(t(logcounts(sce_train)))
data_train$cell_type <- sce_train$cell_type
data_annotate <- as.data.frame(t(logcounts(sce_annotate)))

train_dir <- file.path(tempdir(), "train")
annotate_dir <- file.path(tempdir(), "annotate")

dir.create(train_dir)
dir.create(annotate_dir)

write.table(data_train, file = file.path(train_dir, "train.csv"),
col.names = FALSE, row.names = FALSE, sep = ',')
write.table(data_annotate, file = file.path(annotate_dir, "annotate.csv"),
col.names = FALSE, row.names = FALSE, sep = ',')

LDA.Model <- CyTOF_LDAtrain(TrainingSamplesExt = train_dir, TrainingLabelsExt = '',
mode = 'CSV', RelevantMarkers = seq_len(nrow(sce_train)),
LabelIndex = ncol(data_train), Transformation = FALSE)

predictions <- CyTOF_LDApredict(LDA.Model, TestingSamplesExt = annotate_dir,
mode = 'CSV', RejectionThreshold = 0)

predictions <- unlist(Predictions)

df_output <- tibble(
cell_id = data_annotate$cell_id,
cell_type = Predictions,
annotator = args$annotator,
cohort = args$cohort,
method = args$method
)

write_tsv(df_output, args$output_assignments)
8 changes: 8 additions & 0 deletions pipeline/cell-type-assignment/predict-random-forest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import numpy as np
import pandas as pd

import joblib


## Load everything
model = joblib.load(open(snakemake.input['model'], 'rb'))
Loading

0 comments on commit 8d168ee

Please sign in to comment.