diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0e87b99
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+output/*
+*.Rproj
+data/*
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..f53bbf7
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "utils"]
+	path = utils
+	url = https://github.com/tabdelaal/CyTOF-Linear-Classifier.git
diff --git a/Pipfile b/Pipfile
new file mode 100644
index 0000000..b9ba84f
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,11 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+
+[dev-packages]
+
+[requires]
+python_version = "3.7"
diff --git a/Snakefile b/Snakefile
new file mode 100644
index 0000000..5ebe3d9
--- /dev/null
+++ b/Snakefile
@@ -0,0 +1,20 @@
+
+import pandas as pd
+import numpy as np
+
+configfile: 'config/config.yml'
+output = 'output/' + config['version'] + '/'
+
+selection_procedures = ['random']
+annotators = ['test']
+modalities = ['scRNASeq']
+data_splits = ['train', 'test']
+
+include: 'pipeline/process-data.smk'
+include: 'pipeline/cell-type-predictions.smk'
+
+
+rule all:
+    input:
+        process_data_output.values(),
+        cell_type_predictions.values()
\ No newline at end of file
diff --git a/config/config.yml b/config/config.yml
new file mode 100644
index 0000000..e1c90db
--- /dev/null
+++ b/config/config.yml
@@ -0,0 +1,2 @@
+
+version: v1
\ No newline at end of file
diff --git a/markers/scRNA.yml b/markers/scRNA.yml
new file mode 100644
index 0000000..02b52c9
--- /dev/null
+++ b/markers/scRNA.yml
@@ -0,0 +1,204 @@
+cell_types:
+    CD4 T cell:
+        positive:
+            - CD3D
+            - CD3E
+            - CD3G
+            - TRAC
+            - CD4
+            - TCF7
+            - CD27
+            - IL7R
+        negative:
+            - CD8A
+            - CD8B
+            - GNLY
+            - NKG7
+            - CST7
+        
+    Cytotoxic T cell:
+        positive:
+            - CD3D
+            - CD3E
+            - CD3G
+            - TRAC
+            - CD8A
+            - CD8B
+            - GZMK
+            - CCL5
+            - NKG7
+        negative:
+            - CD4
+            - FCER1G
+    
+    B cell:
+        positive:
+            - CD19
+            - MS4A1
+            - CD79A
+            - CD79B
+            - MZB1
+            - IGHD
+            - IGHM
+    
+    Natural Killer Cell:
+        positive:
+            - NCAM1
+            - NKG7
+            - KLRB1
+            - KLRD1
+            - KLRF1
+            - KLRC1
+            - KLRC2
+            - KLRC3
+            - KLRC4
+            - FCGR3A
+            - FCGR3B
+            - ITGAL
+            - ITGAM
+            - FCER1G
+        negative:
+            - CD3D
+            - CD3E
+            - CD3G
+            - CD14
+            - TRAC
+
+    CD14 monocyte:
+        positive:
+            - VCAN
+            - FCN1
+            - S100A8
+            - S100A9
+            - CD14
+            - ITGAL
+            - ITGAM
+            - CSF3R
+            - CSF1R
+            - CX3CR1
+            - TYROBP
+            - LYZ
+            - S100A12
+            - FCN1
+            - FCGR3A
+            - FCGR3B
+            - ITGAL
+            - ITGAM
+            - CSF3R
+            - CSF1R
+            - CX3CR1
+            - CDKN1C
+            - MS4A7
+        negative:
+            - FCGR3A
+            - FCGR3B
+            - CD3D
+            - CD3E
+            - CD3G
+            - TRAC
+            - NKG7
+            - KLRB1
+            - KLRD1
+            - S100A8
+            - S100A9
+            - S100A12
+            - CD14
+            - CD3D
+            - CD3E
+            - CD3G
+            - TRAC
+            - NKG7
+            - KLRB1
+            - KLRD1
+
+    Dendritic cell:
+      positive:
+        - HLA-DPB1
+        - HLA-DPA1
+        - HLA-DQA1
+        - ITGAX
+        - CD1C
+        - CD1E
+        - FCER1A
+        - CLEC10A
+        - FCGR2B
+      negative:
+        - CD3D
+        - CD3E
+        - CD3G
+        - NCAM1
+        - CD19
+        - CD14
+        - MS4A1
+        - CD79A
+        - CD79B
+
+    Plasmacytoid dendritic cell:
+      positive:
+        - IL3RA
+        - GZMB
+        - JCHAIN
+        - IRF7
+        - TCF4
+        - LILRA4
+        - CLEC4C
+      negative:
+        - ITGAX
+        - CD3D
+        - CD3E
+        - CD3G
+        - NCAM1
+        - CD19
+        - CD14
+        - MS4A1
+        - CD79A
+        - CD79B
+
+    Plasma cell:
+      positive:
+        - CD38
+        - XBP1
+        - CD27
+        - SLAMF7
+        - IGHA1
+        - IGHA2
+        - IGHG1
+        - IGHG2
+        - IGHG3
+        - IGHG4
+      negative:
+        - CD19
+        - MS4A1
+        - CD3D
+        - CD3E
+        - CD3G
+
+    Platelet:
+      positive:
+        - PF4
+        - PPBP
+        - GP5
+        - ITGA2B
+        - NRGN
+        - TUBB1
+        - SPARC
+        - RGS18
+        - MYL9
+        - GNG11
+
+
+
+
+
+
+
+
+
+
+
+    
+
+
+
+
+
diff --git a/pipeline/cell-type-assignment/CyTOFLDA.R b/pipeline/cell-type-assignment/CyTOFLDA.R
new file mode 100644
index 0000000..022bb2e
--- /dev/null
+++ b/pipeline/cell-type-assignment/CyTOFLDA.R
@@ -0,0 +1,60 @@
+suppressPackageStartupMessages({
+  library(scater)
+  library(SingleCellExperiment)
+  library(tidyverse)
+  source(file.path("CyTOF-Linear-Classifier/", 'CyTOF_LDAtrain.R'))
+  source(file.path("CyTOF-Linear-Classifier/", 'CyTOF_LDApredict.R'))
+})
+library(devtools)
+devtools::load_all("../taproom/")
+
+sce_train <- readRDS("data/training/zurich_subset1k.rds")
+labs <- read_csv("data/zurich1_astir_assignments.csv")
+cell_id <- labs$X1
+labs$X1 <- NULL
+labs$cell_type <- get_celltypes(labs)
+labs <- as.data.frame(labs)
+rownames(labs) <- cell_id
+
+
+labs <- select(labs, cell_type)
+
+sce_train$cell_type <- labs[colnames(sce_train), 'cell_type']
+
+sce_annotate <- sce_train[,1:500]
+sce_train <- sce_train[,501:1000]
+
+
+data_train <- as.data.frame(t(logcounts(sce_train)))
+data_train$cell_type <- sce_train$cell_type
+data_annotate <- as.data.frame(t(logcounts(sce_annotate)))
+
+train_dir <- file.path(tempdir(), "train")
+annotate_dir <- file.path(tempdir(), "annotate")
+
+dir.create(train_dir)
+dir.create(annotate_dir)
+
+write.table(data_train, file = file.path(train_dir, "train.csv"), 
+            col.names = FALSE, row.names = FALSE, sep = ',')
+write.table(data_annotate, file = file.path(annotate_dir, "annotate.csv"),
+            col.names = FALSE, row.names = FALSE, sep = ',')
+
+LDA.Model <- CyTOF_LDAtrain(TrainingSamplesExt = train_dir, TrainingLabelsExt = '',
+                            mode = 'CSV', RelevantMarkers = seq_len(nrow(sce_train)),
+                            LabelIndex = ncol(data_train), Transformation = FALSE)
+
+predictions <- CyTOF_LDApredict(LDA.Model, TestingSamplesExt = annotate_dir,
+                                mode = 'CSV', RejectionThreshold = 0)
+
+predictions <- unlist(Predictions)
+
+df_output <- tibble(
+  cell_id = data_annotate$cell_id,
+  cell_type = Predictions,
+  annotator = args$annotator,
+  cohort = args$cohort,
+  method = args$method
+)
+
+write_tsv(df_output, args$output_assignments)
\ No newline at end of file
diff --git a/pipeline/cell-type-assignment/predict-random-forest.py b/pipeline/cell-type-assignment/predict-random-forest.py
new file mode 100644
index 0000000..437b085
--- /dev/null
+++ b/pipeline/cell-type-assignment/predict-random-forest.py
@@ -0,0 +1,8 @@
+import numpy as np
+import pandas as pd 
+
+import joblib
+
+
+## Load everything 
+model = joblib.load(open(snakemake.input['model'], 'rb'))
\ No newline at end of file
diff --git a/pipeline/cell-type-assignment/scmap.R b/pipeline/cell-type-assignment/scmap.R
new file mode 100644
index 0000000..b900579
--- /dev/null
+++ b/pipeline/cell-type-assignment/scmap.R
@@ -0,0 +1,76 @@
+suppressPackageStartupMessages({
+  library(SingleCellExperiment)
+  library(scmap)
+  library(tidyverse)
+})
+
+# Read in training data expression
+train_sce <- readRDS(snakemake@input[['train_data']])
+
+# Read in annotated labels
+annotations <- read_tsv(snakemake@input[['annotation']]) %>% 
+  as.data.frame() %>% 
+  column_to_rownames('cell_id')
+
+# Add labels to expression
+train_sce$cell_type1 <- annotations[colnames(train_sce),]
+
+# Process data
+rowData(train_sce)$feature_symbol <- rownames(train_sce)
+train_sce <- selectFeatures(train_sce, suppress_plot = TRUE)
+
+# Read in test set
+annotate_sce <- readRDS(snakemake@input[['test_data']])
+rowData(annotate_sce)$feature_symbol <- rownames(annotate_sce)
+
+### Start training
+train_sce <- indexCluster(train_sce)
+
+
+# Map clusters
+scmapCluster_results <- scmapCluster(
+  projection = annotate_sce, 
+  index_list = list(
+    yan = metadata(train_sce)$scmap_cluster_index
+  )
+)
+
+clustering_prediction <- tibble(cell_id = colnames(annotate_sce),
+                                predicted_cell_type = scmapCluster_results$combined_labs,
+                                prediction_params = 'clusters',
+                                selection_procedure = snakemake@wildcards[['selection_procedure']],
+                                training_annotator = snakemake@wildcards[['annotator']],
+                                modality = 'scRNASeq')
+
+write_tsv(clustering_prediction, snakemake@output[['cluster_predictions']])
+
+
+### Cell level
+set.seed(1)
+
+train_sce <- indexCell(train_sce)
+
+scmapCell_results <- scmapCell(
+  annotate_sce, 
+  list(
+    yan = metadata(train_sce)$scmap_cell_index
+  )
+)
+
+
+scmapCell_clusters <- scmapCell2Cluster(
+  scmapCell_results, 
+  list(
+    as.character(colData(train_sce)$cell_type1)
+  )
+)
+
+sc_prediction <- tibble(cell_id = colnames(annotate_sce),
+                        predicted_cell_type = scmapCell_clusters$scmap_cluster_labs,
+                        prediction_params = 'single-cell',
+                        selection_procedure = snakemake@wildcards[['selection_procedure']],
+                        training_annotator = snakemake@wildcards[['annotator']],
+                        modality = 'scRNASeq')
+
+write_tsv(sc_prediction, snakemake@output[['sc_predictions']])
+
diff --git a/pipeline/cell-type-assignment/singleR.R b/pipeline/cell-type-assignment/singleR.R
new file mode 100644
index 0000000..18e04f3
--- /dev/null
+++ b/pipeline/cell-type-assignment/singleR.R
@@ -0,0 +1,27 @@
+suppressPackageStartupMessages({
+  library(SingleR)
+  library(tidyverse)
+})
+
+train_sce <- readRDS(snakemake@input[['train_data']])
+assays(train_sce)$counts <- NULL
+
+train_labels <- read_tsv(snakemake@input[['annotation']]) %>% 
+  as.data.frame() %>% 
+  column_to_rownames('cell_id')
+
+train_sce$cell_type <- train_labels[colnames(train_sce),]
+
+annotate_sce <- readRDS(snakemake@input[['test_data']])
+assays(annotate_sce)$counts <- NULL
+
+pred <- SingleR(annotate_sce, train_sce, labels = train_sce$CellType)
+
+result <- tibble(cell_id = rownames(pred),
+                 predicted_cell_type = pred$labels,
+                 prediction_params = 'labels',
+                 selection_procedure = snakemake@wildcards[['selection_procedure']],
+                 training_annotator = snakemake@wildcards[['annotator']],
+                 modality = 'scRNASeq')
+
+write_tsv(result, snakemake@output[['predictions']])
\ No newline at end of file
diff --git a/pipeline/cell-type-predictions.smk b/pipeline/cell-type-predictions.smk
new file mode 100644
index 0000000..83c42b2
--- /dev/null
+++ b/pipeline/cell-type-predictions.smk
@@ -0,0 +1,52 @@
+
+cell_type_predictions = {
+    'random_forest_models': expand(output + 'models/random-forest-{modality}-trained-on-{selection_procedure}-by-{annotator}.pkl', 
+        modality = modalities, selection_procedure = selection_procedures, annotator = annotators),
+    'scmap_cluster': expand(output + 'rare-subtype-benchmarking/scRNASeq-{selection_procedure}-annotation-{annotator}-scmap-cluster-predictions.tsv',
+        selection_procedure = selection_procedures, annotator = annotators),
+    'scmap_sc': expand(output + 'rare-subtype-benchmarking/scRNASeq-{selection_procedure}-annotation-{annotator}-scmap-sc-predictions.tsv',
+        selection_procedure = selection_procedures, annotator = annotators),
+    'singleR': expand(output + 'rare-subtype-benchmarking/scRNASeq-{selection_procedure}-annotation-{annotator}-singleR-predictions.tsv',
+        selection_procedure = selection_procedures, annotator = annotators)
+}
+
+rule train_and_predict_scmap:
+    input:
+        annotation = 'data/scRNASeq/{selection_procedure}/{selection_procedure}-annotation-{annotator}.tsv',
+        train_data = 'data/scRNASeq/scRNASeq-train.rds',
+        test_data = 'data/scRNASeq/scRNASeq-test.rds'
+    output:
+        cluster_predictions = output + 'rare-subtype-benchmarking/scRNASeq-{selection_procedure}-annotation-{annotator}-scmap-cluster-predictions.tsv',
+        sc_predictions = output + 'rare-subtype-benchmarking/scRNASeq-{selection_procedure}-annotation-{annotator}-scmap-sc-predictions.tsv'
+    script:
+        'cell-type-assignment/scmap.R'
+
+rule train_and_predict_singleR:
+    input:
+        annotation = 'data/scRNASeq/{selection_procedure}/{selection_procedure}-annotation-{annotator}.tsv',
+        train_data = 'data/scRNASeq/scRNASeq-train.rds',
+        test_data = 'data/scRNASeq/scRNASeq-test.rds'
+    output:
+        predictions = output + 'rare-subtype-benchmarking/scRNASeq-{selection_procedure}-annotation-{annotator}-singleR-predictions.tsv'
+    script:
+        'cell-type-assignment/singleR.R'
+
+rule train_random_forest:
+    input:
+        train = 'data/{modality}/{modality}-expression-df-train.tsv',
+        annotation = 'data/{modality}/{selection_procedure}/{selection_procedure}-annotation-{annotator}.tsv'
+    output:
+        model = output + 'models/random-forest-{modality}-trained-on-{selection_procedure}-by-{annotator}.pkl'
+    log:
+        output + 'logs/cell-type-predictions/random-forest-{modality}-trained-on-{selection_procedure}-by-{annotator}.log'
+    script:
+        'random-forest-train.py'
+
+rule predict_random_forest:
+    input:
+        test = 'data/{modality}/{modality}-expression-df-test.tsv',
+        model = output + 'models/random-forest-{modality}-trained-on-{selection_procedure}-by-{annotator}.pkl'
+    output:
+        predictions = output + 'rare-subtype-benchmarking/{modality}-{selection-procedure}-annotation-{annotator}-randomForest-predictions.tsv'
+    script:
+        'random-forest-test.py'
\ No newline at end of file
diff --git a/pipeline/process-data.smk b/pipeline/process-data.smk
new file mode 100644
index 0000000..5850d4f
--- /dev/null
+++ b/pipeline/process-data.smk
@@ -0,0 +1,25 @@
+
+process_data_output = {
+    'train_test_split': expand('data/{modality}/{modality}-{split}.rds', modality = modalities, split = data_splits),
+    'random_subset': expand('data/{modality}/random/random-{modality}-500-cells.rds', modality = modalities),
+    'scRNA_expression_df': expand('data/{modality}/{modality}-expression-df-{split}.tsv', modality = modalities, split = data_splits)
+}
+
+rule split_datasets:
+    input:
+        rds = 'data/scRNASeq/scRNASeq-full.rds'
+    output:
+        train = 'data/scRNASeq/scRNASeq-train.rds',
+        test = 'data/scRNASeq/scRNASeq-test.rds',
+        random_train = 'data/scRNASeq/random/random-scRNASeq-500-cells.rds',
+    script:
+        'process-data/split-train-test.R'
+
+
+rule process_data_for_random_forest:
+    input:
+        expression_sce = 'data/{modality}/{modality}-full.rds'
+    output:
+        expression_df = 'data/{modality}/{modality}-expression-df-{split}.tsv'
+    script:
+        'process-data/Create-expression-df.R'
diff --git a/pipeline/process-data/Create-expression-df.R b/pipeline/process-data/Create-expression-df.R
new file mode 100644
index 0000000..f2c9b2c
--- /dev/null
+++ b/pipeline/process-data/Create-expression-df.R
@@ -0,0 +1,14 @@
+library(tidyverse)
+library(SingleCellExperiment)
+
+sce <- readRDS(snakemake@input[['expression_sce']])
+
+expression <- assays(sce)$logcounts %>% 
+  t() %>% as.matrix() %>% 
+  as.data.frame()
+
+expression$cell_type <- sce[, rownames(expression)]$CellType
+expression$cell_id <- rownames(expression)
+rownames(expression) <- NULL
+
+write_tsv(expression, snakemake@output[['expression_df']])
\ No newline at end of file
diff --git a/pipeline/process-data/split-train-test.R b/pipeline/process-data/split-train-test.R
new file mode 100644
index 0000000..acd96ce
--- /dev/null
+++ b/pipeline/process-data/split-train-test.R
@@ -0,0 +1,27 @@
+suppressPackageStartupMessages({
+  library(tidyverse)
+  library(SingleCellExperiment)
+  library(caret)
+})
+
+sce <- readRDS(snakemake@input[['rds']])
+
+set.seed(42)
+train <- createDataPartition(sce$CellType, p = 0.5)$Resample1
+
+train_sce <- sce[,train]
+test_sce <- sce[,-train]
+
+saveRDS(train_sce, snakemake@output[['train']])
+saveRDS(test_sce, snakemake@output[['test']])
+
+### random subset of 500 cells
+cell_ids <- sample(1:ncol(train_sce), 500)
+random_subset <- train_sce[,cell_ids]
+
+saveRDS(random_subset, snakemake@output[['random_train']])
+
+test_labels <- tibble(cell_id = colnames(random_subset), 
+                      cell_type = random_subset$CellType)
+
+write_tsv(test_labels, 'data/scRNASeq/random/random-annotation-test.tsv')
\ No newline at end of file
diff --git a/pipeline/random-forest-train.py b/pipeline/random-forest-train.py
new file mode 100644
index 0000000..5a41192
--- /dev/null
+++ b/pipeline/random-forest-train.py
@@ -0,0 +1,38 @@
+import numpy as np
+import pandas as pd
+import random
+
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import GridSearchCV
+from sklearn.decomposition import PCA
+import joblib
+
+
+# ## Data processing
+expression = pd.read_csv(snakemake.input['train'], sep = "\t", header = 0)
+X_train = expression.drop(['cell_type', 'cell_id'], axis = 1)
+y_train = expression['cell_type']
+
+# ## ML Pipeline start
+RSEED = 42
+
+steps = [('Scale', StandardScaler()), ('pca', PCA()), ('RandomForest', RandomForestClassifier())]
+pipeline = Pipeline(steps)
+
+parameters = {
+    'pca__n_components': [100, 450],
+    'RandomForest__max_features': [2, 4, 6, 10],
+    'RandomForest__n_estimators': [50, 100, 150]
+}
+
+print("starting grid search")
+grid = GridSearchCV(pipeline, param_grid=parameters, cv=10, return_train_score=True)
+
+grid.fit(X_train, y_train)
+
+joblib.dump(grid.best_estimator_, snakemake.output['model'], compress = 1)
+
+print(f"Training score: {grid.score(X_train, y_train)}")
+print(f"The best parameters are: {grid.best_params_}")
\ No newline at end of file
diff --git a/utils b/utils
new file mode 160000
index 0000000..19c2f8f
--- /dev/null
+++ b/utils
@@ -0,0 +1 @@
+Subproject commit 19c2f8f524ab01d99a46e32989be385f5de20a8c