Skip to content

Commit

Permalink
cell type assignment
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael-Geuenich committed May 29, 2023
1 parent cb7eda8 commit 33a3b3b
Show file tree
Hide file tree
Showing 10 changed files with 157 additions and 160 deletions.
4 changes: 4 additions & 0 deletions pipeline/cell-type-assignment/CyTOFLDA.R
Original file line number Diff line number Diff line change
Expand Up @@ -72,4 +72,8 @@ if(is.null(snakemake@wildcards[['cell_selection']])){
df_output$cell_selection <- snakemake@wildcards[['cell_selection']]
}

if(!is.null(snakemake@wildcards$cell_selection)){
df_output$pred_cells <- snakemake@wildcards$cell_selection
}

write_tsv(df_output, snakemake@output[['prediction']])
35 changes: 0 additions & 35 deletions pipeline/cell-type-assignment/Seurat.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ sce <- readRDS(snakemake@input[['training_rds']])
markers <- read_yaml(snakemake@input[['markers']])$cell_types
unique_markers <- unlist(markers) %>% unique()


### [PROCESS DATA] ###
if(snakemake@params[['mod']] == 'scRNASeq' | snakemake@params[['mod']] == 'snRNASeq'){
# Normalize scRNASeq data
Expand Down Expand Up @@ -101,40 +100,6 @@ assignments %>%
select(cell_id, predicted_cell_type, prediction_params, cluster, modality) %>%
write_tsv(snakemake@output[['assignments']])

### Create diagnostic plots
# Positive markers
# lapply(1:length(markers), function(x){
# cell_types <- names(markers[x]) %>%
# gsub(" ", "-", .)

# out_path <- snakemake@params[['positive_markers_diagnostic']] %>%
# gsub("\\[", "{", .) %>%
# gsub("\\]", "}", .)
# out <- glue(out_path)

# pdf(out, width = 12)
# print(FeaturePlot(seu, features = markers[[x]]$positive))
# dev.off()
# })

# # Negative markers
# lapply(1:length(markers), function(x){
# cell_types <- names(markers[x]) %>%
# gsub(" ", "-", .)

# out_path <- snakemake@params[['negative_markers_diagnostic']] %>%
# gsub("\\[", "{", .) %>%
# gsub("\\]", "}", .)
# out <- glue(out_path)

# if(!is.null(markers[[x]]$negative)){
# pdf(out, width = 12)
# print(FeaturePlot(seu, features = markers[[x]]$negative))
# dev.off()
# }
# })


### Cell type umap
Idents(seu) <- assignments$predicted_cell_type

Expand Down
12 changes: 8 additions & 4 deletions pipeline/cell-type-assignment/active-learning-accuracy.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ if(grepl("maxp", snakemake@wildcards$strat)){
df_expression <- load_scs(sce)
df_expression$cell_type <- NA
df_expression$gt_cell_type <- sce$CellType
df_expression$non_corrupted_gt_cell_type <- sce$CellType
df_expression$iteration <- NA
df_expression$corrupted <- NA

Expand Down Expand Up @@ -63,6 +64,7 @@ df_PCA <- bind_cols(
df_PCA[,1:min(20, ncol(df_PCA))],
tibble(cell_type = df_expression$cell_type,
gt_cell_type = df_expression$gt_cell_type,
non_corrupted_gt_cell_type = df_expression$non_corrupted_gt_cell_type,
iteration = df_expression$iteration)
)

Expand All @@ -74,24 +76,26 @@ f1_scores <- tibble(
)

for(i in 1:iters){
print(i)
annotated_cells <- df_PCA %>%
filter(!is.na(cell_type)) %>%
filter(cell_type != "Skipped", cell_type != "Unclear")
print(dim(annotated_cells))

left_cells <- df_PCA %>%
filter(is.na(cell_type))

ModelFit <- fit_AL_classifier(select(annotated_cells, -gt_cell_type, -iteration),
ModelFit <- fit_AL_classifier(select(annotated_cells, -gt_cell_type, -iteration, -non_corrupted_gt_cell_type),
snakemake@wildcards$AL_alg)

### Calculate F1-score
predicted_scores <- predict(ModelFit,
select(left_cells, -X1, -cell_type, -gt_cell_type, -iteration),
select(left_cells, -X1, -cell_type, -gt_cell_type, -iteration, -non_corrupted_gt_cell_type),
type = "raw")

preds <- tibble(
cell_id = left_cells$X1,
annotated_cell_type = left_cells$gt_cell_type,
annotated_cell_type = left_cells$non_corrupted_gt_cell_type,
predicted_cell_type = predicted_scores
)

Expand All @@ -106,7 +110,7 @@ for(i in 1:iters){

# Continue AL - predict probabilities
predicted_scores <- predict(ModelFit,
select(left_cells, -X1, -cell_type, -gt_cell_type, -iteration),
select(left_cells, -X1, -cell_type, -gt_cell_type, -iteration, -non_corrupted_gt_cell_type),
type = "prob")

# Get next set of cells
Expand Down
10 changes: 9 additions & 1 deletion pipeline/cell-type-assignment/predict-random-forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,14 @@

predicted = model.predict(expression)

if 'cell_num' in dict(snakemake.wildcards).keys():
cell_num = snakemake.wildcards['cell_num']
else:
cell_num = 'NA'

output = pd.DataFrame({'cell_id': cell_ids,
'predicted_cell_type': predicted,
'prediction_params': 'Random-Forest-knn-' + snakemake.wildcards['neighbors'] + '-res-' + snakemake.wildcards['res'] + '-cell_numbers-' + snakemake.wildcards['cell_num'] + '-randomSelection-' + snakemake.wildcards['rand'] + '-corrupted-' + snakemake.wildcards['corrupt'] + '-Init-' + snakemake.wildcards['initial'] + '-seed-' + snakemake.wildcards['s'],
'prediction_params': 'Random-Forest-knn-' + snakemake.wildcards['neighbors'] + '-res-' + snakemake.wildcards['res'] + '-cell_numbers-' + cell_num + '-randomSelection-' + snakemake.wildcards['rand'] + '-corrupted-' + snakemake.wildcards['corrupt'] + '-Init-' + snakemake.wildcards['initial'] + '-seed-' + snakemake.wildcards['s'],
'selection_procedure': snakemake.wildcards['selection_procedure'] + '-strategy-' + snakemake.wildcards['strat'] + '-ALAlg-' + snakemake.wildcards['AL_alg'],
'modality': snakemake.wildcards['modality']})

Expand All @@ -28,4 +33,7 @@
if 'similarity' in dict(snakemake.wildcards).keys():
output['similarity'] = snakemake.wildcards['bal'] + '-' + snakemake.wildcards['similarity']

if 'cell_selection' in dict(snakemake.wildcards).keys():
output['pred_cells'] = snakemake.wildcards['cell_selection']

output.to_csv(snakemake.output['predictions'], sep = '\t', index = False)
23 changes: 14 additions & 9 deletions pipeline/cell-type-assignment/random-forest-train.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,24 @@
expression = pd.read_csv(snakemake.input['train'], sep = "\t", header = 0)
annotation = pd.read_csv(str(snakemake.input['annotation']), sep = '\t', header = 0)

expression = pd.merge(expression, annotation, on = 'cell_id')
if 'cell_selection' in dict(snakemake.wildcards).keys():
expression = pd.merge(expression, annotation, on = 'cell_id')
X_train = expression.drop(['cell_type_y', 'entropy', 'labeling', 'cell_type_x', 'cell_id', 'corrupted_cell_type', 'iteration', 'method', 'cell_num', 'params'], axis = 1)
y_train = expression['cell_type_y']
else:
expression = pd.merge(expression, annotation, on = 'cell_id')

if snakemake.wildcards['selection_procedure'] == 'Active-Learning_entropy' or snakemake.wildcards['selection_procedure'] == 'Active-Learning_maxp':
expression = expression.drop(['iteration', 'cell_num', 'corrupted_cell_type'], axis = 1)
if snakemake.wildcards['selection_procedure'] == 'Active-Learning_entropy' or snakemake.wildcards['selection_procedure'] == 'Active-Learning_maxp':
expression = expression.drop(['iteration', 'cell_num', 'corrupted_cell_type'], axis = 1)

if snakemake.wildcards['selection_procedure'] == 'random':
expression = expression.drop(['set','params'], axis = 1)
if snakemake.wildcards['selection_procedure'] == 'random':
expression = expression.drop(['set','params'], axis = 1)

if snakemake.wildcards['selection_procedure'] == 'Seurat-clustering':
expression = expression.drop(['params'], axis = 1)
if snakemake.wildcards['selection_procedure'] == 'NoMarkerSeurat-clustering' or snakemake.wildcards['selection_procedure'] == 'MarkerSeurat-clustering':
expression = expression.drop(['params'], axis = 1)

X_train = expression.drop(['cell_type_y', 'method', 'cell_type_x', 'cell_id'], axis = 1)
y_train = expression['cell_type_y']
X_train = expression.drop(['cell_type_y', 'method', 'cell_type_x', 'cell_id'], axis = 1)
y_train = expression['cell_type_y']

# ## ML Pipeline start
RSEED = 42
Expand Down
8 changes: 8 additions & 0 deletions pipeline/cell-type-assignment/scmap.R
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ if(!is.null(snakemake@wildcards[['similarity']])){
clustering_prediction$similarity <- paste0(snakemake@wildcards[['bal']], '-', snakemake@wildcards[['similarity']])
}

if(!is.null(snakemake@wildcards$cell_selection)){
clustering_prediction$pred_cells <- snakemake@wildcards$cell_selection
}


write_tsv(clustering_prediction, snakemake@output[['cluster_predictions']])

Expand Down Expand Up @@ -111,5 +115,9 @@ if(!is.null(snakemake@wildcards[['similarity']])){
sc_prediction$similarity <- paste0(snakemake@wildcards[['bal']], '-', snakemake@wildcards[['similarity']])
}

if(!is.null(snakemake@wildcards$cell_selection)){
sc_prediction$pred_cells <- snakemake@wildcards$cell_selection
}

write_tsv(sc_prediction, snakemake@output[['sc_predictions']])

3 changes: 3 additions & 0 deletions pipeline/cell-type-assignment/simulate-active-learner.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ if(snakemake@wildcards[['initial']] == 'ranking'){
}else if(snakemake@wildcards[['initial']] == 'random'){
random_cell_idx <- sample(1:nrow(df_expression), 20)
df_expression$cell_type[random_cell_idx] <- df_expression$gt_cell_type[random_cell_idx]
df_expression$iteration[random_cell_idx] <- 0
}

if(!is.null(snakemake@wildcards[['similarity']])){
Expand Down Expand Up @@ -82,6 +83,8 @@ df_PCA <- bind_cols(
iteration = df_expression$iteration)
)

table(df_PCA$cell_type)

for(i in 1:max_AL_iterations){
AL <- active_learning_wrapper(select(df_PCA, -gt_cell_type, -iteration),
snakemake@wildcards[['AL_alg']],
Expand Down
4 changes: 4 additions & 0 deletions pipeline/cell-type-assignment/singleR.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,8 @@ if(!is.null(snakemake@wildcards[['similarity']])){
result$similarity <- paste0(snakemake@wildcards[['bal']], '-', snakemake@wildcards[['similarity']])
}

if(!is.null(snakemake@wildcards$cell_selection)){
result$pred_cells <- snakemake@wildcards$cell_selection
}

write_tsv(result, snakemake@output[['predictions']])
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
library(tidyverse)

training <- read_tsv(snakemake@input[['assignment']]) %>%
filter(!is.na(iteration)) |>
arrange(iteration)

subset <- training[1:as.integer(snakemake@wildcards[['subset_val']]),]
Expand Down
Loading

0 comments on commit 33a3b3b

Please sign in to comment.