nf-core · LaurenceKuhl · Sep 4, 2024 · Aug 16, 2024 · Aug 23, 2024 · Aug 27, 2024
diff --git a/.editorconfig b/.editorconfig
@@ -31,3 +31,6 @@ indent_size = unset
 # ignore python and markdown
 [*.{py,md}]
 indent_style = unset
+
+[/assets/hgnc_complete_set.txt]
+trim_trailing_whitespace = unset
diff --git a/README.md b/README.md
@@ -67,9 +67,10 @@ For crispr screening:
    - ([`bowtie2`](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml))
 3. Optional: CNV correction and normalization with ([`CRISPRcleanR`](https://github.com/francescojm/CRISPRcleanR))
 4. Rank sgRNAs and genes ;
-   a. ([MAGeCK test](https://sourceforge.net/p/mageck/wiki/usage/#test))
-   b. ([MAGeCK mle](https://sourceforge.net/p/mageck/wiki/Home/#mle))
-   c. ([BAGEL2](https://github.com/hart-lab/bagel))
+   - ([MAGeCK test](https://sourceforge.net/p/mageck/wiki/usage/#test))
+   - ([MAGeCK mle](https://sourceforge.net/p/mageck/wiki/Home/#mle))
+   - ([BAGEL2](https://github.com/hart-lab/bagel))
+   - ([DrugZ](https://github.com/hart-lab/drugz))
 5. Visualize analysis
 
 ## Usage

diff --git a/assets/biogrid_hgncid_noduplicate_dropna.csv b/assets/biogrid_hgncid_noduplicate_dropna.csv
diff --git a/assets/hgnc_complete_set.txt b/assets/hgnc_complete_set.txt
diff --git a/conf/test_screening.config b/conf/test_screening.config
@@ -20,12 +20,14 @@ params {
     max_time   = '6.h'
 
     // Input data
-    input             = params.pipelines_testdata_base_path + "crisprseq/testdata/samplesheet_test.csv"
-    analysis          = 'screening'
-    crisprcleanr      = "Brunello_Library"
-    library           = params.pipelines_testdata_base_path + "crisprseq/testdata/brunello_target_sequence.txt"
-    contrasts         = params.pipelines_testdata_base_path + "crisprseq/testdata/rra_contrasts.txt"
-    drugz             = params.pipelines_testdata_base_path + "crisprseq/testdata/rra_contrasts.txt"
+    input                      = params.pipelines_testdata_base_path + "crisprseq/testdata/samplesheet_test.csv"
+    analysis                   = 'screening'
+    crisprcleanr               = "Brunello_Library"
+    library                    = params.pipelines_testdata_base_path + "crisprseq/testdata/brunello_target_sequence.txt"
+    contrasts                  = params.pipelines_testdata_base_path + "crisprseq/testdata/rra_contrasts.txt"
+    drugz                      = params.pipelines_testdata_base_path + "crisprseq/testdata/rra_contrasts.txt"
+    hit_selection_iteration_nb = 150
+    hitselection               = true
 }
 
 process {

diff --git a/docs/output/screening.md b/docs/output/screening.md
@@ -190,6 +190,12 @@ For further reading and documentation see the [cutadapt helper page](https://cut
   - `*.txt`: Pathway view for top enriched pathways.
   - `*.png`: Pathway view for top enriched pathways.
 
+### HitSelection
+
+- `HitSelection`
+  - `*.png` : -logP value vs gene rank plot to determine the rank threshols
+  - `*.txt` : Ranked -logP value and gene symbols table
+
 ## MultiQC
 
 <details markdown="1">

diff --git a/docs/usage/screening.md b/docs/usage/screening.md
@@ -134,6 +134,18 @@ The contrast from reference to treatment should be ; separated
 
 If you wish to remove specific genes before the drugZ analysis, you can use the `--drugz_remove_genes` option following a comma separated list of genes.
 
+### Running Hitselection
+
+Hitselection provides the user with a threshold and a set of genes that are likely to be closer to true positives by identifying the most interconnected subnetworks within the ranked gene list. This module is for now only developped for KO screens on Human data mapped to Entrez IDs.
+
+Hitselection is a script for identifying rank thresholds for CRISPR screen results based on using the connectivity of subgraphs of protein-protein interaction (PPI) networks. The script is based on R and is also an implementation of RNAiCut (Kaplow et al., 2009), a method for estimating thresholds in RNAi data. The principle behind Hitselection is that true positive hits are densely connected in the PPI networks. The script runs a simulation based on Poisson distribution of the ranked screen gene list to calculate the -logP value for comparing the interconnectivity of the real subnetwork and the degree match random subnetwork of each gene, one by one. The degree of the nodes is used as the interconnectivity metric.
+
+To run Hitselection, you can specify '--hitselection' and it will automatically run on the gene essentiality algorithms you have chosen. The outputs are a png file containing the -logP value vs gene rank plot and a txt file containing all the -logP values, edge and average edge values and ranked gene symbols.
+
+## :warning: The hitselection algorithm is for the moment developped only for KO screens and requires the library to map to genes with an EntrezID.
+
+## :warning: Please be advised that the Hitselection algorithm is time intensive and will make the pipeline run longer
+
 Note that the pipeline will create the following files in your working directory:
 
 ```bash

diff --git a/modules/local/hitselection.nf b/modules/local/hitselection.nf
@@ -0,0 +1,305 @@
+process HITSELECTION {
+    tag "${meta.treatment}_${meta.reference}"
+    label 'process_high'
+
+    conda "r-base=4.4.1,r-igraph=2.0.3,r-dplyr=1.1.4,r-tidyr=1.3.1,r-readr=2.1.5,r-ggplot2=3.5.1"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/mulled-v2-dcfb6eba6adda57b9d4a990b9096cb040320914f:588e2c290fce5c5c11ef2340b6184370efd2c628-0':
+        'biocontainers/mulled-v2-dcfb6eba6adda57b9d4a990b9096cb040320914f:588e2c290fce5c5c11ef2340b6184370efd2c628-0' }"
+
+    input:
+    tuple val(meta), path(per_gene_results)
+    path(biogrid)
+    path(hgnc)
+    val(hit_selection_iteration_nb)
+
+    output:
+    path("*_gene_conversion.txt"),        emit: gene_converted
+    path("*_hitselection.tsv"),     emit: hitselection
+    path("*_logpvalue_distribution.png"),         emit: plot
+    path "versions.yml",            emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+
+    """
+    #!/usr/bin/env Rscript
+    #### author: Metin Yazar
+    #### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text.
+    ####
+
+    library(igraph)
+    library(dplyr)
+    library(ggplot2)
+    library(readr)
+
+    # Function to process gene symbols from the screen data
+    update_gene_columns <- function(gene_symbol) {
+        gene_symbol_str <- as.character(gene_symbol)
+        parts <- strsplit(gene_symbol_str, '-')[[1]]  # Split gene symbol by hyphen
+        if (length(parts) > 1 && nchar(tail(parts, n=1)) >= 2) {
+            # If there's a suffix with at least 2 characters, split it off as Gene_2
+            return(list(paste(parts[-length(parts)], collapse = '-'), tail(parts, n = 1)))
+        } else {
+            # If no valid suffix, return the full symbol and NA for Gene_2
+            return(list(gene_symbol_str, NA))
+        }
+    }
+
+    # Function to look up gene information based on gene symbols
+    lookup_gene_info <- function(gene_symbol, gene_symbol_2) {
+        result <- gene_lookup[[gene_symbol]]  # Try to find information for the main symbol
+        if (is.null(result)) {
+            result <- gene_lookup[[gene_symbol_2]]  # Try the second symbol (suffix) if the first one fails
+            if (is.null(result)) {
+                # If neither symbol is found, return a "No entry is found" message
+                result <- list('No entry is found', 'No entry is found')
+            }
+        }
+        return(result)
+    }
+
+    # Function to perform degree-conserved edge permutations
+    EdgePermutationDegreeConserved <- function(permutation, frequency, degree, hit.genes, graph) {
+        edges.permutation.degree.conserved <- rep(NA, permutation)  # Initialize result vector
+        if(length(hit.genes) > 0) {
+            for(j in 1:permutation) {
+                set.seed(j)  # Change the seed for each permutation
+                hit.genes.permuted.degree.conserved <- NULL
+                # Permute genes based on degree thresholds
+                if(sum(frequency[which(as.numeric(names(frequency)) > 500)]) > 0){
+                    sample.temp <- which(degree > 500)
+                    hit.genes.permuted.degree.conserved <- c(hit.genes.permuted.degree.conserved,
+                                                            names(sample(x = sample.temp, size = sum(frequency[which(as.numeric(names(frequency)) > 500)]), replace = FALSE)))
+                }
+                if(sum(frequency[which(as.numeric(names(frequency)) <= 500 & as.numeric(names(frequency)) > 100)]) > 0){
+                    sample.temp <- which(degree <= 500 & degree > 100)
+                    hit.genes.permuted.degree.conserved <- c(hit.genes.permuted.degree.conserved,
+                                                            names(sample(x = sample.temp, size = sum(frequency[which(as.numeric(names(frequency)) <= 500 & as.numeric(names(frequency)) > 100)]), replace = FALSE)))
+                }
+                if(sum(frequency[which(as.numeric(names(frequency)) <= 100 & as.numeric(names(frequency)) > 50)]) > 0){
+                    sample.temp <- which(degree <= 100 & degree > 50)
+                    hit.genes.permuted.degree.conserved <- c(hit.genes.permuted.degree.conserved,
+                                                            names(sample(x = sample.temp, size = sum(frequency[which(as.numeric(names(frequency)) <= 100 & as.numeric(names(frequency)) > 50)]), replace = FALSE)))
+                }
+                if(sum(frequency[which(as.numeric(names(frequency)) <= 50 & as.numeric(names(frequency)) > 30)]) > 0){
+                    sample.temp <- which(degree <= 50 & degree > 30)
+                    hit.genes.permuted.degree.conserved <- c(hit.genes.permuted.degree.conserved,
+                                                            names(sample(x = sample.temp, size = sum(frequency[which(as.numeric(names(frequency)) <= 50 & as.numeric(names(frequency)) > 30)]), replace = FALSE)))
+                }
+                if(sum(frequency[which(as.numeric(names(frequency)) <= 30 & as.numeric(names(frequency)) > 10)]) > 0){
+                    sample.temp <- which(degree <= 30 & degree > 10)
+                    hit.genes.permuted.degree.conserved <- c(hit.genes.permuted.degree.conserved,
+                                                            names(sample(x = sample.temp, size = sum(frequency[which(as.numeric(names(frequency)) <= 30 & as.numeric(names(frequency)) > 10)]), replace = FALSE)))
+                }
+                if(length(which(as.numeric(names(frequency)) <= 10)) > 0) {
+                    temp.frequency <- frequency[which(as.numeric(names(frequency)) <= 10)]
+                    for(k in 1:length(temp.frequency)){
+                        sample.temp <- which(degree == as.numeric(names(temp.frequency[k])))
+                        hit.genes.permuted.degree.conserved <- c(hit.genes.permuted.degree.conserved,
+                                                                names(sample(x = sample.temp, size = temp.frequency[k], replace = FALSE)))
+                    }
+                }
+                # Ensure the number of permuted genes matches the original hit genes
+                if(length(hit.genes) != length(hit.genes.permuted.degree.conserved)) {
+                    print("we have a problem")
+                }
+                # Count the number of edges in the permuted subgraph and store it
+                edges.permutation.degree.conserved[j] <- length(E(induced.subgraph(graph, hit.genes.permuted.degree.conserved)))
+            }
+        } else {
+            edges.permutation.degree.conserved[1:permutation] = 0  # If no hit genes, all edge counts are zero
+        }
+        return(edges.permutation.degree.conserved)
+    }
+
+    Find.Significance <- function(graph, hit.genes, degree, permutation)
+    {
+        subGraph <- induced.subgraph(graph, hit.genes)
+        edges <- length(E(subGraph))	# Compute number of edges in the network
+        frequency <- table(degree[which((names(degree) %in% hit.genes) == TRUE)])  # Find degrees in the original graph
+        edges.permutation.degree.conserved <- EdgePermutationDegreeConserved(permutation,frequency,degree,hit.genes,graph)
+        avg_edges_permutation_degree_conserved <- mean(edges.permutation.degree.conserved)
+        logp.val.degree.conserved = - ppois(edges-1, avg_edges_permutation_degree_conserved,lower.tail = FALSE,log.p = TRUE)
+        #logp.val.degree.conserved <-  -log10(p.val.degree.conserved)
+        no.nodes <- length(V(subGraph))
+        no.edges <- length(E(subGraph))
+        return(data.frame(logp.val.degree.conserved,edges,avg_edges_permutation_degree_conserved))
+    }
+
+    # Load necessary data
+    screen <- read_delim("${per_gene_results}", delim = '\t')  # Load gene screening results
+
+    beta_col <- grep("beta", colnames(screen), value = TRUE)
+
+    #For MAGeCK MLE output
+    if(length(beta_col) >= 1) {
+        screen\$Rank <- rank(screen[[beta_col]])
+        screen <- screen[order(screen\$Rank), ]
+    }
+
+    hgnc <- read_delim("${hgnc}", delim = '\t')  # Load HGNC gene data
+
+    # Select relevant columns from HGNC data
+    columns_to_include <- c('hgnc_id', 'symbol', 'prev_symbol', 'ensembl_gene_id', 'alias_symbol', 'entrez_id')
+    hgnc <- hgnc[columns_to_include]  # Keep only the necessary columns
+
+    # Convert HGNC data to a list of dictionaries, where each entry corresponds to a gene
+    hgnc_dict <- split(hgnc, seq(nrow(hgnc)))
+
+    # Apply the update_gene_columns function to each gene symbol in the screen data
+    updated_genes <- do.call(rbind, lapply(screen[[1]], update_gene_columns))
+    screen[[1]] <- updated_genes[, 1]  # Update the main gene symbol column
+    screen\$Gene_2 <- updated_genes[, 2]  # Add a new column for the suffixes (Gene_2)
+
+    # Create a lookup dictionary for gene information based on HGNC data
+    gene_lookup <- list()
+    for (entry in hgnc_dict) {
+        # Add main gene symbol to lookup table
+        gene_lookup[[entry\$symbol]] <- list(entry\$hgnc_id, entry\$ensembl_gene_id)
+        # Add previous symbols (if any) to lookup table
+        if (!is.na(entry\$prev_symbol)) {
+            prev_symbols <- unlist(strsplit(entry\$prev_symbol, split = "|", fixed = TRUE))
+            for (prev_symbol in prev_symbols) {
+                gene_lookup[[prev_symbol]] <- list(entry\$hgnc_id, entry\$ensembl_gene_id)
+            }
+        }
+        # Add alias symbols (if any) to lookup table
+        if (!is.na(entry\$alias_symbol)) {
+            alias_symbols <- unlist(strsplit(entry\$alias_symbol, split = "|", fixed = TRUE))
+            for (alias in alias_symbols) {
+                gene_lookup[[alias]] <- list(entry\$hgnc_id, entry\$ensembl_gene_id)
+            }
+        }
+    }
+
+
+    # Apply the lookup_gene_info function to each row in the screen data
+    info <- do.call(rbind, apply(screen, 1, function(row) lookup_gene_info(as.character(row[1]), as.character(row\$Gene_2))))
+    info <- as.data.frame(info)
+
+    # Add the HGNC and Ensembl gene IDs back into the screen data
+    screen\$hgnc_id <- as.character(info[, 1])
+    screen\$ensembl_gene_id <- as.character(info[, 2])
+
+    if(length(beta_col) >= 1) {
+        screen\$Gene <- as.character(screen\$Gene)
+    } else {
+        screen\$GENE <- as.character(screen\$GENE)
+    }
+
+    # Save the updated screen data with HGNC and Ensembl IDs to a file
+    if(length(beta_col) >= 1) {
+        filename <- '${meta.treatment}_vs_${meta.reference}_mle'
+    } else {
+        filename <- '${meta.treatment}_vs_${meta.reference}'
+    }
+
+    write_delim(as.data.frame(screen), paste0(filename, '_gene_conversion.txt'), delim = '\t')
+
+    # Select the HGNC IDs from the screen data and the first 1000 gene symbols
+    screen_genes <- screen\$hgnc_id
+    gene_symbols_1000 <- screen[[1]][1:${hit_selection_iteration_nb}]
+    # Load interaction data from BioGRID
+    interactions_df <- read.csv("${biogrid}")
+
+    # Create an undirected graph from the interaction data
+    g <- graph_from_data_frame(interactions_df[, c("hgnc_id_1", "hgnc_id_2")], directed=FALSE)
+
+    # Calculate the degree (number of connections) for each gene in the graph
+    degree <- degree(g)
+    names(degree) <- V(g)\$name
+
+    min <- 0
+    max <- ${hit_selection_iteration_nb}
+    steps <- max - min
+    hit.genes.last <- NULL
+    final_results <- list() # Initialize as an empty list
+    for(i in 1:steps) {
+        final_hit.genes <- intersect(screen_genes[c(1:i)], V(g)\$name)
+        hit.genes.last <- final_hit.genes
+        result <- Find.Significance(g, hit.genes.last, degree, permutation = 500)
+        final_results[[i]]  <- result
+    }
+
+    final_dataframe <- bind_rows(final_results)
+    final_dataframe\$gene_symbols <- gene_symbols_1000
+    write.table(final_dataframe, file=paste0(filename,"_hitselection.tsv"),row.names=FALSE,quote=FALSE,sep="\t")
+
+    max_value <- max(final_dataframe\$logp.val.degree.conserved)
+    max_index <- which.max(final_dataframe\$logp.val.degree.conserved)
+
+    # Add a column to the dataframe to indicate whether each point is the maximum or not
+    final_dataframe <- final_dataframe %>%
+    mutate(is_max = ifelse(logp.val.degree.conserved == max_value, "Maximum", "Other"))
+
+    # Create the plot
+
+    ggplot(final_dataframe, aes(x = seq_along(logp.val.degree.conserved), y = logp.val.degree.conserved)) +
+        geom_point(aes(color = is_max, size = is_max)) +
+        scale_color_manual(values = c("Maximum" = "red", "Other" = "black")) +
+        scale_size_manual(values = c("Maximum" = 3, "Other" = 1)) +  # Adjust sizes as needed
+        labs(title = "Log P-value (Degree Conserved) vs Index",
+        x = "Index",
+        y = "Log P-value (Degree Conserved)",
+        color = "Point Type",
+        size = "Point Size") +
+    theme(
+        panel.background = element_rect(fill = "white"),
+        plot.background = element_rect(fill = "white"),
+        legend.background = element_rect(fill = "white"),
+        axis.text = element_text(color = "black"),
+        axis.title = element_text(color = "black"),
+        plot.title = element_text(color = "black"),
+        legend.text = element_text(color = "black"),
+        legend.title = element_text(color = "black"),
+        legend.position = "top",
+        panel.grid.major = element_line(color = "gray80"),  # Major grid lines
+        panel.grid.minor = element_line(color = "gray90")   # Minor grid lines
+    )
+    ggsave(paste0(filename, "_logpvalue_distribution.png"))
+
+    version_file_path <- "versions.yml"
+    version_igraph <- paste(unlist(packageVersion("igraph")), collapse = ".")
+    version_dplyr <- paste(unlist(packageVersion("dplyr")), collapse = ".")
+    version_ggplot2 <- paste(unlist(packageVersion("ggplot2")), collapse = ".")
+
+    f <- file(version_file_path, "w")
+    writeLines('"${task.process}":', f)
+    writeLines("    igraph: ", f, sep = "")
+    writeLines(version_igraph, f)
+    writeLines("    dplyr: ", f, sep = "")
+    writeLines(version_dplyr, f)
+    writeLines("    ggplot2: ", f, sep = "")
+    writeLines(version_ggplot2, f)
+    close(f)
+    """
+
+    stub:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+
+    """
+    touch "${meta.treatment}_vs_${meta.reference}_hitselection.tsv"
+    touch "${meta.treatment}_vs_${meta.reference}_output_converted.txt"
+
+    version_file_path <- "versions.yml"
+    version_igraph <- paste(unlist(packageVersion("igraph")), collapse = ".")
+    version_dplyr <- paste(unlist(packageVersion("dplyr")), collapse = ".")
+    version_ggplot2 <- paste(unlist(packageVersion("ggplot2")), collapse = ".")
+
+    f <- file(version_file_path, "w")
+    writeLines('"${task.process}":', f)
+    writeLines("    igraph: ", f, sep = "")
+    writeLines(version_igraph, f)
+    writeLines("    dplyr: ", f, sep = "")
+    writeLines(version_dplyr, f)
+    writeLines("    ggplot2: ", f, sep = "")
+    writeLines(version_ggplot2, f)
+    close(f)
+    """
+}
diff --git a/nextflow.config b/nextflow.config
@@ -31,6 +31,8 @@ params {
     bagel_reference_nonessentials = 'https://raw.githubusercontent.com/hart-lab/bagel/master/NEGv1.txt'
     drugz                      = null
     drugz_remove_genes         = null
+    hitselection               = false
+    hit_selection_iteration_nb = 1000
     // Pipeline steps
     overrepresented            = false
     umi_clustering             = false