Merge pull request #191 from LaurenceKuhl/hit_selection

hit selection module
nf-core · Sep 4, 2024 · 5f241de · 5f241de
2 parents bdb7682 + 98d683e
commit 5f241de
Show file tree

Hide file tree

Showing 15 changed files with 128,807 additions and 11 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -31,3 +31,6 @@ indent_size = unset
 # ignore python and markdown
 [*.{py,md}]
 indent_style = unset
+
+[/assets/hgnc_complete_set.txt]
+trim_trailing_whitespace = unset
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Add module to classify samples by clonality ([#178](https://github.com/nf-core/crisprseq/pull/178))
 - Add DrugZ, a module for chemogenetic interaction ([#168](https://github.com/nf-core/crisprseq/pull/168))
+- Add Hitselection, a module for subsetting more likely true positives for KO screen based on the protein protein interaction ([#191](https://github.com/nf-core/crisprseq/pull/191))
 
 ### Fixed
 

diff --git a/README.md b/README.md
@@ -67,10 +67,12 @@ For crispr screening:
    - ([`bowtie2`](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml))
 3. Optional: CNV correction and normalization with ([`CRISPRcleanR`](https://github.com/francescojm/CRISPRcleanR))
 4. Rank sgRNAs and genes ;
-   a. ([MAGeCK test](https://sourceforge.net/p/mageck/wiki/usage/#test))
-   b. ([MAGeCK mle](https://sourceforge.net/p/mageck/wiki/Home/#mle))
-   c. ([BAGEL2](https://github.com/hart-lab/bagel))
-5. Visualize analysis
+   - ([MAGeCK test](https://sourceforge.net/p/mageck/wiki/usage/#test))
+   - ([MAGeCK mle](https://sourceforge.net/p/mageck/wiki/Home/#mle))
+   - ([BAGEL2](https://github.com/hart-lab/bagel))
+   - ([DrugZ](https://github.com/hart-lab/drugz))
+5. Optional: hit selection on KO screen allowing a subset of more likely true positives
+6. Visualize analysis
 
 ## Usage
 

diff --git a/assets/biogrid_hgncid_noduplicate_dropna.csv b/assets/biogrid_hgncid_noduplicate_dropna.csv
diff --git a/assets/hgnc_complete_set.txt b/assets/hgnc_complete_set.txt
diff --git a/conf/modules.config b/conf/modules.config
@@ -144,6 +144,44 @@ process {
             ]
     }
 
+    withName: HITSELECTION {
+        containerOptions = ''
+        publishDir       = [
+            path: { "${params.outdir}/hitselection/drugz/" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            ]
+    }
+
+    withName: HITSELECTION_MLE {
+        containerOptions = ''
+        publishDir       = [
+            path: { "${params.outdir}/hitselection/mle/" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            ]
+    }
+
+
+    withName: HITSELECTION_BAGEL2 {
+        containerOptions = ''
+        publishDir       = [
+            path: { "${params.outdir}/hitselection/bagel2/" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            ]
+    }
+
+    withName: HITSELECTION_RRA {
+        containerOptions = ''
+        publishDir       = [
+            path: { "${params.outdir}/hitselection/rra/" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            ]
+    }
+
+
     withName: VENNDIAGRAM {
         publishDir       = [
             path: { "${params.outdir}/venndiagram/${meta.treatment}_vs_${meta.reference}/" },

diff --git a/conf/test_screening.config b/conf/test_screening.config
@@ -20,12 +20,14 @@ params {
     max_time   = '6.h'
 
     // Input data
-    input             = params.pipelines_testdata_base_path + "crisprseq/testdata/samplesheet_test.csv"
-    analysis          = 'screening'
-    crisprcleanr      = "Brunello_Library"
-    library           = params.pipelines_testdata_base_path + "crisprseq/testdata/brunello_target_sequence.txt"
-    contrasts         = params.pipelines_testdata_base_path + "crisprseq/testdata/rra_contrasts.txt"
-    drugz             = params.pipelines_testdata_base_path + "crisprseq/testdata/rra_contrasts.txt"
+    input                      = params.pipelines_testdata_base_path + "crisprseq/testdata/samplesheet_test.csv"
+    analysis                   = 'screening'
+    crisprcleanr               = "Brunello_Library"
+    library                    = params.pipelines_testdata_base_path + "crisprseq/testdata/brunello_target_sequence.txt"
+    contrasts                  = params.pipelines_testdata_base_path + "crisprseq/testdata/rra_contrasts.txt"
+    drugz                      = params.pipelines_testdata_base_path + "crisprseq/testdata/rra_contrasts.txt"
+    hit_selection_iteration_nb = 150
+    hitselection               = true
 }
 
 process {

diff --git a/conf/test_screening_rra.config b/conf/test_screening_rra.config
@@ -26,6 +26,8 @@ params {
     library           = params.pipelines_testdata_base_path + "crisprseq/testdata/brunello_target_sequence.txt"
     contrasts         = params.pipelines_testdata_base_path + "crisprseq/testdata/rra_contrasts.txt"
     rra               = true
+    hitselection      = true
+    hit_selection_iteration_nb = 150
 }
 
 process {

diff --git a/docs/output/screening.md b/docs/output/screening.md
@@ -190,6 +190,12 @@ For further reading and documentation see the [cutadapt helper page](https://cut
   - `*.txt`: Pathway view for top enriched pathways.
   - `*.png`: Pathway view for top enriched pathways.
 
+### HitSelection
+
+- `HitSelection`
+  - `*.png` : -logP value vs gene rank plot to determine the rank thresholds
+  - `*.txt` : Ranked -logP value and gene symbols table
+
 ## MultiQC
 
 <details markdown="1">

diff --git a/docs/usage/screening.md b/docs/usage/screening.md
@@ -134,6 +134,18 @@ The contrast from reference to treatment should be ; separated
 
 If you wish to remove specific genes before the drugZ analysis, you can use the `--drugz_remove_genes` option following a comma separated list of genes.
 
+### Running Hitselection
+
+Hitselection provides the user with a threshold and a set of genes that are likely to be closer to true positives by identifying the most interconnected subnetworks within the ranked gene list. This module is for now only developed for KO screens on Human data mapped to Entrez IDs.
+
+Hitselection is a script for identifying rank thresholds for CRISPR screen results based on using the connectivity of subgraphs of protein-protein interaction (PPI) networks. The script is based on R and is also an implementation of RNAiCut (Kaplow et al., 2009), a method for estimating thresholds in RNAi data. The principle behind Hitselection is that true positive hits are densely connected in the PPI networks. The script runs a simulation based on Poisson distribution of the ranked screen gene list to calculate the -logP value for comparing the interconnectivity of the real subnetwork and the degree match random subnetwork of each gene, one by one. The degree of the nodes is used as the interconnectivity metric.
+
+To run Hitselection, you can specify '--hitselection' and it will automatically run on the gene essentiality algorithms you have chosen. The outputs are a png file containing the -logP value vs gene rank plot and a txt file containing all the -logP values, edge and average edge values and ranked gene symbols.
+
+## :warning: The hitselection algorithm is for the moment developed only for KO screens and requires the library to map to genes with an Homosapiens EntrezID.
+
+## :warning: Please be advised that the Hitselection algorithm is time intensive and will make the pipeline run longer
+
 Note that the pipeline will create the following files in your working directory:
 
 ```bash