Merge pull request #18 from cokelaer/main

implement bwa_split to analyse large fastq files
sequana · Feb 6, 2024 · 8d02c9d · 8d02c9d
2 parents 55c315b + 5ae8a48
commit 8d02c9d
Show file tree

Hide file tree

Showing 9 changed files with 248 additions and 81 deletions.
diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
@@ -11,20 +11,20 @@ jobs:
     runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@main
-    - name: Set up Python 3.7
-      uses: actions/setup-python@v1
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v2
       with:
-        python-version: 3.7
+        python-version: 3.8
 
-    - name: Install package
+    - name: Install package 
       run: |
-          pip install build
+          pip install build poetry
 
     - name: Build source tarball
       run: |
           rm -rf dist;
-          python setup.py sdist
-
+          poetry build
+ 
     - name: Publish distribution to Test PyPI
       uses: pypa/gh-action-pypi-publish@release/v1
       with:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,33 @@
+
+files: '\.(py|rst|sh)$'
+fail_fast: false
+
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v3.2.0
+    hooks:
+    -   id: trailing-whitespace
+    -   id: end-of-file-fixer
+    -   id: check-yaml
+    #-   id: check-executables-have-shebangs
+    -   id: check-ast
+
+-   repo: https://github.com/pycqa/flake8
+    rev: 6.1.0
+    hooks:
+    -   id: flake8
+        args: ["-j8", "--ignore=E203,E501,W503,E722", "--max-line-length=120", "--exit-zero"]
+
+-   repo: https://github.com/psf/black
+    rev: 22.10.0
+    hooks:
+    -   id: black
+        args: ["--line-length=120"]
+        exclude: E501
+
+-   repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        args: ["--profile", "black"] # solves conflicts between black and isort
+
diff --git a/README.rst b/README.rst
@@ -3,7 +3,7 @@
      :target: https://pypi.python.org/pypi/sequana_mapper
 
 .. image:: https://github.com/sequana/mapper/actions/workflows/main.yml/badge.svg
-   :target: https://github.com/sequana/mapper/actions/    
+   :target: https://github.com/sequana/mapper/actions/
 
 .. image:: https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C3.10-blue.svg
     :target: https://pypi.python.org/pypi/sequana
@@ -46,7 +46,7 @@ to execute the pipeline::
     cd mapper
     sh mapper.sh  # for a local run
 
-This launch a snakemake pipeline. If you are familiar with snakemake, you can 
+This launch a snakemake pipeline. If you are familiar with snakemake, you can
 retrieve the pipeline itself and its configuration files and then execute the pipeline yourself with specific parameters::
 
     snakemake -s mapper.rules -c config.yaml --cores 4 \
@@ -74,21 +74,21 @@ This pipelines requires the following executable(s):
 Details
 ~~~~~~~~~
 
-This pipeline runs **mapper** in parallel on the input fastq files (paired or not). 
-A brief sequana summary report is also produced. When using **--pacbio** option, 
+This pipeline runs **mapper** in parallel on the input fastq files (paired or not).
+A brief sequana summary report is also produced. When using **--pacbio** option,
 *-x map-pb* options is automatically added to the config.yaml file and the
-readtag is set to None. 
+readtag is set to None.
 
 The BAM files are filtered to remove unmapped reads to keep BAM files to minimal size. However,
-the multiqc and statistics to be found in  {sample}/bamtools_stats/ includes mapped and unmapped reads information. Each BAM file is stored in a directory named after the sample. 
+the multiqc and statistics to be found in  {sample}/bamtools_stats/ includes mapped and unmapped reads information. Each BAM file is stored in a directory named after the sample.
 
 
 
 Rules and configuration details
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Here is the `latest documented configuration file <https://raw.githubusercontent.com/sequana/mapper/main/sequana_pipelines/mapper/config.yaml>`_
-to be used with the pipeline. Each rule used in the pipeline may have a section in the configuration file. 
+to be used with the pipeline. Each rule used in the pipeline may have a section in the configuration file.
 
 
 Changelog
@@ -97,6 +97,8 @@ Changelog
 ========= ======================================================================
 Version   Description
 ========= ======================================================================
+1.2.0     * Implement a bwa_split method to speed up mapping of very large
+            fastq files.
 1.1.0     * BAM files are now filtered to remove unmapped reads
           * set wrappers branch in config file and update pipeline.
           * refactorise to use click and new sequana-pipetools
@@ -105,25 +107,25 @@ Version   Description
 0.11.1    * Fix typo when setting coverage to True and allow untagged filenames
 0.11.0    * implement feature counts for capture-seq projects
 0.10.1    * remove getlogdir and getname
-0.10.0    * use new wrappers framework 
+0.10.0    * use new wrappers framework
 0.9.0     * fix issue with logger and increments requirements
-          * add new option --pacbio to automatically set the options for 
+          * add new option --pacbio to automatically set the options for
             pacbio data (-x map-pb and readtag set to None)
 0.8.13    * add the thread option in minimap2 case
 0.8.12    * factorise multiqc rule
 0.8.11    * Implemente the --from-project option and new framework
           * custom HTMrLl report
 0.8.10    * change samtools_depth rule and switched to bam2cov to cope with null
-            coverage 
+            coverage
 0.8.9     * fix requirements
 0.8.8     * fix pipeline rule for bigwig + renamed output_bigwig into
             create_bigwig; fix the multiqc config file
 0.8.7     * fix config file creation (for bigwig)
 0.8.6     * added bowtie2 mapper + bigwig as output, make coverage optional
 0.8.5     * create a sym link to the HTML report. Better post cleaning.
-0.8.4     * Fixing multiqc (synchronized with sequana updates) 
-0.8.3     * add sequana_coverage rule. 
-0.8.2     * add minimap2 mapper 
+0.8.4     * Fixing multiqc (synchronized with sequana updates)
+0.8.3     * add sequana_coverage rule.
+0.8.2     * add minimap2 mapper
 0.8.1     * fix bamtools stats rule to have different output name for multiqc
 0.8.0     **First release.**
 ========= ======================================================================
@@ -132,7 +134,6 @@ Version   Description
 Contribute & Code of Conduct
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-To contribute to this project, please take a look at the 
-`Contributing Guidelines <https://github.com/sequana/sequana/blob/main/CONTRIBUTING.rst>`_ first. Please note that this project is released with a 
+To contribute to this project, please take a look at the
+`Contributing Guidelines <https://github.com/sequana/sequana/blob/main/CONTRIBUTING.rst>`_ first. Please note that this project is released with a
 `Code of Conduct <https://github.com/sequana/sequana/blob/main/CONDUCT.md>`_. By contributing to this project, you agree to abide by its terms.
-
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "sequana-mapper"
-version = "1.1.0"
+version = "1.2.0"
 description = "A multi-sample mapper to map reads onto a reference"
 authors = ["Sequana Team"]
 license = "BSD-3"

diff --git a/sequana_pipelines/mapper/config.yaml b/sequana_pipelines/mapper/config.yaml
@@ -6,7 +6,7 @@
 # If input_directory provided, use it otherwise if input_pattern provided,
 # use it, otherwise use input_samples.
 # ============================================================================
-sequana_wrappers: v23.11.18
+sequana_wrappers: v24.1.14
 
 input_directory:
 input_readtag: _R[12]_
@@ -31,7 +31,8 @@ apptainers:
     minimap2: https://zenodo.org/record/7341710/files/sequana_tools_0.14.5.img
     multiqc: https://zenodo.org/record/10205070/files/multiqc_1.16.0.img
     samtools: https://zenodo.org/record/7341710/files/sequana_tools_0.14.5.img
-    sequana_coverage: https://zenodo.org/record/10209929/files/sequana_0.16.1.img
+    sequana_coverage: https://zenodo.org/record/10460105/files/sequana_0.16.5.img
+    seqkit: https://zenodo.org/record/7821924/files/seqkit_2.4.0.img
     subread: https://zenodo.org/record/7341710/files/sequana_tools_0.14.5.img
 
 
@@ -47,6 +48,18 @@ samtools_depth:
     resources:
         mem: 8G
 
+##############################################################################
+# BWA MEM indexing 
+#
+bwa_index:
+    options: ''
+    threads: 4
+    resources:
+        mem: 8G
+
+##############################################################################
+# BAM alignment indexing 
+#
 bam_indexing:
     resources:
         mem: 8G
@@ -68,25 +81,30 @@ bwa:
     resources:
         mem: 8G
 
-minimap2:
-    options: ''
+bwa_split:
+    nreads: 1000000
+    index_algorithm: is
+    options: -T 30 -M
     threads: 4
+    tmp_directory: ./tmp
     resources:
         mem: 8G
 
-bowtie2:
+
+
+minimap2:
     options: ''
     threads: 4
     resources:
         mem: 8G
 
-bowtie2_index:
+bowtie2:
     options: ''
     threads: 4
     resources:
         mem: 8G
 
-bwa_index:
+bowtie2_index:
     options: ''
     threads: 4
     resources:
@@ -158,7 +176,7 @@ sequana_coverage:
 ##       If you want your own multiqc, fill this entry
 multiqc:
     options: -p -f
-    modules: busco quast sequana_coverage prokka
+    modules: sequana_bamtools_stats sequana_coverage
     input_directory: .
     config_file: multiqc_config.yaml
     resources:

diff --git a/sequana_pipelines/mapper/main.py b/sequana_pipelines/mapper/main.py
@@ -10,30 +10,31 @@
 #  documentation: http://sequana.readthedocs.io
 #
 ##############################################################################
-import sys
 import os
+import sys
 
-import rich_click as click
 import click_completion
+import rich_click as click
 
 click_completion.init()
 
 NAME = "mapper"
 
-from sequana_pipetools.options import *
 from sequana_pipetools import SequanaManager
+from sequana_pipetools.options import *
 
 help = init_click(
     NAME,
     groups={
         "Pipeline Specific": [
-            "--mapper",
-            "--reference-file",
+            "--aligner-choice",
             "--annotation-file",
+            "--capture-annotation-file",
+            "--create-bigwig",
             "--do-coverage",
+            "--nanopore",
             "--pacbio",
-            "--create-bigwig",
-            "--capture-annotation-file",
+            "--reference-file",
         ],
     },
 )
@@ -45,7 +46,13 @@
 @include_options_from(ClickInputOptions)
 @include_options_from(ClickGeneralOptions)
 @click.option(
-    "--mapper", default="bwa", type=click.Choice(["bwa", "minimap2", "bowtie2"]), help="Choose one of the valid mapper"
+    "--aligner-choice",
+    "mapper",
+    default="bwa",
+    type=click.Choice(["bwa", "bwa_split", "minimap2", "bowtie2"]),
+    help="""Choose one of the valid mapper. bwa_split is experimental. it first split the fastq files in chunks of 1Mreads,
+aligns the reads with bwa and merge back the sub BAM files. Should be equivalent to using bwa but could be used on
+cluster to speed up analysis.""",
 )
 @click.option("--reference-file", required=True, help="You input reference file in fasta format")
 @click.option("--annotation-file", help="Used by the sequana_coverage tool if provided")
@@ -108,6 +115,20 @@ def main(**options):
         cfg.feature_counts.options = "-F SAF "
         cfg.feature_counts.gff = os.path.abspath(options.capture_annotation_file)
 
+    # Given the reference, let us compute its length and the index algorithm
+    from sequana import FastA
+
+    f = FastA(cfg.general.reference_file)
+    N = f.get_stats()["total_length"]
+
+    # seems to be a hardcoded values in bwa according to the documentation
+    if N >= 2000000000:
+        cfg["bwa"]["index_algorithm"] = "bwtsw"
+        cfg["bwa_split"]["index_algorithm"] = "bwtsw"
+    else:
+        cfg["bwa"]["index_algorithm"] = "is"
+        cfg["bwa_split"]["index_algorithm"] = "is"
+
     # finalise the command and save it; copy the snakemake. update the config
     # file and save it.
     manager.teardown()