diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..fcc1b34 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,22 @@ +# Read the Docs configuration file for Sphinx projects +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version and other tools you might need +build: + os: "ubuntu-22.04" + tools: + python: "mambaforge-22.9" + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/conf.py + +# Build all formats +formats: all + +# enable conda environment +conda: + environment: environment.yml diff --git a/.readthedocs.yml b/.readthedocs.yml deleted file mode 100644 index 6b7f858..0000000 --- a/.readthedocs.yml +++ /dev/null @@ -1,18 +0,0 @@ -# .readthedocs.yml -# Read the Docs configuration file -# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details - -# Required -version: 2 - -# Build documentation in the docs/ directory with Sphinx -sphinx: - configuration: docs/conf.py - -# Build all formats -formats: all - -python: - version: 3.8 - install: - - requirements: requirements.txt diff --git a/.vscode/settings.json b/.vscode/settings.json index 896b0d7..90896b6 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -8,24 +8,45 @@ "bioinformatics", "bunop", "CACHEDIR", + "cineca", "conda", + "Cozzi", + "cozzip", "cpus", + "engelbart", + "fasta", "fastq", "fastqc", "freebayes", "IBBA", + "Iscr", + "lprod", "methylseq", + "mirdeep", "mkdir", "Nextflow", + "NFCORE", + "ntasks", + "outdir", + "paolo", + "println", "pypi", "pytest", "resequencing", "rnaseq", + "SAMPLESHEET", + "samtools", + "SBATCH", "sessionid", "slurm", + "sshfs", "subfolders", "subworkflow", - "Subworkflows" + "Subworkflows", + "testdata", + "TRIMGALORE", + "whitespaces", + "workdir" ], "esbonio.server.enabled": false, "esbonio.sphinx.buildDir": "${workspaceRoot}/docs/_build", diff --git a/docs/conf.py b/docs/conf.py index e2921a1..c722604 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,11 +18,11 @@ # -- Project information ----------------------------------------------------- project = 'BIOINFO Guidelines' -copyright = '2020-2022, Paolo Cozzi, ...' -author = 'Paolo Cozzi, ...' +copyright = '2020-2024, Paolo Cozzi' +author = 'Paolo Cozzi' # The full version, including alpha/beta/rc tags -release = 'v0.2.2' +release = 'v0.2.3' # -- General configuration --------------------------------------------------- diff --git a/docs/contributing.rst b/docs/contributing.rst index f1fd2bb..8aa89e0 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -22,23 +22,25 @@ are welcome. Before starting to work on documentation, please follow those steps 3. Start a new branch, preferably as indicating the issue number of the issue you have opened on original repository (for example, if you created the issue #3, you should use ``issue-3`` as branch name) -4. Install ``sphinx`` requirements by using:: +4. Install ``conda`` (or ``miniconda``) if you don't have it yet. You could find + it at `Anaconda website `__ +5. Install ``sphinx`` requirements by using:: - pip install -r requirements.txt + conda env create -f environment.yml Inside project directory. You could install them in a conda environment, if you prefer -5. Work on documentation by adding or modifying files according your need. Track +6. Work on documentation by adding or modifying files according your need. Track your modifications using git -6. Before submitting modifications, please check that documentation compiles +7. Before submitting modifications, please check that documentation compiles without errors and warnings by:: - $ cd docs - $ make clean - $ make html + $ cd docs + $ make clean + $ make html -7. If documentation works as you intended, you could push your work on your GitHub +8. If documentation works as you intended, you could push your work on your GitHub account and start *from here* a new pull request -8. Give some times to the maintainers to review your code. If they requests you some +9. Give some times to the maintainers to review your code. If they requests you some modifications, please work in the same branch you used before and send your revision by committing in the same branch. Don't create a new branch or a new pull request, if you push your modifications on the same branch with a open pull request it will diff --git a/docs/general/conda.rst b/docs/general/conda.rst index 2302dca..c2c791c 100644 --- a/docs/general/conda.rst +++ b/docs/general/conda.rst @@ -88,7 +88,7 @@ You can explore the conda environment available with:: $ conda env list # conda environments: # - R-3.6 /home/cozzip/.conda/envs/R-3.6 + R-4.3 /home/cozzip/.conda/envs/R-4.3 base * /usr/local/Miniconda3-py38_4.8.3-Linux-x86_64 nf-core /usr/local/Miniconda3-py38_4.8.3-Linux-x86_64/envs/nf-core @@ -102,7 +102,7 @@ in the bash prompt. You could enable a conda environment using ``conda activate``, for example:: - $ conda activate R-3.6 + $ conda activate R-4.3 You should see that the environment name near the bash prompt changed to the desidered environment. In order to exit the current environment (and return to your previous @@ -139,8 +139,8 @@ contains community packages, often more updated that the official channels. If y search or want to install a package in a different channel than the ``default``, you have to specify with the ``--channel`` option:: - $ conda search --channel R r-base=3.6 - $ conda create --channel R --name R-3.6 r-base=3.6 + $ conda search --channel R r-base=4.3 + $ conda create --channel R --name R-4.3 r-base=4.3 You can find more information on `Managing channels `__ in conda documentation. @@ -160,8 +160,8 @@ Export a conda environment You could export conda environment in a file. First, you have to activate the environment that you want to import, for example:: - $ conda activate R-3.6 - $ conda env export > R-3.6.yml + $ conda activate R-4.3 + $ conda env export > R-4.3.yml .. hint:: @@ -175,7 +175,7 @@ that you want to import, for example:: during environment import. For such cases, its better to export a conda environment without **build specifications**, like this:: - $ conda env export --no-builds > R-3.6.yml + $ conda env export --no-builds > R-4.3.yml This will track all your package version without the file hash stored in conda channels. This require more time when restoring an environment, however you will @@ -188,7 +188,48 @@ Import a conda environment You could create a new environment relying on the exported file, for example on a different machine:: - $ conda env create -f R-3.6.yml + $ conda env create -f R-4.3.yml + +Conda-pack +~~~~~~~~~~ + +Conda-pack is a tool which allows you to pack a conda environment in a single +file. This file can be moved to a different machine and unpacked in a different +location. This is useful when you want to move a conda environment to a different +machine without internet connection. You can install conda-pack with:: + + $ conda install conda-pack + +Then you can pack an environment with:: + + $ conda pack -n R-4.3 -o R-4.3.tar.gz + +.. hint:: + + ``conda-pack`` is already installed in our shared **core** environment using + the default ``base`` conda environment + +.. warning:: + + ``conda-pack`` will made a copy of all dependencies of your environment, thus + the resulting file could be very large. You will make not use of conda packages + caches, consider to use ``conda-pack`` only when is impossible to make an + environment using the standard conda commands. + +You can unpack the environment in a different location with:: + + $ mkdir R-4.3 + $ cd R-4.3 + $ tar -xzf ../R-4.3.tar.gz + $ source bin/activate + +.. hint:: + + If you unpack the environment in the conda environment folder (ie. ``$HOME.conda/envs``), + you can activate the environment without specifying the full path (using the + standard *conda activate* command, like ``conda activate R-4.3``), since conda + will search for environments in the default location. Remember that you have to + create the destination path, since the archive will not create it for you. Remove an environment ~~~~~~~~~~~~~~~~~~~~~ @@ -196,7 +237,7 @@ Remove an environment You can remove an environment by specifying its *name*: this environment shouldn't be active when removing:: - $ conda env remove --name R-3.6 + $ conda env remove --name R-4.3 Conda best practices -------------------- @@ -207,7 +248,7 @@ Specify package version if possible Specifying package version could save a lot of time, for example when you need to resolve dependencies with channels:: - $ conda create --channel conda-forge --channel R --name R-4.0 r-base=4.0 + $ conda create --channel conda-forge --channel R --name R-4.3 r-base=4.3 Clean up ~~~~~~~~ @@ -255,3 +296,43 @@ Remember that when defining environment variables as collection of paths, the de path should be *prepended* to current paths, in order to retrieve the desired files before the other positions. The current path should be updated and not replaced since it could contains useful information. + +.. warning:: + + It's a bad idea to set the ``$PATH`` environment variable using the *config API*, + since when disabling the conda environment, the ``$PATH`` will be unset, causing + your terminal not working correctly. If you need to add a path to ``$PATH``, you + need to manually edit the ``env_vars.sh`` files. Ensure to activate your desidered + environment (in order to resolve the ``$CONDA_PREFIX`` environment variable) and + then: + + .. code-block:: bash + + cd $CONDA_PREFIX + mkdir -p ./etc/conda/activate.d + mkdir -p ./etc/conda/deactivate.d + touch ./etc/conda/activate.d/env_vars.sh + touch ./etc/conda/deactivate.d/env_vars.sh + + Next, edit the ``./etc/conda/activate.d/env_vars.sh`` file and modify the ``$PATH`` + variable, for example: + + .. code-block:: bash + + #!/bin/sh + + export PATH="/home/core/software/sratoolkit/bin:$PATH" + + If you desire, you can restore the previous ``$PATH`` value by editing the + ``./etc/conda/deactivate.d/env_vars.sh`` file: + + .. code-block:: bash + + #!/bin/sh + + # remove a particular directory from $PATH (define a new $PATH without it) + # see: https://unix.stackexchange.com/a/496050 + export PATH=$(echo $PATH | tr ":" "\n" | grep -v '/home/core/software/sratoolkit/bin' | xargs | tr ' ' ':') + + See conda `Manaing environments `__ + for more information. diff --git a/docs/nextflow/customize.rst b/docs/nextflow/customize.rst index ec5781b..6e98b62 100644 --- a/docs/nextflow/customize.rst +++ b/docs/nextflow/customize.rst @@ -7,22 +7,210 @@ Customize a Pipeline Cloning a pipeline ------------------ -The easiest way to modifying an existing pipeline is to clone them from the github +The easiest way to modifying an existing pipeline is to clone it from the github repository:: - $ git clone https://github.com/nf-core/rnaseq + git clone https://github.com/nf-core/rnaseq .. hint:: nextflow itself can clone a pipeline like git does:: - $ nextflow clone nextflow-io/rnaseq + nextflow clone nf-core/rnaseq .. warning:: if you clone a pipeline with ``nextflow clone`` command, ensure that git *remotes* are correct and point to the repository location +Configuring a pipeline +---------------------- + +You can customize a pipeline by creating a custom configuration file. This could +be necessary if you need to lower the requirements of a pipeline, for example, +in order to run a pipeline with limited resources or to avoid to provide pipeline +parameters using the command line interface. You can also specify a custom +configuration file in order to run a pipeline with a different profile, for example +to enable different options required to a specific environment. A custom configuration +file has an higher priority than the default configuration file, but will have a lower +priority than the parameters provided with command line. For a complete list of +configuration options and priorities, please see the +`nextflow config `__ documentation. +Before starting with a new custom configuration file, you should take a look to +the default configuration file provided by the pipeline you are working on. For +a standard nextflow pipeline, the default configuration file is named ``nextflow.config`` +and is located on the root of the pipeline directory. In this file there are defined +the default parameters that affect pipeline execution. In a DSL2 pipeline, you can +also find the ``conf/base.config`` file, in which the requirements for each job +are defined. + +.. hint:: + + Is recommended by the community that the pipeline parameters, like the input files, + the reference database used or user defined values need to be provided by a *parameters* + file, which is defined as a JSON file and is specified with the ``-params-file`` + option. This let you to run a pipeline without + providing parameters using the command line interface. All the parameters which + cannot be specified using the command line interface (for example the amount of + memory required by a certain step) can be defined in the custom configuration file. + +.. warning:: + + Avoid to name your custom config file as ``nextflow.config``, since is a reserved + name for the default configuration file, which is loaded automatically by nextflow + if present in the pipeline directory. If you name your custom configuration file + with a different name, you can control when it's loaded using the ``-c`` or + ``-config`` option when running nextflow. + +Lowering pipeline requirements +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Nextflow let you to specify the amount of resources required by a pipeline step +using process `selectors `__ +in the configuration files. More precisely, in DSL2 pipelines, this requirements +are specified in ``conf/base.config`` file. There are mainly two types of selectors: +``withName`` and ``withLabel``: the first one let you to specify the requirements +for a process by name, the second one let you to specify the requirements for every +process having the same label. To lower resources requirements, it's better to +start by redefining the most used labels, like ``process_high`` and ``process_medium``, +and then redefine single process by names. Start with an empty configuration +file and add a ``process`` scope like this:: + + process { + withLabel:process_single { + memory = 1.G + } + withLabel:process_low { + memory = 4.G + } + withLabel:process_medium { + memory = 12.G + } + withLabel:process_high { + memory = 48.G + } + } + +You may want to explore the imported modules tho understand will processes will +be affected by which label. +In order to get effect, you need to provide this file with the nextflow ``-c`` +or ``-config`` option: + +.. code-block:: bash + + nextflow run -c custom.config ... + +.. hint:: + + Since these parameters will override the default ones, it's better to declare only + the minimal parameters required by your pipeline. + +You can also declare resources dynamically. For example, you can make use of the +``check_max`` function, but you will require to define the ``check_max`` function +in your custom configuration file:: + + process { + withLabel:process_medium { + cpus = { check_max( 6 * task.attempt, 'cpus' ) } + memory = { check_max( 12.GB * task.attempt, 'memory' ) } + time = { check_max( 8.h * task.attempt, 'time' ) } + } + } + + // Function to ensure that resource requirements don't go beyond + // a maximum limit + def check_max(obj, type) { + if (type == 'memory') { + try { + if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) + return params.max_memory as nextflow.util.MemoryUnit + else + return obj + } catch (all) { + println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'time') { + try { + if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) + return params.max_time as nextflow.util.Duration + else + return obj + } catch (all) { + println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'cpus') { + try { + return Math.min( obj, params.max_cpus as int ) + } catch (all) { + println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" + return obj + } + } + } + +The ``--max_cpus``, ``--max_memory`` and ``--max_time`` parameters are the maximum +allowed values for dynamic job requirements: by setting these parameters you can +ensure that a *single job* will not allocate more resources than the ones you have +declared. Those parameters have not effect on the *global* resources used or the +number of job submitted. + +.. hint:: + + ``--max_cpus``, ``--max_memory`` and ``--max_time`` are parameters that can be + submitted using the nextflow *params file* or command line interface. + +Provide custom parameters to a process +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some modules may require additional parameters to be provided in order to work +correctly. This parameters can be specified with the ``ext.args`` variable within +the process scope in the custom configuration file, for example:: + + process { + withName:process_fastqc { + ext.args = '-t 4' + } + } + +When a process is composed by two (or more) tools, you can specify parameters for +each process independently, using ``ext.args``, ``ext.args2``, ``ext.args3``: +``ext.args`` will be used for the first process, ``ext.args2`` for the second and +so on. In a DSL2 pipeline, custom variables for each process are defined in +``conf/base.config`` file: take a look to this file to understand which variables +are set by default in your pipeline and before adding new variables to a process. + +Create a custom profile +~~~~~~~~~~~~~~~~~~~~~~~ + +A profile is a set of parameters that can be used to run a pipeline in a specific +environment. For example, you can define a profile to run a pipeline in a cluster +environment, or to run a pipeline using a specific container engine. You can also +define a profile to run a pipeline with a specific set of parameters, for example +test data. +A profile is defined in a configuration file, which is specified +using the ``-profile`` option when running nextflow. A profile require a name +which is used to identify the profile and a set of parameters. For example, you +can define a profile like this in your ``custom.config`` file:: + + profiles { + cineca { + process { + clusterOptions = { "--partition=g100_usr_prod --qos=normal" } + } + } + } + +In this example, each process will be submitted to the ``g100_usr_prod`` partition +using the ``normal`` quality of service, and those parameters may depend on the +environment in which this pipeline is supposed to run. In another environment, +those parameter will not apply, so there's no need to use this specific profile +in a different environment. You can the call your pipeline using the ``-profile`` +option:: + + nextflow run -profile cineca,singularity ... + Creating a new pipeline ----------------------- @@ -37,11 +225,11 @@ The minimal set of files required to have a pipeline is to have locally ``main.nf``, ``nextflow.config`` and ``modules.json`` inside your project folder. You should have also a ``modules`` directory inside your project:: - $ mkdir -p my-new-pipeline/modules - $ cd my-new-pipeline - $ touch main.nf nextflow.config modules.json README.md + mkdir -p my-new-pipeline/modules + cd my-new-pipeline + touch main.nf nextflow.config modules.json README.md .nf-core.yml -Next you have to edit modules.json in order to have minimal information:: +Next you have to edit ``modules.json`` in order to have minimal information:: { "name": "", @@ -61,7 +249,7 @@ pipelines using ``nf-core/tools``. You could also create a new pipeline using the ``nf-core`` template:: - $ nf-core create + nf-core create This template is required if you want to submit your pipeline to the ``nf-core`` community. Please see the `join the community `__ @@ -75,11 +263,12 @@ Browsing modules list You can get a list of modules by using ``nf-core/tools`` (see :ref:`here ` how you can install it):: - $ nf-core modules list remote + nf-core modules list remote You could also browse modules inside a different repository and branch, for example:: - $ nf-core modules --github-repository cnr-ibba/nf-modules --branch master list remote + nf-core modules --github-repository https://github.com/cnr-ibba/nf-modules.git \ + --branch master list remote .. hint:: @@ -96,7 +285,7 @@ Adding a module to a pipeline You can download and add a module to your pipeline using ``nf-core/tools``:: - $ nf-core modules install --dir . fastqc + nf-core modules install --dir . fastqc .. note:: @@ -108,12 +297,67 @@ You can download and add a module to your pipeline using ``nf-core/tools``:: If you don't provide the module, ``nf-core`` will search and prompt for for a module in ``nf-core/modules`` GitHub repository +Add a simple workflow +~~~~~~~~~~~~~~~~~~~~~ + +In order to have a minimal pipeline, you need to add at least an unnamed workflow +to your pipeline. Moreover, you should declare the input channels and the modules +or the processes you plan to use. Suppose to create a minimal pipeline to do a *fastqc* +analysis on a set of reads. You can install the ``fastqc`` module as described +above and then add a workflow like this in your ``main.nf``:: + + // Declare syntax version + nextflow.enable.dsl=2 + + include { FASTQC } from './modules/nf-core/fastqc/main' + + workflow { + reads_ch = Channel.fromFilePairs(params.input, checkIfExists: true) + .map { it -> + [[id: it[1][0].baseName], it[1]] + } + // .view() + + FASTQC(reads_ch) + } + +In this case ``FASTQC`` expect to receive a channel with *meta* information, so +this is why we create an input channel and then we add *meta* relying on file names. +Please refer to the module ``main.nf`` file to understand how to call a module +and how to pass parameters to it. Next you will need also a minimal +``nextflow.config`` configuration file to run your pipeline, in order +to define where *softwares* could be found, and other useful options:: + + params { + input = null + } + + profiles { + docker { + docker.enabled = true + docker.userEmulation = true + } + } + + docker.registry = 'quay.io' + +Next, you can call your pipeline like this:: + + nextflow run main.nf -profile docker --input "data/*_{1,2}.fastq.gz" + +You can create different workflows and call them in your main workflow, or you +can install a subworkflow as like as you install a module. Also you can add +more options to your ``nextflow.config`` file, or define a custom profile +for modules, in order to provide more options to your pipeline. Please refer +to nextflow documentation to get more information on how to customize your +pipeline. + List all modules in a pipeline ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ You can have a full list of installed modules using:: - $ nf-core modules list local + nf-core modules list local .. _update-a-pipeline-module: @@ -122,14 +366,13 @@ Update a pipeline module You can update a module simple by calling:: - $ nf-core modules update fastqc + nf-core modules update fastqc .. hint:: Call ``nf-core modules update --help`` to get a list of the available options, for example, if you need to install a specific version of a module - Custom pipeline modules ----------------------- @@ -145,21 +388,7 @@ their `documentation `__ is a private - repository (at the moment). In order to browse private repositories with ``nf-core`` - script, you have to configure the `GitHub CLI auth `__:: - - $ gh auth login - - and provide here your credentials for **GitHub.com** (using ``https`` as protocol - an providing a *personal token* with ``repo``, ``read:org``, ``workflow`` scopes - at least). This *CLI* utility will write the ``$HOME/.config/gh/hosts.yml`` - file with your credentials (please, keep it private!!), which is a requirement - to satisfy in order to use ``nf-core`` with private repository modules. + nf-core modules -g https://github.com/cnr-ibba/nf-modules.git list remote Add a custom module to a pipeline ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -167,7 +396,7 @@ Add a custom module to a pipeline To add a custom module to your pipeline, move into your pipeline folder and call ``nf-core install`` with your custom module repository as parameter, for example:: - $ nf-core modules --repository cnr-ibba/nf-modules install freebayes/single + nf-core modules --repository cnr-ibba/nf-modules install freebayes/single Create a new module ~~~~~~~~~~~~~~~~~~~ @@ -182,7 +411,7 @@ The command acts in the same way for both the two scenarios: relying on your pro ``nf-core modules`` will determine if your folder is a pipeline or a *modules* repository clone:: - $ nf-core modules create freebayes/single --author @bunop --label process_high --meta + nf-core modules create freebayes/single --author @bunop --label process_high --meta .. tip:: @@ -200,12 +429,13 @@ modules. The python package ``pytest-workflow`` is a requirement to make such te You need also to specify an environment between ``conda``, ``docker`` or ``singularity`` in order to perform test. Use tags to specify which tests need to be run:: - $ NF_CORE_MODULES_TEST=1 PROFILE=docker pytest --tag freebayes/single --symlink --keep-workflow-wd + NF_CORE_MODULES_TEST=1 PROFILE=docker pytest --symlink --keep-workflow-wd \ + --git-aware --tag freebayes/single You need to check also syntax with ``nf-core`` script by specify which tests to call using *tags*:: - $ nf-core modules lint freebayes/single + nf-core modules lint freebayes/single If you are successful in both tests, you have an higher chance that your tests will be executed without errors in GitHub workflow. @@ -284,24 +514,32 @@ implies different pipeline scripts with differ only for a few things, for exampl where the input files are. If you place your configuration files outside your main script, you can re-use the same parameters within different scripts and keep your main file unmodified: this keeps the stuff simple and let you to focus only -on important changes with your *CVS*. For example, you could define a ``custom.config`` -*JSON* in which specify your specific requirements:: +on important changes with your *CVS*. For example, you could define a +custom ``params.json`` *JSON* config file in which specify your +specific requirements:: - params { - // Input / Output parameters - readPaths = "$baseDir/fastq/*.fastq.gz" - outdir = "results" - - // reference genome - genome = "/path/to/genome.fasta" + { + "readPaths": "$baseDir/fastq/*.fastq.gz", + "outdir": "results", + "genome": "/path/to/genome.fasta" } -An then calling nextflow by providing your custom parameters:: +All the other parameters which cannot be specified using the command line interface +need to be provided in a *custom configuration* file using the standard nextflow +syntax:: + + profiles { + slurm { + process.executor = 'slurm' + process.queue = 'testing' + } + } - $ nextflow run -resume main.nf -c custom.config --profile singularity +Then, you can call nextflow by providing your custom parameters and configuration +file:: -Moreover, by writing specific configuration parameters let you to call a remote -pipeline with ``nextflow run`` without collect nextflow code in your analysis directory. + nextflow run -resume main.nf -params-file params.json \ + -config custom.config -profile singularity .. hint:: @@ -329,7 +567,7 @@ running as intended in the shortest time. You should also consider to provide a ``test`` profile with the required parameters which let you to test your pipeline like this:: - $ nextflow run . -profile test,singularity + nextflow run . -profile test,singularity Where the ``test`` profile is specified in ``nextflow.config`` and refers to the *test dataset* you provide with your pipeline:: @@ -348,3 +586,11 @@ the *test dataset* you provide with your pipeline:: This type of test could be used even with CI system, like `GitHub workflow `__. + +Lower resources usage +~~~~~~~~~~~~~~~~~~~~~ + +You should consider to lower the resources required by your pipeline. This will +avoid the costs of allocating more resources than needed and will let you complete +your analysis in a shorter time when resources are limited. +Take a look at `Lowering pipeline requirements`_ documentation section. diff --git a/docs/nextflow/getting-started.rst b/docs/nextflow/getting-started.rst index 7e30708..f767b2c 100644 --- a/docs/nextflow/getting-started.rst +++ b/docs/nextflow/getting-started.rst @@ -96,13 +96,40 @@ but the recommended way is using pip:: $ source activate nf-core $ nf-core --help +.. _configuring_nextflow: + Configuring nextflow -~~~~~~~~~~~~~~~~~~~~ +-------------------- + +Nextflow can be customized in different ways: there are configuration files, +which can be used to customize a single pipeline execution, and environment +variables, which can be used to customize the nextflow runtime and the underlying +Java virtual machine. There's also a ``$HOME/.nextflow/config`` file which can +be used to customize the default configuration of nextflow, for example by limiting +resources usage:: + + executor { + name = 'slurm' + queueSize = 50 + submitRateLimit = '10 sec' + } + +In this way is possible to setup a default configuration for all your pipelines, +by limiting the job submission in order to avoid to overload the cluster scheduler. +Nextflow configuration files are stored in multiple locations, and are loaded in +different order. This means that you can have a default configuration file in +``$HOME/.nextflow/config`` and a pipeline specific configuration file in the +pipeline directory, and the latter will override the former. You could find more +information in the `nextflow documentation `__. +There are some tips for HPC users, please take a look at nextflow forum for +`5 Nextflow Tips for HPC Users `__ +and `Five more tips for Nextflow user on HPC `__ +articles. .. _set-singularity-cache: Setting ``NXF_SINGULARITY_CACHEDIR`` -"""""""""""""""""""""""""""""""""""" +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Using nextflow with singularity lets you to define a directory where remote Singularity images are stored. This could speed up **a lot** pipelines execution times, since images @@ -126,10 +153,65 @@ inside this directory When using a computing cluster it must be a shared folder accessible from all computing nodes. +.. _nextflow_environment_variables: + +Other nextflow environment variables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There are others environment variables which could be useful to set in order to +customize your nextflow experience. You could find a list of them in the +`nextflow documentation `__. +Here are a selection of them: + +.. list-table:: Nextflow environment variables + :header-rows: 1 + :widths: 25 50 25 + + * - Name + - Description + - Example + * - NXF_EXECUTOR + - Defines the default process executor + - ``slurm`` + * - NXF_OPTS + - | Provides extra options for the Java and Nextflow runtime. + | It must be a blank separated list of ``-Dkey[=value]`` properties + - ``-Xms500M -Xmx2G`` + * - NXF_SINGULARITY_CACHEDIR + - | Directory where remote Singularity images are stored. + | When using a computing cluster it must be a shared + | folder accessible from all compute nodes. + - ``$WORK/nxf_singularity_cache`` + * - NXF_WORK + - | Directory where working files are stored + | (usually your scratch directory) + - ``"$CINECA_SCRATCH/nxf_work"`` + * - NXF_OFFLINE + - | When true disables the project automatic download and + | update from remote repositories (default: ``false``). + - ``true`` + * - NXF_ANSI_LOG + - | Enables/disables ANSI console output + | (default ``true`` when ANSI terminal is detected). + - ``false`` + +Those environment variables could be set in your ``$HOME/.profile`` (Debian) or +``$HOME/.bash_profile`` (Red-Hat) configuration files, for example: + +.. code-block:: bash + + # Nextflow custom environment variables + export NXF_EXECUTOR=slurm + export NXF_OPTS="-Xms500M -Xmx2G" + export NXF_SINGULARITY_CACHEDIR="$WORK/nxf_singularity_cache" + export NXF_WORK="$CINECA_SCRATCH/nxf_work" + export NXF_OFFLINE='true' + export NXF_ANSI_LOG='false' + .. _nextflow-private-repo: Access to private repositories -"""""""""""""""""""""""""""""" +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The file ``$HOME/.nextflow/scm`` can store the configuration required to access to private repository in GitHub, for example:: @@ -146,7 +228,7 @@ You could find more information in section of nextflow documentation. Access to private nextflow modules -"""""""""""""""""""""""""""""""""" +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In order to get access to the private `nextflow-modules `__, you need to diff --git a/docs/nextflow/running.rst b/docs/nextflow/running.rst index a3f27c8..fc751ee 100644 --- a/docs/nextflow/running.rst +++ b/docs/nextflow/running.rst @@ -100,6 +100,14 @@ You could search for a specific pipeline by providing a name as an argument:: $ nf-core list rna +You can download a pipeline with its container dependencies. This will be helpful +when running nextflow in an environment without internet connection:: + + $ nf-core download nf-core/rnaseq -r 3.12.0 + +this command let the possibility to amend singularity images in your +``$NXF_SINGULARITY_CACHEDIR``, which means that images will not be placed in the +archive but in your local folder. The most interesting thing is the possibility to configure params with:: $ nf-core launch rnaseq @@ -134,6 +142,21 @@ Nextflow best-practices Here are some tips that could be useful while running nextflow. +Run a pipeline with test data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When you run a pipeline for the first time, it's better to use test data in order +to check if the pipeline is working as expected. All the community pipelines have +a ``-profile test`` option which will download a small dataset and run the pipeline +on it. For example, to run the ``nf-core/rnaseq`` pipeline with test data, you can +do:: + + $ nextflow run nf-core/rnaseq -profile test,singularity -resume + +This will also download the required dependencies (like the singularity images). +Next time you will run the pipeline, nextflow will use the cached images and will +not download them again. + Getting information from logs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/nextflow/trubleshooting.rst b/docs/nextflow/trubleshooting.rst index 7e2df8b..709ec8b 100644 --- a/docs/nextflow/trubleshooting.rst +++ b/docs/nextflow/trubleshooting.rst @@ -183,3 +183,41 @@ this:: This will download all the requirements and will put nextflow in your current directory. Change the nextflow default permissions to ``755`` and move such executable in a directory with a higher position in your ``$PATH`` environment, for example ``$HOME/bin`` + +Cannot execute nextflow interactively +------------------------------------- + +In HPC environment when the resources are limited in the login nodes, nextflow cannot +be executed interactively. In such case, nextflow need to be submitted to a job +scheduler. For example, in a SLURM environment, you can define a nextflow job +like this: + +.. code-block:: bash + + #!/bin/bash + #SBATCH --nodes=1 # 1 node + #SBATCH --ntasks-per-node=1 # 1 tasks per node + #SBATCH --cpus-per-task=2 # 2 CPUs per task + #SBATCH --time=4-00:00:00 # time limits: see queue and QoS + #SBATCH --mem=4G # 4GB to manage process + #SBATCH --error=nextflow.err # standard error file + #SBATCH --output=nextflow.out # standard output file + #SBATCH --job-name=nf-core-rnaseq # job name + #SBATCH --account=IscrC_NF-PIPE # account name + #SBATCH --partition=g100_usr_prod # partition name (see https://wiki.u-gov.it/confluence/display/SCAIUS/UG3.3%3A+GALILEO100+UserGuide) + #SBATCH --qos=g100_qos_lprod # quality of service (see https://wiki.u-gov.it/confluence/display/SCAIUS/UG3.3%3A+GALILEO100+UserGuide) + nextflow run nf-core/rnaseq -r 3.12.0 -profile "singularity,cineca" -resume -config custom.config -params-file rnaseq-nf-params.json + +Next you will require to configure nextflow to not working interactively and +limiting some resources. Take a look at :ref:`environment-variables ` +and :ref:`Configuring nextflow ` sections of this guide. + +Terminating nextflow execution +------------------------------ + +If you need to terminate a nextflow execution, you can send a ``SIGTERM`` signal +for example with ``Ctrl+C``. This will terminate all running processes and will +turn off the pipeline execution removing the temporary *lock* files. If you require +to terminate a running process which nextflow can't terminate, you will need to +terminate such process manually, for example using ``scancel`` on a SLURM environment +or by killing such process if you are running nextflow with a local executor. diff --git a/docs/terminal/R.rst b/docs/terminal/R.rst index 555ff1a..7e7931f 100644 --- a/docs/terminal/R.rst +++ b/docs/terminal/R.rst @@ -49,10 +49,10 @@ session. However if you plan to manage packages installation by yourself, if you require the most update packages or your packages need to be installed from source since they are not included in conda repositories, the most effective way is to install the less from conda and then install your required packages from sources. -To create a new environment in ``conda`` with the latest version (*4.2.0* at the +To create a new environment in ``conda`` with the latest version (*4.3.1* at the moment) you can do like this:: - conda create --channel R --name R-4.2 r-base=4.2 + conda create --channel R --name R-4.3 r-base=4.3 .. hint:: @@ -95,7 +95,7 @@ in which we add some dependencies require to compile these packages. Here is how it looks like the singularity ``.def`` file:: Bootstrap: docker - From: rocker/r-base:4.2.0 + From: rocker/r-base:4.3.1 Stage: build %post diff --git a/docs/terminal/ssh.rst b/docs/terminal/ssh.rst index 387bf10..2cd613f 100644 --- a/docs/terminal/ssh.rst +++ b/docs/terminal/ssh.rst @@ -111,6 +111,17 @@ required when using your key pairs. You could reply with no arguments (simply pr Your identification has been saved in /.ssh/id_rsa Your public key has been saved in /.ssh/id_rsa.pub +.. hint:: + + The default paths for public/private rsa key pairs are ``/.ssh/id_rsa`` + and ``/.ssh/id_rsa.pub``. We suggest to keep the default paths for + simplicity. If you specify custom paths or files for key pairs, you will need + to specify the private key path when using SSH, for example to connect to a + remote server with :ref:`ssh ` + when copying files using :ref:`rsync ` with ``ssh`` + remote protocol or using :ref:`scp ` and to mount remote + folders using :ref:`sshfs ` + In case you have already generated a key pair with the same file name, you are prompted if you want to overwrite your key pair:: @@ -236,6 +247,8 @@ need to be placed inside your ``$HOME/.ssh/authorized_keys`` file on remote host ├── authorized_keys └── known_hosts +.. _ssh_folder_permissions: + Moreover, in order to connect, those files need to be accessed only by your user (with the ``700`` and ``600`` ``chmod`` permissions for directory and files respectively):: @@ -335,7 +348,7 @@ option:: Closing a connection """""""""""""""""""" -To exit from the remote terminal and logount from the remote server, simply type:: +To exit from the remote terminal and logout from the remote server, simply type:: $ exit @@ -351,7 +364,7 @@ You could also choose to override global configuration by specifing the same par in the specific remote section. The ``$HOME/.ssh/config`` could be structured like this:: - # these settings are applied everytime you start a ssh connection + # these settings are applied every time you start a ssh connection ServerAliveInterval=60 ServerAliveCountMax=20 ConnectTimeout=60 @@ -369,7 +382,7 @@ this:: IdentityFile /path/to/your/private/id_rsa The ``IdentityFile`` could be used to define your private key location, in order -to not provide your identity file everytime you start a new connection, +to not provide your identity file every time you start a new connection, ``ServerAliveInterval``, ``ServerAliveCountMax`` and ``ConnectTimeout`` are respectively timers which regulate the timeouts when connecting and in sending messages between client and servers. They could be useful when connecting using a unreliable network. @@ -437,6 +450,8 @@ file is updated or not). SCP """ +.. _copy-files-with-scp: + ``scp`` works like linux ``cp`` but support remote origin or destination. Simple prefix your source or destination path with ``@`` as you do when connecting using OpenSSH, for example to copy recursively from a remote folder in @@ -462,7 +477,7 @@ If you want to copy a local folder into a remote folder, simply add the timings: your copied file will have the created/modified time when the copy occurs, and you can't define the most updated file simply relying on date. Moreover, if you remote copy a folder using ``scp``, you will copy the whole directory content, - indipendently if destination files are already present or aren't changed. This + independently if destination files are already present or aren't changed. This need to be taken into consideration for example if there are network issues during copying and you need to executing the same command again: for those reasons, ``rsync`` is the recommended way to copy or backup files using ``OpenSSH``. @@ -473,7 +488,7 @@ Rsync .. _copy-files-with-rsync: ``rsync`` is the recommended way to backup or copy files from/to remote services: -it checks contents in destination folder in order to save time and bandwith by copying +it checks contents in destination folder in order to save time and bandwidth by copying only new or modified files. Command is similar to scp, however there are additional parameters that need to be mastered in order to take full advantage of ``rsync``. For example, to copy files from local to remote your could do like this:: @@ -603,7 +618,7 @@ this:: in the previous case, the default values of ``ServerAliveInterval`` and ``ServerAliveCountMax`` are replaced by these new ones, which will be applied only when connecting to ``localhost`` - (for example, when you use *tunnels* to reach remote *ports* through a firewalled network). +(for example, when you use *tunnels* to reach remote *ports* through a firewalled network). ``Host`` syntax supports wildcards, like ``192.168.1.*`` or ``*.ibba.cnr.it``: in these cases, configurations will be applied on all SSH session matching these patterns. @@ -616,9 +631,86 @@ these cases, configurations will be applied on all SSH session matching these pa Please consider to raise up this parameters accordingly your needs but not exceed reasonable times. +SSH Muliplexing +~~~~~~~~~~~~~~~ + +.. epigraph:: + + Multiplexing is the ability to send more than one signal over a single line or + connection. In OpenSSH, multiplexing can re-use an existing outgoing + TCP connection for multiple concurrent SSH sessions to a remote SSH server, + avoiding the overhead of creating a new TCP connection and reauthenticating + each time. + +When using multiplexing you will connect once and then all the other connections +to the same resource will re-use the already defined connection, thus avoiding +the creation of new TCP connection and the negotiation of a secure connection. +This will help a lot when using ``ssh-agent`` or others authentication softwares +like the `step client `__: +you will need to authenticate in a terminal and then you can login from different +terminals or applications like `VSCode `__. +Activities that repeatedly open new connections can be significantly sped up +using multiplexing. In order to use SSH with multiplexing, add this to your +``$HOME/.ssh/config`` file:: + + # inspired from https://superuser.com/a/879696 + Host + HostName + User + ControlPath ~/.ssh/controlmasters/%r@%h:%p + ControlMaster auto + ControlPersist 10m + +The ``Host`` directive can accept the full hostname, the IP address or an alias +for your connection. If you specify an alias, you will need to specify the +full domain name or ip address with the ``HostName`` directive. The +``ControlMaster auto`` directive will create a new multiplex connection if doesn't +exists, will reuse the connection if already established and will remove the +connection after a certain amount of time defined by the ``ControlPersist`` directive. +Next you will need to ensure that the ``ControlPath`` path (without the ``%r@%h:%p`` +token) is present, for example with: + +.. code-block:: bash + + mkdir ~/.ssh/controlmasters + +.. hint:: + + Mind to :ref:`ssh folder permissions ` when dealing + with ssh folder. + +After that you can connect to the remote server using SSH as usual. + +.. hint:: + + You can check connection status using: + + .. code-block:: bash + + ssh -O check @ + + ## Or if you defined a connection alias, like the example + + ssh -O check + +.. warning:: + + If you lose the network connection, the multiplexed connection is not more valid. + You will need to manually remove the ``ControlPath`` file or terminate the + multiplexed connection with: + + .. code-block:: bash + + ssh -O stop @ # Or ssh -O stop + +For more information, see `OpenSSH/Cookbook/Multiplexing `__ +and `How To Reuse SSH Connection... `__ + Mount remote folders using SSH ------------------------------ +.. _mount_using_sshfs: + It is possible to mount a remote folder in your local environment using ``sshfs``. Briefly, this utility lets you to mount a remote folder into your local environment using ``SSH`` as protocol. This has the benefit that you could see the remote @@ -650,6 +742,11 @@ After that, you could mount the remote folder with:: ``ssh`` credentials (this because your local user could be different from your remote user required to create/access files remotely). +.. hint:: + + there's also the `-o follow_symlinks` option, which is useful when mounting + a folder with symlinks pointing outside the mounted folder + If you need to unmount a folder:: $ fusermount -u /mnt/core diff --git a/environment.yml b/environment.yml index 999d6ad..e66b9ab 100644 --- a/environment.yml +++ b/environment.yml @@ -4,47 +4,54 @@ channels: dependencies: - _libgcc_mutex=0.1=main - _openmp_mutex=5.1=1_gnu - - ca-certificates=2022.4.26=h06a4308_0 + - alabaster=0.7.12=pyhd3eb1b0_0 + - babel=2.11.0=py38h06a4308_0 + - brotli-python=1.0.9=py38h6a678d5_7 + - ca-certificates=2023.08.22=h06a4308_0 + - certifi=2023.7.22=py38h06a4308_0 + - cffi=1.15.1=py38h5eee18b_3 + - charset-normalizer=2.0.4=pyhd3eb1b0_0 + - colorama=0.4.6=py38h06a4308_0 + - cryptography=41.0.3=py38hdda0065_0 + - docutils=0.18.1=py38h06a4308_3 + - idna=3.4=py38h06a4308_0 + - imagesize=1.4.1=py38h06a4308_0 + - importlib-metadata=6.0.0=py38h06a4308_0 + - jinja2=3.1.2=py38h06a4308_0 - ld_impl_linux-64=2.38=h1181459_1 - - libffi=3.3=he6710b0_2 + - libffi=3.4.4=h6a678d5_0 - libgcc-ng=11.2.0=h1234567_1 - libgomp=11.2.0=h1234567_1 - libstdcxx-ng=11.2.0=h1234567_1 - - ncurses=6.3=h5eee18b_3 - - openssl=1.1.1q=h7f8727e_0 - - pip=22.1.2=py39h06a4308_0 - - python=3.9.12=h12debd9_1 - - readline=8.1.2=h7f8727e_1 - - setuptools=61.2.0=py39h06a4308_0 - - sqlite=3.38.5=hc218d9a_0 + - markupsafe=2.1.1=py38h7f8727e_0 + - ncurses=6.4=h6a678d5_0 + - openssl=3.0.12=h7f8727e_0 + - packaging=23.1=py38h06a4308_0 + - pip=23.3=py38h06a4308_0 + - pycparser=2.21=pyhd3eb1b0_0 + - pygments=2.15.1=py38h06a4308_1 + - pyopenssl=23.2.0=py38h06a4308_0 + - pysocks=1.7.1=py38h06a4308_0 + - python=3.8.18=h955ad1f_0 + - pytz=2023.3.post1=py38h06a4308_0 + - readline=8.2=h5eee18b_0 + - requests=2.31.0=py38h06a4308_0 + - setuptools=68.0.0=py38h06a4308_0 + - snowballstemmer=2.2.0=pyhd3eb1b0_0 + - sphinx=5.0.2=py38h06a4308_0 + - sphinx-rtd-theme=1.1.1=pyhd3eb1b0_0 + - sphinx_rtd_theme=1.1.1=py38h06a4308_0 + - sphinxcontrib-applehelp=1.0.2=pyhd3eb1b0_0 + - sphinxcontrib-devhelp=1.0.2=pyhd3eb1b0_0 + - sphinxcontrib-htmlhelp=2.0.0=pyhd3eb1b0_0 + - sphinxcontrib-jsmath=1.0.1=pyhd3eb1b0_0 + - sphinxcontrib-qthelp=1.0.3=pyhd3eb1b0_0 + - sphinxcontrib-serializinghtml=1.1.5=pyhd3eb1b0_0 + - sqlite=3.41.2=h5eee18b_0 - tk=8.6.12=h1ccaba5_0 - - tzdata=2022a=hda174b7_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - xz=5.2.5=h7f8727e_1 - - zlib=1.2.12=h7f8727e_2 - - pip: - - alabaster==0.7.12 - - babel==2.9.0 - - certifi==2021.10.8 - - chardet==4.0.0 - - docutils==0.16 - - idna==2.10 - - imagesize==1.2.0 - - jinja2==2.11.2 - - markupsafe==1.1.1 - - packaging==20.8 - - pygments==2.7.3 - - pyparsing==2.4.7 - - pytz==2020.4 - - requests==2.25.1 - - snowballstemmer==2.0.0 - - sphinx==4.0.2 - - sphinx-rtd-theme==0.5.2 - - sphinxcontrib-applehelp==1.0.2 - - sphinxcontrib-devhelp==1.0.2 - - sphinxcontrib-htmlhelp==1.0.3 - - sphinxcontrib-jsmath==1.0.1 - - sphinxcontrib-qthelp==1.0.3 - - sphinxcontrib-serializinghtml==1.1.4 - - urllib3==1.26.2 + - urllib3=1.26.18=py38h06a4308_0 + - wheel=0.41.2=py38h06a4308_0 + - xz=5.4.2=h5eee18b_0 + - zipp=3.11.0=py38h06a4308_0 + - zlib=1.2.13=h5eee18b_0 prefix: /home/paolo/.conda/envs/BIOINFO-guidelines diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 09919e8..0000000 --- a/requirements.txt +++ /dev/null @@ -1,24 +0,0 @@ -alabaster==0.7.12 -Babel==2.9.0 -certifi==2021.10.8 -chardet==4.0.0 -docutils==0.16 -idna==2.10 -imagesize==1.2.0 -Jinja2==2.11.2 -MarkupSafe==1.1.1 -packaging==20.8 -Pygments==2.7.3 -pyparsing==2.4.7 -pytz==2020.4 -requests==2.25.1 -snowballstemmer==2.0.0 -Sphinx==4.0.2 -sphinx-rtd-theme==0.5.2 -sphinxcontrib-applehelp==1.0.2 -sphinxcontrib-devhelp==1.0.2 -sphinxcontrib-htmlhelp==1.0.3 -sphinxcontrib-jsmath==1.0.1 -sphinxcontrib-qthelp==1.0.3 -sphinxcontrib-serializinghtml==1.1.4 -urllib3==1.26.2