nf-core · julietmWM · Apr 9, 2025 · Apr 17, 2025 · Apr 17, 2025 · Apr 22, 2025
diff --git a/aws_batch.config b/aws_batch.config
@@ -0,0 +1,36 @@
+/*
+========================================================================================
+    wmg_nextflow/masterworkflow Nextflow AWS Batch config file
+========================================================================================
+    Default config options for AWS Batch
+----------------------------------------------------------------------------------------
+*/
+
+
+params {
+    awsqueue = 'nextflow-with-dockerhub-aws-batch-large'
+    awsregion = 'us-west-2'
+    run = 'default'
+    // Max resource options
+    max_memory                 = '256.GB'
+    max_cpus                   = 256
+    max_time                   = '240.h'
+    outdir = "s3://watchmaker-lts/nanoseq/${params.run}/"
+}
+
+
+
+
+process {
+    executor = 'awsbatch'
+    queue = 'nextflow-with-dockerhub-aws-batch-large'
+}
+
+aws {
+    batch {
+        cliPath = '/home/ec2-user/miniconda/bin/aws'
+    }
+    region = 'us-west-2'
+}
+
+workDir = "s3://watchmaker-lts/nanoseq/work/"
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -49,19 +49,19 @@ def read_head(handle, num_lines=10):
 def check_samplesheet(file_in, updated_path, file_out):
     """
     This function checks that the samplesheet follows the following structure:
-    group,replicate,barcode,input_file,fasta,gtf
-    MCF7,1,,MCF7_directcDNA_replicate1.fastq.gz,genome.fa,
-    MCF7,2,,MCF7_directcDNA_replicate3.fastq.gz,genome.fa,genome.gtf
-    K562,1,,K562_directcDNA_replicate1.fastq.gz,genome.fa,
-    K562,2,,K562_directcDNA_replicate4.fastq.gz,,transcripts.fa
+    group,replicate,barcode,input_file,fasta,gtf,restrander_config
+    MCF7,1,,MCF7_directcDNA_replicate1.fastq.gz,genome.fa,,restrander_config.json
+    MCF7,2,,MCF7_directcDNA_replicate3.fastq.gz,genome.fa,genome.gtf, restrander_config.json
+    K562,1,,K562_directcDNA_replicate1.fastq.gz,genome.fa,,
+    K562,2,,K562_directcDNA_replicate4.fastq.gz,,transcripts.fa,
     """
 
     input_extensions = []
     sample_info_dict = {}
     with open(file_in, "r") as fin:
         ## Check header
         MIN_COLS = 3
-        HEADER = ["group", "replicate", "barcode", "input_file", "fasta", "gtf"]
+        HEADER = ["group", "replicate", "barcode", "input_file", "fasta", "gtf", "restrander_config"]
         header = fin.readline().strip().split(",")
         if header[: len(HEADER)] != HEADER:
             print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER)))
@@ -80,7 +80,7 @@ def check_samplesheet(file_in, updated_path, file_out):
                 print_error("Invalid number of populated columns (minimum = {})!".format(MIN_COLS), "Line", line)
 
             ## Check group name entries
-            group, replicate, barcode, input_file, fasta, gtf = lspl[: len(HEADER)]
+            group, replicate, barcode, input_file, fasta, gtf, restrander_config = lspl[: len(HEADER)]
             if group:
                 if group.find(" ") != -1:
                     print_error("Group entry contains spaces!", "Line", line)
@@ -177,8 +177,8 @@ def check_samplesheet(file_in, updated_path, file_out):
                 #    is_transcripts = '1'
                 #    genome = transcriptome
 
-            ## Create sample mapping dictionary = {group: {replicate : [ barcode, input_file, genome, gtf, is_transcripts, nanopolish_fast5 ]}}
-            sample_info = [barcode, input_file, fasta, gtf, is_transcripts, nanopolish_fast5]
+            ## Create sample mapping dictionary = {group: {replicate : [ barcode, input_file, genome, gtf, is_transcripts, nanopolish_fast5, restrander_config ]}}
+            sample_info = [barcode, input_file, fasta, gtf, is_transcripts, nanopolish_fast5, restrander_config]
             if group not in sample_info_dict:
                 sample_info_dict[group] = {}
             if replicate not in sample_info_dict[group]:
@@ -200,7 +200,7 @@ def check_samplesheet(file_in, updated_path, file_out):
         make_dir(out_dir)
         with open(file_out, "w") as fout:
             fout.write(
-                ",".join(["sample", "barcode", "input_file", "fasta", "gtf", "is_transcripts", "nanopolish_fast5"])
+                ",".join(["sample", "barcode", "input_file", "fasta", "gtf", "is_transcripts", "nanopolish_fast5", "restrander_config"])
                 + "\n"
             )
             for sample in sorted(sample_info_dict.keys()):

diff --git a/conf/modules.config b/conf/modules.config
@@ -45,6 +45,16 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
+
+    // Publish dir for RESTRANDER
+    withName: RESTRANDER {
+        publishDir = [
+            path: { "${params.outdir}/restrander" },
+            mode: 'copy',
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
 }
 
 if (!params.skip_demultiplexing) {
@@ -467,6 +477,8 @@ if (params.call_variants) {
                 ]
             }
         }
+
+
     }
     if (params.structural_variant_caller == 'sniffles') {
         process {
@@ -535,6 +547,17 @@ if (params.call_variants) {
 }
 
 if (!params.skip_quantification) {
+    process {
+            withName: RSEQC_GENEBODYCOVERAGE {
+                publishDir = [
+                    path: { "${params.outdir}/rseqc" },
+                    mode: 'copy',
+                    enabled: true,
+                    saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+                ]
+            }
+        }
+
     if (params.quantification_method == "bambu") {
         process {
             withName: BAMBU {

diff --git a/docs/output.md b/docs/output.md
@@ -46,6 +46,23 @@ _Documentation_:
 _Description_:
 If you would like to run NanoLyse on the raw FASTQ files you can provide `--run_nanolyse` when running the pipeline. By default, the pipeline will filter lambda phage reads. However, you can provide your own FASTA file of "contaminants" with `--nanolyse_fasta`. The filtered FASTQ files will contain raw reads without the specified reference sequences (default: lambda phage sequences).
 
+## cDNA Read Orientation
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `restrander/<SAMPLE>_restrander.fq.gz`: FASTQ file of the stranded reads. The reverse strand reads are replaced with their reverse-complements, ensuring that all reads in the output have the same orientation as the original transcripts.
+- `restrander/<SAMPLE>-unknowns.*_restrander.fq.gz`: FASTQ file of the reads whose strand could not be inferred.
+- `restrander/<SAMPLE>.restrander.json`: Restrander output statistics - includes artefact and strand statistics.
+
+</details>
+
+_Documentation_:
+[Restrander](https://github.com/mritchielab/restrander)
+
+_Description_:
+Restrander is a program designed for orienting and quality-checking cDNA sequencing reads. Restrander will run automatically if the protocol is cDNA and a Restrander config file is present in the sample sheet.
+
 ## Read QC
 
 <details markdown="1">

diff --git a/docs/usage.md b/docs/usage.md
@@ -10,12 +10,13 @@ You will need to create a file with information about the samples in your experi
 
 | Column       | Description                                                                                                                                                                                                                                                                               |
 | ------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `group`      | Group identifier for sample. This will be identical for replicate samples from the same experimental group.                                                                                                                                                                               |
-| `replicate`  | Integer representing replicate number. Must start from `1..<number of replicates>`.                                                                                                                                                                                                       |
-| `barcode`    | Barcode identifier attributed to that sample during multiplexing. Must be an integer.                                                                                                                                                                                                     |
-| `input_file` | Full path to FastQ file if previously demultiplexed, BAM file if previously aligned, or a path to a directory with subdirectories containing fastq or fast5 files. FastQ file has to be zipped and have the extension ".fastq.gz" or ".fq.gz". BAM file has to have the extension ".bam". |
-| `fasta`      | Genome fasta file or transcriptome fasta file for alignment. This can either be a local path, or the appropriate key for a genome available in [iGenomes config file](../conf/igenomes.config). Must have the extension ".fasta", ".fasta.gz", ".fa" or ".fa.gz".                         |
-| `gtf`        | Annotation gtf file for transcript discovery and quantification and RNA modification detection. This can either be blank or a local path. Must have the extension ".gtf".                                                                                                                 |
+| `group`             | Group identifier for sample. This will be identical for replicate samples from the same experimental group.                                                                                                                                                                               |
+| `replicate`         | Integer representing replicate number. Must start from `1..<number of replicates>`.                                                                                                                                                                                                       |
+| `barcode`           | Barcode identifier attributed to that sample during multiplexing. Must be an integer.                                                                                                                                                                                                     |
+| `input_file`        | Full path to FastQ file if previously demultiplexed, BAM file if previously aligned, or a path to a directory with subdirectories containing fastq or fast5 files. FastQ file has to be zipped and have the extension ".fastq.gz" or ".fq.gz". BAM file has to have the extension ".bam". |
+| `fasta`             | Genome fasta file or transcriptome fasta file for alignment. This can either be a local path, or the appropriate key for a genome available in [iGenomes config file](../conf/igenomes.config). Must have the extension ".fasta", ".fasta.gz", ".fa" or ".fa.gz".                         |
+| `gtf`               | Annotation gtf file for transcript discovery and quantification and RNA modification detection. This can either be blank or a local path. Must have the extension ".gtf".                                                                                                                 |
+| `restrander_config` | Restrander .json config file that provides the template-switching oligo (TSO) and reverse transcription primer (RTP) sequences. Different configurations are used for different library preparation protocols. This can either be blank or a file path. If blank, Restrander will not run for the sample.                               |
 
 ### Skip demultiplexing
 
@@ -26,13 +27,13 @@ As shown in the examples below, the accepted samplesheet format is different dep
 ##### Example `samplesheet.csv` for non-demultiplexed fastq inputs
 
 ```bash
-group,replicate,barcode,input_file,fasta,gtf
-WT_MOUSE,1,1,,mm10,
-WT_HUMAN,1,2,,hg19,
-WT_POMBE,1,3,,/path/to/local/genome.fa,
-WT_DENOVO,1,4,,,/path/to/local/transcriptome.fa
-WT_LOCAL,2,5,,/path/to/local/genome.fa,/path/to/local/transcriptome.gtf
-WT_UNKNOWN,3,6,,,
+group,replicate,barcode,input_file,fasta,gtf,restrander_config
+WT_MOUSE,1,1,,mm10,,
+WT_HUMAN,1,2,,hg19,,
+WT_POMBE,1,3,,/path/to/local/genome.fa,,
+WT_DENOVO,1,4,,,/path/to/local/transcriptome.fa,
+WT_LOCAL,2,5,,/path/to/local/genome.fa,/path/to/local/transcriptome.gtf,
+WT_UNKNOWN,3,6,,,,
 ```
 
 ##### Example command for non-demultiplexed fastq inputs
@@ -52,11 +53,11 @@ nextflow run nf-core/nanoseq \
 ##### Example `samplesheet.csv` for demultiplexed fastq inputs
 
 ```bash
-group,replicate,barcode,input_file,fasta,gtf
-WT,1,,SAM101A1.fastq.gz,hg19,
-WT,2,,SAM101A2.fastq.gz,hg19,
-KO,1,,SAM101A3.fastq.gz,hg19,
-KO,2,,SAM101A4.fastq.gz,hg19,
+group,replicate,barcode,input_file,fasta,gtf,restrander_config
+WT,1,,SAM101A1.fastq.gz,hg19,,
+WT,2,,SAM101A2.fastq.gz,hg19,,
+KO,1,,SAM101A3.fastq.gz,hg19,,
+KO,2,,SAM101A4.fastq.gz,hg19,,
 ```
 
 ##### Example command for demultiplexed fastq inputs
@@ -74,11 +75,11 @@ nextflow run nf-core/nanoseq \
 ##### Example `samplesheet.csv` for BAM inputs
 
 ```bash
-group,replicate,barcode,input_file,fasta,gtf
-WT,1,,SAM101A1.bam,hg19,
-WT,2,,SAM101A2.bam,hg19,
-KO,1,,SAM101A3.bam,hg19,
-KO,2,,SAM101A4.bam,hg19,
+group,replicate,barcode,input_file,fasta,gtf,restrander_config
+WT,1,,SAM101A1.bam,hg19,,
+WT,2,,SAM101A2.bam,hg19,,
+KO,1,,SAM101A3.bam,hg19,,
+KO,2,,SAM101A4.bam,hg19,,
 ```
 
 ##### Example command for BAM inputs
@@ -97,11 +98,11 @@ nextflow run nf-core/nanoseq \
 ##### Example `samplesheet.csv` for FAST5 and FASTQ input directories
 
 ```bash
-group,replicate,barcode,input_file,fasta,gtf
-WT,1,,/full/path/to/SAM101A1/,hg19.fasta,hg19.gtf
-WT,2,,/full/path/to/SAM101A2/,hg19.fasta,hg19.gtf
-KO,1,,/full/path/to/SAM101A3/,hg19.fasta,hg19.gtf
-KO,2,,/full/path/to/SAM101A4/,hg19.fasta,hg19.gtf
+group,replicate,barcode,input_file,fasta,gtf,restrander_config
+WT,1,,/full/path/to/SAM101A1/,hg19.fasta,hg19.gtf,
+WT,2,,/full/path/to/SAM101A2/,hg19.fasta,hg19.gtf,
+KO,1,,/full/path/to/SAM101A3/,hg19.fasta,hg19.gtf,
+KO,2,,/full/path/to/SAM101A4/,hg19.fasta,hg19.gtf,
 ```
 
 ##### Each of the FAST5 and FASTQ input directory should have the following structure:
@@ -128,6 +129,20 @@ nextflow run nf-core/nanoseq \
     -profile <docker/singularity/institute>
 ```
 
+### Using Restrander
+
+Restrander is a program used for orienting and quality-checking cDNA sequencing reads. Restrander will automatically run if the protocol is cDNA and a Restrander config file is present in the sample sheet. Examples of Restrander configuration files for several protocols can be found in the [README](https://github.com/jakob-schuster/restrander-vignette?tab=readme-ov-file#configuration-files) for the Restrander vignette. The sample sheet can have a mix of samples with and without Restrander config files.
+
+##### Example `samplesheet.csv` for using Restrander
+
+```bash
+group,replicate,barcode,input_file,fasta,gtf,restrander_config
+WT,1,1,/full/path/to/SAM101A1/,hg19,hg19.gtf,
+WT,2,2,/full/path/to/SAM101A2/,hg19,hg19.gtf,
+KO,1,3,/full/path/to/SAM101A3/,hg19,hg19.gtf,PCB109.json
+KO,2,4,/full/path/to/SAM101A4/,hg19,hg19.gtf,PCB109.json
+```
+
 ## Running the pipeline
 
 The typical command for running the pipeline is as follows:

diff --git a/modules/local/restrander.nf b/modules/local/restrander.nf
@@ -0,0 +1,31 @@
+process RESTRANDER {
+    tag "$meta.id"
+    label 'process_medium'
+
+    container "${'912684371407.dkr.ecr.us-west-2.amazonaws.com/restrander:1.2'}"
+
+    input:
+    tuple val(meta), path(reads), path(input_config)
+
+    output:
+    tuple val(meta), path("*_restrander.fq.gz") , emit: reads
+    tuple val(meta), path("*.restrander.json")  , emit: metrics
+    path "versions.yml"                         , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def prefix = task.ext.prefix ?: reads.getBaseName()
+    """
+    /restrander \\
+        ${reads} \\
+        ${prefix}_restrander.fq.gz \\
+        ${input_config} > ${prefix}.restrander.json
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        restrander: v1.0.1
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/rseqc_genebodycoverage.nf b/modules/local/rseqc_genebodycoverage.nf
@@ -0,0 +1,33 @@
+process RSEQC_GENEBODYCOVERAGE {
+    label 'process_high'
+    container "912684371407.dkr.ecr.us-west-2.amazonaws.com/quay.io/biocontainers/rseqc:3.0.1--py37h516909a_1"
+
+    input:
+    tuple path(bam), path(bai), path(bed12)
+
+    output:
+    path("*.pdf")                  , emit: pdf
+    path("*.geneBodyCoverage.txt") , emit: rna_txt_ch
+    path("versions.yml")           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def name = bam.getName().replaceAll(/\.bam$/, '')
+
+    """
+    geneBody_coverage.py \\
+        $args \\
+        --refgene=$bed12 \\
+        --input=$bam  \\
+        --minimum_length=100 \\
+        --out-prefix=${name}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        rseqc: \$(geneBody_coverage.py --version | sed -e "s/geneBody_coverage.py //g")
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf
diff --git a/modules/nf-core/nanoplot/main.nf b/modules/nf-core/nanoplot/main.nf