diff --git a/aws_batch.config b/aws_batch.config
new file mode 100644
index 00000000..e2eb8984
--- /dev/null
+++ b/aws_batch.config
@@ -0,0 +1,36 @@
+/*
+========================================================================================
+ wmg_nextflow/masterworkflow Nextflow AWS Batch config file
+========================================================================================
+ Default config options for AWS Batch
+----------------------------------------------------------------------------------------
+*/
+
+
+params {
+ awsqueue = 'nextflow-with-dockerhub-aws-batch-large'
+ awsregion = 'us-west-2'
+ run = 'default'
+ // Max resource options
+ max_memory = '256.GB'
+ max_cpus = 256
+ max_time = '240.h'
+ outdir = "s3://watchmaker-lts/nanoseq/${params.run}/"
+}
+
+
+
+
+process {
+ executor = 'awsbatch'
+ queue = 'nextflow-with-dockerhub-aws-batch-large'
+}
+
+aws {
+ batch {
+ cliPath = '/home/ec2-user/miniconda/bin/aws'
+ }
+ region = 'us-west-2'
+}
+
+workDir = "s3://watchmaker-lts/nanoseq/work/"
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index a10f4479..32a54035 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -49,11 +49,11 @@ def read_head(handle, num_lines=10):
def check_samplesheet(file_in, updated_path, file_out):
"""
This function checks that the samplesheet follows the following structure:
- group,replicate,barcode,input_file,fasta,gtf
- MCF7,1,,MCF7_directcDNA_replicate1.fastq.gz,genome.fa,
- MCF7,2,,MCF7_directcDNA_replicate3.fastq.gz,genome.fa,genome.gtf
- K562,1,,K562_directcDNA_replicate1.fastq.gz,genome.fa,
- K562,2,,K562_directcDNA_replicate4.fastq.gz,,transcripts.fa
+ group,replicate,barcode,input_file,fasta,gtf,restrander_config
+ MCF7,1,,MCF7_directcDNA_replicate1.fastq.gz,genome.fa,,restrander_config.json
+ MCF7,2,,MCF7_directcDNA_replicate3.fastq.gz,genome.fa,genome.gtf, restrander_config.json
+ K562,1,,K562_directcDNA_replicate1.fastq.gz,genome.fa,,
+ K562,2,,K562_directcDNA_replicate4.fastq.gz,,transcripts.fa,
"""
input_extensions = []
@@ -61,7 +61,7 @@ def check_samplesheet(file_in, updated_path, file_out):
with open(file_in, "r") as fin:
## Check header
MIN_COLS = 3
- HEADER = ["group", "replicate", "barcode", "input_file", "fasta", "gtf"]
+ HEADER = ["group", "replicate", "barcode", "input_file", "fasta", "gtf", "restrander_config"]
header = fin.readline().strip().split(",")
if header[: len(HEADER)] != HEADER:
print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER)))
@@ -80,7 +80,7 @@ def check_samplesheet(file_in, updated_path, file_out):
print_error("Invalid number of populated columns (minimum = {})!".format(MIN_COLS), "Line", line)
## Check group name entries
- group, replicate, barcode, input_file, fasta, gtf = lspl[: len(HEADER)]
+ group, replicate, barcode, input_file, fasta, gtf, restrander_config = lspl[: len(HEADER)]
if group:
if group.find(" ") != -1:
print_error("Group entry contains spaces!", "Line", line)
@@ -177,8 +177,8 @@ def check_samplesheet(file_in, updated_path, file_out):
# is_transcripts = '1'
# genome = transcriptome
- ## Create sample mapping dictionary = {group: {replicate : [ barcode, input_file, genome, gtf, is_transcripts, nanopolish_fast5 ]}}
- sample_info = [barcode, input_file, fasta, gtf, is_transcripts, nanopolish_fast5]
+ ## Create sample mapping dictionary = {group: {replicate : [ barcode, input_file, genome, gtf, is_transcripts, nanopolish_fast5, restrander_config ]}}
+ sample_info = [barcode, input_file, fasta, gtf, is_transcripts, nanopolish_fast5, restrander_config]
if group not in sample_info_dict:
sample_info_dict[group] = {}
if replicate not in sample_info_dict[group]:
@@ -200,7 +200,7 @@ def check_samplesheet(file_in, updated_path, file_out):
make_dir(out_dir)
with open(file_out, "w") as fout:
fout.write(
- ",".join(["sample", "barcode", "input_file", "fasta", "gtf", "is_transcripts", "nanopolish_fast5"])
+ ",".join(["sample", "barcode", "input_file", "fasta", "gtf", "is_transcripts", "nanopolish_fast5", "restrander_config"])
+ "\n"
)
for sample in sorted(sample_info_dict.keys()):
diff --git a/conf/modules.config b/conf/modules.config
index ba13b442..40a066c6 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -45,6 +45,16 @@ process {
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}
+
+ // Publish dir for RESTRANDER
+ withName: RESTRANDER {
+ publishDir = [
+ path: { "${params.outdir}/restrander" },
+ mode: 'copy',
+ enabled: true,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
}
if (!params.skip_demultiplexing) {
@@ -467,6 +477,8 @@ if (params.call_variants) {
]
}
}
+
+
}
if (params.structural_variant_caller == 'sniffles') {
process {
@@ -535,6 +547,17 @@ if (params.call_variants) {
}
if (!params.skip_quantification) {
+ process {
+ withName: RSEQC_GENEBODYCOVERAGE {
+ publishDir = [
+ path: { "${params.outdir}/rseqc" },
+ mode: 'copy',
+ enabled: true,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+ }
+
if (params.quantification_method == "bambu") {
process {
withName: BAMBU {
diff --git a/docs/output.md b/docs/output.md
index 42a1a00c..5cb7af6b 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -46,6 +46,23 @@ _Documentation_:
_Description_:
If you would like to run NanoLyse on the raw FASTQ files you can provide `--run_nanolyse` when running the pipeline. By default, the pipeline will filter lambda phage reads. However, you can provide your own FASTA file of "contaminants" with `--nanolyse_fasta`. The filtered FASTQ files will contain raw reads without the specified reference sequences (default: lambda phage sequences).
+## cDNA Read Orientation
+
+
+Output files
+
+- `restrander/_restrander.fq.gz`: FASTQ file of the stranded reads. The reverse strand reads are replaced with their reverse-complements, ensuring that all reads in the output have the same orientation as the original transcripts.
+- `restrander/-unknowns.*_restrander.fq.gz`: FASTQ file of the reads whose strand could not be inferred.
+- `restrander/.restrander.json`: Restrander output statistics - includes artefact and strand statistics.
+
+
+
+_Documentation_:
+[Restrander](https://github.com/mritchielab/restrander)
+
+_Description_:
+Restrander is a program designed for orienting and quality-checking cDNA sequencing reads. Restrander will run automatically if the protocol is cDNA and a Restrander config file is present in the sample sheet.
+
## Read QC
diff --git a/docs/usage.md b/docs/usage.md
index 97dd0a5a..2de4586e 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -10,12 +10,13 @@ You will need to create a file with information about the samples in your experi
| Column | Description |
| ------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `group` | Group identifier for sample. This will be identical for replicate samples from the same experimental group. |
-| `replicate` | Integer representing replicate number. Must start from `1..`. |
-| `barcode` | Barcode identifier attributed to that sample during multiplexing. Must be an integer. |
-| `input_file` | Full path to FastQ file if previously demultiplexed, BAM file if previously aligned, or a path to a directory with subdirectories containing fastq or fast5 files. FastQ file has to be zipped and have the extension ".fastq.gz" or ".fq.gz". BAM file has to have the extension ".bam". |
-| `fasta` | Genome fasta file or transcriptome fasta file for alignment. This can either be a local path, or the appropriate key for a genome available in [iGenomes config file](../conf/igenomes.config). Must have the extension ".fasta", ".fasta.gz", ".fa" or ".fa.gz". |
-| `gtf` | Annotation gtf file for transcript discovery and quantification and RNA modification detection. This can either be blank or a local path. Must have the extension ".gtf". |
+| `group` | Group identifier for sample. This will be identical for replicate samples from the same experimental group. |
+| `replicate` | Integer representing replicate number. Must start from `1..`. |
+| `barcode` | Barcode identifier attributed to that sample during multiplexing. Must be an integer. |
+| `input_file` | Full path to FastQ file if previously demultiplexed, BAM file if previously aligned, or a path to a directory with subdirectories containing fastq or fast5 files. FastQ file has to be zipped and have the extension ".fastq.gz" or ".fq.gz". BAM file has to have the extension ".bam". |
+| `fasta` | Genome fasta file or transcriptome fasta file for alignment. This can either be a local path, or the appropriate key for a genome available in [iGenomes config file](../conf/igenomes.config). Must have the extension ".fasta", ".fasta.gz", ".fa" or ".fa.gz". |
+| `gtf` | Annotation gtf file for transcript discovery and quantification and RNA modification detection. This can either be blank or a local path. Must have the extension ".gtf". |
+| `restrander_config` | Restrander .json config file that provides the template-switching oligo (TSO) and reverse transcription primer (RTP) sequences. Different configurations are used for different library preparation protocols. This can either be blank or a file path. If blank, Restrander will not run for the sample. |
### Skip demultiplexing
@@ -26,13 +27,13 @@ As shown in the examples below, the accepted samplesheet format is different dep
##### Example `samplesheet.csv` for non-demultiplexed fastq inputs
```bash
-group,replicate,barcode,input_file,fasta,gtf
-WT_MOUSE,1,1,,mm10,
-WT_HUMAN,1,2,,hg19,
-WT_POMBE,1,3,,/path/to/local/genome.fa,
-WT_DENOVO,1,4,,,/path/to/local/transcriptome.fa
-WT_LOCAL,2,5,,/path/to/local/genome.fa,/path/to/local/transcriptome.gtf
-WT_UNKNOWN,3,6,,,
+group,replicate,barcode,input_file,fasta,gtf,restrander_config
+WT_MOUSE,1,1,,mm10,,
+WT_HUMAN,1,2,,hg19,,
+WT_POMBE,1,3,,/path/to/local/genome.fa,,
+WT_DENOVO,1,4,,,/path/to/local/transcriptome.fa,
+WT_LOCAL,2,5,,/path/to/local/genome.fa,/path/to/local/transcriptome.gtf,
+WT_UNKNOWN,3,6,,,,
```
##### Example command for non-demultiplexed fastq inputs
@@ -52,11 +53,11 @@ nextflow run nf-core/nanoseq \
##### Example `samplesheet.csv` for demultiplexed fastq inputs
```bash
-group,replicate,barcode,input_file,fasta,gtf
-WT,1,,SAM101A1.fastq.gz,hg19,
-WT,2,,SAM101A2.fastq.gz,hg19,
-KO,1,,SAM101A3.fastq.gz,hg19,
-KO,2,,SAM101A4.fastq.gz,hg19,
+group,replicate,barcode,input_file,fasta,gtf,restrander_config
+WT,1,,SAM101A1.fastq.gz,hg19,,
+WT,2,,SAM101A2.fastq.gz,hg19,,
+KO,1,,SAM101A3.fastq.gz,hg19,,
+KO,2,,SAM101A4.fastq.gz,hg19,,
```
##### Example command for demultiplexed fastq inputs
@@ -74,11 +75,11 @@ nextflow run nf-core/nanoseq \
##### Example `samplesheet.csv` for BAM inputs
```bash
-group,replicate,barcode,input_file,fasta,gtf
-WT,1,,SAM101A1.bam,hg19,
-WT,2,,SAM101A2.bam,hg19,
-KO,1,,SAM101A3.bam,hg19,
-KO,2,,SAM101A4.bam,hg19,
+group,replicate,barcode,input_file,fasta,gtf,restrander_config
+WT,1,,SAM101A1.bam,hg19,,
+WT,2,,SAM101A2.bam,hg19,,
+KO,1,,SAM101A3.bam,hg19,,
+KO,2,,SAM101A4.bam,hg19,,
```
##### Example command for BAM inputs
@@ -97,11 +98,11 @@ nextflow run nf-core/nanoseq \
##### Example `samplesheet.csv` for FAST5 and FASTQ input directories
```bash
-group,replicate,barcode,input_file,fasta,gtf
-WT,1,,/full/path/to/SAM101A1/,hg19.fasta,hg19.gtf
-WT,2,,/full/path/to/SAM101A2/,hg19.fasta,hg19.gtf
-KO,1,,/full/path/to/SAM101A3/,hg19.fasta,hg19.gtf
-KO,2,,/full/path/to/SAM101A4/,hg19.fasta,hg19.gtf
+group,replicate,barcode,input_file,fasta,gtf,restrander_config
+WT,1,,/full/path/to/SAM101A1/,hg19.fasta,hg19.gtf,
+WT,2,,/full/path/to/SAM101A2/,hg19.fasta,hg19.gtf,
+KO,1,,/full/path/to/SAM101A3/,hg19.fasta,hg19.gtf,
+KO,2,,/full/path/to/SAM101A4/,hg19.fasta,hg19.gtf,
```
##### Each of the FAST5 and FASTQ input directory should have the following structure:
@@ -128,6 +129,20 @@ nextflow run nf-core/nanoseq \
-profile
```
+### Using Restrander
+
+Restrander is a program used for orienting and quality-checking cDNA sequencing reads. Restrander will automatically run if the protocol is cDNA and a Restrander config file is present in the sample sheet. Examples of Restrander configuration files for several protocols can be found in the [README](https://github.com/jakob-schuster/restrander-vignette?tab=readme-ov-file#configuration-files) for the Restrander vignette. The sample sheet can have a mix of samples with and without Restrander config files.
+
+##### Example `samplesheet.csv` for using Restrander
+
+```bash
+group,replicate,barcode,input_file,fasta,gtf,restrander_config
+WT,1,1,/full/path/to/SAM101A1/,hg19,hg19.gtf,
+WT,2,2,/full/path/to/SAM101A2/,hg19,hg19.gtf,
+KO,1,3,/full/path/to/SAM101A3/,hg19,hg19.gtf,PCB109.json
+KO,2,4,/full/path/to/SAM101A4/,hg19,hg19.gtf,PCB109.json
+```
+
## Running the pipeline
The typical command for running the pipeline is as follows:
diff --git a/modules/local/restrander.nf b/modules/local/restrander.nf
new file mode 100644
index 00000000..0ea848d3
--- /dev/null
+++ b/modules/local/restrander.nf
@@ -0,0 +1,31 @@
+process RESTRANDER {
+ tag "$meta.id"
+ label 'process_medium'
+
+ container "${'912684371407.dkr.ecr.us-west-2.amazonaws.com/restrander:1.2'}"
+
+ input:
+ tuple val(meta), path(reads), path(input_config)
+
+ output:
+ tuple val(meta), path("*_restrander.fq.gz") , emit: reads
+ tuple val(meta), path("*.restrander.json") , emit: metrics
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def prefix = task.ext.prefix ?: reads.getBaseName()
+ """
+ /restrander \\
+ ${reads} \\
+ ${prefix}_restrander.fq.gz \\
+ ${input_config} > ${prefix}.restrander.json
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ restrander: v1.0.1
+ END_VERSIONS
+ """
+}
diff --git a/modules/local/rseqc_genebodycoverage.nf b/modules/local/rseqc_genebodycoverage.nf
new file mode 100644
index 00000000..c911a823
--- /dev/null
+++ b/modules/local/rseqc_genebodycoverage.nf
@@ -0,0 +1,33 @@
+process RSEQC_GENEBODYCOVERAGE {
+ label 'process_high'
+ container "912684371407.dkr.ecr.us-west-2.amazonaws.com/quay.io/biocontainers/rseqc:3.0.1--py37h516909a_1"
+
+ input:
+ tuple path(bam), path(bai), path(bed12)
+
+ output:
+ path("*.pdf") , emit: pdf
+ path("*.geneBodyCoverage.txt") , emit: rna_txt_ch
+ path("versions.yml") , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def name = bam.getName().replaceAll(/\.bam$/, '')
+
+ """
+ geneBody_coverage.py \\
+ $args \\
+ --refgene=$bed12 \\
+ --input=$bam \\
+ --minimum_length=100 \\
+ --out-prefix=${name}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ rseqc: \$(geneBody_coverage.py --version | sed -e "s/geneBody_coverage.py //g")
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf
index 9ae58381..e4d47b97 100644
--- a/modules/nf-core/fastqc/main.nf
+++ b/modules/nf-core/fastqc/main.nf
@@ -1,6 +1,6 @@
process FASTQC {
tag "$meta.id"
- label 'process_medium'
+ label 'process_high'
conda "bioconda::fastqc=0.11.9"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
diff --git a/modules/nf-core/nanoplot/main.nf b/modules/nf-core/nanoplot/main.nf
index ca0d8454..7ba90a8b 100644
--- a/modules/nf-core/nanoplot/main.nf
+++ b/modules/nf-core/nanoplot/main.nf
@@ -22,12 +22,16 @@ process NANOPLOT {
script:
def args = task.ext.args ?: ''
- def input_file = ("$ontfile".endsWith(".fastq.gz")) ? "--fastq ${ontfile}" :
+ def input_file = ("$ontfile".endsWith(".fastq.gz") || "$ontfile".endsWith(".fq.gz") || "$ontfile".endsWith(".fastq") || "$ontfile".endsWith(".fq")) ? "--fastq ${ontfile}" :
("$ontfile".endsWith(".txt")) ? "--summary ${ontfile}" : ''
"""
NanoPlot \\
$args \\
+ --store \\
+ --raw \\
-t $task.cpus \\
+ -p $meta.id \\
+ --tsv_stats \\
$input_file
cat <<-END_VERSIONS > versions.yml
"${task.process}":
diff --git a/modules/nf-core/seqtk/main.nf b/modules/nf-core/seqtk/main.nf
new file mode 100644
index 00000000..74c1163d
--- /dev/null
+++ b/modules/nf-core/seqtk/main.nf
@@ -0,0 +1,58 @@
+process SEQTK_SAMPLE {
+ tag "$meta.id"
+ label 'process_single'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/seqtk:1.4--he4a0461_1' :
+ 'quay.io/biocontainers/seqtk:1.4--he4a0461_1' }"
+
+ input:
+ tuple val(meta), path(reads), val(sample_size)
+
+ output:
+ tuple val(meta), path("*.fastq.gz"), emit: reads
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ if (!(args ==~ /.*\ -s\ ?[0-9]+.*/)) {
+ args += " -s100"
+ }
+ if ( !sample_size ) {
+ error "SEQTK/SAMPLE must have a sample_size value included"
+ }
+ """
+ printf "%s\\n" $reads | while read f;
+ do
+ seqtk \\
+ sample \\
+ $args \\
+ \$f \\
+ $sample_size \\
+ | gzip --no-name > ${prefix}_\$(basename \$f)
+ done
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//')
+ END_VERSIONS
+ """
+
+ stub:
+ def prefix = task.ext.prefix ?: "${meta.id}"
+
+ """
+ echo "" | gzip > ${prefix}.fastq.gz
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//')
+ END_VERSIONS
+ """
+
+}
\ No newline at end of file
diff --git a/subworkflows/local/align_minimap2.nf b/subworkflows/local/align_minimap2.nf
index 70693a8a..ed0a94b8 100644
--- a/subworkflows/local/align_minimap2.nf
+++ b/subworkflows/local/align_minimap2.nf
@@ -21,8 +21,8 @@ workflow ALIGN_MINIMAP2 {
ch_index
.cross(ch_fastq) { it -> it[-1] }
.flatten()
- .collate(13)
- .map { it -> [ it[7], it[8], it[0], it[1], it[2], it[3], it[4], it[5] ] } // [ sample, fastq, fasta, sizes, gtf, bed, is_transcripts, index ]
+ .collate(14)
+ .map { it -> [ it[7], it[8], it[0], it[1], it[2], it[3], it[4], it[5] ] } // [ meta, fastq, fasta, sizes, gtf, bed, is_transcripts, index ]
.set { ch_index }
/*
diff --git a/subworkflows/local/bedtools_ucsc_bigbed.nf b/subworkflows/local/bedtools_ucsc_bigbed.nf
index 0a8d94b3..33216e77 100644
--- a/subworkflows/local/bedtools_ucsc_bigbed.nf
+++ b/subworkflows/local/bedtools_ucsc_bigbed.nf
@@ -26,6 +26,7 @@ workflow BEDTOOLS_UCSC_BIGBED {
emit:
bedtools_version
+ ch_bed12
ch_bigbed
bed12tobigbed_version
}
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index f38041df..ad8f9440 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -27,8 +27,11 @@ workflow INPUT_CHECK {
// Function to resolve fasta and gtf file if using iGenomes
// Returns [ sample, input_file, barcode, fasta, gtf, is_transcripts, annotation_str, nanopolish_fast5 ]
def get_sample_info(LinkedHashMap sample, LinkedHashMap genomeMap) {
+ print(sample)
def meta = [:]
meta.id = sample.sample
+ meta.restrander_config = sample.restrander_config
+
// Resolve fasta and gtf file if using iGenomes
def fasta = false
diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf
index e6047cc0..189a9642 100644
--- a/subworkflows/local/prepare_genome.nf
+++ b/subworkflows/local/prepare_genome.nf
@@ -11,10 +11,10 @@ workflow PREPARE_GENOME {
ch_fastq
main:
- // Get unique list of all fasta files
+ // Get unique list of all fasta files - reference FASTA is at position 2
ch_fastq
.filter { it[2] }
- .map { it -> [ it[2], it[5].toString() ] } // [ fasta, annotation_str ]
+ .map { it -> [ it[2], it[6].toString() ] } // [ fasta, annotation_str ]
.unique()
.set { ch_fastq_sizes }
@@ -25,10 +25,10 @@ workflow PREPARE_GENOME {
ch_chrom_sizes = GET_CHROM_SIZES.out.sizes
samtools_version = GET_CHROM_SIZES.out.versions
- // Get unique list of all gtf files
+ // Get unique list of all gtf files - GTF is at position 3 in the tuple
ch_fastq
.filter { it[3] }
- .map { it -> [ it[3], it[5] ] } // [ gtf, annotation_str ]
+ .map { it -> [ it[3], it[6] ] } // [ gtf, annotation_str ]
.unique()
.set { ch_fastq_gtf }
@@ -44,8 +44,8 @@ workflow PREPARE_GENOME {
.map { it -> [ it[1], it[2], it[0] ] }
.cross(ch_fastq) { it -> it[-1] }
.flatten()
- .collate(9)
- .map { it -> [ it[5], it[0], it[6], it[1], it[7], it[8] ]} // [ fasta, sizes, gtf, bed, is_transcripts, annotation_str ]
+ .collate(10)
+ .map { it -> [ it[5], it[0], it[6], it[1], it[8], it[9] ]} // [ fasta, sizes, gtf, bed, is_transcripts, annotation_str ]
.unique()
.set { ch_fasta_index }
@@ -54,7 +54,7 @@ workflow PREPARE_GENOME {
*/
ch_fastq
.filter { it[2] }
- .map { it -> [ it[0], it[2] ] } // [ gtf, annotation_str ]
+ .map { it -> [ it[0], it[2] ] } // [ meta, fasta ]
.unique()
.set { ch_fasta }
diff --git a/workflows/nanoseq.nf b/workflows/nanoseq.nf
index ba1c377e..e20583e1 100644
--- a/workflows/nanoseq.nf
+++ b/workflows/nanoseq.nf
@@ -89,6 +89,12 @@ if (!params.skip_quantification) {
}
}
+if (params.downsample_depth) {
+ if (params.downsample_depth < 1 ) {
+ exit 1, "Invalid downsampling value: ${params.downsample_depth}. Must be greater than 0."
+ }
+}
+
////////////////////////////////////////////////////
/* -- CONFIG FILES -- */
////////////////////////////////////////////////////
@@ -105,12 +111,15 @@ include { GET_NANOLYSE_FASTA } from '../modules/local/get_nanolyse_fasta'
include { QCAT } from '../modules/local/qcat'
include { BAM_RENAME } from '../modules/local/bam_rename'
include { BAMBU } from '../modules/local/bambu'
+include { RSEQC_GENEBODYCOVERAGE} from '../modules/local/rseqc_genebodycoverage'
include { MULTIQC } from '../modules/local/multiqc'
+include { RESTRANDER } from '../modules/local/restrander'
/*
* SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
*/
+
include { INPUT_CHECK } from '../subworkflows/local/input_check'
include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome'
include { QCFASTQ_NANOPLOT_FASTQC } from '../subworkflows/local/qcfastq_nanoplot_fastqc'
@@ -134,6 +143,7 @@ include { RNA_FUSIONS_JAFFAL } from '../subworkflows/local/rna_fus
* MODULE: Installed directly from nf-core/modules
*/
include { NANOLYSE } from '../modules/nf-core/nanolyse/main'
+include { SEQTK_SAMPLE } from '../modules/nf-core/seqtk/main'
include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main'
/*
@@ -209,6 +219,23 @@ workflow NANOSEQ{
ch_fastq = Channel.empty()
}
}
+ // check if we need to do downsampling on ch_fastq if so then do it and update
+
+ if (params.downsample_depth) {
+
+ ch_fastq
+ .map { it -> [ it[0], it[1], params.downsample_depth ] }
+ .set { ch_for_seqtk }
+
+ SEQTK_SAMPLE( ch_for_seqtk )
+ ch_software_versions = ch_software_versions.mix(SEQTK_SAMPLE.out.versions)
+
+ SEQTK_SAMPLE.out.reads.join(ch_fastq).set{ joined_seqtk}
+
+ joined_seqtk
+ .map { it -> [ it[0], it[1], it[3], it[4], it[5], it[6] ] }
+ .set { ch_fastq }
+ }
if (params.run_nanolyse) {
ch_fastq
@@ -230,6 +257,7 @@ workflow NANOSEQ{
* MODULE: DNA contaminant removal using NanoLyse
*/
NANOLYSE ( ch_fastq_nanolyse, ch_nanolyse_fasta )
+
NANOLYSE.out.fastq
.join( ch_sample )
.map { it -> [ it[0], it[1], it[3], it[4], it[5], it[6] ]}
@@ -237,6 +265,47 @@ workflow NANOSEQ{
ch_software_versions = ch_software_versions.mix(NANOLYSE.out.versions.first().ifEmpty(null))
}
+ if (params.protocol == 'cDNA'){
+
+ // split the fastq channel into two branches - samples with and without restrander_config
+ ch_fastq.branch{
+ config_provided: it[0].restrander_config != null && it[0].restrander_config != ''
+ no_config: it[0].restrander_config == null || it[0].restrander_config == ''
+ }.set { ch_fastq_branch }
+
+ // only run Restrander on the branch with config provided
+ ch_fastq_branch.config_provided
+ .map { it -> [ it[0], it[1], it[0].restrander_config] }
+ .set { ch_fastq_restrander }
+
+ /*
+ * MODULE: Orientate and quality check cDNA reads with Restrander
+ */
+ RESTRANDER ( ch_fastq_restrander )
+
+ RESTRANDER.out.reads
+ .join(ch_fastq_branch.config_provided, by: 0)
+ .map { tuple ->
+ def meta = tuple[0]
+ def restranded_files = tuple[1]
+ def main_restranded_file = restranded_files[1]
+
+
+ [ meta, main_restranded_file, tuple[3], tuple[4], tuple[5], tuple[6] ]
+ }
+ // merge the restranded files with the rest of the fastq files
+ .mix(ch_fastq_branch.no_config)
+ .set { ch_fastq }
+
+ ch_software_versions = ch_software_versions.mix(RESTRANDER.out.versions.first().ifEmpty(null))
+ }
+
+ // Extract the GTF file from combined string, add it as own element in the channel
+ ch_fastq
+ .map { it -> [ it[0], it[1], it[2], it[5].toString().split(';')[1],
+ it[3], it[4], it[5] ] }
+ .set { ch_fastq }
+
ch_fastqc_multiqc = Channel.empty()
if (!params.skip_qc) {
@@ -361,6 +430,9 @@ workflow NANOSEQ{
// MULTIPLE_CONDITIONS = ch_sample.map { it -> it[0].split('_')[0..-2].join('_') }.unique().count().val > 1
ch_r_version = Channel.empty()
+
+
+
if (params.quantification_method == 'bambu') {
ch_sample
.map { it -> [ it[2], it[3] ]}
@@ -387,6 +459,13 @@ workflow NANOSEQ{
ch_featurecounts_gene_multiqc = QUANTIFY_STRINGTIE_FEATURECOUNTS.out.featurecounts_gene_multiqc.ifEmpty([])
ch_featurecounts_transcript_multiqc = QUANTIFY_STRINGTIE_FEATURECOUNTS.out.featurecounts_transcript_multiqc.ifEmpty([])
}
+
+ RSEQC_GENEBODYCOVERAGE (
+ ch_view_sortbam
+ .join( BEDTOOLS_UCSC_BIGBED.out.ch_bed12 )
+ .map { it -> [ it[3], it[4], it[6] ] }
+ )
+
if (!params.skip_differential_analysis) {
/*