diff --git a/aws_batch.config b/aws_batch.config new file mode 100644 index 00000000..e2eb8984 --- /dev/null +++ b/aws_batch.config @@ -0,0 +1,36 @@ +/* +======================================================================================== + wmg_nextflow/masterworkflow Nextflow AWS Batch config file +======================================================================================== + Default config options for AWS Batch +---------------------------------------------------------------------------------------- +*/ + + +params { + awsqueue = 'nextflow-with-dockerhub-aws-batch-large' + awsregion = 'us-west-2' + run = 'default' + // Max resource options + max_memory = '256.GB' + max_cpus = 256 + max_time = '240.h' + outdir = "s3://watchmaker-lts/nanoseq/${params.run}/" +} + + + + +process { + executor = 'awsbatch' + queue = 'nextflow-with-dockerhub-aws-batch-large' +} + +aws { + batch { + cliPath = '/home/ec2-user/miniconda/bin/aws' + } + region = 'us-west-2' +} + +workDir = "s3://watchmaker-lts/nanoseq/work/" diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index a10f4479..32a54035 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -49,11 +49,11 @@ def read_head(handle, num_lines=10): def check_samplesheet(file_in, updated_path, file_out): """ This function checks that the samplesheet follows the following structure: - group,replicate,barcode,input_file,fasta,gtf - MCF7,1,,MCF7_directcDNA_replicate1.fastq.gz,genome.fa, - MCF7,2,,MCF7_directcDNA_replicate3.fastq.gz,genome.fa,genome.gtf - K562,1,,K562_directcDNA_replicate1.fastq.gz,genome.fa, - K562,2,,K562_directcDNA_replicate4.fastq.gz,,transcripts.fa + group,replicate,barcode,input_file,fasta,gtf,restrander_config + MCF7,1,,MCF7_directcDNA_replicate1.fastq.gz,genome.fa,,restrander_config.json + MCF7,2,,MCF7_directcDNA_replicate3.fastq.gz,genome.fa,genome.gtf, restrander_config.json + K562,1,,K562_directcDNA_replicate1.fastq.gz,genome.fa,, + K562,2,,K562_directcDNA_replicate4.fastq.gz,,transcripts.fa, """ input_extensions = [] @@ -61,7 +61,7 @@ def check_samplesheet(file_in, updated_path, file_out): with open(file_in, "r") as fin: ## Check header MIN_COLS = 3 - HEADER = ["group", "replicate", "barcode", "input_file", "fasta", "gtf"] + HEADER = ["group", "replicate", "barcode", "input_file", "fasta", "gtf", "restrander_config"] header = fin.readline().strip().split(",") if header[: len(HEADER)] != HEADER: print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER))) @@ -80,7 +80,7 @@ def check_samplesheet(file_in, updated_path, file_out): print_error("Invalid number of populated columns (minimum = {})!".format(MIN_COLS), "Line", line) ## Check group name entries - group, replicate, barcode, input_file, fasta, gtf = lspl[: len(HEADER)] + group, replicate, barcode, input_file, fasta, gtf, restrander_config = lspl[: len(HEADER)] if group: if group.find(" ") != -1: print_error("Group entry contains spaces!", "Line", line) @@ -177,8 +177,8 @@ def check_samplesheet(file_in, updated_path, file_out): # is_transcripts = '1' # genome = transcriptome - ## Create sample mapping dictionary = {group: {replicate : [ barcode, input_file, genome, gtf, is_transcripts, nanopolish_fast5 ]}} - sample_info = [barcode, input_file, fasta, gtf, is_transcripts, nanopolish_fast5] + ## Create sample mapping dictionary = {group: {replicate : [ barcode, input_file, genome, gtf, is_transcripts, nanopolish_fast5, restrander_config ]}} + sample_info = [barcode, input_file, fasta, gtf, is_transcripts, nanopolish_fast5, restrander_config] if group not in sample_info_dict: sample_info_dict[group] = {} if replicate not in sample_info_dict[group]: @@ -200,7 +200,7 @@ def check_samplesheet(file_in, updated_path, file_out): make_dir(out_dir) with open(file_out, "w") as fout: fout.write( - ",".join(["sample", "barcode", "input_file", "fasta", "gtf", "is_transcripts", "nanopolish_fast5"]) + ",".join(["sample", "barcode", "input_file", "fasta", "gtf", "is_transcripts", "nanopolish_fast5", "restrander_config"]) + "\n" ) for sample in sorted(sample_info_dict.keys()): diff --git a/conf/modules.config b/conf/modules.config index ba13b442..40a066c6 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -45,6 +45,16 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + + // Publish dir for RESTRANDER + withName: RESTRANDER { + publishDir = [ + path: { "${params.outdir}/restrander" }, + mode: 'copy', + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } } if (!params.skip_demultiplexing) { @@ -467,6 +477,8 @@ if (params.call_variants) { ] } } + + } if (params.structural_variant_caller == 'sniffles') { process { @@ -535,6 +547,17 @@ if (params.call_variants) { } if (!params.skip_quantification) { + process { + withName: RSEQC_GENEBODYCOVERAGE { + publishDir = [ + path: { "${params.outdir}/rseqc" }, + mode: 'copy', + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + if (params.quantification_method == "bambu") { process { withName: BAMBU { diff --git a/docs/output.md b/docs/output.md index 42a1a00c..5cb7af6b 100644 --- a/docs/output.md +++ b/docs/output.md @@ -46,6 +46,23 @@ _Documentation_: _Description_: If you would like to run NanoLyse on the raw FASTQ files you can provide `--run_nanolyse` when running the pipeline. By default, the pipeline will filter lambda phage reads. However, you can provide your own FASTA file of "contaminants" with `--nanolyse_fasta`. The filtered FASTQ files will contain raw reads without the specified reference sequences (default: lambda phage sequences). +## cDNA Read Orientation + +
+Output files + +- `restrander/_restrander.fq.gz`: FASTQ file of the stranded reads. The reverse strand reads are replaced with their reverse-complements, ensuring that all reads in the output have the same orientation as the original transcripts. +- `restrander/-unknowns.*_restrander.fq.gz`: FASTQ file of the reads whose strand could not be inferred. +- `restrander/.restrander.json`: Restrander output statistics - includes artefact and strand statistics. + +
+ +_Documentation_: +[Restrander](https://github.com/mritchielab/restrander) + +_Description_: +Restrander is a program designed for orienting and quality-checking cDNA sequencing reads. Restrander will run automatically if the protocol is cDNA and a Restrander config file is present in the sample sheet. + ## Read QC
diff --git a/docs/usage.md b/docs/usage.md index 97dd0a5a..2de4586e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -10,12 +10,13 @@ You will need to create a file with information about the samples in your experi | Column | Description | | ------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `group` | Group identifier for sample. This will be identical for replicate samples from the same experimental group. | -| `replicate` | Integer representing replicate number. Must start from `1..`. | -| `barcode` | Barcode identifier attributed to that sample during multiplexing. Must be an integer. | -| `input_file` | Full path to FastQ file if previously demultiplexed, BAM file if previously aligned, or a path to a directory with subdirectories containing fastq or fast5 files. FastQ file has to be zipped and have the extension ".fastq.gz" or ".fq.gz". BAM file has to have the extension ".bam". | -| `fasta` | Genome fasta file or transcriptome fasta file for alignment. This can either be a local path, or the appropriate key for a genome available in [iGenomes config file](../conf/igenomes.config). Must have the extension ".fasta", ".fasta.gz", ".fa" or ".fa.gz". | -| `gtf` | Annotation gtf file for transcript discovery and quantification and RNA modification detection. This can either be blank or a local path. Must have the extension ".gtf". | +| `group` | Group identifier for sample. This will be identical for replicate samples from the same experimental group. | +| `replicate` | Integer representing replicate number. Must start from `1..`. | +| `barcode` | Barcode identifier attributed to that sample during multiplexing. Must be an integer. | +| `input_file` | Full path to FastQ file if previously demultiplexed, BAM file if previously aligned, or a path to a directory with subdirectories containing fastq or fast5 files. FastQ file has to be zipped and have the extension ".fastq.gz" or ".fq.gz". BAM file has to have the extension ".bam". | +| `fasta` | Genome fasta file or transcriptome fasta file for alignment. This can either be a local path, or the appropriate key for a genome available in [iGenomes config file](../conf/igenomes.config). Must have the extension ".fasta", ".fasta.gz", ".fa" or ".fa.gz". | +| `gtf` | Annotation gtf file for transcript discovery and quantification and RNA modification detection. This can either be blank or a local path. Must have the extension ".gtf". | +| `restrander_config` | Restrander .json config file that provides the template-switching oligo (TSO) and reverse transcription primer (RTP) sequences. Different configurations are used for different library preparation protocols. This can either be blank or a file path. If blank, Restrander will not run for the sample. | ### Skip demultiplexing @@ -26,13 +27,13 @@ As shown in the examples below, the accepted samplesheet format is different dep ##### Example `samplesheet.csv` for non-demultiplexed fastq inputs ```bash -group,replicate,barcode,input_file,fasta,gtf -WT_MOUSE,1,1,,mm10, -WT_HUMAN,1,2,,hg19, -WT_POMBE,1,3,,/path/to/local/genome.fa, -WT_DENOVO,1,4,,,/path/to/local/transcriptome.fa -WT_LOCAL,2,5,,/path/to/local/genome.fa,/path/to/local/transcriptome.gtf -WT_UNKNOWN,3,6,,, +group,replicate,barcode,input_file,fasta,gtf,restrander_config +WT_MOUSE,1,1,,mm10,, +WT_HUMAN,1,2,,hg19,, +WT_POMBE,1,3,,/path/to/local/genome.fa,, +WT_DENOVO,1,4,,,/path/to/local/transcriptome.fa, +WT_LOCAL,2,5,,/path/to/local/genome.fa,/path/to/local/transcriptome.gtf, +WT_UNKNOWN,3,6,,,, ``` ##### Example command for non-demultiplexed fastq inputs @@ -52,11 +53,11 @@ nextflow run nf-core/nanoseq \ ##### Example `samplesheet.csv` for demultiplexed fastq inputs ```bash -group,replicate,barcode,input_file,fasta,gtf -WT,1,,SAM101A1.fastq.gz,hg19, -WT,2,,SAM101A2.fastq.gz,hg19, -KO,1,,SAM101A3.fastq.gz,hg19, -KO,2,,SAM101A4.fastq.gz,hg19, +group,replicate,barcode,input_file,fasta,gtf,restrander_config +WT,1,,SAM101A1.fastq.gz,hg19,, +WT,2,,SAM101A2.fastq.gz,hg19,, +KO,1,,SAM101A3.fastq.gz,hg19,, +KO,2,,SAM101A4.fastq.gz,hg19,, ``` ##### Example command for demultiplexed fastq inputs @@ -74,11 +75,11 @@ nextflow run nf-core/nanoseq \ ##### Example `samplesheet.csv` for BAM inputs ```bash -group,replicate,barcode,input_file,fasta,gtf -WT,1,,SAM101A1.bam,hg19, -WT,2,,SAM101A2.bam,hg19, -KO,1,,SAM101A3.bam,hg19, -KO,2,,SAM101A4.bam,hg19, +group,replicate,barcode,input_file,fasta,gtf,restrander_config +WT,1,,SAM101A1.bam,hg19,, +WT,2,,SAM101A2.bam,hg19,, +KO,1,,SAM101A3.bam,hg19,, +KO,2,,SAM101A4.bam,hg19,, ``` ##### Example command for BAM inputs @@ -97,11 +98,11 @@ nextflow run nf-core/nanoseq \ ##### Example `samplesheet.csv` for FAST5 and FASTQ input directories ```bash -group,replicate,barcode,input_file,fasta,gtf -WT,1,,/full/path/to/SAM101A1/,hg19.fasta,hg19.gtf -WT,2,,/full/path/to/SAM101A2/,hg19.fasta,hg19.gtf -KO,1,,/full/path/to/SAM101A3/,hg19.fasta,hg19.gtf -KO,2,,/full/path/to/SAM101A4/,hg19.fasta,hg19.gtf +group,replicate,barcode,input_file,fasta,gtf,restrander_config +WT,1,,/full/path/to/SAM101A1/,hg19.fasta,hg19.gtf, +WT,2,,/full/path/to/SAM101A2/,hg19.fasta,hg19.gtf, +KO,1,,/full/path/to/SAM101A3/,hg19.fasta,hg19.gtf, +KO,2,,/full/path/to/SAM101A4/,hg19.fasta,hg19.gtf, ``` ##### Each of the FAST5 and FASTQ input directory should have the following structure: @@ -128,6 +129,20 @@ nextflow run nf-core/nanoseq \ -profile ``` +### Using Restrander + +Restrander is a program used for orienting and quality-checking cDNA sequencing reads. Restrander will automatically run if the protocol is cDNA and a Restrander config file is present in the sample sheet. Examples of Restrander configuration files for several protocols can be found in the [README](https://github.com/jakob-schuster/restrander-vignette?tab=readme-ov-file#configuration-files) for the Restrander vignette. The sample sheet can have a mix of samples with and without Restrander config files. + +##### Example `samplesheet.csv` for using Restrander + +```bash +group,replicate,barcode,input_file,fasta,gtf,restrander_config +WT,1,1,/full/path/to/SAM101A1/,hg19,hg19.gtf, +WT,2,2,/full/path/to/SAM101A2/,hg19,hg19.gtf, +KO,1,3,/full/path/to/SAM101A3/,hg19,hg19.gtf,PCB109.json +KO,2,4,/full/path/to/SAM101A4/,hg19,hg19.gtf,PCB109.json +``` + ## Running the pipeline The typical command for running the pipeline is as follows: diff --git a/modules/local/restrander.nf b/modules/local/restrander.nf new file mode 100644 index 00000000..0ea848d3 --- /dev/null +++ b/modules/local/restrander.nf @@ -0,0 +1,31 @@ +process RESTRANDER { + tag "$meta.id" + label 'process_medium' + + container "${'912684371407.dkr.ecr.us-west-2.amazonaws.com/restrander:1.2'}" + + input: + tuple val(meta), path(reads), path(input_config) + + output: + tuple val(meta), path("*_restrander.fq.gz") , emit: reads + tuple val(meta), path("*.restrander.json") , emit: metrics + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: reads.getBaseName() + """ + /restrander \\ + ${reads} \\ + ${prefix}_restrander.fq.gz \\ + ${input_config} > ${prefix}.restrander.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + restrander: v1.0.1 + END_VERSIONS + """ +} diff --git a/modules/local/rseqc_genebodycoverage.nf b/modules/local/rseqc_genebodycoverage.nf new file mode 100644 index 00000000..c911a823 --- /dev/null +++ b/modules/local/rseqc_genebodycoverage.nf @@ -0,0 +1,33 @@ +process RSEQC_GENEBODYCOVERAGE { + label 'process_high' + container "912684371407.dkr.ecr.us-west-2.amazonaws.com/quay.io/biocontainers/rseqc:3.0.1--py37h516909a_1" + + input: + tuple path(bam), path(bai), path(bed12) + + output: + path("*.pdf") , emit: pdf + path("*.geneBodyCoverage.txt") , emit: rna_txt_ch + path("versions.yml") , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def name = bam.getName().replaceAll(/\.bam$/, '') + + """ + geneBody_coverage.py \\ + $args \\ + --refgene=$bed12 \\ + --input=$bam \\ + --minimum_length=100 \\ + --out-prefix=${name} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + rseqc: \$(geneBody_coverage.py --version | sed -e "s/geneBody_coverage.py //g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf index 9ae58381..e4d47b97 100644 --- a/modules/nf-core/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -1,6 +1,6 @@ process FASTQC { tag "$meta.id" - label 'process_medium' + label 'process_high' conda "bioconda::fastqc=0.11.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/nf-core/nanoplot/main.nf b/modules/nf-core/nanoplot/main.nf index ca0d8454..7ba90a8b 100644 --- a/modules/nf-core/nanoplot/main.nf +++ b/modules/nf-core/nanoplot/main.nf @@ -22,12 +22,16 @@ process NANOPLOT { script: def args = task.ext.args ?: '' - def input_file = ("$ontfile".endsWith(".fastq.gz")) ? "--fastq ${ontfile}" : + def input_file = ("$ontfile".endsWith(".fastq.gz") || "$ontfile".endsWith(".fq.gz") || "$ontfile".endsWith(".fastq") || "$ontfile".endsWith(".fq")) ? "--fastq ${ontfile}" : ("$ontfile".endsWith(".txt")) ? "--summary ${ontfile}" : '' """ NanoPlot \\ $args \\ + --store \\ + --raw \\ -t $task.cpus \\ + -p $meta.id \\ + --tsv_stats \\ $input_file cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/seqtk/main.nf b/modules/nf-core/seqtk/main.nf new file mode 100644 index 00000000..74c1163d --- /dev/null +++ b/modules/nf-core/seqtk/main.nf @@ -0,0 +1,58 @@ +process SEQTK_SAMPLE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqtk:1.4--he4a0461_1' : + 'quay.io/biocontainers/seqtk:1.4--he4a0461_1' }" + + input: + tuple val(meta), path(reads), val(sample_size) + + output: + tuple val(meta), path("*.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if (!(args ==~ /.*\ -s\ ?[0-9]+.*/)) { + args += " -s100" + } + if ( !sample_size ) { + error "SEQTK/SAMPLE must have a sample_size value included" + } + """ + printf "%s\\n" $reads | while read f; + do + seqtk \\ + sample \\ + $args \\ + \$f \\ + $sample_size \\ + | gzip --no-name > ${prefix}_\$(basename \$f) + done + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + echo "" | gzip > ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + +} \ No newline at end of file diff --git a/subworkflows/local/align_minimap2.nf b/subworkflows/local/align_minimap2.nf index 70693a8a..ed0a94b8 100644 --- a/subworkflows/local/align_minimap2.nf +++ b/subworkflows/local/align_minimap2.nf @@ -21,8 +21,8 @@ workflow ALIGN_MINIMAP2 { ch_index .cross(ch_fastq) { it -> it[-1] } .flatten() - .collate(13) - .map { it -> [ it[7], it[8], it[0], it[1], it[2], it[3], it[4], it[5] ] } // [ sample, fastq, fasta, sizes, gtf, bed, is_transcripts, index ] + .collate(14) + .map { it -> [ it[7], it[8], it[0], it[1], it[2], it[3], it[4], it[5] ] } // [ meta, fastq, fasta, sizes, gtf, bed, is_transcripts, index ] .set { ch_index } /* diff --git a/subworkflows/local/bedtools_ucsc_bigbed.nf b/subworkflows/local/bedtools_ucsc_bigbed.nf index 0a8d94b3..33216e77 100644 --- a/subworkflows/local/bedtools_ucsc_bigbed.nf +++ b/subworkflows/local/bedtools_ucsc_bigbed.nf @@ -26,6 +26,7 @@ workflow BEDTOOLS_UCSC_BIGBED { emit: bedtools_version + ch_bed12 ch_bigbed bed12tobigbed_version } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index f38041df..ad8f9440 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -27,8 +27,11 @@ workflow INPUT_CHECK { // Function to resolve fasta and gtf file if using iGenomes // Returns [ sample, input_file, barcode, fasta, gtf, is_transcripts, annotation_str, nanopolish_fast5 ] def get_sample_info(LinkedHashMap sample, LinkedHashMap genomeMap) { + print(sample) def meta = [:] meta.id = sample.sample + meta.restrander_config = sample.restrander_config + // Resolve fasta and gtf file if using iGenomes def fasta = false diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index e6047cc0..189a9642 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -11,10 +11,10 @@ workflow PREPARE_GENOME { ch_fastq main: - // Get unique list of all fasta files + // Get unique list of all fasta files - reference FASTA is at position 2 ch_fastq .filter { it[2] } - .map { it -> [ it[2], it[5].toString() ] } // [ fasta, annotation_str ] + .map { it -> [ it[2], it[6].toString() ] } // [ fasta, annotation_str ] .unique() .set { ch_fastq_sizes } @@ -25,10 +25,10 @@ workflow PREPARE_GENOME { ch_chrom_sizes = GET_CHROM_SIZES.out.sizes samtools_version = GET_CHROM_SIZES.out.versions - // Get unique list of all gtf files + // Get unique list of all gtf files - GTF is at position 3 in the tuple ch_fastq .filter { it[3] } - .map { it -> [ it[3], it[5] ] } // [ gtf, annotation_str ] + .map { it -> [ it[3], it[6] ] } // [ gtf, annotation_str ] .unique() .set { ch_fastq_gtf } @@ -44,8 +44,8 @@ workflow PREPARE_GENOME { .map { it -> [ it[1], it[2], it[0] ] } .cross(ch_fastq) { it -> it[-1] } .flatten() - .collate(9) - .map { it -> [ it[5], it[0], it[6], it[1], it[7], it[8] ]} // [ fasta, sizes, gtf, bed, is_transcripts, annotation_str ] + .collate(10) + .map { it -> [ it[5], it[0], it[6], it[1], it[8], it[9] ]} // [ fasta, sizes, gtf, bed, is_transcripts, annotation_str ] .unique() .set { ch_fasta_index } @@ -54,7 +54,7 @@ workflow PREPARE_GENOME { */ ch_fastq .filter { it[2] } - .map { it -> [ it[0], it[2] ] } // [ gtf, annotation_str ] + .map { it -> [ it[0], it[2] ] } // [ meta, fasta ] .unique() .set { ch_fasta } diff --git a/workflows/nanoseq.nf b/workflows/nanoseq.nf index ba1c377e..e20583e1 100644 --- a/workflows/nanoseq.nf +++ b/workflows/nanoseq.nf @@ -89,6 +89,12 @@ if (!params.skip_quantification) { } } +if (params.downsample_depth) { + if (params.downsample_depth < 1 ) { + exit 1, "Invalid downsampling value: ${params.downsample_depth}. Must be greater than 0." + } +} + //////////////////////////////////////////////////// /* -- CONFIG FILES -- */ //////////////////////////////////////////////////// @@ -105,12 +111,15 @@ include { GET_NANOLYSE_FASTA } from '../modules/local/get_nanolyse_fasta' include { QCAT } from '../modules/local/qcat' include { BAM_RENAME } from '../modules/local/bam_rename' include { BAMBU } from '../modules/local/bambu' +include { RSEQC_GENEBODYCOVERAGE} from '../modules/local/rseqc_genebodycoverage' include { MULTIQC } from '../modules/local/multiqc' +include { RESTRANDER } from '../modules/local/restrander' /* * SUBWORKFLOW: Consisting of a mix of local and nf-core/modules */ + include { INPUT_CHECK } from '../subworkflows/local/input_check' include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' include { QCFASTQ_NANOPLOT_FASTQC } from '../subworkflows/local/qcfastq_nanoplot_fastqc' @@ -134,6 +143,7 @@ include { RNA_FUSIONS_JAFFAL } from '../subworkflows/local/rna_fus * MODULE: Installed directly from nf-core/modules */ include { NANOLYSE } from '../modules/nf-core/nanolyse/main' +include { SEQTK_SAMPLE } from '../modules/nf-core/seqtk/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' /* @@ -209,6 +219,23 @@ workflow NANOSEQ{ ch_fastq = Channel.empty() } } + // check if we need to do downsampling on ch_fastq if so then do it and update + + if (params.downsample_depth) { + + ch_fastq + .map { it -> [ it[0], it[1], params.downsample_depth ] } + .set { ch_for_seqtk } + + SEQTK_SAMPLE( ch_for_seqtk ) + ch_software_versions = ch_software_versions.mix(SEQTK_SAMPLE.out.versions) + + SEQTK_SAMPLE.out.reads.join(ch_fastq).set{ joined_seqtk} + + joined_seqtk + .map { it -> [ it[0], it[1], it[3], it[4], it[5], it[6] ] } + .set { ch_fastq } + } if (params.run_nanolyse) { ch_fastq @@ -230,6 +257,7 @@ workflow NANOSEQ{ * MODULE: DNA contaminant removal using NanoLyse */ NANOLYSE ( ch_fastq_nanolyse, ch_nanolyse_fasta ) + NANOLYSE.out.fastq .join( ch_sample ) .map { it -> [ it[0], it[1], it[3], it[4], it[5], it[6] ]} @@ -237,6 +265,47 @@ workflow NANOSEQ{ ch_software_versions = ch_software_versions.mix(NANOLYSE.out.versions.first().ifEmpty(null)) } + if (params.protocol == 'cDNA'){ + + // split the fastq channel into two branches - samples with and without restrander_config + ch_fastq.branch{ + config_provided: it[0].restrander_config != null && it[0].restrander_config != '' + no_config: it[0].restrander_config == null || it[0].restrander_config == '' + }.set { ch_fastq_branch } + + // only run Restrander on the branch with config provided + ch_fastq_branch.config_provided + .map { it -> [ it[0], it[1], it[0].restrander_config] } + .set { ch_fastq_restrander } + + /* + * MODULE: Orientate and quality check cDNA reads with Restrander + */ + RESTRANDER ( ch_fastq_restrander ) + + RESTRANDER.out.reads + .join(ch_fastq_branch.config_provided, by: 0) + .map { tuple -> + def meta = tuple[0] + def restranded_files = tuple[1] + def main_restranded_file = restranded_files[1] + + + [ meta, main_restranded_file, tuple[3], tuple[4], tuple[5], tuple[6] ] + } + // merge the restranded files with the rest of the fastq files + .mix(ch_fastq_branch.no_config) + .set { ch_fastq } + + ch_software_versions = ch_software_versions.mix(RESTRANDER.out.versions.first().ifEmpty(null)) + } + + // Extract the GTF file from combined string, add it as own element in the channel + ch_fastq + .map { it -> [ it[0], it[1], it[2], it[5].toString().split(';')[1], + it[3], it[4], it[5] ] } + .set { ch_fastq } + ch_fastqc_multiqc = Channel.empty() if (!params.skip_qc) { @@ -361,6 +430,9 @@ workflow NANOSEQ{ // MULTIPLE_CONDITIONS = ch_sample.map { it -> it[0].split('_')[0..-2].join('_') }.unique().count().val > 1 ch_r_version = Channel.empty() + + + if (params.quantification_method == 'bambu') { ch_sample .map { it -> [ it[2], it[3] ]} @@ -387,6 +459,13 @@ workflow NANOSEQ{ ch_featurecounts_gene_multiqc = QUANTIFY_STRINGTIE_FEATURECOUNTS.out.featurecounts_gene_multiqc.ifEmpty([]) ch_featurecounts_transcript_multiqc = QUANTIFY_STRINGTIE_FEATURECOUNTS.out.featurecounts_transcript_multiqc.ifEmpty([]) } + + RSEQC_GENEBODYCOVERAGE ( + ch_view_sortbam + .join( BEDTOOLS_UCSC_BIGBED.out.ch_bed12 ) + .map { it -> [ it[3], it[4], it[6] ] } + ) + if (!params.skip_differential_analysis) { /*