diff --git a/conf/modules.config b/conf/modules.config index eaa35f259..600bd6c30 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -146,7 +146,7 @@ process { if (!params.skip_bbsplit && params.bbsplit_fasta_list) { process { - withName: '.*:PREPARE_GENOME:BBMAP_BBSPLIT' { + withName: '.*:UNCOMPRESS_GENOME:BBMAP_BBSPLIT' { ext.args = 'build=1' publishDir = [ path: { "${params.outdir}/genome/index" }, diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index b83126cd7..f6f9a25bc 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -1,25 +1,11 @@ // -// Uncompress and prepare reference genome files +// Prepare reference genome files // -include { GUNZIP as GUNZIP_FASTA } from '../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_GTF } from '../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_GFF } from '../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_GENE_BED } from '../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_TRANSCRIPT_FASTA } from '../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_ADDITIONAL_FASTA } from '../../modules/nf-core/gunzip/main' - -include { UNTAR as UNTAR_BBSPLIT_INDEX } from '../../modules/nf-core/untar/main' -include { UNTAR as UNTAR_STAR_INDEX } from '../../modules/nf-core/untar/main' -include { UNTAR as UNTAR_RSEM_INDEX } from '../../modules/nf-core/untar/main' -include { UNTAR as UNTAR_HISAT2_INDEX } from '../../modules/nf-core/untar/main' -include { UNTAR as UNTAR_SALMON_INDEX } from '../../modules/nf-core/untar/main' - include { CUSTOM_GETCHROMSIZES } from '../../modules/nf-core/custom/getchromsizes/main' include { GFFREAD } from '../../modules/nf-core/gffread/main' include { BBMAP_BBSPLIT } from '../../modules/nf-core/bbmap/bbsplit/main' include { STAR_GENOMEGENERATE } from '../../modules/nf-core/star/genomegenerate/main' -include { HISAT2_EXTRACTSPLICESITES } from '../../modules/nf-core/hisat2/extractsplicesites/main' include { HISAT2_BUILD } from '../../modules/nf-core/hisat2/build/main' include { SALMON_INDEX } from '../../modules/nf-core/salmon/index/main' include { RSEM_PREPAREREFERENCE as RSEM_PREPAREREFERENCE_GENOME } from '../../modules/nf-core/rsem/preparereference/main' @@ -33,101 +19,52 @@ include { STAR_GENOMEGENERATE_IGENOMES } from '../../modules/local/star_ workflow PREPARE_GENOME { take: - fasta // file: /path/to/genome.fasta - gtf // file: /path/to/genome.gtf - gff // file: /path/to/genome.gff - additional_fasta // file: /path/to/additional.fasta - transcript_fasta // file: /path/to/transcript.fasta - gene_bed // file: /path/to/gene.bed - splicesites // file: /path/to/splicesites.txt - bbsplit_fasta_list // file: /path/to/bbsplit_fasta_list.txt - star_index // directory: /path/to/star/index/ - rsem_index // directory: /path/to/rsem/index/ - salmon_index // directory: /path/to/salmon/index/ - hisat2_index // directory: /path/to/hisat2/index/ - bbsplit_index // directory: /path/to/rsem/index/ - gencode // boolean: whether the genome is from GENCODE - is_aws_igenome // boolean: whether the genome files are from AWS iGenomes - biotype // string: if additional fasta file is provided biotype value to use when appending entries to GTF file - prepare_tool_indices // list: tools to prepare indices for + ch_fasta // file: /path/to/genome.fasta + ch_gtf // file: /path/to/genome.gtf + ch_gff // file: /path/to/genome.gff + ch_additional_fasta // file: /path/to/additional.fasta + ch_transcript_fasta // file: /path/to/transcript.fasta + ch_gene_bed // file: /path/to/gene.bed + ch_splicesites // file: /path/to/splicesites.txt + bbsplit_fasta_list // file: /path/to/bbsplit_fasta_list.txt + star_index // directory: /path/to/star/index/ + rsem_index // directory: /path/to/rsem/index/ + salmon_index // directory: /path/to/salmon/index/ + hisat2_index // directory: /path/to/hisat2/index/ + bbsplit_index // directory: /path/to/rsem/index/ + gencode // boolean: whether the genome is from GENCODE + is_aws_igenome // boolean: whether the genome files are from AWS iGenomes + biotype // string: if additional fasta file is provided biotype value to use when appending entries to GTF file + prepare_tool_indices // list: tools to prepare indices for main: ch_versions = Channel.empty() // - // Uncompress genome fasta file if required - // - if (fasta.endsWith('.gz')) { - ch_fasta = GUNZIP_FASTA ( [ [:], fasta ] ).gunzip.map { it[1] } - ch_versions = ch_versions.mix(GUNZIP_FASTA.out.versions) - } else { - ch_fasta = Channel.value(file(fasta)) - } - + // Create GTF annotation from GFF3 if required // - // Uncompress GTF annotation file or create from GFF3 if required - // - if (gtf) { - if (gtf.endsWith('.gz')) { - ch_gtf = GUNZIP_GTF ( [ [:], gtf ] ).gunzip.map { it[1] } - ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) - } else { - ch_gtf = Channel.value(file(gtf)) - } - } else if (gff) { - if (gff.endsWith('.gz')) { - ch_gff = GUNZIP_GFF ( [ [:], gff ] ).gunzip.map { it[1] } - ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions) - } else { - ch_gff = Channel.value(file(gff)) - } + if (!ch_gtf && ch_gff) { ch_gtf = GFFREAD ( ch_gff ).gtf - ch_versions = ch_versions.mix(GFFREAD.out.versions) + ch_versions = ch_versions.mix(GFFREADch_additional_fasta.out.versions) } // - // Uncompress additional fasta file and concatenate with reference fasta and gtf files + // Concatenate additional fasta file with reference fasta and gtf files // - if (additional_fasta) { - if (additional_fasta.endsWith('.gz')) { - ch_add_fasta = GUNZIP_ADDITIONAL_FASTA ( [ [:], additional_fasta ] ).gunzip.map { it[1] } - ch_versions = ch_versions.mix(GUNZIP_ADDITIONAL_FASTA.out.versions) - } else { - ch_add_fasta = Channel.value(file(additional_fasta)) - } - CAT_ADDITIONAL_FASTA ( ch_fasta, ch_gtf, ch_add_fasta, biotype ) + if (ch_additional_fasta) { + CAT_ADDITIONAL_FASTA ( ch_fasta, ch_gtf, ch_additional_fasta, biotype ) + ch_fasta = CAT_ADDITIONAL_FASTA.out.fasta ch_gtf = CAT_ADDITIONAL_FASTA.out.gtf ch_versions = ch_versions.mix(CAT_ADDITIONAL_FASTA.out.versions) } // - // Uncompress gene BED annotation file or create from GTF if required - // - if (gene_bed) { - if (gene_bed.endsWith('.gz')) { - ch_gene_bed = GUNZIP_GENE_BED ( [ [:], gene_bed ] ).gunzip.map { it[1] } - ch_versions = ch_versions.mix(GUNZIP_GENE_BED.out.versions) - } else { - ch_gene_bed = Channel.value(file(gene_bed)) - } - } else { - ch_gene_bed = GTF2BED ( ch_gtf ).bed - ch_versions = ch_versions.mix(GTF2BED.out.versions) - } - - // - // Uncompress transcript fasta file / create if required + // Create transcript fasta file if required // - if (transcript_fasta) { - if (transcript_fasta.endsWith('.gz')) { - ch_transcript_fasta = GUNZIP_TRANSCRIPT_FASTA ( [ [:], transcript_fasta ] ).gunzip.map { it[1] } - ch_versions = ch_versions.mix(GUNZIP_TRANSCRIPT_FASTA.out.versions) - } else { - ch_transcript_fasta = Channel.value(file(transcript_fasta)) - } - if (gencode) { + if (ch_transcript_fasta) { + if (gencode) { PREPROCESS_TRANSCRIPTS_FASTA_GENCODE ( ch_transcript_fasta ) ch_transcript_fasta = PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.fasta ch_versions = ch_versions.mix(PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.versions) @@ -139,6 +76,14 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions) } + // + // Create gene BED annotation file from GTF if required + // + if (!ch_gene_bed) { + ch_gene_bed = GTF2BED ( ch_gtf ).bed + ch_versions = ch_versions.mix(GTF2BED.out.versions) + } + // // Create chromosome sizes file // @@ -148,52 +93,33 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(CUSTOM_GETCHROMSIZES.out.versions) // - // Uncompress BBSplit index or generate from scratch if required + // Generate BBSplit index from scratch if required // ch_bbsplit_index = Channel.empty() - if ('bbsplit' in prepare_tool_indices) { - if (bbsplit_index) { - if (bbsplit_index.endsWith('.tar.gz')) { - ch_bbsplit_index = UNTAR_BBSPLIT_INDEX ( [ [:], bbsplit_index ] ).untar.map { it[1] } - ch_versions = ch_versions.mix(UNTAR_BBSPLIT_INDEX.out.versions) - } else { - ch_bbsplit_index = Channel.value(file(bbsplit_index)) - } - } else { - Channel - .from(file(bbsplit_fasta_list)) - .splitCsv() // Read in 2 column csv file: short_name,path_to_fasta - .flatMap { id, fasta -> [ [ 'id', id ], [ 'fasta', file(fasta, checkIfExists: true) ] ] } // Flatten entries to be able to groupTuple by a common key - .groupTuple() - .map { it -> it[1] } // Get rid of keys and keep grouped values - .collect { [ it ] } // Collect entries as a list to pass as "tuple val(short_names), path(path_to_fasta)" to module - .set { ch_bbsplit_fasta_list } - - ch_bbsplit_index = BBMAP_BBSPLIT ( [ [:], [] ], [], ch_fasta, ch_bbsplit_fasta_list, true ).index - ch_versions = ch_versions.mix(BBMAP_BBSPLIT.out.versions) - } + if ('bbsplit' in prepare_tool_indices && !bbsplit_index) { + ch_bbsplit_fasta_list = Channel + .from(file(bbsplit_fasta_list)) + .splitCsv() // Read in 2 column csv file: short_name,path_to_fasta + .flatMap { id, fasta -> [ [ 'id', id ], [ 'fasta', file(fasta, checkIfExists: true) ] ] } // Flatten entries to be able to groupTuple by a common key + .groupTuple() + .map { it -> it[1] } // Get rid of keys and keep grouped values + .collect { [ it ] } // Collect entries as a list to pass as "tuple val(short_names), path(path_to_fasta)" to module + + ch_bbsplit_index = BBMAP_BBSPLIT ( [ [:], [] ], [], ch_fasta, ch_bbsplit_fasta_list, true ).index + ch_versions = ch_versions.mix(BBMAP_BBSPLIT.out.versions) } // // Uncompress STAR index or generate from scratch if required // ch_star_index = Channel.empty() - if ('star_salmon' in prepare_tool_indices) { - if (star_index) { - if (star_index.endsWith('.tar.gz')) { - ch_star_index = UNTAR_STAR_INDEX ( [ [:], star_index ] ).untar.map { it[1] } - ch_versions = ch_versions.mix(UNTAR_STAR_INDEX.out.versions) - } else { - ch_star_index = Channel.value(file(star_index)) - } + if ('star_salmon' in prepare_tool_indices && !star_index) { + if (is_aws_igenome) { + ch_star_index = STAR_GENOMEGENERATE_IGENOMES ( ch_fasta, ch_gtf ).index + ch_versions = ch_versions.mix(STAR_GENOMEGENERATE_IGENOMES.out.versions) } else { - if (is_aws_igenome) { - ch_star_index = STAR_GENOMEGENERATE_IGENOMES ( ch_fasta, ch_gtf ).index - ch_versions = ch_versions.mix(STAR_GENOMEGENERATE_IGENOMES.out.versions) - } else { - ch_star_index = STAR_GENOMEGENERATE ( ch_fasta, ch_gtf ).index - ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) - } + ch_star_index = STAR_GENOMEGENERATE ( ch_fasta, ch_gtf ).index + ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) } } @@ -201,61 +127,27 @@ workflow PREPARE_GENOME { // Uncompress RSEM index or generate from scratch if required // ch_rsem_index = Channel.empty() - if ('star_rsem' in prepare_tool_indices) { - if (rsem_index) { - if (rsem_index.endsWith('.tar.gz')) { - ch_rsem_index = UNTAR_RSEM_INDEX ( [ [:], rsem_index ] ).untar.map { it[1] } - ch_versions = ch_versions.mix(UNTAR_RSEM_INDEX.out.versions) - } else { - ch_rsem_index = Channel.value(file(rsem_index)) - } - } else { - ch_rsem_index = RSEM_PREPAREREFERENCE_GENOME ( ch_fasta, ch_gtf ).index - ch_versions = ch_versions.mix(RSEM_PREPAREREFERENCE_GENOME.out.versions) - } + if ('star_rsem' in prepare_tool_indices && !rsem_index) { + ch_rsem_index = RSEM_PREPAREREFERENCE_GENOME ( ch_fasta, ch_gtf ).index + ch_versions = ch_versions.mix(RSEM_PREPAREREFERENCE_GENOME.out.versions) } // // Uncompress HISAT2 index or generate from scratch if required // - ch_splicesites = Channel.empty() ch_hisat2_index = Channel.empty() - if ('hisat2' in prepare_tool_indices) { - if (!splicesites) { - ch_splicesites = HISAT2_EXTRACTSPLICESITES ( ch_gtf.map { [ [:], it ] } ).txt.map { it[1] } - ch_versions = ch_versions.mix(HISAT2_EXTRACTSPLICESITES.out.versions) - } else { - ch_splicesites = Channel.value(file(splicesites)) - } - if (hisat2_index) { - if (hisat2_index.endsWith('.tar.gz')) { - ch_hisat2_index = UNTAR_HISAT2_INDEX ( [ [:], hisat2_index ] ).untar.map { it[1] } - ch_versions = ch_versions.mix(UNTAR_HISAT2_INDEX.out.versions) - } else { - ch_hisat2_index = Channel.value(file(hisat2_index)) - } - } else { - ch_hisat2_index = HISAT2_BUILD ( ch_fasta.map { [ [:], it ] }, ch_gtf.map { [ [:], it ] }, ch_splicesites.map { [ [:], it ] } ).index.map { it[1] } - ch_versions = ch_versions.mix(HISAT2_BUILD.out.versions) - } + if ('hisat2' in prepare_tool_indices && !hisat2_index) { + ch_hisat2_index = HISAT2_BUILD ( ch_fasta.map { [ [:], it ] }, ch_gtf.map { [ [:], it ] }, ch_splicesites.map { [ [:], it ] } ).index.map { it[1] } + ch_versions = ch_versions.mix(HISAT2_BUILD.out.versions) } // // Uncompress Salmon index or generate from scratch if required // ch_salmon_index = Channel.empty() - if (salmon_index) { - if (salmon_index.endsWith('.tar.gz')) { - ch_salmon_index = UNTAR_SALMON_INDEX ( [ [:], salmon_index ] ).untar.map { it[1] } - ch_versions = ch_versions.mix(UNTAR_SALMON_INDEX.out.versions) - } else { - ch_salmon_index = Channel.value(file(salmon_index)) - } - } else { - if ('salmon' in prepare_tool_indices) { - ch_salmon_index = SALMON_INDEX ( ch_fasta, ch_transcript_fasta ).index - ch_versions = ch_versions.mix(SALMON_INDEX.out.versions) - } + if ('salmon' in prepare_tool_indices && !salmon_index) { + ch_salmon_index = SALMON_INDEX ( ch_fasta, ch_transcript_fasta ).index + ch_versions = ch_versions.mix(SALMON_INDEX.out.versions) } emit: @@ -265,7 +157,6 @@ workflow PREPARE_GENOME { gene_bed = ch_gene_bed // channel: path(gene.bed) transcript_fasta = ch_transcript_fasta // channel: path(transcript.fasta) chrom_sizes = ch_chrom_sizes // channel: path(genome.sizes) - splicesites = ch_splicesites // channel: path(genome.splicesites.txt) bbsplit_index = ch_bbsplit_index // channel: path(bbsplit/index/) star_index = ch_star_index // channel: path(star/index/) rsem_index = ch_rsem_index // channel: path(rsem/index/) diff --git a/subworkflows/local/uncompress_genome.nf b/subworkflows/local/uncompress_genome.nf new file mode 100644 index 000000000..570aa0250 --- /dev/null +++ b/subworkflows/local/uncompress_genome.nf @@ -0,0 +1,163 @@ +// +// Uncompress reference genome files +// + +include { GUNZIP as GUNZIP_FASTA } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_GTF } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_GFF } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_GENE_BED } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_TRANSCRIPT_FASTA } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_ADDITIONAL_FASTA } from '../../modules/nf-core/gunzip/main' + +include { HISAT2_EXTRACTSPLICESITES } from '../../modules/nf-core/hisat2/extractsplicesites/main' + +include { UNTAR as UNTAR_BBSPLIT_INDEX } from '../../modules/nf-core/untar/main' +include { UNTAR as UNTAR_STAR_INDEX } from '../../modules/nf-core/untar/main' +include { UNTAR as UNTAR_RSEM_INDEX } from '../../modules/nf-core/untar/main' +include { UNTAR as UNTAR_HISAT2_INDEX } from '../../modules/nf-core/untar/main' +include { UNTAR as UNTAR_SALMON_INDEX } from '../../modules/nf-core/untar/main' + +workflow UNCOMPRESS_GENOME { + take: + fasta // file: /path/to/genome.fasta + gtf // file: /path/to/genome.gtf + gff // file: /path/to/genome.gff + additional_fasta // file: /path/to/additional.fasta + transcript_fasta // file: /path/to/transcript.fasta + gene_bed // file: /path/to/gene.bed + splicesites // file: /path/to/splicesites.txt + star_index // directory: /path/to/star/index/ + rsem_index // directory: /path/to/rsem/index/ + salmon_index // directory: /path/to/salmon/index/ + hisat2_index // directory: /path/to/hisat2/index/ + bbsplit_index // directory: /path/to/rsem/index/ + prepare_tool_indices // list: tools to uncompress indices for + + main: + + ch_versions = Channel.empty() + + // + // Uncompress genome fasta file if required + // + ch_fasta = Channel.empty() + if (fasta) { + ch_fasta = GUNZIP_FASTA ( [ [:], fasta ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_FASTA.out.versions) + } + + // + // Uncompress GTF annotation file + // + ch_gtf = Channel.empty() + if (gtf) { + ch_gtf = GUNZIP_GTF ( [ [:], gtf ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) + } + + // + // Uncompress GFF annotation file + // + ch_gff = Channel.empty() + if (gff) { + ch_gff = GUNZIP_GFF ( [ [:], gff ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions) + } + + // + // Uncompress additional fasta file + // + ch_additional_fasta = Channel.empty() + if (additional_fasta) { + ch_additional_fasta = GUNZIP_ADDITIONAL_FASTA ( [ [:], additional_fasta ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_ADDITIONAL_FASTA.out.versions) + } + + // + // Uncompress transcript fasta file + // + ch_transcript_fasta = Channel.empty() + if (transcript_fasta) { + ch_transcript_fasta = GUNZIP_TRANSCRIPT_FASTA ( [ [:], transcript_fasta ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_TRANSCRIPT_FASTA.out.versions) + } + + // + // Uncompress gene BED annotation file + // + ch_gene_bed = Channel.empty() + if (gene_bed) { + ch_gene_bed = GUNZIP_GENE_BED ( [ [:], gene_bed ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_GENE_BED.out.versions) + } + + // + // Extract Splice Sites + // + ch_splicesites = Channel.empty() + if ('hisat2' in prepare_tool_indices && !splicesites) { + ch_splicesites = HISAT2_EXTRACTSPLICESITES ( ch_gtf.map { [ [:], it ] } ).txt.map { it[1] } + ch_versions = ch_versions.mix(HISAT2_EXTRACTSPLICESITES.out.versions) + } + + // + // Uncompress STAR index + // + ch_star_index = Channel.empty() + if ('star_salmon' in prepare_tool_indices && star_index) { + ch_star_index = UNTAR_STAR_INDEX ( [ [:], star_index ] ).untar.map { it[1] } + ch_versions = ch_versions.mix(UNTAR_STAR_INDEX.out.versions) + } + + // + // Uncompress RSEM index + // + ch_rsem_index = Channel.empty() + if ('star_rsem' in prepare_tool_indices && rsem_index) { + ch_rsem_index = UNTAR_RSEM_INDEX ( [ [:], rsem_index ] ).untar.map { it[1] } + ch_versions = ch_versions.mix(UNTAR_RSEM_INDEX.out.versions) + } + + // + // Uncompress Salmon index + // + ch_salmon_index = Channel.empty() + if ('salmon' in prepare_tool_indices && salmon_index) { + ch_salmon_index = UNTAR_SALMON_INDEX ( [ [:], salmon_index ] ).untar.map { it[1] } + ch_versions = ch_versions.mix(UNTAR_SALMON_INDEX.out.versions) + } + + // + // Uncompress HISAT2 index + // + ch_hisat2_index = Channel.empty() + if ('hisat2' in prepare_tool_indices && hisat2_index) { + ch_hisat2_index = UNTAR_HISAT2_INDEX ( [ [:], hisat2_index ] ).untar.map { it[1] } + ch_versions = ch_versions.mix(UNTAR_HISAT2_INDEX.out.versions) + } + + // + // Uncompress BBSplit index + // + ch_bbsplit_index = Channel.empty() + if ('bbsplit' in prepare_tool_indices && bbsplit_index) { + ch_bbsplit_index = UNTAR_BBSPLIT_INDEX ( [ [:], bbsplit_index ] ).untar.map { it[1] } + ch_versions = ch_versions.mix(UNTAR_BBSPLIT_INDEX.out.versions) + } + + emit: + fasta = ch_fasta // channel: path(genome.fasta) + gtf = ch_gtf // channel: path(genome.gtf) + gff = ch_gff // channel: path(genome.gff) + gene_bed = ch_gene_bed // channel: path(gene.bed) + splicesites = ch_splicesites // channel: path(genome.splicesites.txt) + additional_fasta = ch_additional_fasta // channel: path(additional.fasta) + transcript_fasta = ch_transcript_fasta // channel: path(transcript.fasta) + bbsplit_index = ch_bbsplit_index // channel: path(bbsplit/index/) + star_index = ch_star_index // channel: path(star/index/) + rsem_index = ch_rsem_index // channel: path(rsem/index/) + hisat2_index = ch_hisat2_index // channel: path(hisat2/index/) + salmon_index = ch_salmon_index // channel: path(salmon/index/) + + versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] +} diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index 72b7619a4..98963a4ea 100755 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -97,9 +97,10 @@ include { UMITOOLS_PREPAREFORRSEM as UMITOOLS_PREPAREFORSALMON } from '../module // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' -include { ALIGN_STAR } from '../subworkflows/local/align_star' -include { QUANTIFY_RSEM } from '../subworkflows/local/quantify_rsem' +include { UNCOMPRESS_GENOME } from '../subworkflows/local/uncompress_genome' +include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' +include { ALIGN_STAR } from '../subworkflows/local/align_star' +include { QUANTIFY_RSEM } from '../subworkflows/local/quantify_rsem' include { QUANTIFY_SALMON as QUANTIFY_STAR_SALMON } from '../subworkflows/local/quantify_salmon' include { QUANTIFY_SALMON as QUANTIFY_SALMON } from '../subworkflows/local/quantify_salmon' @@ -154,63 +155,106 @@ workflow RNASEQ { ch_versions = Channel.empty() // - // SUBWORKFLOW: Uncompress and prepare reference genome files + // SUBWORKFLOW: Uncompress reference genome files + // + UNCOMPRESS_GENOME ( + (params.fasta && params.fasta.endsWith('.gz') ? params.fasta : ''), + (params.gtf && params.gtf.endsWith('.gz') ? params.gtf : ''), + (params.gff && params.gff.endsWith('.gz') ? params.gff : ''), + (params.additional_fasta && params.additional_fasta.endsWith('.gz') ? params.additional_fasta : ''), + (params.transcript_fasta && params.transcript_fasta.endsWith('.gz') ? params.transcript_fasta : ''), + (params.gene_bed && params.gene_bed.endsWith('.gz') ? params.gene_bed : ''), + (params.splicesites ?: ''), + (params.star_index && params.star_index.endsWith('.tar.gz') ? params.star_index : ''), + (params.rsem_index && params.rsem_index.endsWith('.tar.gz') ? params.rsem_index : ''), + (params.salmon_index && params.salmon_index.endsWith('.tar.gz') ? params.salmon_index : ''), + (params.hisat2_index && params.hisat2_index.endsWith('.tar.gz') ? params.hisat2_index : ''), + (params.bbsplit_index && params.bbsplit_index.endsWith('.tar.gz') ? params.bbsplit_index : ''), + prepareToolIndices + ) + ch_versions = ch_versions.mix(UNCOMPRESS_GENOME.out.versions) + + ch_fasta = params.fasta ? params.fasta.endsWith('.gz') ? UNCOMPRESS_GENOME.out.fasta : Channel.value(file(params.fasta)) : Channel.empty() + ch_gtf = params.gtf ? params.gtf.endsWith('.gz') ? UNCOMPRESS_GENOME.out.gtf : Channel.value(file(params.gtf)) : Channel.empty() + ch_gff = params.gff ? params.gff.endsWith('.gz') ? UNCOMPRESS_GENOME.out.gff : Channel.value(file(params.gff)) : Channel.empty() + + ch_splicesites = params.splicesites ? Channel.value(file(splicesites)) : UNCOMPRESS_GENOME.out.splicesites + ch_gene_bed = params.gene_bed ? params.gene_bed.endsWith('.gz') ? UNCOMPRESS_GENOME.out.gene_bed : Channel.value(file(gene_bed)) : Channel.empty() + + ch_additional_fasta = params.additional_fasta ? params.additional_fasta.endsWith('.gz') ? + UNCOMPRESS_GENOME.out.additional_fasta : + Channel.value(file(params.additional_fasta)) : + Channel.empty() + + ch_transcript_fasta = params.transcript_fasta ? params.transcript_fasta.endsWith('.gz') ? + UNCOMPRESS_GENOME.out.transcript_fasta : + Channel.value(file(params.transcript_fasta)) : + Channel.empty() + + + ch_star_index = params.star_index ? params.star_index.endsWith('.tar.gz') ? UNCOMPRESS_GENOME.out.star_index : Channel.value(file(params.star_index)) : Channel.empty() + ch_rsem_index = params.rsem_index ? params.rsem_index.endsWith('.tar.gz') ? UNCOMPRESS_GENOME.out.rsem_index : Channel.value(file(params.rsem_index)) : Channel.empty() + ch_salmon_index = params.salmon_index ? params.salmon_index.endsWith('.tar.gz') ? UNCOMPRESS_GENOME.out.salmon_index : Channel.value(file(params.salmon_index)) : Channel.empty() + ch_hisat2_index = params.hisat2_index ? params.hisat2_index.endsWith('.tar.gz') ? UNCOMPRESS_GENOME.out.hisat2_index : Channel.value(file(params.hisat2_index)) : Channel.empty() + ch_bbsplit_index = params.bbsplit_index ? params.bbsplit_index.endsWith('.tar.gz') ? UNCOMPRESS_GENOME.out.bbsplit_index : Channel.value(file(params.bbsplit_index)) : Channel.empty() + + // + // SUBWORKFLOW: Prepare reference genome files // def biotype = params.gencode ? "gene_type" : params.featurecounts_group_type PREPARE_GENOME ( - params.fasta, - params.gtf, - params.gff, - params.additional_fasta, - params.transcript_fasta, - params.gene_bed, - params.splicesites, + ch_fasta, + ch_gtf, + ch_gff, + ch_additional_fasta, + ch_transcript_fasta, + ch_gene_bed, + ch_splicesites, params.bbsplit_fasta_list, - params.star_index, - params.rsem_index, - params.salmon_index, - params.hisat2_index, - params.bbsplit_index, + ch_star_index, + ch_rsem_index, + ch_salmon_index, + ch_hisat2_index, + ch_bbsplit_index, params.gencode, is_aws_igenome, biotype, prepareToolIndices ) + + ch_fasta = PREPARE_GENOME.out.fasta + ch_fai = PREPARE_GENOME.out.fai + ch_chrom_sizes = PREPARE_GENOME.out.chrom_sizes + ch_gtf = params.gtf ? ch_gtf : PREPARE_GENOME.out.gtf + ch_transcript_fasta = params.transcript_fasta ? ch_transcript_fasta : PREPARE_GENOME.out.transcript_fasta + ch_bbsplit_index = params.bbsplit_index ? ch_bbsplit_index : PREPARE_GENOME.out.bbsplit_index + ch_star_index = params.star_index ? ch_star_index : PREPARE_GENOME.out.star_index + ch_rsem_index = params.rsem_index ? ch_rsem_index : PREPARE_GENOME.out.rsem_index + ch_hisat2_index = params.hisat2_index ? ch_hisat2_index : PREPARE_GENOME.out.hisat2_index + ch_salmon_index = params.salmon_index ? ch_salmon_index : PREPARE_GENOME.out.salmon_index + ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions) // Check if contigs in genome fasta file > 512 Mbp if (!params.skip_alignment && !params.bam_csi_index) { - PREPARE_GENOME - .out - .fai - .map { WorkflowRnaseq.checkMaxContigSize(it, log) } + ch_fai.map { WorkflowRnaseq.checkMaxContigSize(it, log) } } // // Create input channel from input file provided through params.input // - Channel - .fromSamplesheet("input") - .map { - meta, fastq_1, fastq_2 -> - if (!fastq_2) { - return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ] - } else { - return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2 ] ] - } - } - .groupTuple() - .map { - WorkflowRnaseq.validateInput(it) - } - .branch { - meta, fastqs -> - single : fastqs.size() == 1 - return [ meta, fastqs.flatten() ] - multiple: fastqs.size() > 1 - return [ meta, fastqs.flatten() ] + ch_fastq = Channel.fromSamplesheet("input").map{ meta, fastq_1, fastq_2 -> + if (!fastq_2) { + return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ] + } else { + return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2 ] ] } - .set { ch_fastq } + }.groupTuple().map {WorkflowRnaseq.validateInput(it)}.branch{meta, fastqs -> + single : fastqs.size() == 1 + return [ meta, fastqs.flatten() ] + multiple: fastqs.size() > 1 + return [ meta, fastqs.flatten() ] + } // // MODULE: Concatenate FastQ files from same sample if required @@ -221,6 +265,7 @@ workflow RNASEQ { .reads .mix(ch_fastq.single) .set { ch_cat_fastq } + ch_versions = ch_versions.mix(CAT_FASTQ.out.versions.first().ifEmpty(null)) // Branch FastQ channels if 'auto' specified to infer strandedness @@ -238,18 +283,16 @@ workflow RNASEQ { // SUBWORKFLOW: Sub-sample FastQ files and pseudo-align with Salmon to auto-infer strandedness // // Return empty channel if ch_strand_fastq.auto_strand is empty so salmon index isn't created - PREPARE_GENOME.out.fasta - .combine(ch_strand_fastq.auto_strand) + ch_genome_fasta = ch_fasta.combine(ch_strand_fastq.auto_strand) .map { it.first() } .first() - .set { ch_genome_fasta } FASTQ_SUBSAMPLE_FQ_SALMON ( ch_strand_fastq.auto_strand, ch_genome_fasta, - PREPARE_GENOME.out.transcript_fasta, - PREPARE_GENOME.out.gtf, - PREPARE_GENOME.out.salmon_index, + ch_transcript_fasta, + ch_gtf, + ch_salmon_index, !params.salmon_index && !('salmon' in prepareToolIndices) ) ch_versions = ch_versions.mix(FASTQ_SUBSAMPLE_FQ_SALMON.out.versions) @@ -340,7 +383,7 @@ workflow RNASEQ { if (!params.skip_bbsplit) { BBMAP_BBSPLIT ( ch_filtered_reads, - PREPARE_GENOME.out.bbsplit_index, + ch_bbsplit_index, [], [ [], [] ], false @@ -382,13 +425,13 @@ workflow RNASEQ { if (!params.skip_alignment && params.aligner == 'star_salmon') { ALIGN_STAR ( ch_filtered_reads, - PREPARE_GENOME.out.star_index, - PREPARE_GENOME.out.gtf, + ch_star_index, + ch_gtf, params.star_ignore_sjdbgtf, '', params.seq_center ?: '', is_aws_igenome, - PREPARE_GENOME.out.fasta.map { [ [:], it ] } + ch_fasta.map { [ [:], it ] } ) ch_genome_bam = ALIGN_STAR.out.bam ch_genome_bam_index = ALIGN_STAR.out.bai @@ -424,7 +467,7 @@ workflow RNASEQ { // Co-ordinate sort, index and run stats on transcriptome BAM BAM_SORT_STATS_SAMTOOLS ( ch_transcriptome_bam, - PREPARE_GENOME.out.fasta.map { [ [:], it ] } + ch_fasta.map { [ [:], it ] } ) ch_transcriptome_sorted_bam = BAM_SORT_STATS_SAMTOOLS.out.bam ch_transcriptome_sorted_bai = BAM_SORT_STATS_SAMTOOLS.out.bai @@ -472,8 +515,8 @@ workflow RNASEQ { QUANTIFY_STAR_SALMON ( ch_transcriptome_bam, ch_dummy_file, - PREPARE_GENOME.out.transcript_fasta, - PREPARE_GENOME.out.gtf, + ch_transcript_fasta, + ch_gtf, true, params.salmon_quant_libtype ?: '' ) @@ -498,8 +541,8 @@ workflow RNASEQ { if (!params.skip_alignment && params.aligner == 'star_rsem') { QUANTIFY_RSEM ( ch_filtered_reads, - PREPARE_GENOME.out.rsem_index, - PREPARE_GENOME.out.fasta.map { [ [:], it ] } + ch_rsem_index, + ch_fasta.map { [ [:], it ] } ) ch_genome_bam = QUANTIFY_RSEM.out.bam ch_genome_bam_index = QUANTIFY_RSEM.out.bai @@ -532,9 +575,9 @@ workflow RNASEQ { if (!params.skip_alignment && params.aligner == 'hisat2') { FASTQ_ALIGN_HISAT2 ( ch_filtered_reads, - PREPARE_GENOME.out.hisat2_index.map { [ [:], it ] }, - PREPARE_GENOME.out.splicesites.map { [ [:], it ] }, - PREPARE_GENOME.out.fasta.map { [ [:], it ] } + ch_hisat2_index.map { [ [:], it ] }, + ch_splicesites.map { [ [:], it ] }, + ch_fasta.map { [ [:], it ] } ) ch_genome_bam = FASTQ_ALIGN_HISAT2.out.bam ch_genome_bam_index = FASTQ_ALIGN_HISAT2.out.bai @@ -627,8 +670,8 @@ workflow RNASEQ { if (!params.skip_alignment && !params.skip_markduplicates && !params.with_umi) { BAM_MARKDUPLICATES_PICARD ( ch_genome_bam, - PREPARE_GENOME.out.fasta.map { [ [:], it ] }, - PREPARE_GENOME.out.fai.map { [ [:], it ] } + ch_fasta.map { [ [:], it ] }, + ch_fai.map { [ [:], it ] } ) ch_genome_bam = BAM_MARKDUPLICATES_PICARD.out.bam ch_genome_bam_index = BAM_MARKDUPLICATES_PICARD.out.bai @@ -648,7 +691,7 @@ workflow RNASEQ { if (!params.skip_alignment && !params.skip_stringtie) { STRINGTIE_STRINGTIE ( ch_genome_bam, - PREPARE_GENOME.out.gtf + ch_gtf ) ch_versions = ch_versions.mix(STRINGTIE_STRINGTIE.out.versions.first()) } @@ -659,15 +702,11 @@ workflow RNASEQ { ch_featurecounts_multiqc = Channel.empty() if (!params.skip_alignment && !params.skip_qc && !params.skip_biotype_qc && biotype) { - PREPARE_GENOME - .out - .gtf - .map { WorkflowRnaseq.biotypeInGtf(it, biotype, log) } - .set { biotype_in_gtf } + biotype_in_gtf = ch_gtf.map { WorkflowRnaseq.biotypeInGtf(it, biotype, log) } // Prevent any samples from running if GTF file doesn't have a valid biotype ch_genome_bam - .combine(PREPARE_GENOME.out.gtf) + .combine(ch_gtf) .combine(biotype_in_gtf) .filter { it[-1] } .map { it[0.. 0) { BAM_RSEQC ( ch_genome_bam.join(ch_genome_bam_index, by: [0]), - PREPARE_GENOME.out.gene_bed, + ch_gene_bed, rseqc_modules ) ch_bamstat_multiqc = BAM_RSEQC.out.bamstat_txt @@ -796,9 +835,9 @@ workflow RNASEQ { if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'salmon') { QUANTIFY_SALMON ( ch_filtered_reads, - PREPARE_GENOME.out.salmon_index, + ch_salmon_index, ch_dummy_file, - PREPARE_GENOME.out.gtf, + ch_gtf, false, params.salmon_quant_libtype ?: '' )