From ab9a367d453b14655910fb38bee7512f1675e521 Mon Sep 17 00:00:00 2001 From: Ian Light <ilight1542@gmail.com> Date: Fri, 13 Sep 2024 12:17:12 +0000 Subject: [PATCH 01/19] partial fix for krakenuniq, error within kraken call for PE (thinks fasta?) --- subworkflows/local/bamfiltering.nf | 2 +- subworkflows/local/metagenomics_profiling.nf | 22 ++++++++++++++++--- .../local/utils_nfcore_eager_pipeline/main.nf | 1 - 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/bamfiltering.nf b/subworkflows/local/bamfiltering.nf index b3a98f416..b7906365c 100644 --- a/subworkflows/local/bamfiltering.nf +++ b/subworkflows/local/bamfiltering.nf @@ -93,7 +93,7 @@ workflow FILTER_BAM { .map { meta, fastqs -> def meta_new = meta.clone() - meta_new['single_end'] = true + meta_new['single_end_clone'] = true [ meta_new, fastqs.flatten() ] } CAT_FASTQ_UNMAPPED ( ch_paired_fastq_for_cat ) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index f21cb5590..f89dc3e66 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -142,15 +142,31 @@ workflow METAGENOMICS_PROFILING { ch_krakenuniq_input = ch_reads .map{ meta, file -> [ - ['single_end':true], + ['single_end':meta['single_end']], file ] } .groupTuple(by:0) + .map { meta, files -> + [ + meta, files.flatten() + ]}.view() + + ch_krakenuniq_input = ch_krakenuniq_input.combine(ch_database) + .multiMap{ + meta, files, database -> + meta_files_input: [meta, files] + database: database + } +// NOTE to self: it will try to submit all reads to the process at once, +// then any PE reads will be lists within the list of read inputs with the single single_end_clone meta +// eg [single_end_clone:true], [/workspace/eager/work/24/66b0b0000323ab315c1e4d499a8b53/JK2802_JK2802_AGAATAACCTACCA_Mammoth_MT_Krause.merged.fastq.gz, /workspace/eager/work/03/4fcf3b2285211d76322257a51ace29/JK2782_JK2782_TGGCCGATCAACGA_BAM_Mammoth_MT_Krause.merged.fastq.gz, [/workspace/eager/work/eb/2d257c378f932115ea34e338bee444/JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_1.merged.fastq.gz, /workspace/eager/work/eb/2d257c378f932115ea34e338bee444/JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_2.merged.fastq.gz]]] +// +// so solution must split the inputs to krakenuniq by the PE and SE reads and then remerge the channel for outputting KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( - ch_krakenuniq_input, - ch_database, + ch_krakenuniq_input.meta_files_input, + ch_krakenuniq_input.database, params.metagenomics_krakenuniq_ramchunksize, params.metagenomics_kraken2_savereads, true, // save read assignments diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index ac01b35c2..cdc9e3abe 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -225,7 +225,6 @@ def validateInputParameters() { if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_prinseq_mode == 'entropy' && params.metagenomics_prinseq_dustscore != 0.5 ) { if (params.metagenomics_complexity_entropy == 0.3) { exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'entropy' mode but provided a dust score. Please specify an entropy filter threshold using the --metagenomics_complexity_entropy flag") } } - if ( params.run_metagenomics && params.preprocessing_skippairmerging ) { exit 1, ("[nf-core/eager] ERROR: Metagenomics: Currently no support for unmerged paired end reads inputs into Metagenomics subworkflow. Please rerun without --preprocessing_skippairmerging.") } if ( params.metagenomics_run_postprocessing && params.metagenomics_profiling_tool == 'malt' && From 6924787b8055c87fc03a85b1cfbe2a99f13604ff Mon Sep 17 00:00:00 2001 From: Ian Light <ilight1542@gmail.com> Date: Fri, 4 Oct 2024 09:55:28 +0000 Subject: [PATCH 02/19] notes for fixes going forward --- docs/development/metagenomics_paired_end.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 docs/development/metagenomics_paired_end.md diff --git a/docs/development/metagenomics_paired_end.md b/docs/development/metagenomics_paired_end.md new file mode 100644 index 000000000..650a87cc5 --- /dev/null +++ b/docs/development/metagenomics_paired_end.md @@ -0,0 +1,18 @@ +## investigation notes for updating code to allow for PE inputs into metagenomics profiling (eg for kraken, malt) + +see +https://github.com/nf-core/eager/issues/945 + +current issue is that the reads that go into mapping are not by default extracted as singletons and non-singletons, so we lose that information +Then downstream the inputs into the krakenuniq module (even if split correctly with meta vars) don't have the correct headers to parse the PE nature of the reads (since they have all been concatenated anyways, and just were ORIGINALLY PE) + +So: needs to be fixed up higher (eg in bamfiltering.nf, likely with a new adjustment to the SAMTOOLS_FASTQ_UNMAPPED, SAMTOOLS_FASTQ_MAPPED, and SAMTOOLS_VIEW_BAM_FILTERING modules ) + +ISSUE FOUND: while the outputting of PE reads is OK in bamfiltering.nf (fastq_mapped & fastq_unmapped) when overlap merging is not done cat_fastq weirdly merges singletons to one PE file and other to the other PE file, so then everything gets fucked up +""" +cat input1/JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_unmapped_other.fastq.gz input3/JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_unmapped_1.fastq.gz > JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_1.merged.fastq.gz +cat input2/JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_unmapped_singleton.fastq.gz input4/JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_unmapped_2.fastq.gz > JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_2.merged.fastq.gz +""" + +Decision is needed on what behavior is wanted for unmapped singletons, other. and then likely remove the call to cat_fastq for PE reads +Possibly just split to also have the singletons parsed separately? From 376a639923ba51f439d05e114c1c605338529d69 Mon Sep 17 00:00:00 2001 From: Ian Light <ilight1542@gmail.com> Date: Fri, 18 Oct 2024 11:38:07 +0000 Subject: [PATCH 03/19] added warning, and correct parsing for input into metagenomic screening for PE vs SE --- subworkflows/local/bamfiltering.nf | 32 ++++++++----------- .../local/utils_nfcore_eager_pipeline/main.nf | 1 + 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/subworkflows/local/bamfiltering.nf b/subworkflows/local/bamfiltering.nf index b7906365c..0b7ce8484 100644 --- a/subworkflows/local/bamfiltering.nf +++ b/subworkflows/local/bamfiltering.nf @@ -86,39 +86,35 @@ workflow FILTER_BAM { } if ( ( params.run_metagenomics && params.metagenomics_input == 'unmapped' ) && params.preprocessing_skippairmerging ) { - ch_paired_fastq_for_cat = SAMTOOLS_FASTQ_UNMAPPED.out.fastq + ch_paired_fastq_for_cat = SAMTOOLS_FASTQ_UNMAPPED.out.fastq.filter { !it[0].single_end } + + ch_single_fastq_for_cat = SAMTOOLS_FASTQ_UNMAPPED.out.fastq .mix(SAMTOOLS_FASTQ_UNMAPPED.out.singleton) .mix(SAMTOOLS_FASTQ_UNMAPPED.out.other) .groupTuple() - .map { - meta, fastqs -> - def meta_new = meta.clone() - meta_new['single_end_clone'] = true - [ meta_new, fastqs.flatten() ] - } - CAT_FASTQ_UNMAPPED ( ch_paired_fastq_for_cat ) + .filter{ it[0].single_end } + + CAT_FASTQ_UNMAPPED ( ch_single_fastq_for_cat ) } // TODO: see request https://github.com/nf-core/eager/issues/945 if ( ( params.run_metagenomics && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) && params.preprocessing_skippairmerging ) { - ch_paired_fastq_for_cat = SAMTOOLS_FASTQ_MAPPED.out.fastq + ch_paired_fastq_for_cat = SAMTOOLS_FASTQ_UNMAPPED.out.fastq.filter { !it[0].single_end } + + ch_single_fastq_for_cat = SAMTOOLS_FASTQ_MAPPED.out.fastq .mix(SAMTOOLS_FASTQ_MAPPED.out.singleton) .mix(SAMTOOLS_FASTQ_MAPPED.out.other) .groupTuple() - .map { - meta, fastqs -> - def meta_new = meta.clone() - meta_new['single_end'] = true - [ meta_new, fastqs.flatten() ] - } - CAT_FASTQ_MAPPED ( ch_paired_fastq_for_cat ) + .filter{ it[0].single_end } + + CAT_FASTQ_MAPPED ( ch_single_fastq_for_cat ) } // Routing for metagenomic screening -> first accounting for paired-end mapping, then merged mapping, then no metagenomics if ( ( params.run_metagenomics && params.metagenomics_input == 'unmapped' ) && params.preprocessing_skippairmerging ) { - ch_fastq_for_metagenomics = CAT_FASTQ_UNMAPPED.out.reads + ch_fastq_for_metagenomics = CAT_FASTQ_UNMAPPED.out.reads.mix(ch_paired_fastq_for_cat) } else if ( ( params.run_metagenomics && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) && params.preprocessing_skippairmerging ) { - ch_fastq_for_metagenomics = CAT_FASTQ_MAPPED.out.reads + ch_fastq_for_metagenomics = CAT_FASTQ_MAPPED.out.reads.mix(ch_paired_fastq_for_cat) } else if ( params.run_metagenomics && params.metagenomics_input == 'unmapped' ) { ch_fastq_for_metagenomics = SAMTOOLS_FASTQ_UNMAPPED.out.other } else if ( params.run_metagenomics && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' )) { diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index cdc9e3abe..1c0818cbf 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -233,6 +233,7 @@ def validateInputParameters() { !params.metagenomics_maltextract_ncbidir ) ){ exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked MALT with postprocessing but didnt provided required input files. Please provide the --metagenomics_maltextract_taxonlist and --metagenomics_maltextract_ncbidir flags") } + if ( params.run_metagenomics && params.preprocessing_skippairmerging ) { log.warn("[nf-core/eager] WARNING: --preprocessing_skippairmerging selected with running metagenomics! All singletons from paired end samples will be discarded prior to input for metagenomics screening!") } if ( params.run_genotyping && ! params.genotyping_tool ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_tool was specified.") } if ( params.run_genotyping && ! params.genotyping_source ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_source was specified.") } if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } From d8c8c30fc7eea3a49a184f545923b9286fe35716 Mon Sep 17 00:00:00 2001 From: Ian Light <ilight1542@gmail.com> Date: Fri, 25 Oct 2024 08:43:53 +0000 Subject: [PATCH 04/19] full implementation of paired end metagenomics krakenuniq --- conf/modules.config | 1 + modules.json | 2 +- .../preloadedkrakenuniq/environment.yml | 5 + .../krakenuniq/preloadedkrakenuniq/main.nf | 112 +++++++------ .../krakenuniq/preloadedkrakenuniq/meta.yml | 147 +++++++++++------- subworkflows/local/metagenomics_profiling.nf | 8 +- 6 files changed, 164 insertions(+), 111 deletions(-) create mode 100644 modules/nf-core/krakenuniq/preloadedkrakenuniq/environment.yml diff --git a/conf/modules.config b/conf/modules.config index 8c1d91620..82b09f21f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -989,6 +989,7 @@ process { mode: params.publish_dir_mode, pattern: '*.{txt,fastq.gz}' ] + ext.prefix = { "${meta.single_end}" } } withName: METAPHLAN_METAPHLAN { diff --git a/modules.json b/modules.json index a2fc85bbb..156449adb 100644 --- a/modules.json +++ b/modules.json @@ -187,7 +187,7 @@ }, "krakenuniq/preloadedkrakenuniq": { "branch": "master", - "git_sha": "a6eb17f65b3ee5761c25c075a6166c9f76733cee", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"] }, "malt/run": { diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/environment.yml b/modules/nf-core/krakenuniq/preloadedkrakenuniq/environment.yml new file mode 100644 index 000000000..bbf85c335 --- /dev/null +++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::krakenuniq=1.0.4 diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf b/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf index 0cb402f77..d24f75d29 100644 --- a/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf +++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf @@ -2,49 +2,51 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { tag "$meta.id" label 'process_high' - conda "bioconda::krakenuniq=1.0.2" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/krakenuniq:1.0.2--pl5321h19e8d03_0': - 'quay.io/biocontainers/krakenuniq:1.0.2--pl5321h19e8d03_0' }" + 'https://depot.galaxyproject.org/singularity/krakenuniq:1.0.4--pl5321h6dccd9a_2': + 'biocontainers/krakenuniq:1.0.4--pl5321h6dccd9a_2' }" input: - tuple val(meta), path(fastqs) - path db + tuple val(meta), path(sequences) + val sequence_type + path db val ram_chunk_size - val save_output_fastqs + val save_output_reads val report_file val save_output output: - tuple val(meta), path('*.classified{.,_}*') , optional:true, emit: classified_reads_fastq - tuple val(meta), path('*.unclassified{.,_}*') , optional:true, emit: unclassified_reads_fastq - tuple val(meta), path('*classified.txt') , optional:true, emit: classified_assignment - tuple val(meta), path('*report.txt') , emit: report - - path "versions.yml" , emit: versions + tuple val(meta), path("*.classified.${sequence_type}.gz") , optional:true, emit: classified_reads + tuple val(meta), path("*.unclassified.${sequence_type}.gz"), optional:true, emit: unclassified_reads + tuple val(meta), path('*.krakenuniq.classified.txt') , optional:true, emit: classified_assignment + tuple val(meta), path('*.krakenuniq.report.txt') , emit: report + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: + assert sequence_type in ['fasta', 'fastq'] + def args = task.ext.args ?: '' def args2 = task.ext.args ?: '' - def classified = meta.single_end ? '"\${PREFIX}.classified.fastq"' : '"\${PREFIX}.classified#.fastq"' - def unclassified = meta.single_end ? '"\${PREFIX}.unclassified.fastq"' : '"\${PREFIX}.unclassified#.fastq"' - def classified_option = save_output_fastqs ? "--classified-out ${classified}" : '' - def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : '' + classified = meta.single_end ? "\${PREFIX}.classified.${sequence_type}" : "\${PREFIX}.merged.classified.${sequence_type}" + unclassified = meta.single_end ? "\${PREFIX}.unclassified.${sequence_type}" : "\${PREFIX}.merged.unclassified.${sequence_type}" + classified_option = save_output_reads ? "--classified-out \"${classified}\"" : '' + unclassified_option = save_output_reads ? "--unclassified-out \"${unclassified}\"" : '' def output_option = save_output ? '--output "\${PREFIX}.krakenuniq.classified.txt"' : '' def report = report_file ? '--report-file "\${PREFIX}.krakenuniq.report.txt"' : '' - def compress_reads_command = save_output_fastqs ? 'gzip --no-name *.fastq' : '' + compress_reads_command = save_output_reads ? "find . -name '*.${sequence_type}' -print0 | xargs -0 -t -P ${task.cpus} -I % gzip --no-name %" : '' if (meta.single_end) { """ krakenuniq \\ + $args \\ --db $db \\ --preload \\ --preload-size $ram_chunk_size \\ - --threads $task.cpus \\ - $args + --threads $task.cpus strip_suffix() { local result=\$1 @@ -52,7 +54,7 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { echo "\${result%%.*}" } - printf "%s\\n" ${fastqs} | while read FASTQ; do \\ + printf "%s\\n" ${sequences} | while read FASTQ; do \\ PREFIX="\$(strip_suffix "\${FASTQ}")" krakenuniq \\ @@ -62,7 +64,6 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { $output_option \\ $unclassified_option \\ $classified_option \\ - $output_option \\ $args2 \\ "\${FASTQ}" done @@ -77,11 +78,11 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { } else { """ krakenuniq \\ + $args \\ --db $db \\ --preload \\ --preload-size $ram_chunk_size \\ - --threads $task.cpus \\ - $args + --threads $task.cpus strip_suffix() { local result @@ -91,7 +92,7 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { echo "\${result%.}" } - printf "%s %s\\n" ${fastqs} | while read FASTQ; do \\ + printf "%s %s\\n" ${sequences} | while read FASTQ; do \\ read -r -a FASTQ <<< "\${FASTQ}" PREFIX="\$(printf "%s\\n" "\${FASTQ[@]}" | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' | strip_suffix)" @@ -102,7 +103,6 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { $output_option \\ $unclassified_option \\ $classified_option \\ - $output_option \\ --paired \\ $args2 \\ "\${FASTQ[@]}" @@ -118,24 +118,26 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { } stub: + assert sequence_type in ['fasta', 'fastq'] + def args = task.ext.args ?: '' def args2 = task.ext.args ?: '' - def classified = meta.single_end ? '"\${PREFIX}.classified.fastq"' : '"\${PREFIX}.classified#.fastq"' - def unclassified = meta.single_end ? '"\${PREFIX}.unclassified.fastq"' : '"\${PREFIX}.unclassified#.fastq"' - def classified_option = save_output_fastqs ? "--classified-out ${classified}" : '' - def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : '' + classified = meta.single_end ? "\${PREFIX}.classified.${sequence_type}" : "\${PREFIX}.merged.classified.${sequence_type}" + unclassified = meta.single_end ? "\${PREFIX}.unclassified.${sequence_type}" : "\${PREFIX}.merged.unclassified.${sequence_type}" + classified_option = save_output_reads ? "--classified-out \"${classified}\"" : '' + unclassified_option = save_output_reads ? "--unclassified-out \"${unclassified}\"" : '' def output_option = save_output ? '--output "\${PREFIX}.krakenuniq.classified.txt"' : '' def report = report_file ? '--report-file "\${PREFIX}.krakenuniq.report.txt"' : '' - def compress_reads_command = save_output_fastqs ? 'gzip --no-name *.fastq' : '' + compress_reads_command = save_output_reads ? "find . -name '*.${sequence_type}' -print0 | xargs -0 -t -P ${task.cpus} -I % gzip --no-name %" : '' if (meta.single_end) { """ echo krakenuniq \\ + $args \\ --db $db \\ --preload \\ --preload-size $ram_chunk_size \\ - --threads $task.cpus \\ - $args + --threads $task.cpus strip_suffix() { local result=\$1 @@ -143,7 +145,15 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { echo "\${result%%.*}" } - printf "%s\\n" ${fastqs} | while read FASTQ; do \\ + create_file() { + echo '<3 nf-core' > "\$1" + } + + create_gzip_file() { + echo '<3 nf-core' | gzip -n > "\$1" + } + + printf "%s\\n" ${sequences} | while read FASTQ; do \\ echo "\${FASTQ}" PREFIX="\$(strip_suffix "\${FASTQ}")" echo "\${PREFIX}" @@ -155,17 +165,16 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { $output_option \\ $unclassified_option \\ $classified_option \\ - $output_option \\ $args2 \\ "\${FASTQ}" - touch "\${PREFIX}.classified.fastq.gz" - touch "\${PREFIX}.krakenuniq.classified.txt" - touch "\${PREFIX}.krakenuniq.report.txt" - touch "\${PREFIX}.unclassified.fastq.gz" + create_file "\${PREFIX}.krakenuniq.classified.txt" + create_file "\${PREFIX}.krakenuniq.report.txt" + create_gzip_file "\${PREFIX}.classified.${sequence_type}.gz" + create_gzip_file "\${PREFIX}.unclassified.${sequence_type}.gz" done - echo $compress_reads_command + echo "$compress_reads_command" cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -175,11 +184,11 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { } else { """ echo krakenuniq \\ + $args \\ --db $db \\ --preload \\ --preload-size $ram_chunk_size \\ - --threads $task.cpus \\ - $args + --threads $task.cpus strip_suffix() { local result @@ -189,7 +198,15 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { echo "\${result%.}" } - printf "%s %s\\n" ${fastqs} | while read FASTQ; do \\ + create_file() { + echo '<3 nf-core' > "\$1" + } + + create_gzip_file() { + echo '<3 nf-core' | gzip -n > "\$1" + } + + printf "%s %s\\n" ${sequences} | while read FASTQ; do \\ read -r -a FASTQ <<< "\${FASTQ}" echo "\${FASTQ[@]}" PREFIX="\$(printf "%s\\n" "\${FASTQ[@]}" | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' | strip_suffix)" @@ -202,18 +219,17 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { $output_option \\ $unclassified_option \\ $classified_option \\ - $output_option \\ --paired \\ $args2 \\ "\${FASTQ[@]}" - touch "\${PREFIX}.classified_1.fastq.gz" "\${PREFIX}.classified_2.fastq.gz" - touch "\${PREFIX}.krakenuniq.classified.txt" - touch "\${PREFIX}.krakenuniq.report.txt" - touch "\${PREFIX}.unclassified_1.fastq.gz" "\${PREFIX}.unclassified_2.fastq.gz" + create_file "\${PREFIX}.krakenuniq.classified.txt" + create_file "\${PREFIX}.krakenuniq.report.txt" + create_gzip_file "\${PREFIX}.merged.classified.${sequence_type}.gz" + create_gzip_file "\${PREFIX}.merged.unclassified.${sequence_type}.gz" done - echo $compress_reads_command + echo "$compress_reads_command" cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml b/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml index 4ac645c55..1af2350d6 100644 --- a/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml +++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml @@ -8,71 +8,106 @@ keywords: - db tools: - "krakenuniq": - description: "Metagenomics classifier with unique k-mer counting for more specific results" + description: "Metagenomics classifier with unique k-mer counting for more specific + results" homepage: https://github.com/fbreitwieser/krakenuniq documentation: https://github.com/fbreitwieser/krakenuniq doi: 10.1186/s13059-018-1568-0 licence: ["MIT"] - + identifier: biotools:KrakenUniq input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - fastqs: - type: file - description: List of input FastQ files - - db: - type: directory - description: KrakenUniq database - - ram_chunk_size: - type: val - description: Amount of maximum amount of RAM each chunk of database that should be loaded at any one time - pattern: "*GB" - - save_output_fastqs: - type: boolean - description: | - If true, optional commands are added to save classified and unclassified reads - as fastq files - - save_reads_assignment: - type: boolean - description: | - If true, an optional command is added to save a file reporting the taxonomic - classification of each input read + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - sequences: + type: file + description: List of input files containing sequences. All of them must be either + in FASTA or FASTQ format. + - - sequence_type: + type: string + description: Format of all given sequencing files as literal string, either + 'fasta' or 'fastq'. + pattern: "{fasta,fastq}" + - - db: + type: directory + description: KrakenUniq database + - - ram_chunk_size: + type: string + description: Amount of maximum amount of RAM each chunk of database that should + be loaded at any one time + pattern: "*GB" + - - save_output_reads: + type: boolean + description: | + Optionally, commands are added to save classified and unclassified reads + as FASTQ or FASTA files depending on the input format. When the input + is paired-end, the single output FASTQ contains merged reads. + - - report_file: + type: boolean + description: Whether to generate a report of relative abundances. + - - save_output: + type: boolean + description: Whether to save a file reporting the taxonomic classification of + each input read. output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - classified_reads_fastq: - type: file - description: | - Reads classified as belonging to any of the taxa - on the KrakenUniq database. - pattern: "*.fastq.gz" - - unclassified_reads_fastq: - type: file - description: | - Reads not classified to any of the taxa - on the KrakenUniq database. - pattern: "*.fastq.gz" + - classified_reads: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.classified.${sequence_type}.gz": + type: file + description: | + Reads classified as belonging to any of the taxa + in the KrakenUniq reference database. + pattern: "*.classified.{fastq,fasta}.gz" + - unclassified_reads: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.unclassified.${sequence_type}.gz": + type: file + description: | + Reads not classified to any of the taxa + in the KrakenUniq reference database. + pattern: "*.unclassified.{fastq,fasta}.gz" - classified_assignment: - type: file - description: | - KrakenUniq output file indicating the taxonomic assignment of - each input read ## DOUBLE CHECK!! + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.krakenuniq.classified.txt": + type: file + description: | + KrakenUniq output file indicating the taxonomic assignment of + each input read ## DOUBLE CHECK!! + pattern: "*.krakenuniq.classified.txt" - report: - type: file - description: | - KrakenUniq report containing stats about classified - and not classifed reads. - pattern: "*.report.txt" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.krakenuniq.report.txt": + type: file + description: | + KrakenUniq report containing statistics about classified + and unclassified reads. + pattern: "*.krakenuniq.report.txt" - versions: - type: file - description: File containing software versions - pattern: "versions.yml" + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@mjamy" - "@Midnighter" +maintainers: + - "@mjamy" + - "@Midnighter" diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index f89dc3e66..d7f74c392 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -142,7 +142,7 @@ workflow METAGENOMICS_PROFILING { ch_krakenuniq_input = ch_reads .map{ meta, file -> [ - ['single_end':meta['single_end']], + ['single_end':meta['single_end']], // retain single_end vs paired_end bools for input splitting file ] } @@ -158,14 +158,10 @@ workflow METAGENOMICS_PROFILING { meta_files_input: [meta, files] database: database } -// NOTE to self: it will try to submit all reads to the process at once, -// then any PE reads will be lists within the list of read inputs with the single single_end_clone meta -// eg [single_end_clone:true], [/workspace/eager/work/24/66b0b0000323ab315c1e4d499a8b53/JK2802_JK2802_AGAATAACCTACCA_Mammoth_MT_Krause.merged.fastq.gz, /workspace/eager/work/03/4fcf3b2285211d76322257a51ace29/JK2782_JK2782_TGGCCGATCAACGA_BAM_Mammoth_MT_Krause.merged.fastq.gz, [/workspace/eager/work/eb/2d257c378f932115ea34e338bee444/JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_1.merged.fastq.gz, /workspace/eager/work/eb/2d257c378f932115ea34e338bee444/JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_2.merged.fastq.gz]]] -// -// so solution must split the inputs to krakenuniq by the PE and SE reads and then remerge the channel for outputting KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_krakenuniq_input.meta_files_input, + 'fastq', // only fastq files can get to the input channel ch_krakenuniq_input.database, params.metagenomics_krakenuniq_ramchunksize, params.metagenomics_kraken2_savereads, From b10d16b3057370e0ec0d6e997b58870e7b5dfb06 Mon Sep 17 00:00:00 2001 From: Ian Light <ilight1542@gmail.com> Date: Fri, 25 Oct 2024 09:58:17 +0000 Subject: [PATCH 05/19] updated warn and comments --- subworkflows/local/metagenomics_profiling.nf | 2 +- subworkflows/local/utils_nfcore_eager_pipeline/main.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index d7f74c392..8d9291f8b 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -137,7 +137,7 @@ workflow METAGENOMICS_PROFILING { } else if ( params.metagenomics_profiling_tool == 'krakenuniq' ) { - // run krakenuniq once for all samples + // run krakenuniq once for all samples, unless non-merged PE vs SE data ch_krakenuniq_input = ch_reads .map{ meta, file -> diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index 1c0818cbf..8effb2353 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -233,7 +233,7 @@ def validateInputParameters() { !params.metagenomics_maltextract_ncbidir ) ){ exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked MALT with postprocessing but didnt provided required input files. Please provide the --metagenomics_maltextract_taxonlist and --metagenomics_maltextract_ncbidir flags") } - if ( params.run_metagenomics && params.preprocessing_skippairmerging ) { log.warn("[nf-core/eager] WARNING: --preprocessing_skippairmerging selected with running metagenomics! All singletons from paired end samples will be discarded prior to input for metagenomics screening!") } + if ( params.run_metagenomics && params.preprocessing_skippairmerging ) { log.warn("[nf-core/eager] WARNING: --preprocessing_skippairmerging selected in combination with metagenomics! All singletons from paired end samples will be discarded prior to input for metagenomics screening! This may be inappropriate for malt and metaphlan, which do not utilize paired end information!") } if ( params.run_genotyping && ! params.genotyping_tool ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_tool was specified.") } if ( params.run_genotyping && ! params.genotyping_source ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_source was specified.") } if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } From b8239c7fd4518fcc2ad2aa419a8cc5bc4b1a231c Mon Sep 17 00:00:00 2001 From: Ian Light <ilight1542@gmail.com> Date: Fri, 15 Nov 2024 10:35:38 +0000 Subject: [PATCH 06/19] updated error catching, tested profiling with paired end inputs --- subworkflows/local/utils_nfcore_eager_pipeline/main.nf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index 8effb2353..c7e6daf74 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -233,9 +233,10 @@ def validateInputParameters() { !params.metagenomics_maltextract_ncbidir ) ){ exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked MALT with postprocessing but didnt provided required input files. Please provide the --metagenomics_maltextract_taxonlist and --metagenomics_maltextract_ncbidir flags") } - if ( params.run_metagenomics && params.preprocessing_skippairmerging ) { log.warn("[nf-core/eager] WARNING: --preprocessing_skippairmerging selected in combination with metagenomics! All singletons from paired end samples will be discarded prior to input for metagenomics screening! This may be inappropriate for malt and metaphlan, which do not utilize paired end information!") } - if ( params.run_genotyping && ! params.genotyping_tool ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_tool was specified.") } - if ( params.run_genotyping && ! params.genotyping_source ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_source was specified.") } + if ( params.run_metagenomics && params.metagenomics_profiling_tool == 'metaphlan' && params.preprocessing_skippairmerging ) { log.warn("[nf-core/eager] WARNING: --preprocessing_skippairmerging selected in combination with MetPhlAn4 for metagenomics! All singletons from paired end samples will be discarded prior to input for metagenomics screening! This may be inappropriate for metaphlan, which does not utilize paired-end information!") } + if ( params.run_metagenomics && params.metagenomics_profiling_tool == 'malt' && params.preprocessing_skippairmerging ) { exit 1, ("[nf-core/eager] ERROR: --preprocessing_skippairmerging selected in combination with MALT for metagenomics! MALT cannot accept separated read pair information, please remove --preprocessing_skippairmerging parameter.") } + if ( params.run_genotyping && ! params.genotyping_tool ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_tool was specified.") } + if ( params.run_genotyping && ! params.genotyping_source ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_source was specified.") } if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } From a9179535845828bac98a1051629626679e4734d0 Mon Sep 17 00:00:00 2001 From: Ian Light <ilight1542@gmail.com> Date: Fri, 15 Nov 2024 11:08:21 +0000 Subject: [PATCH 07/19] added tags for log, updated warns/errors --- conf/modules.config | 1 + subworkflows/local/bamfiltering.nf | 1 - subworkflows/local/utils_nfcore_eager_pipeline/main.nf | 4 ++-- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 82b09f21f..1be7b8540 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -984,6 +984,7 @@ process { } withName: KRAKENUNIQ_PRELOADEDKRAKENUNIQ { + tag = { "${meta.single_end}" } publishDir = [ path: { "${params.outdir}/metagenomics/profiling/krakenuniq/" }, mode: params.publish_dir_mode, diff --git a/subworkflows/local/bamfiltering.nf b/subworkflows/local/bamfiltering.nf index 0b7ce8484..88b5626eb 100644 --- a/subworkflows/local/bamfiltering.nf +++ b/subworkflows/local/bamfiltering.nf @@ -97,7 +97,6 @@ workflow FILTER_BAM { CAT_FASTQ_UNMAPPED ( ch_single_fastq_for_cat ) } - // TODO: see request https://github.com/nf-core/eager/issues/945 if ( ( params.run_metagenomics && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) && params.preprocessing_skippairmerging ) { ch_paired_fastq_for_cat = SAMTOOLS_FASTQ_UNMAPPED.out.fastq.filter { !it[0].single_end } diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index c7e6daf74..2964373ba 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -233,8 +233,8 @@ def validateInputParameters() { !params.metagenomics_maltextract_ncbidir ) ){ exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked MALT with postprocessing but didnt provided required input files. Please provide the --metagenomics_maltextract_taxonlist and --metagenomics_maltextract_ncbidir flags") } - if ( params.run_metagenomics && params.metagenomics_profiling_tool == 'metaphlan' && params.preprocessing_skippairmerging ) { log.warn("[nf-core/eager] WARNING: --preprocessing_skippairmerging selected in combination with MetPhlAn4 for metagenomics! All singletons from paired end samples will be discarded prior to input for metagenomics screening! This may be inappropriate for metaphlan, which does not utilize paired-end information!") } - if ( params.run_metagenomics && params.metagenomics_profiling_tool == 'malt' && params.preprocessing_skippairmerging ) { exit 1, ("[nf-core/eager] ERROR: --preprocessing_skippairmerging selected in combination with MALT for metagenomics! MALT cannot accept separated read pair information, please remove --preprocessing_skippairmerging parameter.") } + if ( params.run_metagenomics && params.preprocessing_skippairmerging ) { log.warn("[nf-core/eager] WARNING: --preprocessing_skippairmerging selected in combination for metagenomics! All singletons from paired end samples will be discarded prior to input for metagenomics screening! Additionally, this may be inappropriate for metaphlan, which does not utilize paired-end information!") } + if ( params.run_metagenomics && params.preprocessing_skippairmerging && params.metagenomics_profiling_tool == 'malt' ) { exit 1, ("[nf-core/eager] ERROR: --preprocessing_skippairmerging selected in combination with MALT for metagenomics! MALT cannot accept separated read pair information, please remove --preprocessing_skippairmerging parameter.") } if ( params.run_genotyping && ! params.genotyping_tool ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_tool was specified.") } if ( params.run_genotyping && ! params.genotyping_source ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_source was specified.") } if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } From 49f36d2c6066bb3c3f884799528e979fcc5b7c1a Mon Sep 17 00:00:00 2001 From: Ian Light <ilight1542@gmail.com> Date: Fri, 24 Jan 2025 11:03:45 +0000 Subject: [PATCH 08/19] samtools fastq map name with all when bamfiltering merging all files (clarity fix) --- conf/modules.config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 1be7b8540..7defa5ab5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -399,7 +399,7 @@ process { ext.args = [ params.metagenomics_input == 'all' ? '' : '-F 4', ].join(' ').trim() - ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_mapped" } + ext.prefix = { params.metagenomics_input == 'all' ? "${meta.sample_id}_${meta.library_id}_${meta.reference}_all" : "${meta.sample_id}_${meta.library_id}_${meta.reference}_mapped" } publishDir = [ path: { "${params.outdir}/bam_filtering/" }, mode: params.publish_dir_mode, @@ -972,6 +972,7 @@ process { } withName: KRAKEN2_KRAKEN2 { + tag = { "${meta.sample_id}|single_end_mode_${meta.single_end}" } ext.args = [ params.metagenomics_kraken2_saveminimizers ? "--report-minimizer-data" : "" ].join(' ').trim() From c38886cbd9c2c41f416b5c180960fc0495e3bf34 Mon Sep 17 00:00:00 2001 From: Ian Light <ilight1542@gmail.com> Date: Fri, 24 Jan 2025 11:04:59 +0000 Subject: [PATCH 09/19] adjusted fastq generation for input into metagenomics - bugfix --- subworkflows/local/bamfiltering.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/bamfiltering.nf b/subworkflows/local/bamfiltering.nf index 88b5626eb..13741fef9 100644 --- a/subworkflows/local/bamfiltering.nf +++ b/subworkflows/local/bamfiltering.nf @@ -80,7 +80,7 @@ workflow FILTER_BAM { // Solution to the Andrades Valtueña-Light Problem: mapped bam for metagenomics (with options for quality- and length filtered) - if ( params.bamfiltering_generatemappedfastq || ( params.run_metagenomics && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) ) { + if ( params.bamfiltering_generatemappedfastq || ( params.run_metagenomics && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) ) { SAMTOOLS_FASTQ_MAPPED ( bam.map{[ it[0], it[1] ]}, false ) ch_versions = ch_versions.mix( SAMTOOLS_FASTQ_MAPPED.out.versions.first() ) } @@ -98,7 +98,7 @@ workflow FILTER_BAM { } if ( ( params.run_metagenomics && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) && params.preprocessing_skippairmerging ) { - ch_paired_fastq_for_cat = SAMTOOLS_FASTQ_UNMAPPED.out.fastq.filter { !it[0].single_end } + ch_paired_fastq_for_cat = SAMTOOLS_FASTQ_MAPPED.out.fastq.filter { !it[0].single_end } ch_single_fastq_for_cat = SAMTOOLS_FASTQ_MAPPED.out.fastq .mix(SAMTOOLS_FASTQ_MAPPED.out.singleton) From 0bf439bc1f7e38b48447e9dd0c1e5e056c2d95c7 Mon Sep 17 00:00:00 2001 From: Ian Light <ilight1542@gmail.com> Date: Fri, 24 Jan 2025 11:05:17 +0000 Subject: [PATCH 10/19] reduced unnecssary words --- subworkflows/local/utils_nfcore_eager_pipeline/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index 2964373ba..d81dbddbb 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -233,7 +233,7 @@ def validateInputParameters() { !params.metagenomics_maltextract_ncbidir ) ){ exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked MALT with postprocessing but didnt provided required input files. Please provide the --metagenomics_maltextract_taxonlist and --metagenomics_maltextract_ncbidir flags") } - if ( params.run_metagenomics && params.preprocessing_skippairmerging ) { log.warn("[nf-core/eager] WARNING: --preprocessing_skippairmerging selected in combination for metagenomics! All singletons from paired end samples will be discarded prior to input for metagenomics screening! Additionally, this may be inappropriate for metaphlan, which does not utilize paired-end information!") } + if ( params.run_metagenomics && params.preprocessing_skippairmerging ) { log.warn("[nf-core/eager] WARNING: --preprocessing_skippairmerging selected in combination for metagenomics! All singletons from paired end samples will be discarded prior to input for metagenomics screening! This may be inappropriate for metaphlan, which does not utilize paired-end information!") } if ( params.run_metagenomics && params.preprocessing_skippairmerging && params.metagenomics_profiling_tool == 'malt' ) { exit 1, ("[nf-core/eager] ERROR: --preprocessing_skippairmerging selected in combination with MALT for metagenomics! MALT cannot accept separated read pair information, please remove --preprocessing_skippairmerging parameter.") } if ( params.run_genotyping && ! params.genotyping_tool ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_tool was specified.") } if ( params.run_genotyping && ! params.genotyping_source ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_source was specified.") } From ed3d93c2de51677306fb7ee69157bc0bf516c476 Mon Sep 17 00:00:00 2001 From: Ian Light <ilight1542@gmail.com> Date: Fri, 31 Jan 2025 10:01:55 +0000 Subject: [PATCH 11/19] removed view and added useful module tag for runtime --- conf/modules.config | 2 +- subworkflows/local/metagenomics_profiling.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 2dc654387..a604e2d21 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -993,7 +993,7 @@ process { } withName: KRAKENUNIQ_PRELOADEDKRAKENUNIQ { - tag = { "${meta.single_end}" } + tag = { ${meta.single_end} ? "single_end" : "paried_end" } publishDir = [ path: { "${params.outdir}/metagenomics/profiling/krakenuniq/" }, mode: params.publish_dir_mode, diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 8d9291f8b..77bdc8ee1 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -150,7 +150,7 @@ workflow METAGENOMICS_PROFILING { .map { meta, files -> [ meta, files.flatten() - ]}.view() + ]} ch_krakenuniq_input = ch_krakenuniq_input.combine(ch_database) .multiMap{ From a946fca12b065a57109dc3d8396e9dad70036b7e Mon Sep 17 00:00:00 2001 From: Ian Light <ilight1542@gmail.com> Date: Fri, 21 Feb 2025 11:36:08 +0000 Subject: [PATCH 12/19] adjustment needed for krakenuniq --- subworkflows/local/utils_nfcore_eager_pipeline/main.nf | 4 ---- workflows/eager.nf | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index 75b3171a0..755795025 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -118,10 +118,6 @@ workflow PIPELINE_INITIALISATION { } [ meta, r1, r2, bam ] } - .groupTuple() - .map { samplesheet -> - validateInputSamplesheet(samplesheet) - } // - Only single-ended specified for BAM files ch_samplesheet_for_branch.bam diff --git a/workflows/eager.nf b/workflows/eager.nf index b01f769e0..9dbf9b547 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -27,7 +27,7 @@ include { MAP } from '../subwork include { FILTER_BAM } from '../subworkflows/local/bamfiltering.nf' include { DEDUPLICATE } from '../subworkflows/local/deduplicate' include { MANIPULATE_DAMAGE } from '../subworkflows/local/manipulate_damage' -include { METAGENOMICS_COMPLEXITYFILTER } from '../subworkflows/local/metagenomics_complexityfilter' +include { METAGENOMICS } from '../subworkflows/local/metagenomics' include { ESTIMATE_CONTAMINATION } from '../subworkflows/local/estimate_contamination' include { CALCULATE_DAMAGE } from '../subworkflows/local/calculate_damage' include { RUN_SEXDETERRMINE } from '../subworkflows/local/run_sex_determination' From b2ef998adde1244f0b74eedfdc92baf6f447dec6 Mon Sep 17 00:00:00 2001 From: Ian Light <ilight1542@gmail.com> Date: Fri, 21 Feb 2025 11:36:33 +0000 Subject: [PATCH 13/19] for manual tests --- docs/development/manual_tests.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 71e558e96..2bec03850 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -721,7 +721,7 @@ HOP001 ERR8958750 0 4 paired double half /workspace/eager/testing/test_data/ERR8 HOP001 ERR8958751 0 2 paired double half /workspace/eager/testing/test_data/ERR8958751_1.fastq.gz_reduced.fastq.gz /workspace/eager/testing/test_data/ERR8958751_2.fastq.gz_reduced.fastq.gz NA NA HOP001 ERR8958752 0 2 paired double half /workspace/eager/testing/test_data/ERR8958752_1.fastq.gz_reduced.fastq.gz /workspace/eager/testing/test_data/ERR8958752_2.fastq.gz_reduced.fastq.gz NA NA HOP001 ERR8958753 0 2 paired double half /workspace/eager/testing/test_data/ERR8958753_1.fastq.gz_reduced.fastq.gz /workspace/eager/testing/test_data/ERR8958753_2.fastq.gz_reduced.fastq.gz NA NA -HOP001 ERR8958754 0 2 paired double none /workspace/eager/testing/test_data/ERR8958754_1.fastq.gz_reduced.fastq.gz /workspace/eager/testing/test_data/ERR8958754_2.fastq.gz_reduced.fastq.gz NA NA" | sed 's/ /\t/g' > test.tsv +HOP001 ERR8958754 0 2 paired double none /workspace/eager/testing/test_data/ERR8958754_1.fastq.gz_reduced.fastq.gz /workspace/eager/testing/test_data/ERR8958754_2.fastq.gz_reduced.fastq.gz NA NA" | sed 's/NA/ /g' | sed 's/ /\t/g' > test.tsv nextflow run ../main.nf -profile docker \ --input test.tsv \ @@ -738,6 +738,16 @@ nextflow run ../main.nf -profile docker \ --metagenomics_malt_group_size 3 ``` +# kraken2 + +nextflow run main.nf -profile docker \ + --input testing/test.tsv \ + --outdir ./out \ + --run_metagenomics \ + --metagenomics_profiling_tool kraken2 \ + --metagenomics_profiling_database /workspace/eager/testing/eager_test.tar.gz +--preprocessing_skippairmerging + ## Mapping statistics ### ENDOSPY From 5c77fa89805a600fe7240822cdbfbf94015ed946 Mon Sep 17 00:00:00 2001 From: Ian Light <ilight1542@gmail.com> Date: Fri, 21 Feb 2025 11:37:13 +0000 Subject: [PATCH 14/19] debugging view commands --- subworkflows/local/metagenomics_profiling.nf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 77bdc8ee1..918ff6273 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -50,6 +50,8 @@ workflow METAGENOMICS_PROFILING { // for each tool and make liberal use of multiMap to keep reads/database // channel element order in sync with each other + ch_reads.view() + if ( params.metagenomics_profiling_tool == 'malt' ) { // Optional parallel run of malt available: @@ -152,6 +154,8 @@ workflow METAGENOMICS_PROFILING { meta, files.flatten() ]} + ch_krakenuniq_input.view() + ch_krakenuniq_input = ch_krakenuniq_input.combine(ch_database) .multiMap{ meta, files, database -> From 9ac4c42aaaf60af990de074f3eafefb13a53c757 Mon Sep 17 00:00:00 2001 From: Ian Light <ilight1542@gmail.com> Date: Fri, 28 Feb 2025 09:46:37 +0000 Subject: [PATCH 15/19] adjusted tag for correct parsing SEvsPE krakenuniq --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index a604e2d21..7732f77f0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -993,7 +993,7 @@ process { } withName: KRAKENUNIQ_PRELOADEDKRAKENUNIQ { - tag = { ${meta.single_end} ? "single_end" : "paried_end" } + tag = { "single_end_mode_${meta.single_end}" } publishDir = [ path: { "${params.outdir}/metagenomics/profiling/krakenuniq/" }, mode: params.publish_dir_mode, From c1bab1df83df4b9ef28bb115bb83ea7d6337340c Mon Sep 17 00:00:00 2001 From: Ian Light <ilight1542@gmail.com> Date: Fri, 28 Feb 2025 09:46:49 +0000 Subject: [PATCH 16/19] removed print statemtns --- subworkflows/local/metagenomics_profiling.nf | 4 ---- 1 file changed, 4 deletions(-) diff --git a/subworkflows/local/metagenomics_profiling.nf b/subworkflows/local/metagenomics_profiling.nf index 918ff6273..77bdc8ee1 100644 --- a/subworkflows/local/metagenomics_profiling.nf +++ b/subworkflows/local/metagenomics_profiling.nf @@ -50,8 +50,6 @@ workflow METAGENOMICS_PROFILING { // for each tool and make liberal use of multiMap to keep reads/database // channel element order in sync with each other - ch_reads.view() - if ( params.metagenomics_profiling_tool == 'malt' ) { // Optional parallel run of malt available: @@ -154,8 +152,6 @@ workflow METAGENOMICS_PROFILING { meta, files.flatten() ]} - ch_krakenuniq_input.view() - ch_krakenuniq_input = ch_krakenuniq_input.combine(ch_database) .multiMap{ meta, files, database -> From 7c30833561ed6c9c26b92c62cdc933f8f563f7e2 Mon Sep 17 00:00:00 2001 From: Ian Light <ilight1542@gmail.com> Date: Fri, 7 Mar 2025 11:08:26 +0000 Subject: [PATCH 17/19] adjusted multiple module imports into single module import for bamfiltering --> fastq for metagenomics. improves clarity (IMO) --- conf/modules.config | 27 +++-------- subworkflows/local/bamfiltering.nf | 75 ++++++++++-------------------- 2 files changed, 31 insertions(+), 71 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index ce45a4c3f..4e1d3cee1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -447,29 +447,14 @@ process { ] } - withName: SAMTOOLS_FASTQ_MAPPED { + withName: SAMTOOLS_FASTQ_METAGENOMICS { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.args = [ - params.metagenomics_input == 'all' ? '' : '-F 4', + params.metagenomics_input == 'mapped' ? '-F 4': '', + params.metagenomics_input == 'unmapped' ? '-f 4': '', + // 'all' is left then with NO -F or -f flag, therefore all reads get sent to fastq ].join(' ').trim() - ext.prefix = { params.metagenomics_input == 'all' ? "${meta.sample_id}_${meta.library_id}_${meta.reference}_all" : "${meta.sample_id}_${meta.library_id}_${meta.reference}_mapped" } - publishDir = [ - [ - // data - path: { "${params.outdir}/read_filtering/fastq/data/" }, - mode: params.publish_dir_mode, - pattern: '*.fastq.gz', - enabled: params.bamfiltering_generatemappedfastq - ] - ] - } - - withName: SAMTOOLS_FASTQ_UNMAPPED { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } - ext.args = [ - '-f 4', - ].join(' ').trim() - ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_unmapped" } + ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_metagenomics_fastq_${params.metagenomics_input}" } publishDir = [ [ // data @@ -481,7 +466,7 @@ process { ] } - withName: 'CAT_FASTQ_UNMAPPED|CAT_FASTQ_MAPPED' { + withName: 'CAT_FASTQ_METAGENOMICS' { tag = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } publishDir = [ diff --git a/subworkflows/local/bamfiltering.nf b/subworkflows/local/bamfiltering.nf index 13741fef9..01cf7cde1 100644 --- a/subworkflows/local/bamfiltering.nf +++ b/subworkflows/local/bamfiltering.nf @@ -9,6 +9,8 @@ include { SAMTOOLS_INDEX as SAMTOOLS_FILTER_INDEX } from '../../modules/ include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTAT_FILTERED } from '../../modules/nf-core/samtools/flagstat/main' include { SAMTOOLS_FASTQ as SAMTOOLS_FASTQ_UNMAPPED } from '../../modules/nf-core/samtools/fastq/main' include { SAMTOOLS_FASTQ as SAMTOOLS_FASTQ_MAPPED } from '../../modules/nf-core/samtools/fastq/main' +include { SAMTOOLS_FASTQ as SAMTOOLS_FASTQ_METAGENOMICS } from '../../modules/nf-core/samtools/fastq/main' +include { CAT_FASTQ as CAT_FASTQ_METAGENOMICS } from '../../modules/nf-core/cat/fastq' include { CAT_FASTQ as CAT_FASTQ_UNMAPPED } from '../../modules/nf-core/cat/fastq' include { CAT_FASTQ as CAT_FASTQ_MAPPED } from '../../modules/nf-core/cat/fastq' @@ -69,56 +71,29 @@ workflow FILTER_BAM { // // Metagenomics FASTQ generation for metagenomics (or just generation) // - FASTQ generation is now separate from BAM filtering - - // no length/quality filtering applies to metagenomic bam - // - - // Generate unmapped bam (no additional filtering) if the unmapped bam OR unmapped for metagneomics selected - if ( params.bamfiltering_generateunmappedfastq || ( params.run_metagenomics && params.metagenomics_input == 'unmapped' ) ) { - SAMTOOLS_FASTQ_UNMAPPED ( bam.map{[ it[0], it[1] ]}, false ) - ch_versions = ch_versions.mix( SAMTOOLS_FASTQ_UNMAPPED.out.versions.first() ) - } - - // Solution to the Andrades Valtueña-Light Problem: mapped bam for metagenomics (with options for quality- and length filtered) - - if ( params.bamfiltering_generatemappedfastq || ( params.run_metagenomics && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) ) { - SAMTOOLS_FASTQ_MAPPED ( bam.map{[ it[0], it[1] ]}, false ) - ch_versions = ch_versions.mix( SAMTOOLS_FASTQ_MAPPED.out.versions.first() ) - } - - if ( ( params.run_metagenomics && params.metagenomics_input == 'unmapped' ) && params.preprocessing_skippairmerging ) { - ch_paired_fastq_for_cat = SAMTOOLS_FASTQ_UNMAPPED.out.fastq.filter { !it[0].single_end } - - ch_single_fastq_for_cat = SAMTOOLS_FASTQ_UNMAPPED.out.fastq - .mix(SAMTOOLS_FASTQ_UNMAPPED.out.singleton) - .mix(SAMTOOLS_FASTQ_UNMAPPED.out.other) - .groupTuple() - .filter{ it[0].single_end } - - CAT_FASTQ_UNMAPPED ( ch_single_fastq_for_cat ) - } - - if ( ( params.run_metagenomics && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) && params.preprocessing_skippairmerging ) { - ch_paired_fastq_for_cat = SAMTOOLS_FASTQ_MAPPED.out.fastq.filter { !it[0].single_end } - - ch_single_fastq_for_cat = SAMTOOLS_FASTQ_MAPPED.out.fastq - .mix(SAMTOOLS_FASTQ_MAPPED.out.singleton) - .mix(SAMTOOLS_FASTQ_MAPPED.out.other) - .groupTuple() - .filter{ it[0].single_end } - - CAT_FASTQ_MAPPED ( ch_single_fastq_for_cat ) - } - - // Routing for metagenomic screening -> first accounting for paired-end mapping, then merged mapping, then no metagenomics - if ( ( params.run_metagenomics && params.metagenomics_input == 'unmapped' ) && params.preprocessing_skippairmerging ) { - ch_fastq_for_metagenomics = CAT_FASTQ_UNMAPPED.out.reads.mix(ch_paired_fastq_for_cat) - } else if ( ( params.run_metagenomics && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' ) ) && params.preprocessing_skippairmerging ) { - ch_fastq_for_metagenomics = CAT_FASTQ_MAPPED.out.reads.mix(ch_paired_fastq_for_cat) - } else if ( params.run_metagenomics && params.metagenomics_input == 'unmapped' ) { - ch_fastq_for_metagenomics = SAMTOOLS_FASTQ_UNMAPPED.out.other - } else if ( params.run_metagenomics && ( params.metagenomics_input == 'mapped' || params.metagenomics_input == 'all' )) { - ch_fastq_for_metagenomics = SAMTOOLS_FASTQ_MAPPED.out.other - } else if ( !params.run_metagenomics ) { + // No length/quality filtering applies to metagenomic bam files (could be extension) + // All bam -> fastq filtering options (-F 4, -f 4 or none) will be dealt with within modules.config + + SAMTOOLS_FASTQ_METAGENOMICS ( bam.map{[ it[0], it[1] ]}, false ) + ch_versions = ch_versions.mix( SAMTOOLS_FASTQ_METAGENOMICS.out.versions.first() ) + + ch_paired_fastq_for_cat_metagenomics = SAMTOOLS_FASTQ_METAGENOMICS.out.fastq.filter { !it[0].single_end } + ch_single_fastq_for_cat_metagenomics = SAMTOOLS_FASTQ_METAGENOMICS.out.fastq + .mix(SAMTOOLS_FASTQ_METAGENOMICS.out.singleton) + .mix(SAMTOOLS_FASTQ_METAGENOMICS.out.other) + .groupTuple() + .filter{ it[0].single_end } + CAT_FASTQ_METAGENOMICS ( ch_single_fastq_for_cat_metagenomics ) + + if ( params.run_metagenomics ) { + if ( params.preprocessing_skippairmerging ) { + // separate libraries that are SE (all merged reads) from PE (separate forward & reverse reads with NO singletons) data + ch_fastq_for_metagenomics = CAT_FASTQ_METAGENOMICS.out.reads.mix( ch_paired_fastq_for_cat_metagenomics ) + } + else { + ch_fastq_for_metagenomics = SAMTOOLS_FASTQ_METAGENOMICS.out.other + } + } else { ch_fastq_for_metagenomics = Channel.empty() } From dc329288c20d9d0caab9e2b8f679b5419e8da8f7 Mon Sep 17 00:00:00 2001 From: Ian Light <ilight1542@gmail.com> Date: Fri, 14 Mar 2025 10:50:11 +0000 Subject: [PATCH 18/19] separated bamfiltering -> fastq from metagenomics fastq generation. should allow for #945 extension --- conf/modules.config | 24 +++++--- nextflow.config | 6 +- nextflow_schema.json | 22 ++++---- subworkflows/local/bamfiltering.nf | 56 ++++++++++--------- .../local/utils_nfcore_eager_pipeline/main.nf | 19 ++++--- 5 files changed, 72 insertions(+), 55 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 4e1d3cee1..38b803ce1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -406,7 +406,7 @@ process { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.args = [ "-q ${params.bamfiltering_mappingquality}", - params.bamfiltering_retainunmappedgenomicbam ? '' : "-F ${params.bamfilter_genomicbamfilterflag}", + params.bamfiltering_retainunmappedgenomicbam ? '' : "-F ${params.bamfiltering_genomicbamfilterflag}", ].join(' ').trim() ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_filtered" } publishDir = [ @@ -447,6 +447,20 @@ process { ] } + withName: SAMTOOLS_FASTQ_SAVEBAMFILTERINGREADS { + tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_bamfiltering_fastq" } + publishDir = [ + [ + // data + path: { "${params.outdir}/read_filtering/fastq/data/" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.bamfiltering_generatefastq + ] + ] + } + withName: SAMTOOLS_FASTQ_METAGENOMICS { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.args = [ @@ -461,7 +475,7 @@ process { path: { "${params.outdir}/read_filtering/fastq/data/" }, mode: params.publish_dir_mode, pattern: '*.fastq.gz', - enabled: params.bamfiltering_generateunmappedfastq + enabled: params.metagenomics_input_savefastq ] ] } @@ -471,11 +485,7 @@ process { ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } publishDir = [ [ - // data - path: { "${params.outdir}/read_filtering/fastq/data/" }, - mode: params.publish_dir_mode, - pattern: '*.fastq.gz', - enabled: params.preprocessing_savepreprocessedreads + enabled: false // NO publishing of concatenated fastq files for metagenomics, only outputs from SAMTOOLS_FASTQ_METAGENOMICS ] ] } diff --git a/nextflow.config b/nextflow.config index a5d4f6949..7f841067f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -124,15 +124,15 @@ params { run_bamfiltering = false bamfiltering_minreadlength = 0 bamfiltering_mappingquality = 0 - bamfilter_genomicbamfilterflag = 4 + bamfiltering_genomicbamfilterflag = 4 bamfiltering_retainunmappedgenomicbam = false // downstream genomics only - bamfiltering_generateunmappedfastq = false - bamfiltering_generatemappedfastq = false bamfiltering_savefilteredbams = false // can include unmapped reads if --bamfiltering_retainunmappedgenomicbam specified + bamfiltering_generatefastq = false // can include unmapped reads if --bamfiltering_retainunmappedgenomicbam specified // Metagenomic Screening run_metagenomics = false metagenomics_input = 'unmapped' // mapped, all, unmapped -> mapped vs all specified in SAMTOOLS_FASTQ_MAPPED in modules.conf, unmapped hardcoded SAMTOOLS_FASTQ_UNMAPPED + metagenomics_input_savefastq = false run_metagenomics_complexityfiltering = false metagenomics_complexity_tool = 'bbduk' metagenomics_complexity_savefastq = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 0535d72ca..fecd8c6ad 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -603,7 +603,7 @@ "help_text": "Specify a mapping quality threshold for mapped reads to be kept for downstream analysis.\n\nBy default all reads are retained and this option is therefore set to 0 to ensure no quality filtering is performed.\n\nNote that by default the output BAM files of this step are _not_ stored in the results directory (as it is assumed that deduplicated BAM files are preferred). See `--bamfiltering_savefilteredbams` if you wish to save these.\n\n> Modifies samtools view parameter: `-q`", "fa_icon": "fas fa-filter" }, - "bamfilter_genomicbamfilterflag": { + "bamfiltering_genomicbamfilterflag": { "type": "integer", "default": 4, "fa_icon": "fas fa-flag", @@ -616,16 +616,10 @@ "help_text": "Specify to retain unmapped reads (optionally also length filtered) in the genomic BAM for downstream analysis. By default, the pipeline only keeps mapped reads for downstream analysis.\n\nThis is also turned on if `--metagenomics_input` is set to `all`.\n\n> ⚠️ This will likely slow down run time of downstream pipeline steps!\n\n> Modifies tool parameter(s):\n> - samtools view: `-f 4` / `-F 4`", "fa_icon": "fas fa-piggy-bank" }, - "bamfiltering_generateunmappedfastq": { + "bamfiltering_generatefastq": { "type": "boolean", - "description": "Specify to generate FASTQ files containing only unmapped reads from the aligner generated BAM files.", - "help_text": "Specify to turn on the generation and saving of FASTQs of only the unmapped reads from the mapping step in the results directory.\n\nThis can be useful if you wish to do other analysis of the unmapped reads independently of the pipeline.\n\nNote: the reads in these FASTQ files have _not_ undergone length of quality filtering\n\n> Modifies samtools fastq parameter: `-f 4`", - "fa_icon": "fas fa-file-alt" - }, - "bamfiltering_generatemappedfastq": { - "type": "boolean", - "description": "Specify to generate FASTQ files containing only mapped reads from the aligner generated BAM files.", - "help_text": "Specify to turn on the generation and saving of FASTQs of only the mapped reads from the mapping step in the results directory.\n\nThis can be useful if you wish to do other analysis of the mapped reads independently of the pipeline, such as remapping with different parameters (whereby only including mapped reads will speed up computation time during the re-mapping due to reduced input data).\n\nNote the reads in these FASTQ files have _not_ undergone length of quality filtering\n\n> Modifies samtools fastq parameter: `-F 4`", + "description": "Specify to generate FASTQ files from the filtered BAM files.", + "help_text": "Specify to turn on the generation and saving of FASTQs from the mapping step (after optional filtering) in the results directory.\n\nThis can be useful if you wish to do other analysis of the reads independently of the pipeline, such as remapping with different parameters (whereby only including mapped reads will speed up computation time during the re-mapping due to reduced input data).\n\nNote the reads in these FASTQ files __may have__ undergone length of quality filtering based on the following parameters `bamfiltering_retainunmappedgenomicbam`, `bamfiltering_genomicbamfilterflag`, `bamfiltering_mappingquality`, `bamfiltering_minreadlength`./n/nOnly allowed with option `run_bamfiltering` turned on!", "fa_icon": "far fa-file-alt" }, "bamfiltering_savefilteredbams": { @@ -655,7 +649,13 @@ "description": "Specify which type of reads to use for metagenomic screening.", "enum": ["unmapped", "mapped", "all"], "fa_icon": "fas fa-hand-pointer", - "help_text": "Specify to select which mapped reads will be sent for metagenomic analysis.\n\nThis influences which reads are sent to this step, whether you want unmapped reads (used in most cases, as 'host reads' can often be contaminants in microbial genomes), mapped reads (e.g, when doing competitive against a genomic reference of multiple genomes and which to apply LCA correction) or all reads.\n\n> ⚠️ If you skip paired-end merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies samtools fastq parameters: `-f 4` / `-F 4`" + "help_text": "Specify to select which mapped reads will be sent for metagenomic analysis OR saved in combination with `bamfiltering_generatedfastq`.\n\nThis influences which reads are sent to this step, whether you want unmapped reads (used in most cases, as 'host reads' can often be contaminants in microbial genomes), mapped reads (e.g, when doing competitive against a genomic reference of multiple genomes and which to apply LCA correction) or all reads.\n\n> ⚠️ If you skip paired-end merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies samtools fastq parameters: `-f 4` / `-F 4`" + }, + "metagenomics_input_savefastq": { + "type": "boolean", + "description": "Specify to turn on saving of input for metagenomics.", + "fa_icon": "fas fa-power-off", + "help_text": "Specify to turn on the saving of input files used for metagenomics screening subworkflow of the pipeline.\n\nThe reads saved will be PRIOR to any complexity filtering steps part of the metagenomics pipeline. Please see `metagenomics_complexity_savefastq` if you are interesting in saving only post-complexity filtered reads." }, "run_metagenomics_complexityfiltering": { "type": "boolean", diff --git a/subworkflows/local/bamfiltering.nf b/subworkflows/local/bamfiltering.nf index 01cf7cde1..becad2825 100644 --- a/subworkflows/local/bamfiltering.nf +++ b/subworkflows/local/bamfiltering.nf @@ -2,17 +2,14 @@ // Filter BAMs for mapping quality, length, unmapped etc. // -include { FILTER_BAM_FRAGMENT_LENGTH } from '../../modules/local/filter_bam_fragment_length' -include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_BAM_FILTERING } from '../../modules/nf-core/samtools/view/main' -include { SAMTOOLS_INDEX as SAMTOOLS_LENGTH_FILTER_INDEX } from '../../modules/nf-core/samtools/index/main' -include { SAMTOOLS_INDEX as SAMTOOLS_FILTER_INDEX } from '../../modules/nf-core/samtools/index/main' -include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTAT_FILTERED } from '../../modules/nf-core/samtools/flagstat/main' -include { SAMTOOLS_FASTQ as SAMTOOLS_FASTQ_UNMAPPED } from '../../modules/nf-core/samtools/fastq/main' -include { SAMTOOLS_FASTQ as SAMTOOLS_FASTQ_MAPPED } from '../../modules/nf-core/samtools/fastq/main' -include { SAMTOOLS_FASTQ as SAMTOOLS_FASTQ_METAGENOMICS } from '../../modules/nf-core/samtools/fastq/main' -include { CAT_FASTQ as CAT_FASTQ_METAGENOMICS } from '../../modules/nf-core/cat/fastq' -include { CAT_FASTQ as CAT_FASTQ_UNMAPPED } from '../../modules/nf-core/cat/fastq' -include { CAT_FASTQ as CAT_FASTQ_MAPPED } from '../../modules/nf-core/cat/fastq' +include { FILTER_BAM_FRAGMENT_LENGTH } from '../../modules/local/filter_bam_fragment_length' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_BAM_FILTERING } from '../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_INDEX as SAMTOOLS_LENGTH_FILTER_INDEX } from '../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_INDEX as SAMTOOLS_FILTER_INDEX } from '../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTAT_FILTERED } from '../../modules/nf-core/samtools/flagstat/main' +include { SAMTOOLS_FASTQ as SAMTOOLS_FASTQ_METAGENOMICS } from '../../modules/nf-core/samtools/fastq/main' +include { CAT_FASTQ as CAT_FASTQ_METAGENOMICS } from '../../modules/nf-core/cat/fastq' +include { SAMTOOLS_FASTQ as SAMTOOLS_FASTQ_SAVEBAMFILTERINGREADS } from '../../modules/nf-core/samtools/fastq/main' workflow FILTER_BAM { @@ -68,27 +65,36 @@ workflow FILTER_BAM { ch_flagstats_file = ch_flagstats_file.mix( SAMTOOLS_FLAGSTAT_FILTERED.out.flagstat ) } - // - // Metagenomics FASTQ generation for metagenomics (or just generation) + // FASTQ generation for saving reads (INDEPENDENT of metagenomics) + // Output of reads is determined parameters are set by parameters for bamfiltering above (see SAMTOOLS_VIEW_BAM_FILTERING module config) + // Extension based on #945 possible here duirng hackathon + // Only possible with run_bamfiltering parameter set + if ( params.bamfiltering_generatefastq ) { + SAMTOOLS_FASTQ_SAVEBAMFILTERINGREADS ( SAMTOOLS_VIEW_BAM_FILTERING.out.bam, false ) + ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ_SAVEBAMFILTERINGREADS.out.versions.first() ) + } + + // FASTQ generation for metagenomics (OR just generation for saving mapped/unmapped reads) // - FASTQ generation is now separate from BAM filtering - // No length/quality filtering applies to metagenomic bam files (could be extension) // All bam -> fastq filtering options (-F 4, -f 4 or none) will be dealt with within modules.config - SAMTOOLS_FASTQ_METAGENOMICS ( bam.map{[ it[0], it[1] ]}, false ) - ch_versions = ch_versions.mix( SAMTOOLS_FASTQ_METAGENOMICS.out.versions.first() ) - - ch_paired_fastq_for_cat_metagenomics = SAMTOOLS_FASTQ_METAGENOMICS.out.fastq.filter { !it[0].single_end } - ch_single_fastq_for_cat_metagenomics = SAMTOOLS_FASTQ_METAGENOMICS.out.fastq - .mix(SAMTOOLS_FASTQ_METAGENOMICS.out.singleton) - .mix(SAMTOOLS_FASTQ_METAGENOMICS.out.other) - .groupTuple() - .filter{ it[0].single_end } - CAT_FASTQ_METAGENOMICS ( ch_single_fastq_for_cat_metagenomics ) - if ( params.run_metagenomics ) { + // Execute fastq generation on original bam mapping (independent of above bamfiltering) + SAMTOOLS_FASTQ_METAGENOMICS ( bam.map{[ it[0], it[1] ]}, false ) + ch_versions = ch_versions.mix( SAMTOOLS_FASTQ_METAGENOMICS.out.versions.first() ) + if ( params.preprocessing_skippairmerging ) { - // separate libraries that are SE (all merged reads) from PE (separate forward & reverse reads with NO singletons) data + // Splitting of paired vs single end data + ch_paired_fastq_for_cat_metagenomics = SAMTOOLS_FASTQ_METAGENOMICS.out.fastq.filter { !it[0].single_end } + ch_single_fastq_for_cat_metagenomics = SAMTOOLS_FASTQ_METAGENOMICS.out.fastq + .mix(SAMTOOLS_FASTQ_METAGENOMICS.out.singleton) + .mix(SAMTOOLS_FASTQ_METAGENOMICS.out.other) + .groupTuple() + .filter{ it[0].single_end } + CAT_FASTQ_METAGENOMICS ( ch_single_fastq_for_cat_metagenomics ) ch_fastq_for_metagenomics = CAT_FASTQ_METAGENOMICS.out.reads.mix( ch_paired_fastq_for_cat_metagenomics ) + ch_versions = ch_versions.mix( CAT_FASTQ_METAGENOMICS.out.versions.first() ) } else { ch_fastq_for_metagenomics = SAMTOOLS_FASTQ_METAGENOMICS.out.other diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index 755795025..a92c8f89c 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -208,15 +208,16 @@ workflow PIPELINE_COMPLETION { // def validateInputParameters() { genomeExistsError() - if ( !params.fasta && !params.fasta_sheet ) { exit 1, "[nf-core/eager] ERROR: Neither FASTA file --fasta nor reference sheet --fasta_sheet have been provided."} - if ( params.fasta && params.fasta_sheet ) { exit 1, "[nf-core/eager] ERROR: A FASTA file --fasta and a reference sheet --fasta_sheet have been provided. These parameters are mutually exclusive."} - if ( params.preprocessing_adapterlist && params.preprocessing_skipadaptertrim ) { log.warn("[nf-core/eager] WARNING: --preprocessing_skipadaptertrim will override --preprocessing_adapterlist. Adapter trimming will be skipped!") } - if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmerged ) { exit 1, "[nf-core/eager] ERROR: Dedup can only be used on collapsed (i.e. merged) PE reads. For all other cases, please set --deduplication_tool to 'markduplicates'."} - if ( params.bamfiltering_retainunmappedgenomicbam && params.bamfiltering_mappingquality > 0 ) { exit 1, ("[nf-core/eager] ERROR: You cannot both retain unmapped reads and perform quality filtering, as unmapped reads have a mapping quality of 0. Pick one or the other functionality.") } - if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } - if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } - if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } - if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_prinseq_mode == 'dust' && params.metagenomics_complexity_entropy != 0.3 ) { + if ( !params.fasta && !params.fasta_sheet ) { exit 1, "[nf-core/eager] ERROR: Neither FASTA file --fasta nor reference sheet --fasta_sheet have been provided."} + if ( params.fasta && params.fasta_sheet ) { exit 1, "[nf-core/eager] ERROR: A FASTA file --fasta and a reference sheet --fasta_sheet have been provided. These parameters are mutually exclusive."} + if ( params.preprocessing_adapterlist && params.preprocessing_skipadaptertrim ) { log.warn("[nf-core/eager] WARNING: --preprocessing_skipadaptertrim will override --preprocessing_adapterlist. Adapter trimming will be skipped!") } + if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmerged ) { exit 1, "[nf-core/eager] ERROR: Dedup can only be used on collapsed (i.e. merged) PE reads. For all other cases, please set --deduplication_tool to 'markduplicates'."} + if ( params.bamfiltering_retainunmappedgenomicbam && params.bamfiltering_mappingquality > 0 ) { exit 1, ("[nf-core/eager] ERROR: You cannot both retain unmapped reads and perform quality filtering, as unmapped reads have a mapping quality of 0. Pick one or the other functionality.") } + if ( params.bamfiltering_generatefastq && params.run_bamfiltering ) { exit 1, ("[nf-core/eager] ERROR: --bamfiltering_generatefastq will NOT generate a fastq file unless BAM filtering is turned on with `--run_bamfiltering`") } + if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } + if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } + if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } + if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_prinseq_mode == 'dust' && params.metagenomics_complexity_entropy != 0.3 ) { if (params.metagenomics_prinseq_dustscore == 0.5) { exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'dust' mode but provided an entropy score. Please specify a dust filter threshold using the --metagenomics_prinseq_dustscore flag") } } if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_prinseq_mode == 'entropy' && params.metagenomics_prinseq_dustscore != 0.5 ) { From 9c46dff7875d5259e3b638ea58c7b5a64add3968 Mon Sep 17 00:00:00 2001 From: Ian Light <ilight1542@gmail.com> Date: Fri, 14 Mar 2025 10:51:54 +0000 Subject: [PATCH 19/19] removal of notes from refactoring tracking issues later resolved --- docs/development/metagenomics_paired_end.md | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 docs/development/metagenomics_paired_end.md diff --git a/docs/development/metagenomics_paired_end.md b/docs/development/metagenomics_paired_end.md deleted file mode 100644 index 650a87cc5..000000000 --- a/docs/development/metagenomics_paired_end.md +++ /dev/null @@ -1,18 +0,0 @@ -## investigation notes for updating code to allow for PE inputs into metagenomics profiling (eg for kraken, malt) - -see -https://github.com/nf-core/eager/issues/945 - -current issue is that the reads that go into mapping are not by default extracted as singletons and non-singletons, so we lose that information -Then downstream the inputs into the krakenuniq module (even if split correctly with meta vars) don't have the correct headers to parse the PE nature of the reads (since they have all been concatenated anyways, and just were ORIGINALLY PE) - -So: needs to be fixed up higher (eg in bamfiltering.nf, likely with a new adjustment to the SAMTOOLS_FASTQ_UNMAPPED, SAMTOOLS_FASTQ_MAPPED, and SAMTOOLS_VIEW_BAM_FILTERING modules ) - -ISSUE FOUND: while the outputting of PE reads is OK in bamfiltering.nf (fastq_mapped & fastq_unmapped) when overlap merging is not done cat_fastq weirdly merges singletons to one PE file and other to the other PE file, so then everything gets fucked up -""" -cat input1/JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_unmapped_other.fastq.gz input3/JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_unmapped_1.fastq.gz > JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_1.merged.fastq.gz -cat input2/JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_unmapped_singleton.fastq.gz input4/JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_unmapped_2.fastq.gz > JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_2.merged.fastq.gz -""" - -Decision is needed on what behavior is wanted for unmapped singletons, other. and then likely remove the call to cat_fastq for PE reads -Possibly just split to also have the singletons parsed separately?