Merge pull request #160 from Daniel-VM/dev

Fix memory issues, handle KmerFinder results, and manage empty fasta files
nf-core · Sep 3, 2024 · c66630f · c66630f
2 parents 9e85db0 + fcf0ab9
commit c66630f
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Fixed`
 
+- [#160](https://github.com/nf-core/bacass/pull/160) Fixed memory issues in KmerFinder, fixed handling of no species detected, and fixed handling of empty fasta files in the prokka/bakkta channel.
 - [#157](https://github.com/nf-core/bacass/pull/157) Fixed corrupted zenodo URL of Kmerfinder database.
 - [#154](https://github.com/nf-core/bacass/pull/154) Fixed kmerfinder script and increase resources to prevent memory issues.
 - [#153](https://github.com/nf-core/bacass/pull/153) Update `.nf-core.yml` to fix files_unchanged section for accurate linting checks.

diff --git a/conf/modules.config b/conf/modules.config
@@ -293,7 +293,6 @@ if (!params.skip_kmerfinder) {
         withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER' {
             errorStrategy = { task.exitStatus in [1, 137, 139] ? 'retry' : 'finish'}
             maxRetries = 3
-            memory = { task.memory * task.attempt }
 
             ext.args = ''
             publishDir = [

diff --git a/subworkflows/local/kmerfinder_subworkflow.nf b/subworkflows/local/kmerfinder_subworkflow.nf
@@ -53,15 +53,19 @@ workflow KMERFINDER_SUBWORKFLOW {
         .join(consensus, by:0)
         .map{
             meta, report_json, report_txt, fasta ->
-                specie = report_json.splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Species"]
+                species_hits = report_json.splitJson(path:"kmerfinder.results.species_hits").value
+                def specie = species_hits.size() > 0 ? species_hits.get(0)["Species"] : "Unknown Species"
+
                 return tuple(specie, meta, report_txt, fasta)
         }
         .groupTuple(by:0) // Group by the "Species" field
         .set { ch_reports_byreference }
 
     // SUBWORKFLOW: For each species target, this subworkflow collects reference genome assemblies ('GCF*') and subsequently downloads the best matching reference assembly.
     FIND_DOWNLOAD_REFERENCE (
-        ch_reports_byreference.map{ specie, meta, report_txt, fasta-> tuple(specie, report_txt) },
+        ch_reports_byreference
+            .map{ specie, meta, report_txt, fasta-> tuple(specie, report_txt) }
+            .filter{ specie, report_txt -> specie != "Unknown Species" },
         ch_ncbi_assembly_metadata
     )
     ch_versions = ch_versions.mix(FIND_DOWNLOAD_REFERENCE.out.versions)

diff --git a/workflows/bacass.nf b/workflows/bacass.nf
@@ -460,7 +460,7 @@ workflow BACASS {
         ch_versions     = ch_versions.mix( GUNZIP.out.versions )
 
         PROKKA (
-            ch_to_prokka,
+            ch_to_prokka.filter{ meta, fasta -> !fasta.isEmpty() },
             [],
             []
         )
@@ -479,7 +479,7 @@ workflow BACASS {
         ch_versions     = ch_versions.mix( GUNZIP.out.versions )
 
         BAKTA_DBDOWNLOAD_RUN (
-            ch_to_bakta,
+            ch_to_bakta.filter{ meta, fasta -> !fasta.isEmpty() },
             params.baktadb,
             params.baktadb_download
         )