galaxyproject · mvdbeek · Dec 5, 2025 · Oct 9, 2025 · Oct 9, 2025 · Nov 13, 2025
diff --git a/...icrobiome/host-contamination-removal/host-contamination-removal-long-reads/.dockstore.yml b/...icrobiome/host-contamination-removal/host-contamination-removal-long-reads/.dockstore.yml
@@ -0,0 +1,13 @@
+version: 1.2
+workflows:
+- name: main
+  subclass: Galaxy
+  publish: true
+  primaryDescriptorPath: /host-or-contamination-removal-on-long-reads.ga
+  testParameterFiles:
+  - /host-or-contamination-removal-on-long-reads-tests.yml
+  authors:
+  - name: Paul Zierep
+    orcid: 0000-0003-2982-388X
+  - name: "B\xE9r\xE9nice Batut"
+    orcid: 0000-0001-9852-1987
diff --git a/...e/host-contamination-removal/host-contamination-removal-long-reads/CHANGELOG.md b/...e/host-contamination-removal/host-contamination-removal-long-reads/CHANGELOG.md
@@ -0,0 +1,5 @@
+# Changelog
+
+## [0.1] 2025-12-03
+
+First release.
diff --git a/...iome/host-contamination-removal/host-contamination-removal-long-reads/README.md b/...iome/host-contamination-removal/host-contamination-removal-long-reads/README.md
@@ -0,0 +1,25 @@
+# Host or Contamination removal on long-reads
+
+The extraction of microbiome DNA or RNA is usually contaminated by host and human DNA or RNA (but also other contaminant). It is an important to get rid of all host/contamination sequences and to only retain microbiome sequences, both in order to speed up further steps and to avoid host/contamination sequences compromising the analysis.
+
+This workflow takes Nanopore fastq(.gz) files and executes the following steps:
+1. Mapping of the reads against a reference genome of the host or contaminant (e.g. human) using **Minimap 2**
+2. Filtering of the generated BAM using **BAMtools** and **Samtools** to keep only the reads that do not align
+3. Generation of mapping statistics using **QualiMap**
+4. Aggregation of the mapping statistics using **MultiQC**
+
+## Input Datasets
+
+- A list of datasets corresponding to reads in `fastqsanger` or `fastqsanger.gz` format.
+- Reference genome
+- Profile for mapping
+
+## Output Datasets
+
+- A list of datasets corresponding to unmapped reads in `fastqsanger` or `fastqsanger.gz`.
+- A list of reports of QualiMap for each sample that could be used as inputs for extra MultiQC
+- MultiQC report of the mapping statistics in HTML
+
+## When to use this workflow
+
+Use this workflow for **long-read sequencing data** (e.g., Nanopore, PacBio). For short-read Illumina data, see the [Host or Contamination removal on short-reads](../host-contamination-removal-short-reads/) workflow.
diff --git a/...st-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml b/...st-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml
@@ -0,0 +1,183 @@
+- doc: Test outline for host-or-contamination-removal-on-long-reads
+  job:
+    Long-reads:
+      class: Collection
+      collection_type: list
+      elements:
+      - class: File
+        identifier: Spike3bBarcode10
+        location: https://zenodo.org/record/12190648/files/collection_of_all_samples_Spike3bBarcode10.fastq.gz
+        filetype: fastqsanger.gz
+      - class: File
+        identifier: Spike3bBarcode12
+        location: https://zenodo.org/record/12190648/files/collection_of_all_samples_Spike3bBarcode12.fastq.gz
+        filetype: fastqsanger.gz
+    Host/Contaminant Reference Genome (long-reads): apiMel3
+    Profile of preset options for the mapping (long-read): map-pb
+  outputs:
+    QualiMap Statistics:
+      element_tests:
+        Spike3bBarcode10:
+          elements:
+            genome_results:
+              asserts:
+                has_text:
+                  text: "Spike3bBarcode10"
+                has_text:
+                  text: "586,300,787 bp"
+            coverage_across_reference:
+              asserts:
+                has_text:
+                  text: "#Position (bp)"
+                has_n_lines:
+                  value: 416
+            coverage_histogram:
+              asserts:
+                has_text:
+                  text: "Number of genomic locations"
+                has_n_lines:
+                  value: 10
+            genome_fraction_coverage:
+              asserts:
+                has_text:
+                  text: "#Coverage (X)"
+                has_n_lines:
+                  value: 51
+            duplication_rate_histogram:
+              asserts:
+                has_text:
+                  text: "#Duplication rate"
+                has_text:
+                  text: "17.0"
+            homopolymer_indels:
+              asserts:
+                has_text:
+                  text: "#Type of indel"
+                has_text:
+                  text: "polyN"
+            insert_size_across_reference:
+              asserts:
+                has_size:
+                  value: 0
+            insert_size_histogram:
+              asserts:
+                has_size:
+                  value: 0
+            mapped_reads_clipping_profile:
+              asserts:
+                has_text:
+                  text: "#Read position (bp)"
+                has_text:
+                  text: "38.123"
+            mapped_reads_gc-content_distribution:
+              asserts:
+                has_text:
+                  text: "#GC Content (%)"
+                has_n_lines:
+                  value: 100
+            mapped_reads_nucleotide_content:
+              asserts:
+                has_text:
+                  text: "6.25"
+            mapping_quality_across_reference:
+              asserts:
+                has_text:
+                  text: "#Position (bp)"
+                has_n_lines:
+                  value: 416
+            mapping_quality_histogram:
+              asserts:
+                has_text:
+                  text: "#Mapping quality"
+                has_n_lines:
+                  value: 13
+        Spike3bBarcode12:
+          elements:
+            genome_results:
+              asserts:
+                has_text:
+                  text: "Spike3bBarcode12"
+                has_text:
+                  text: "586,300,787 bp"
+            coverage_across_reference:
+              asserts:
+                has_text:
+                  text: "#Position (bp)"
+                has_n_lines:
+                  value: 416
+            coverage_histogram:
+              asserts:
+                has_text:
+                  text: "Number of genomic locations"
+                has_n_lines:
+                  value: 6
+            genome_fraction_coverage:
+              asserts:
+                has_text:
+                  text: "#Coverage (X)"
+                has_n_lines:
+                  value: 51
+            duplication_rate_histogram:
+              asserts:
+                has_text:
+                  text: "#Duplication rate"
+                has_text:
+                  text: "8.0"
+            homopolymer_indels:
+              asserts:
+                has_text:
+                  text: "#Type of indel"
+                has_text:
+                  text: "polyN"
+            insert_size_across_reference:
+              asserts:
+                has_size:
+                  value: 0
+            insert_size_histogram:
+              asserts:
+                has_size:
+                  value: 0
+            mapped_reads_clipping_profile:
+              asserts:
+                has_text:
+                  text: "#Read position (bp)"
+                has_text:
+                  text: "0.03930972"
+            mapped_reads_gc-content_distribution:
+              asserts:
+                has_text:
+                  text: "#GC Content (%)"
+                has_n_lines:
+                  value: 100
+            mapped_reads_nucleotide_content:
+              asserts:
+                has_text:
+                  text: "16.0"
+            mapping_quality_across_reference:
+              asserts:
+                has_text:
+                  text: "#Position (bp)"
+                has_n_lines:
+                  value: 416
+            mapping_quality_histogram:
+              asserts:
+                has_text:
+                  text: "#Mapping quality"
+                has_n_lines:
+                  value: 4
+    MultiQC HTML Report:
+      asserts:
+        has_text:
+          text: "Spike3bBarcode10"
+        has_text:
+          text: "Spike3bBarcode12"
+    Reads without Host or Contamination:
+      element_tests:
+        Spike3bBarcode10:
+          asserts:
+            has_text:
+              text: "@0a0c4d2c-291f-46a4-87d5-625efbfed6a0"
+        Spike3bBarcode12:
+          asserts:
+            has_text:
+              text: "@0a0c4e88-893a-4284-9119-ab4274e05445"