nf-core · grst · Mar 31, 2025 · Mar 4, 2025 · Mar 5, 2025 · Mar 5, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -38,7 +38,7 @@ jobs:
         NXF_VER:
           - "24.04.2"
           - "latest-everything"
-        profile: ["simpleaf", "cellranger", "cellrangermulti", "kallisto", "star"]
+        profile: ["simpleaf", "cellranger", "cellrangermulti", "kallisto", "star", "cellrangerarc"]
 
     steps:
       - name: Disk space cleanup

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Update nextflow_schema.json
 - Fix simpleaf protocol name for 10xv4 ([#452](https://github.com/nf-core/scrnaseq/pull/452))
+- Fix the workflow for cellranger-arc alignment and add new test with 10x multiome dataset ([#441](https://github.com/nf-core/scrnaseq/pull/441))
 
 ## v4.0.0 - 2025-03-10
 

diff --git a/assets/cellrangerarc_samplesheet.csv b/assets/cellrangerarc_samplesheet.csv
@@ -0,0 +1,3 @@
+sample,fastq_1,fastq_2,fastq_barcode,sample_type
+10k_PBMC,"https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/10xgenomics/cellranger-arc/10x_pbmc_multiome/fastqs/10k_PBMC_Multiome_nextgem_Chromium_X_gex_chr21_subsample_S2_L001_R1_001.fastq.gz","https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/10xgenomics/cellranger-arc/10x_pbmc_multiome/fastqs/10k_PBMC_Multiome_nextgem_Chromium_X_gex_chr21_subsample_S2_L001_R2_001.fastq.gz",,gex
+10k_PBMC,"https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/10xgenomics/cellranger-arc/10x_pbmc_multiome/fastqs/10k_PBMC_Multiome_nextgem_Chromium_X_atac_chr21_subsample_S2_L001_R1_001.fastq.gz","https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/10xgenomics/cellranger-arc/10x_pbmc_multiome/fastqs/10k_PBMC_Multiome_nextgem_Chromium_X_atac_chr21_subsample_S2_L001_R2_001.fastq.gz","https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/10xgenomics/cellranger-arc/10x_pbmc_multiome/fastqs/10k_PBMC_Multiome_nextgem_Chromium_X_atac_chr21_subsample_S2_L001_R3_001.fastq.gz",atac
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -27,6 +27,10 @@
                 "pattern": "^\\S+\\.f(ast)?q\\.gz$",
                 "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
             },
+            "fastq_barcode": {
+                "type": "string",
+                "meta": ["fastq_barcode"]
+            },
             "expected_cells": {
                 "type": "integer",
                 "errorMessage": "Expected cells must be an Integer",

diff --git a/subworkflows/local/align_cellrangerarc.nf b/subworkflows/local/align_cellrangerarc.nf
@@ -50,7 +50,29 @@ workflow CELLRANGERARC_ALIGN {
         )
         ch_versions = ch_versions.mix(CELLRANGERARC_COUNT.out.versions)
 
+        // Parse the output channels to obtain filtered and raw matrices
+        ch_matrices_filtered = parse_demultiplexed_output_channels( CELLRANGERARC_COUNT.out.outs, "filtered_feature_bc_matrix" )
+        ch_matrices_raw      = parse_demultiplexed_output_channels( CELLRANGERARC_COUNT.out.outs, "raw_feature_bc_matrix"      )
+
     emit:
         ch_versions
-        cellranger_arc_out  = CELLRANGERARC_COUNT.out.outs
+        cellrangerarc_out          = CELLRANGERARC_COUNT.out.outs
+        cellrangerarc_mtx_filtered = ch_matrices_filtered
+        cellrangerarc_mtx_raw      = ch_matrices_raw
+}
+
+// Filter the desired files based on the pattern from an input channel
+def parse_demultiplexed_output_channels(in_ch, pattern) {
+
+    def out_ch = in_ch.map { meta, mtx_files ->
+        // Set the matrix type raw/filtered in the metadata based on the pattern
+        def meta_clone = meta.clone()
+        meta_clone.input_type = pattern.contains('raw_') ? 'raw' : 'filtered'
+        // Iterate over the matrix files and add the ones matching the pattern to the desired files list
+        def desired_files = []
+        mtx_files.each{ if ( it.toString().contains("${pattern}") ) { desired_files.add( it ) } }
+        [ meta_clone, desired_files ]
+    }
+
+    return out_ch
 }
diff --git a/subworkflows/local/utils_nfcore_scrnaseq_pipeline/main.nf b/subworkflows/local/utils_nfcore_scrnaseq_pipeline/main.nf
@@ -92,6 +92,24 @@ workflow PIPELINE_INITIALISATION {
                     return [ meta, fastqs.flatten() ]
             }
             .set { ch_samplesheet }
+    } else if (params.aligner == 'cellrangerarc') { // the cellrangerarc sub-workflow logic needs that channels have a meta, type, subsample, fastqs structure.
+        Channel
+            .fromList(samplesheetToList(params.input, "${projectDir}/assets/schema_input.json"))
+            .map { meta, fastq_1, fastq_2 ->
+                if (!fastq_2 || (meta.sample_type == "atac" && !meta.fastq_barcode)) {
+                    error("Please check input samplesheet -> cellrangerarc requires both paired-end reads and barcode fastq files: ${meta.id}")
+                }
+                if (meta.sample_type == "atac") {
+                    return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2, file(meta.fastq_barcode, checkIfExists: true) ] ]
+                } else {
+                    return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2 ] ]
+                }
+            }
+            .groupTuple()
+            .map {
+                cellrangerarcStructure(it)
+            }
+            .set { ch_samplesheet }
     } else {
         Channel
             .fromList(samplesheetToList(params.input, "${projectDir}/assets/schema_input.json"))
@@ -194,6 +212,44 @@ def validateInputSamplesheet(input) {
     return [ metas[0], fastqs ]
 }
 //
+// cellrangerarc structure for samplesheet channel
+//
+def cellrangerarcStructure(input) {
+    def (metas, fastqs) = input[1..2]
+
+    // Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end
+    def endedness_ok = metas.collect{ meta -> meta.single_end }.unique().size == 1
+    if (!endedness_ok) {
+        error("Please check input samplesheet -> Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end: ${metas[0].id}")
+    }
+
+    // Validate that the property "sample_type" is present and has valid values
+    def valid_sample_types = ["gex", "atac"]
+    def sample_type_ok = metas.collect { meta -> meta.sample_type }.unique().every { it in valid_sample_types }
+    if (!sample_type_ok) {
+        error("Please check input samplesheet -> The property 'sample_type' is required and can only be 'gex' or 'atac'.")
+    }
+
+    // Define a new common meta for all the fastqs in this channel instance
+    def sampleMeta = metas[0].clone()
+    sampleMeta.remove("sample_type")
+    sampleMeta.remove("feature_type")
+
+    // Create a list with all the entries of meta.sample_type
+    def sampletypes = metas.collect { meta -> meta.sample_type }
+
+    // Create a list with all the base name of the fastq files
+    def subsamples = fastqs.collect { fastq ->
+        def match = (fastq[0].baseName =~ /^(.*?)_S\d+_L\d+_R\d+_\d+\.fastq(\.gz)?$/)
+        if (!match) {
+            error("Filename does not follow the expected FASTQ filename convention (SampleName_S1_L001_R1_001.fastq.gz): ${fastq[0]}")
+        }
+        return match[0][1]
+    }
+
+    return [ sampleMeta, sampletypes, subsamples, fastqs.flatten() ]
+}
+//
 // Get attribute from genome config file e.g. fasta
 //
 def getGenomeAttribute(attribute) {

diff --git a/tests/main_pipeline_cellrangerarc.nf.test b/tests/main_pipeline_cellrangerarc.nf.test
@@ -0,0 +1,87 @@
+nextflow_pipeline {
+
+    name "Test Workflow main.nf"
+    script "main.nf"
+
+    test("test-dataset_cellrangerarc_aligner") {
+
+        when {
+            // the rest is taken from shared config
+            params {
+                aligner                   = 'cellrangerarc'
+                outdir                    = "${outputDir}/results_cellrangerarc"
+                input                     = "${baseDir}/assets/cellrangerarc_samplesheet.csv"
+                fasta                     = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/chr21/sequence/genome.fasta'
+                gtf                       = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/chr21/sequence/chr21_gencode.gtf'
+                protocol                  = 'auto'
+                skip_cellbender           = true
+            }
+        }
+
+        then {
+
+            assertAll(
+
+                //
+                // General assertions
+                //
+
+                // Did it finish successfully?
+                {assert workflow.success},
+
+                // How many tasks were executed?
+                {assert workflow.trace.tasks().size() == 14},
+
+                // How many results were produced?
+                {assert path("${outputDir}/results_cellrangerarc").list().size() == 4},
+                {assert path("${outputDir}/results_cellrangerarc/cellrangerarc").list().size() == 4},
+                {assert path("${outputDir}/results_cellrangerarc/cellrangerarc/mtx_conversions").list().size() == 7},
+                {assert path("${outputDir}/results_cellrangerarc/cellrangerarc/count").list().size() == 3},
+                {assert path("${outputDir}/results_cellrangerarc/fastqc").list().size() == 10},
+                {assert path("${outputDir}/results_cellrangerarc/multiqc").list().size() == 3},
+
+                //
+                // Check if files were produced
+                //
+                {assert new File( "${outputDir}/results_cellrangerarc/cellrangerarc/mtx_conversions/10k_PBMC/10k_PBMC_raw_matrix.h5ad" ).exists()},
+                {assert new File( "${outputDir}/results_cellrangerarc/cellrangerarc/mtx_conversions/10k_PBMC/10k_PBMC_filtered_matrix.h5ad" ).exists()},
+                {assert new File( "${outputDir}/results_cellrangerarc/cellrangerarc/mtx_conversions/combined_raw_matrix.h5ad" ).exists()},
+                {assert new File( "${outputDir}/results_cellrangerarc/cellrangerarc/mtx_conversions/combined_filtered_matrix.h5ad" ).exists()},
+
+                //
+                // Check if files are the same
+                //
+                {assert snapshot(
+                    // barcodes.tsv.gz files
+                    path( "${outputDir}/results_cellrangerarc/cellrangerarc/count/10k_PBMC/outs/filtered_feature_bc_matrix/barcodes.tsv.gz" ),
+                    path( "${outputDir}/results_cellrangerarc/cellrangerarc/count/10k_PBMC/outs/raw_feature_bc_matrix/barcodes.tsv.gz"      ),
+
+                    // features.tsv.gz files
+                    path( "${outputDir}/results_cellrangerarc/cellrangerarc/count/10k_PBMC/outs/filtered_feature_bc_matrix/features.tsv.gz" ),
+                    path( "${outputDir}/results_cellrangerarc/cellrangerarc/count/10k_PBMC/outs/raw_feature_bc_matrix/features.tsv.gz"      ),
+
+                    // matrix.mtx.gz files
+                    path( "${outputDir}/results_cellrangerarc/cellrangerarc/count/10k_PBMC/outs/filtered_feature_bc_matrix/matrix.mtx.gz"   ),
+                    path( "${outputDir}/results_cellrangerarc/cellrangerarc/count/10k_PBMC/outs/raw_feature_bc_matrix/matrix.mtx.gz"        ),
+
+                    // metrics_summary.csv files
+                    path( "${outputDir}/results_cellrangerarc/cellrangerarc/count/10k_PBMC/outs/summary.csv"                                ),
+
+                    // .rds files
+                    path( "${outputDir}/results_cellrangerarc/cellrangerarc/mtx_conversions/10k_PBMC/10k_PBMC_raw_matrix.sce.rds"           ),
+                    path( "${outputDir}/results_cellrangerarc/cellrangerarc/mtx_conversions/10k_PBMC/10k_PBMC_filtered_matrix.sce.rds"      ),
+                    path( "${outputDir}/results_cellrangerarc/cellrangerarc/mtx_conversions/10k_PBMC/10k_PBMC_raw_matrix.seurat.rds"        ),
+                    path( "${outputDir}/results_cellrangerarc/cellrangerarc/mtx_conversions/10k_PBMC/10k_PBMC_filtered_matrix.seurat.rds"   ),
+                    path( "${outputDir}/results_cellrangerarc/cellrangerarc/mtx_conversions/combined_raw_matrix.sce.rds"                    ),
+                    path( "${outputDir}/results_cellrangerarc/cellrangerarc/mtx_conversions/combined_filtered_matrix.sce.rds"               ),
+                    path( "${outputDir}/results_cellrangerarc/cellrangerarc/mtx_conversions/combined_raw_matrix.seurat.rds"                 ),
+                    path( "${outputDir}/results_cellrangerarc/cellrangerarc/mtx_conversions/combined_filtered_matrix.seurat.rds"            ),
+
+                ).match()}
+
+            ) // end of assertAll()
+
+        }
+    }
+
+}
diff --git a/tests/main_pipeline_cellrangerarc.nf.test.snap b/tests/main_pipeline_cellrangerarc.nf.test.snap
@@ -0,0 +1,26 @@
+{
+    "test-dataset_cellrangerarc_aligner": {
+        "content": [
+            "barcodes.tsv.gz:md5,b5499384bbd9ecbd448c90dd73d9e84c",
+            "barcodes.tsv.gz:md5,dc8ef24c54122529bdb2c7cd5969c805",
+            "features.tsv.gz:md5,456af0e5fa4a7bcb3968400f300d12e9",
+            "features.tsv.gz:md5,456af0e5fa4a7bcb3968400f300d12e9",
+            "matrix.mtx.gz:md5,83c4b3e84668282fbe55d02d84256b3f",
+            "matrix.mtx.gz:md5,4b3aa16b720d414f5df50c533052d360",
+            "summary.csv:md5,90da383cd94786c0a3810b903c8de8f6",
+            "10k_PBMC_raw_matrix.sce.rds:md5,196adfcb230f9ba9bb7d0aba36e7e261",
+            "10k_PBMC_filtered_matrix.sce.rds:md5,359160d80821c5dbf1ebc7a88dbe8400",
+            "10k_PBMC_raw_matrix.seurat.rds:md5,8ff4d68d88a6a7e3528147e06aeaa4dd",
+            "10k_PBMC_filtered_matrix.seurat.rds:md5,ca44f68f2ea255bfd556c71a6b91423e",
+            "combined_raw_matrix.sce.rds:md5,4846edfc332c65c129b554c27a357dc7",
+            "combined_filtered_matrix.sce.rds:md5,4c1dee683e3d0e602556a69b9b64b3a5",
+            "combined_raw_matrix.seurat.rds:md5,8900e93d58f613d56f02a4243769498a",
+            "combined_filtered_matrix.seurat.rds:md5,91f83560d80b7b1ab238161d73c64831"
+        ],
+        "meta": {
+            "nf-test": "0.9.2",
+            "nextflow": "24.10.5"
+        },
+        "timestamp": "2025-03-26T16:19:15.398105567"
+    }
+}
diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf
@@ -205,7 +205,7 @@ workflow SCRNASEQ {
             ch_cellrangerarc_config
         )
         ch_versions = ch_versions.mix(CELLRANGERARC_ALIGN.out.ch_versions)
-        ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGERARC_ALIGN.out.cellranger_arc_out)
+        ch_mtx_matrices = ch_mtx_matrices.mix( CELLRANGERARC_ALIGN.out.cellrangerarc_mtx_raw, CELLRANGERARC_ALIGN.out.cellrangerarc_mtx_filtered )
     }
 
     // Run cellrangermulti pipeline