Merge pull request #62 from biodiversitycellatlas/dev

bonitavw · web-flow · commit 739cb4a6bf53 · 2026-03-06T14:23:08.000+01:00
Added new mapping_software option "alevin_subsampled_starsolo"
diff --git a/bin/generate_dashboard.py b/bin/generate_dashboard.py
@@ -33,7 +33,7 @@ def parse_args() -> argparse.Namespace:
     # Core Metadata / Dashboard Headers
     parser.add_argument("--project", default="Biodiversity Cell Atlas", help="Project Name")
     parser.add_argument("--pipeline", default="bca-preprocessing", help="Pipeline Name")
-    parser.add_argument("--version", default="0.2.0", help="Pipeline Version")
+    parser.add_argument("--version", default="0.2.1", help="Pipeline Version")
     parser.add_argument("--commit", default="unknown", help="Git Commit Hash")
 
     # Data Inputs
diff --git a/docs/CONFIGURATION_PARAMETERS.md b/docs/CONFIGURATION_PARAMETERS.md
@@ -6,7 +6,7 @@ Within each custom configuration file the following variables can be defined:
 ## Table of Contents
 
 1. [Base Variables](#base-variables)
-2. [Mapping Variables](#mapping-variables)
+2. [STAR Variables](#star-variables)
 3. [FeatureCounts Variables](#featurecounts-variables)
 4. [Gene Extension Variables](#gene-extension-variables)
 5. [Taxonomic-classification Variables](#taxonomic-classification-variables)
@@ -28,16 +28,17 @@ Within each custom configuration file the following variables can be defined:
 | `ref_gtf`              | __Required__      | Path to the GTF/GFF file formatted for STARsolo. |
 | `ref_gtf_alt`          | Optional          | Path to the GTF/GFF file formatted specifically for analysis with Parse Biosciences / CellRanger pipeline. Defaults to the same path as `ref_gtf`. |
 | `run_method`             | Optional          | Method of running the pre-processing pipeline, demonstrated in the [pipeline diagram](img/Preprocs_Pipeline.png), currently either `"standard"` or `"geneext_only"`. Default is set to `"standard"`. |
+| `mapping_software`     | Optional          | Software used to map reads (must be one of the following: `"starsolo"`, `"alevin"`, `"both"/"alevin_starsolo"` or `"alevin_subsampled_starsolo"`). Default set to `"starsolo"`. |
 | `perform_demultiplexing` | Optional        | Boolean flag to enable or disable demultiplexing of the FASTQ files, where applicable. Default is `true`. |
 | `seqspec_file`         | Optional          | Path to the seqspec file. |
+| `subsample_nreads`     | Optional          | The size (number of reads) of the subset used to map to STARsolo, in case the parameter `mapping_software = alevin_subsampled_starsolo`. Default set to `100000000` reads. |
 
 
 
-## Mapping Variables
+## STAR Variables
 
 | Variable               | Required/Optional | Description |
 |------------------------|-------------------|-------------|
-| `mapping_software`     | Optional          | Software used to map reads (must be one of the following: `"starsolo"`, `"alevin"` or `"both"`). Default set to `"starsolo"`. |
 | `star_index`           | Optional          | Path to the pre-generated STAR index. By default the STAR index is created within the pipeline.|
 | `star_genomeSAindexNbases` | Optional         | Lenght of the SA pre-indexing string in STAR. See [protocol-specific defaults](../conf/seqtech_parameters.config) set in the seqtech_paramaters.config file. |
 | `star_genomeSAsparseD`    | Optional       | Suffix array sparsity in STAR.  See [protocol-specific defaults](../conf/seqtech_parameters.config) set in the seqtech_paramaters.config file. |
@@ -76,7 +77,8 @@ Within each custom configuration file the following variables can be defined:
 
 | Variable               | Required/Optional | Description |
 |------------------------|-------------------|-------------|
-| `perform_10x_saturate`      | Optional          | Boolean flag to enable or disable the 10x_saturate step after mapping. Default is `true`. |
+| `perform_10x_saturate` | Optional          | Boolean flag to enable or disable the 10x_saturate step after mapping. Default is `true`. |
+| `saturation_target`    | Optional          | The saturation target fraction used to predict the input reads needed. Default set to `0.7`. |
 
 
 
@@ -94,6 +96,7 @@ Within each custom configuration file the following variables can be defined:
 | Variable               | Required/Optional | Description |
 |------------------------|-------------------|-------------|
 | `perform_cellbender`   | Optional          | Boolean flag to enable or disable removal of ambient RNA using CellBender. Default is `false`. |
+| `cellbender_extraargs` | Optional          | Provide extra arguments to the CellBender function as a string. Refer to the [CellBender manual](https://cellbender.readthedocs.io/en/latest/reference/index.html) for options. |
 
 
 
diff --git a/main.nf b/main.nf
@@ -72,7 +72,7 @@ workflow BCA_PREPROCESSING {
             filter_out = filtering_workflow(QC_mapping_workflow.out.starsolo_genefull50_raw)
 
             reporting_workflow(
-                samplesheet,
+                preprocessing_workflow.out.merged_samplesheet,
                 SAVE_RUN_CONFIG.out.samplesheet,
                 SAVE_RUN_CONFIG.out.run_config,
                 QC_mapping_workflow.out.star_final_log,
diff --git a/modules/local/tools/featurecounts/main.nf b/modules/local/tools/featurecounts/main.nf
@@ -7,7 +7,7 @@ process CALC_MT_RRNA {
     container "oras://community.wave.seqera.io/library/samtools_subread:f5fd17c543add0fd"
 
     input:
-    tuple val(meta), path(mapping_files)
+    tuple val(meta), path(bam_file)
     file(bam_index)
 
     output:
@@ -17,12 +17,10 @@ process CALC_MT_RRNA {
     """
     echo "\n\n==================  CALCULATION rRNA & mtDNA =================="
     echo "Sample ID: ${meta}"
-
-    bam_file=\$(ls ${meta.id}_Aligned.sortedByCoord.out.bam | head -n 1)
-    echo "BAM file: \${bam_file}"
+    echo "BAM file: ${bam_file}"
 
     calculate_rrna_mtdna.sh \\
-        \${bam_file} \\
+        ${bam_file} \\
         ${meta.id}_mt_rrna_metrics.txt \\
         ${params.ref_gtf} \\
         ${params.grep_rrna} \\
diff --git a/modules/local/tools/geneext/main.nf b/modules/local/tools/geneext/main.nf
@@ -7,7 +7,7 @@ process GENE_EXT {
     conda "${moduleDir}/environment.yml"
 
     input:
-    tuple val(meta), path(mapping_files)
+    tuple val(meta), path(bam_file)
     file(bam_index)
 
     output:
@@ -18,29 +18,27 @@ process GENE_EXT {
     echo "\n\n==================  GENE EXTENSION =================="
     echo "Sample ID: ${meta}"
     echo "BAM index: ${bam_index}"
+    echo "BAM file: ${bam_file}"
     echo "Original GTF: ${params.ref_gtf}"
 
     # Remove temporary directory if it exists
     if [ -d "tmp" ]; then rm -r tmp; fi
 
     # Extract file extension
     extension=\$(echo "${params.ref_gtf}" | awk -F. '{print \$NF}')
-
     if [ \$extension == "gff" ];
     then
         gtf_output="${meta.id}_geneext.gff"
     else
         gtf_output="${meta.id}_geneext.gtf"
     fi
     echo \${gtf_output}
-    bam_file=\$(ls *_Aligned.sortedByCoord.out.bam | head -n 1)
 
     # Run GeneExt
     python ${projectDir}/submodules/GeneExt/geneext.py \\
         -g ${params.ref_gtf} \\
-        -b \${bam_file} \\
+        -b ${bam_file} \\
         -o \${gtf_output} \\
         -j 4
-
     """
 }
diff --git a/modules/local/tools/seqtk/environment.yml b/modules/local/tools/seqtk/environment.yml
@@ -0,0 +1,7 @@
+channels:
+  - conda-forge
+  - bioconda
+
+dependencies:
+  - seqtk >=1.4
+  - pigz
diff --git a/modules/local/tools/seqtk/main.nf b/modules/local/tools/seqtk/main.nf
@@ -0,0 +1,40 @@
+process SUBSAMPLE_FASTQS {
+    tag "${meta.id}"
+    label 'process_low'
+
+    conda "${moduleDir}/environment.yml"
+
+    input:
+    tuple val(meta), path(fastq_cDNA), path(fastq_BC_UMI), path(fastq_indices), path(input_file)
+
+    output:
+    tuple val(meta), path("${meta.id}_subsampled_cDNA.fastq.gz"), path("${meta.id}_subsampled_BC_UMI.fastq.gz"), path("${meta.id}_subsampled_indices/"), path(input_file)
+
+    script:
+    """
+    echo "================== SUBSAMPLE FASTQs =================="
+    echo "Sample ID: ${meta.id}"
+    echo "Subsampling to ${params.subsample_nreads} reads"
+
+    SEED=100
+    NREADS=${params.subsample_nreads}
+    # COMPRESSOR="pigz -p ${task.cpus}"
+    COMPRESSOR="gzip"
+
+    echo "[1/2] Subsampling paired reads..."
+    seqtk sample -s\$SEED ${fastq_cDNA} \$NREADS | \$COMPRESSOR > ${meta.id}_subsampled_cDNA.fastq.gz
+    seqtk sample -s\$SEED ${fastq_BC_UMI} \$NREADS | \$COMPRESSOR > ${meta.id}_subsampled_BC_UMI.fastq.gz
+
+    mkdir -p ${meta.id}_subsampled_indices
+
+    if [ -n "${fastq_indices}" ]; then
+        echo "[2/2] Subsampling index reads..."
+        for idx in ${fastq_indices}; do
+            base=\$(basename \$idx .fastq.gz)
+            seqtk sample -s\$SEED \$idx \$NREADS | \$COMPRESSOR > ${meta.id}_subsampled_indices/${meta.id}_subsampled_\${base}.fastq.gz
+        done
+    else
+        echo "No index FASTQs provided. Skipping index subsampling."
+    fi
+    """
+}
diff --git a/nextflow.config b/nextflow.config
@@ -12,7 +12,8 @@ nextflow.enable.dsl = 2
 params {
 
     // Input parameters
-    run_method =            "standard"
+    run_method =            "standard"          // "standard", "geneext_only", "exteral_pipeline_only"
+    mapping_software =      "starsolo"          // "starsolo", "alevin", "both"/"alevin_starsolo", "alevin_subsampled_starsolo"
     input =                 null
     outdir =                "../outdir"
     protocol =              null
@@ -29,8 +30,14 @@ params {
     perform_geneext =       false
     geneext_gtf =           null
 
+    // Subsampling parameters
+    subsample_nreads =      100000000
+
+    // 10x Genomics specific parameters
     perform_10x_saturate =  true
     saturation_target =     0.7
+
+    // Demultiplexing parameters
     perform_demultiplexing = true
     bcl2fastq =             null
 
@@ -39,10 +46,6 @@ params {
     fastp_length_required = null
     fastp_qualified_quality_phred = null
 
-    // Mapping parameters (values are set in conf/seqtech_parameters.config)
-    mapping_software =      "starsolo"          // "starsolo", "alevin" or "both"
-    mt_contig =             "chrM M MT"
-
     // STAR specific parameters
     star_index =            null
     star_genomeSAindexNbases = null
@@ -66,6 +69,7 @@ params {
     // Feature counting parameters
     perform_featurecounts = false
     grep_rrna =             "rRNA"
+    mt_contig =             "chrM M MT"
 
     // Taxonomic classification parameters
     perform_kraken =        false
@@ -265,7 +269,7 @@ manifest {
     homePage        = 'https://github.com/biodiversitycellatlas/bca_preprocessing'
     description     = 'Biodiversity Cell Atlas Pre-processing Pipeline'
     mainScript      = 'main.nf'
-    version         = '0.2.0'
+    version         = '0.2.1'
     nextflowVersion = '>=21.04.0'
 }
 
diff --git a/subworkflows/local/mapping/mapping_starsolo.nf b/subworkflows/local/mapping/mapping_starsolo.nf
@@ -125,33 +125,33 @@ workflow mapping_starsolo_workflow {
             // Calculate percentages mitochondrial DNA and ribosomal RNA
             if (params.perform_featurecounts) {
                 // Join STARsolo files with samtools index
-                STARSOLO_ALIGN.out.starsolo_files
+                ch_starsolo_bam
                     .join(SAMTOOLS_INDEX.out.bam_index)
-                    .multiMap { meta, star_files, bai ->
-                        star_ch: [meta, star_files]
+                    .multiMap { meta, bam_file, bai ->
+                        bam_ch: [meta, bam_file]
                         bai_ch:  [meta, bai]
                     }
                     .set { ch_fc_inputs }
 
                 // Run featureCounts to calculate mtDNA and rRNA percentages and capture output
-                CALC_MT_RRNA(ch_fc_inputs.star_ch, ch_fc_inputs.bai_ch)
+                CALC_MT_RRNA(ch_fc_inputs.bam_ch, ch_fc_inputs.bai_ch)
                 ch_featurecounts = CALC_MT_RRNA.out
             }
 
             // Gene Extension
             if (params.perform_geneext || params.run_method == "geneext_only") {
 
                 // Join inputs for GENE_EXT
-                STARSOLO_ALIGN.out.starsolo_files
+                ch_starsolo_bam
                     .join(SAMTOOLS_INDEX.out.bam_index)
-                    .multiMap { meta, star_files, bai ->
-                        star_ch: [meta, star_files]
+                    .multiMap { meta, bam_file, bai ->
+                        bam_ch: [meta, bam_file]
                         bai_ch:  [meta, bai]
                     }
                     .set { ch_geneext_inputs }
 
                 // Run gene extension using GeneExt
-                GENE_EXT(ch_geneext_inputs.star_ch, ch_geneext_inputs.bai_ch)
+                GENE_EXT(ch_geneext_inputs.bam_ch, ch_geneext_inputs.bai_ch)
 
                 // Remap STARsolo with extended GTF if run_method is not "geneext_only"
                 if (params.run_method != "geneext_only") {
diff --git a/workflows/mapping_workflow.nf b/workflows/mapping_workflow.nf
@@ -11,6 +11,7 @@ include { mapping_starsolo_workflow } from '../subworkflows/local/mapping/mappin
 include { mapping_alevin_workflow   } from '../subworkflows/local/mapping/mapping_alevin'
 
 include { FASTQC                    } from '../modules/local/tools/fastqc/main'
+include { SUBSAMPLE_FASTQS          } from '../modules/local/tools/seqtk/main'
 
 
 /*
@@ -72,10 +73,20 @@ workflow QC_mapping_workflow {
             ch_alevin_quant_json = mapping_alevin_workflow.out.af_quant_json
             ch_alevin_cell_meta = mapping_alevin_workflow.out.af_cell_meta
 
-        } else if (params.mapping_software == "both") {
-            mapping_starsolo_workflow(data_output, bc_whitelist)
+        } else if (params.mapping_software == "both" || params.mapping_software == "alevin_subsampled_starsolo" || params.mapping_software == "alevin_starsolo") {
+
             mapping_alevin_workflow(data_output, bc_whitelist)
 
+            // If 'alevin_subsampled_starsolo' is selected, run STARsolo on a subsampled dataset
+            if (params.mapping_software == "alevin_subsampled_starsolo") {
+                SUBSAMPLE_FASTQS(data_output)
+                mapping_starsolo_workflow(SUBSAMPLE_FASTQS.out, bc_whitelist)
+
+            // If 'both'/'alevin_starsolo' is selected, run STARsolo on the full dataset
+            } else if (params.mapping_software == "both" || params.mapping_software == "alevin_starsolo") {
+                mapping_starsolo_workflow(data_output, bc_whitelist)
+            }
+
             ch_mapping_files         = mapping_alevin_workflow.out.mapping_files.mix(mapping_starsolo_workflow.out.mapping_files)
             ch_starsolo_bam          = mapping_starsolo_workflow.out.starsolo_bam
             ch_star_solodir          =  mapping_starsolo_workflow.out.star_solodir
diff --git a/workflows/preprocessing_workflow.nf b/workflows/preprocessing_workflow.nf
@@ -77,6 +77,7 @@ workflow preprocessing_workflow {
         }
 
     emit:
+        merged_samplesheet = merged_samplesheet
         data_output     = data_output_ch
         bc_whitelist    = bc_whitelist_ch
 }