Skip to content

Commit 739cb4a

Browse files
authored
Merge pull request #62 from biodiversitycellatlas/dev
Added new mapping_software option "alevin_subsampled_starsolo"
2 parents 1f99bc8 + 37cfdb9 commit 739cb4a

11 files changed

Lines changed: 94 additions & 32 deletions

File tree

bin/generate_dashboard.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def parse_args() -> argparse.Namespace:
3333
# Core Metadata / Dashboard Headers
3434
parser.add_argument("--project", default="Biodiversity Cell Atlas", help="Project Name")
3535
parser.add_argument("--pipeline", default="bca-preprocessing", help="Pipeline Name")
36-
parser.add_argument("--version", default="0.2.0", help="Pipeline Version")
36+
parser.add_argument("--version", default="0.2.1", help="Pipeline Version")
3737
parser.add_argument("--commit", default="unknown", help="Git Commit Hash")
3838

3939
# Data Inputs

docs/CONFIGURATION_PARAMETERS.md

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ Within each custom configuration file the following variables can be defined:
66
## Table of Contents
77

88
1. [Base Variables](#base-variables)
9-
2. [Mapping Variables](#mapping-variables)
9+
2. [STAR Variables](#star-variables)
1010
3. [FeatureCounts Variables](#featurecounts-variables)
1111
4. [Gene Extension Variables](#gene-extension-variables)
1212
5. [Taxonomic-classification Variables](#taxonomic-classification-variables)
@@ -28,16 +28,17 @@ Within each custom configuration file the following variables can be defined:
2828
| `ref_gtf` | __Required__ | Path to the GTF/GFF file formatted for STARsolo. |
2929
| `ref_gtf_alt` | Optional | Path to the GTF/GFF file formatted specifically for analysis with Parse Biosciences / CellRanger pipeline. Defaults to the same path as `ref_gtf`. |
3030
| `run_method` | Optional | Method of running the pre-processing pipeline, demonstrated in the [pipeline diagram](img/Preprocs_Pipeline.png), currently either `"standard"` or `"geneext_only"`. Default is set to `"standard"`. |
31+
| `mapping_software` | Optional | Software used to map reads (must be one of the following: `"starsolo"`, `"alevin"`, `"both"/"alevin_starsolo"` or `"alevin_subsampled_starsolo"`). Default set to `"starsolo"`. |
3132
| `perform_demultiplexing` | Optional | Boolean flag to enable or disable demultiplexing of the FASTQ files, where applicable. Default is `true`. |
3233
| `seqspec_file` | Optional | Path to the seqspec file. |
34+
| `subsample_nreads` | Optional | The size (number of reads) of the subset used to map to STARsolo, in case the parameter `mapping_software = alevin_subsampled_starsolo`. Default set to `100000000` reads. |
3335

3436

3537

36-
## Mapping Variables
38+
## STAR Variables
3739

3840
| Variable | Required/Optional | Description |
3941
|------------------------|-------------------|-------------|
40-
| `mapping_software` | Optional | Software used to map reads (must be one of the following: `"starsolo"`, `"alevin"` or `"both"`). Default set to `"starsolo"`. |
4142
| `star_index` | Optional | Path to the pre-generated STAR index. By default the STAR index is created within the pipeline.|
4243
| `star_genomeSAindexNbases` | Optional | Lenght of the SA pre-indexing string in STAR. See [protocol-specific defaults](../conf/seqtech_parameters.config) set in the seqtech_paramaters.config file. |
4344
| `star_genomeSAsparseD` | Optional | Suffix array sparsity in STAR. See [protocol-specific defaults](../conf/seqtech_parameters.config) set in the seqtech_paramaters.config file. |
@@ -76,7 +77,8 @@ Within each custom configuration file the following variables can be defined:
7677

7778
| Variable | Required/Optional | Description |
7879
|------------------------|-------------------|-------------|
79-
| `perform_10x_saturate` | Optional | Boolean flag to enable or disable the 10x_saturate step after mapping. Default is `true`. |
80+
| `perform_10x_saturate` | Optional | Boolean flag to enable or disable the 10x_saturate step after mapping. Default is `true`. |
81+
| `saturation_target` | Optional | The saturation target fraction used to predict the input reads needed. Default set to `0.7`. |
8082

8183

8284

@@ -94,6 +96,7 @@ Within each custom configuration file the following variables can be defined:
9496
| Variable | Required/Optional | Description |
9597
|------------------------|-------------------|-------------|
9698
| `perform_cellbender` | Optional | Boolean flag to enable or disable removal of ambient RNA using CellBender. Default is `false`. |
99+
| `cellbender_extraargs` | Optional | Provide extra arguments to the CellBender function as a string. Refer to the [CellBender manual](https://cellbender.readthedocs.io/en/latest/reference/index.html) for options. |
97100

98101

99102

main.nf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ workflow BCA_PREPROCESSING {
7272
filter_out = filtering_workflow(QC_mapping_workflow.out.starsolo_genefull50_raw)
7373

7474
reporting_workflow(
75-
samplesheet,
75+
preprocessing_workflow.out.merged_samplesheet,
7676
SAVE_RUN_CONFIG.out.samplesheet,
7777
SAVE_RUN_CONFIG.out.run_config,
7878
QC_mapping_workflow.out.star_final_log,

modules/local/tools/featurecounts/main.nf

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ process CALC_MT_RRNA {
77
container "oras://community.wave.seqera.io/library/samtools_subread:f5fd17c543add0fd"
88

99
input:
10-
tuple val(meta), path(mapping_files)
10+
tuple val(meta), path(bam_file)
1111
file(bam_index)
1212

1313
output:
@@ -17,12 +17,10 @@ process CALC_MT_RRNA {
1717
"""
1818
echo "\n\n================== CALCULATION rRNA & mtDNA =================="
1919
echo "Sample ID: ${meta}"
20-
21-
bam_file=\$(ls ${meta.id}_Aligned.sortedByCoord.out.bam | head -n 1)
22-
echo "BAM file: \${bam_file}"
20+
echo "BAM file: ${bam_file}"
2321
2422
calculate_rrna_mtdna.sh \\
25-
\${bam_file} \\
23+
${bam_file} \\
2624
${meta.id}_mt_rrna_metrics.txt \\
2725
${params.ref_gtf} \\
2826
${params.grep_rrna} \\

modules/local/tools/geneext/main.nf

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ process GENE_EXT {
77
conda "${moduleDir}/environment.yml"
88

99
input:
10-
tuple val(meta), path(mapping_files)
10+
tuple val(meta), path(bam_file)
1111
file(bam_index)
1212

1313
output:
@@ -18,29 +18,27 @@ process GENE_EXT {
1818
echo "\n\n================== GENE EXTENSION =================="
1919
echo "Sample ID: ${meta}"
2020
echo "BAM index: ${bam_index}"
21+
echo "BAM file: ${bam_file}"
2122
echo "Original GTF: ${params.ref_gtf}"
2223
2324
# Remove temporary directory if it exists
2425
if [ -d "tmp" ]; then rm -r tmp; fi
2526
2627
# Extract file extension
2728
extension=\$(echo "${params.ref_gtf}" | awk -F. '{print \$NF}')
28-
2929
if [ \$extension == "gff" ];
3030
then
3131
gtf_output="${meta.id}_geneext.gff"
3232
else
3333
gtf_output="${meta.id}_geneext.gtf"
3434
fi
3535
echo \${gtf_output}
36-
bam_file=\$(ls *_Aligned.sortedByCoord.out.bam | head -n 1)
3736
3837
# Run GeneExt
3938
python ${projectDir}/submodules/GeneExt/geneext.py \\
4039
-g ${params.ref_gtf} \\
41-
-b \${bam_file} \\
40+
-b ${bam_file} \\
4241
-o \${gtf_output} \\
4342
-j 4
44-
4543
"""
4644
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
channels:
2+
- conda-forge
3+
- bioconda
4+
5+
dependencies:
6+
- seqtk >=1.4
7+
- pigz

modules/local/tools/seqtk/main.nf

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
process SUBSAMPLE_FASTQS {
2+
tag "${meta.id}"
3+
label 'process_low'
4+
5+
conda "${moduleDir}/environment.yml"
6+
7+
input:
8+
tuple val(meta), path(fastq_cDNA), path(fastq_BC_UMI), path(fastq_indices), path(input_file)
9+
10+
output:
11+
tuple val(meta), path("${meta.id}_subsampled_cDNA.fastq.gz"), path("${meta.id}_subsampled_BC_UMI.fastq.gz"), path("${meta.id}_subsampled_indices/"), path(input_file)
12+
13+
script:
14+
"""
15+
echo "================== SUBSAMPLE FASTQs =================="
16+
echo "Sample ID: ${meta.id}"
17+
echo "Subsampling to ${params.subsample_nreads} reads"
18+
19+
SEED=100
20+
NREADS=${params.subsample_nreads}
21+
# COMPRESSOR="pigz -p ${task.cpus}"
22+
COMPRESSOR="gzip"
23+
24+
echo "[1/2] Subsampling paired reads..."
25+
seqtk sample -s\$SEED ${fastq_cDNA} \$NREADS | \$COMPRESSOR > ${meta.id}_subsampled_cDNA.fastq.gz
26+
seqtk sample -s\$SEED ${fastq_BC_UMI} \$NREADS | \$COMPRESSOR > ${meta.id}_subsampled_BC_UMI.fastq.gz
27+
28+
mkdir -p ${meta.id}_subsampled_indices
29+
30+
if [ -n "${fastq_indices}" ]; then
31+
echo "[2/2] Subsampling index reads..."
32+
for idx in ${fastq_indices}; do
33+
base=\$(basename \$idx .fastq.gz)
34+
seqtk sample -s\$SEED \$idx \$NREADS | \$COMPRESSOR > ${meta.id}_subsampled_indices/${meta.id}_subsampled_\${base}.fastq.gz
35+
done
36+
else
37+
echo "No index FASTQs provided. Skipping index subsampling."
38+
fi
39+
"""
40+
}

nextflow.config

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ nextflow.enable.dsl = 2
1212
params {
1313

1414
// Input parameters
15-
run_method = "standard"
15+
run_method = "standard" // "standard", "geneext_only", "exteral_pipeline_only"
16+
mapping_software = "starsolo" // "starsolo", "alevin", "both"/"alevin_starsolo", "alevin_subsampled_starsolo"
1617
input = null
1718
outdir = "../outdir"
1819
protocol = null
@@ -29,8 +30,14 @@ params {
2930
perform_geneext = false
3031
geneext_gtf = null
3132

33+
// Subsampling parameters
34+
subsample_nreads = 100000000
35+
36+
// 10x Genomics specific parameters
3237
perform_10x_saturate = true
3338
saturation_target = 0.7
39+
40+
// Demultiplexing parameters
3441
perform_demultiplexing = true
3542
bcl2fastq = null
3643

@@ -39,10 +46,6 @@ params {
3946
fastp_length_required = null
4047
fastp_qualified_quality_phred = null
4148

42-
// Mapping parameters (values are set in conf/seqtech_parameters.config)
43-
mapping_software = "starsolo" // "starsolo", "alevin" or "both"
44-
mt_contig = "chrM M MT"
45-
4649
// STAR specific parameters
4750
star_index = null
4851
star_genomeSAindexNbases = null
@@ -66,6 +69,7 @@ params {
6669
// Feature counting parameters
6770
perform_featurecounts = false
6871
grep_rrna = "rRNA"
72+
mt_contig = "chrM M MT"
6973

7074
// Taxonomic classification parameters
7175
perform_kraken = false
@@ -265,7 +269,7 @@ manifest {
265269
homePage = 'https://github.com/biodiversitycellatlas/bca_preprocessing'
266270
description = 'Biodiversity Cell Atlas Pre-processing Pipeline'
267271
mainScript = 'main.nf'
268-
version = '0.2.0'
272+
version = '0.2.1'
269273
nextflowVersion = '>=21.04.0'
270274
}
271275

subworkflows/local/mapping/mapping_starsolo.nf

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -125,33 +125,33 @@ workflow mapping_starsolo_workflow {
125125
// Calculate percentages mitochondrial DNA and ribosomal RNA
126126
if (params.perform_featurecounts) {
127127
// Join STARsolo files with samtools index
128-
STARSOLO_ALIGN.out.starsolo_files
128+
ch_starsolo_bam
129129
.join(SAMTOOLS_INDEX.out.bam_index)
130-
.multiMap { meta, star_files, bai ->
131-
star_ch: [meta, star_files]
130+
.multiMap { meta, bam_file, bai ->
131+
bam_ch: [meta, bam_file]
132132
bai_ch: [meta, bai]
133133
}
134134
.set { ch_fc_inputs }
135135

136136
// Run featureCounts to calculate mtDNA and rRNA percentages and capture output
137-
CALC_MT_RRNA(ch_fc_inputs.star_ch, ch_fc_inputs.bai_ch)
137+
CALC_MT_RRNA(ch_fc_inputs.bam_ch, ch_fc_inputs.bai_ch)
138138
ch_featurecounts = CALC_MT_RRNA.out
139139
}
140140

141141
// Gene Extension
142142
if (params.perform_geneext || params.run_method == "geneext_only") {
143143

144144
// Join inputs for GENE_EXT
145-
STARSOLO_ALIGN.out.starsolo_files
145+
ch_starsolo_bam
146146
.join(SAMTOOLS_INDEX.out.bam_index)
147-
.multiMap { meta, star_files, bai ->
148-
star_ch: [meta, star_files]
147+
.multiMap { meta, bam_file, bai ->
148+
bam_ch: [meta, bam_file]
149149
bai_ch: [meta, bai]
150150
}
151151
.set { ch_geneext_inputs }
152152

153153
// Run gene extension using GeneExt
154-
GENE_EXT(ch_geneext_inputs.star_ch, ch_geneext_inputs.bai_ch)
154+
GENE_EXT(ch_geneext_inputs.bam_ch, ch_geneext_inputs.bai_ch)
155155

156156
// Remap STARsolo with extended GTF if run_method is not "geneext_only"
157157
if (params.run_method != "geneext_only") {

workflows/mapping_workflow.nf

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ include { mapping_starsolo_workflow } from '../subworkflows/local/mapping/mappin
1111
include { mapping_alevin_workflow } from '../subworkflows/local/mapping/mapping_alevin'
1212

1313
include { FASTQC } from '../modules/local/tools/fastqc/main'
14+
include { SUBSAMPLE_FASTQS } from '../modules/local/tools/seqtk/main'
1415

1516

1617
/*
@@ -72,10 +73,20 @@ workflow QC_mapping_workflow {
7273
ch_alevin_quant_json = mapping_alevin_workflow.out.af_quant_json
7374
ch_alevin_cell_meta = mapping_alevin_workflow.out.af_cell_meta
7475

75-
} else if (params.mapping_software == "both") {
76-
mapping_starsolo_workflow(data_output, bc_whitelist)
76+
} else if (params.mapping_software == "both" || params.mapping_software == "alevin_subsampled_starsolo" || params.mapping_software == "alevin_starsolo") {
77+
7778
mapping_alevin_workflow(data_output, bc_whitelist)
7879

80+
// If 'alevin_subsampled_starsolo' is selected, run STARsolo on a subsampled dataset
81+
if (params.mapping_software == "alevin_subsampled_starsolo") {
82+
SUBSAMPLE_FASTQS(data_output)
83+
mapping_starsolo_workflow(SUBSAMPLE_FASTQS.out, bc_whitelist)
84+
85+
// If 'both'/'alevin_starsolo' is selected, run STARsolo on the full dataset
86+
} else if (params.mapping_software == "both" || params.mapping_software == "alevin_starsolo") {
87+
mapping_starsolo_workflow(data_output, bc_whitelist)
88+
}
89+
7990
ch_mapping_files = mapping_alevin_workflow.out.mapping_files.mix(mapping_starsolo_workflow.out.mapping_files)
8091
ch_starsolo_bam = mapping_starsolo_workflow.out.starsolo_bam
8192
ch_star_solodir = mapping_starsolo_workflow.out.star_solodir

0 commit comments

Comments
 (0)