zellerlab · cschu · Nov 10, 2021 · Nov 10, 2021 · Nov 10, 2021 · Nov 10, 2021
diff --git a/README.md b/README.md
@@ -8,13 +8,25 @@
 Other dependencies:
 * bbmap 
 * fastqc
+* seqtk
 * figaro
 * R v4+ with dada2, devtools, tidyverse, and cowplot installed
+* MapSeq
 
 For convenience, `gaga2` comes with a Singularity container with all dependencies installed. 
 
 ```singularity pull oras://ghcr.io/zellerlab/gaga2:latest```
 
+#### MapSeq
+While most dependencies can be installed via conda, `MapSeq` can not. If you don't want / can't use the `gaga2` singularity container, you need to make `gaga2` aware of your own MapSeq installation. In this case, please modify the following entries in the `run.config`:
+* `mapseq_bin` should be the path to your `mapseq` binary; if `mapseq` is in your `$PATH`, you can leave this entry unchanged.
+* `mapseq_db_path` is the `dirname` of the path to your `mapseq` database, i.e. for `/path/to/mapseq_db_folder/` it is `/path/to/`.
+* `mapseq_db_name` is the `basename` of the path to your `mapseq` database, i.e. for `/path/to/mapseq_db_folder/` it is `mapseq_db_folder`.
+
+#### Custom MapSeq databases.
+
+If you use the MapSeq as provided by the gaga2 container, you can modify the `mapseq_db_path` and `mapseq_db_name` parameters to point at a custom database. This custom database will be used in addition to MapSeq's own database and is currently limited to a single taxonomy. Please note, that the database files need to be named `<mapseq_db_name>.fasta` and `<mapseq_db_name>.tax`.
+
 
 ## Usage instructions
 
@@ -28,22 +40,22 @@ They should, but don't have to, be arranged in a sample-based directory structur
 ```
 <project_dir> (aka "input_dir")
      |___ <sample_1>
-     |        |____ <sample_1_forward_reads>
-     |        |____ <sample_2_reverse_reads>
+     |        |____ <sample_1_forward_reads>_R?1.{fastq,fq,fastq.gz,fq.gz}
+     |        |____ <sample_2_reverse_reads>_R?2.{fastq,fq,fastq.gz,fq.gz}
      |
      |___ <sample_2>
      |        |____ <empty samples will be ignored>
      |        
      |___ <sample_n>
-              |____ <sample_n_forward_reads>
-              |____ <sample_n_reverse_reads>
+              |____ <sample_n_forward_reads>_R?1.{fastq,fq,fastq.gz,fq.gz}
+              |____ <sample_n_reverse_reads>_R?2.{fastq,fq,fastq.gz,fq.gz}
 ```
 
 A flat directory structure (with all read files in the same directory) or a deeply-branched (with read files scattered over multiple levels) should also work. 
 
 If `gaga2` preprocesses the reads, it will automatically use `_R1/2` endings internally.
 
-* If input reads have already been preprocessed, you can set the `--preprocessed` flag. In this case, `gaga2` will do no preprocessing at all and instruct `dada2` to perform no trimming. Otherwie, `gaga2` will assess the read lengths for uniformity. If read lengths differ within and between samples, preprocessing with `figaro` is not possible and `dada2` will be run without trimming. 
+* If input reads have already been preprocessed, you can set the `--preprocessed` flag. In this case, `gaga2` will do no preprocessing at all and instruct `dada2` to perform no trimming. Otherwise, `gaga2` will preprocess the reads with `bbduk`. If primer sequences are given via `--primers <fwd_primer>[,<rev_primer>]`, `bbduk` will attempt to remove them + any up/downstream adapter remains. Otherwise, `bbduk` will attempt to remove adapter remains only. Reads are then cut to a uniform length (all forward reads across all samples and all reverse reads across all samples need to have the length, but forward and reverse reads may differ in length.) The latter treatment is a requirement for `figaro`, which will then take the primer lengths (supplied via `--left_primer` and `--right_primer`) into account when determining the optimal cut sites.
 
 * Samples with less than `110` reads after `dada2` preprocessing, will be discarded.
 
@@ -61,6 +73,8 @@ before.
 
 In addition, you should obtain a copy of the `run.config` from the `gaga2` github repo and modify it according to your environment.
 
+Additional arguments can be set via the command line (e.g. `--input_dir /path/to/input_dir`) or in the params section of the `run.config` file (recommended for documentation and reproducibility purposes.)
+
 #### Mandatory arguments
 * `--input_dir` is the project directory mentioned above.
 * `--output_dir` will be created automatically.
@@ -71,6 +85,10 @@ In addition, you should obtain a copy of the `run.config` from the `gaga2` githu
 * `--min_overlap` of read pairs is `20bp` by default
 * `--primers <comma-separated-list-of-primer-sequences>` or `--left_primer`, and `--right_primer` If primer sequences are provided via `--primers`, `gaga2` will remove primers and upstream sequences (using `bbduk`), such as adapters based on the primer sequences. If non-zero primer lengths are provided instead (via `--left_primer` and `--right_primer`), `figaro` will take those into account when determining the best trim positions.
 * `--preprocessed` will prevent any further preprocessing by `gaga2` - this flag should only be used if the read data is reliably clean.
+* `--long_reads` should be only set if individual reads (i.e., forward and/or reverse) are known to be of equal length or longer than the amplicon. This will trigger primer/adapter removal (if enabled) on the 3'-ends in order to get rid of readthroughs.
+* `--qc_minlen` sets the minimum length for reads to be retained after preprocessing.
+* `--dada2_chimera_method` sets the method with which `dada2` will remove chimeras (default: `"consensus"`, alternative: `"pool"`)
+* `--dada2_chimera_min_fold_parent_over_abundance` sets the `minFoldParentOverAbundance` parameter for `removeBimeraDeNovo` (default: `2`)
 
 
 ### internal beta-testing instructions

diff --git a/assets/adapters.fa b/assets/adapters.fa
@@ -1,3 +1,11 @@
+>AmpliconPCR16S_primer_fwd
+TCGTCGGCAGCGTCAGATGTGTATAAGAGACAGCCTACGGGNGGCWGCAG
+>AmpliconPCR16S_primer_rev
+GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAGGACTACHVGGGTATCTAATCC
+>Overhang_fwd
+TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG
+>Overhand_rev
+GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG
 >Reverse_adapter
 AGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG
 >TruSeq_Universal_Adapter

diff --git a/config/run.config b/config/run.config
@@ -1,29 +1,46 @@
 params {
-	publish_mode = "symlink"
+	publish_mode = "copy"
 
+	// default figaro parameters
+	// overlap between read1 and read2
 	min_overlap = 20
+	// length of left primer
 	left_primer = 0
+	// length of right primer
 	right_primer = 0
 
-	/*
-		bbduk qc parameters
-		s. https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/bbduk-guide/
-		qtrim=rl trimq=3 : gentle quality trimming (only discard bases < phred 3; phred 2 = junk marker) on either side (rl) of the read
-		maq=25 : discard reads below average quality of pred 25
-		minlen=45 : discard reads < 45bp
-		ref=?? ktrim=r k=23 mink=11 hdist=1 tpe tbo : right-side k-mer based adapter clipping with 1 mismatch allowed, try overlap-detection (tbo), and trim pairs to same length (tpe) upon adapter detection
-		ftm=5 : get rid off (n*5)+1st base (last sequencing cycle illumina garbage)
-		entropy=0.5 entropywindow=50 entropyk=5 : discard low complexity sequences
-	*/
-
-	qc_params_primers = "qtrim=rl trimq=3 ktrim=l k=14 mink=1 hdist=1 cu=t"
-	qc_params_adapters = "qtrim=rl trimq=3 ktrim=l k=23 mink=1 hdist=1 tpe tbo cu=t"
+	// only keep reads of at least length = qc_minlen
 	qc_minlen = 100
 
+	// If only primer lengths are supplied, figaro/dada2 will take care of primer removal.
+	// Otherwise, if primer sequences are supplied,
+	// primer + adapter removal is a two-step process:
+	// 1. gentle quality trimming (< phred 3) + remove left primer on 5'-R1 and potentially on 3'-R2 (rc)
+	// 2. remove right primer on 5'-R2 and potentially on 3'-R1 (rc)
+
+	// Primer removal is highly dataset-specific, you might have to play with the settings below:
+	// also refer to: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/bbduk-guide/
+	// cu=t : allow degenerate primer sequences
+	// qtrim=rl trimq=3 : gentle quality trimming (< phred 3) on both sides
+	// ktrim=(r|l) : clip adapters from right xor left end -- DO NOT MODIFY.
+	// restrictleft|restrictright : only take into account the first / last N bases for adapter clipping
+	// k=9 hdist=1: adapter/primer k-mers of length 9 have to match with at most one mismatch
+	// mink=1: at the ends of reads, perfect (mismatch=0) adapter/primer k-mer matches of length 1 are allowed (similar to cutadapt)
+  // -- to allow mismatches, set hdist2 to a positive, non-zero integer
+
+	p5_primer_params = "cu=t qtrim=rl ktrim=l trimq=3 k=9 mink=1 hdist=1 restrictleft=50"
+	p3_primer_params = "cu=t ktrim=r k=9 mink=1 hdist=1 restrictright=50"
+
+	// default settings for mapseq (when served from gaga2-singularity container)
+	// only edit these three, if you're using a local mapseq installation
 	mapseq_bin = "mapseq"
-	mapseq_db_path = projectDir
-	mapseq_db_name = ""
+	//mapseq_db_path = projectDir  // dirname of /path/to/mapseq_folder (i.e. /path/to)
+	//mapseq_db_name = "" // basename of /path/to/mapseq_folder (i.e. mapseq_folder)
+	mapseq_db_path = "/g/scb/zeller/schudoma/mapseq_db/zeller_db"
+	mapseq_db_name = "zeller_db"
 
+
+	// parameters for dada2 chimera removal -- potentially useful for troubleshooting
 	dada2_chimera_method = "consensus" // can be "consensus" (default dada2 since 1.4) or "pool"
 	dada2_chimera_min_fold_parent_over_abundance = 2
 }
@@ -47,16 +64,16 @@ process {
 		executor = "slurm"
 		errorStrategy = {task.attempt <= 3 ? "retry" : "ignore"}
 		memory = '16.GB'
-		time = '4d'
+		time = '14d'
 		maxRetries = 3
 		cpus = 8
 	}
 	withLabel: dada2 {
 		container = "oras://ghcr.io/zellerlab/gaga2:latest"
 		executor = "slurm"
 		errorStrategy = {task.attempt <= 3 ? "retry" : "ignore"}
-		memory = '16.GB'
-		time = '4d'
+		memory = {16.GB * task.attempt}
+		time = '14d'
 		maxRetries = 3
 		cpus = 8
 	}
@@ -66,13 +83,13 @@ process {
         errorStrategy = {task.attempt <= 3 ? "retry" : "ignore"}
         cpus = 4
         memory = {8.GB * task.attempt}
-        time = '2h'
+        time = '24h'
         maxRetries = 3
     }
 	withName: fastqc {
 		container = "oras://ghcr.io/zellerlab/gaga2:latest"
 		executor = "slurm"
-        errorStrategy = {task.attempt <= 3 ? "retry" : "ignore"}
+    errorStrategy = {task.attempt <= 3 ? "retry" : "ignore"}
 		cpus = 2
 		memory = {4.GB * task.attempt}
 		time = '7d'