CDCgov · rpetit3 · Jan 25, 2023 · Mar 6, 2023
diff --git a/bin/check_samplesheet_qc.py b/bin/check_samplesheet_qc.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python
+
+# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
+
+import os
+import sys
+import errno
+import argparse
+import gzip
+import re
+
+def parse_args(args=None):
+    Description = "Reformat cdcgov/phoenix samplesheet file and check its contents."
+    Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
+
+    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
+    parser.add_argument("FILE_IN", help="Input samplesheet file.")
+    parser.add_argument("FILE_OUT", help="Output file.")
+    return parser.parse_args(args)
+
+
+def make_dir(path):
+    if len(path) > 0:
+        try:
+            os.makedirs(path)
+        except OSError as exception:
+            if exception.errno != errno.EEXIST:
+                raise exception
+
+
+def print_error(error, context="Line", context_str=""):
+    error_str = "ERROR: Please check samplesheet -> {}".format(error)
+    if context != "" and context_str != "":
+        error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
+            error, context.strip(), context_str.strip()
+        )
+    print(error_str)
+    sys.exit(1)
+
+
+def check_samplesheet(file_in, file_out):
+    """
+    This function checks that the samplesheet follows the following structure:
+
+    sample,fastq_1,fastq_2,fastp_pass_json,fastp_failed_json,spades,mlst,quast,amrfinderplus
+    SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz,passed_fastp.json,failed.fastp.json,spades.fasta,mlst.tsv,quast.tsv,amrfinderplus.tsv
+
+    sample             A sample name for the input
+    fastq_1            R1 of reads run through Fastp
+    fastq_2            R2 of reads run through Fastp
+    fastp_pass_json    JSON output from initial Fastp run
+    fastp_failed_json  JSON output from rerun of Fastp on failed reads
+    spades             Assembly created by SPAdes
+    mlst               TSV output from mlst tool
+    quast              TSV report generated from quast
+    amrfinderplus      TSV report generated from amrfinderplus
+    """
+
+    sample_mapping_dict = {}
+    with open(file_in, "r") as fin:
+
+        ## Check header
+        MIN_COLS = 2
+        HEADER = ['sample', 'fastq_1', 'fastq_2', 'fastp_pass_json', 'fastp_failed_json', 'spades', 'mlst', 'quast', 'amrfinderplus']
+
+        header = [x.strip('"') for x in fin.readline().strip().split(",")]
+        if header[: len(HEADER)] != HEADER:
+            print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER)))
+            sys.exit(1)
+
+        ## Check sample entries
+        for line in fin:
+            lspl = [x.strip().strip('"') for x in line.strip().split(",")]
+
+            # Check valid number of columns per row
+            if len(lspl) < len(HEADER):
+                print_error(
+                    "Invalid number of columns (minimum = {})!".format(len(HEADER)),
+                    "Line",
+                    line,
+                )
+            num_cols = len([x for x in lspl if x])
+            if num_cols < MIN_COLS:
+                print_error(
+                    "Invalid number of populated columns (minimum = {})!".format(MIN_COLS),
+                    "Line",
+                    line,
+                )
+
+            ## Check sample name entries
+            sample, fastq_1, fastq_2, fastp_pass_json, fastp_failed_json, spades, mlst, quast, amrfinderplus = lspl[: len(HEADER)]
+            sample = sample.replace(" ", "_")
+            if not sample:
+                print_error("Sample entry has not been specified!", "Line", line)
+
+            ## Check FastQ file extension
+            for fastq in [fastq_1, fastq_2]:
+                if fastq:
+                    if fastq.find(" ") != -1:
+                        print_error("FastQ file contains spaces!", "Line", line)
+                    if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"): # If file is not gzipped then gzip it. 
+                        fastq_gz = fastq + ".gz"
+                        with open(fastq, "rb") as f_in:
+                            with gzip.open(fastq_gz, 'wb') as f_out: 
+                                f_out.writelines(f_in)
+                        print("FastQ file does not have extension '.fastq.gz' or '.fq.gz'! Zipping file.",
+                            "Line",
+                            line,
+                        )
+
+            ## Auto-detect paired-end/single-end
+            sample_info = []  ## [single_end, fastq_1, fastq_2]
+            if sample and fastq_1 and fastq_2:  ## Paired-end short reads
+                sample_info = ["0", fastq_1, fastq_2, fastp_pass_json, fastp_failed_json, spades, mlst, quast, amrfinderplus]
+            elif sample and fastq_1 and not fastq_2:  ## Single-end short reads
+                sample_info = ["1", fastq_1, fastq_2, fastp_pass_json, fastp_failed_json, spades, mlst, quast, amrfinderplus]
+            else:
+                print_error("Invalid combination of columns provided!", "Line", line)
+
+            ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] }
+            if sample not in sample_mapping_dict:
+                sample_mapping_dict[sample] = [sample_info]
+            else:
+                if sample_info in sample_mapping_dict[sample]:
+                    print_error("Samplesheet contains duplicate rows!", "Line", line)
+                else:
+                    sample_mapping_dict[sample].append(sample_info)
+
+    ## Write validated samplesheet with appropriate columns
+    if len(sample_mapping_dict) > 0:
+        out_dir = os.path.dirname(file_out)
+        make_dir(out_dir)
+        with open(file_out, "w") as fout:
+            fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2", "fastp_pass_json", "fastp_failed_json", "spades", "mlst", "quast", "amrfinderplus"]) + "\n")
+            for sample in sorted(sample_mapping_dict.keys()):
+
+                ## Check that multiple runs of the same sample are of the same datatype
+                if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]):
+                    print_error("Multiple runs of a sample must be of the same datatype!", "Sample: {}".format(sample))
+
+#                for idx, val in enumerate(sample_mapping_dict[sample]):
+#                    fout.write(",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n")
+                for idx, val in enumerate(sample_mapping_dict[sample]):
+                    if not val[1].endswith(".gz"): # check that forward read is a gzip file
+                        val[1] = re.sub(".fastq$", ".fastq.gz", val[1])
+                        val[1] = re.sub(".fq$", ".fq.gz", val[1])
+                    if not val[2].endswith(".gz"): # check that reverse read is a gzip file
+                        val[2] = re.sub(".fastq$", ".fastq.gz", val[2])
+                        val[2] = re.sub(".fq$", ".fq.gz", val[2])
+                    fout.write(",".join(["{}".format(sample)] + val) + "\n")
+    else:
+        print_error("No entries to process!", "Samplesheet: {}".format(file_in))
+
+
+def main(args=None):
+    args = parse_args(args)
+    check_samplesheet(args.FILE_IN, args.FILE_OUT)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/main.nf b/main.nf
@@ -25,6 +25,7 @@ WorkflowMain.initialise(workflow, params, log)
 */
 
 include { PHOENIX_EXTERNAL       } from './workflows/phoenix'
+include { PHOENIX_QC_EXTERNAL    } from './workflows/phoenix_qc'
 include { PHOENIX_EXQC           } from './workflows/cdc_phoenix'
 include { SRA_PHOENIX            } from './workflows/sra_phoenix'
 include { SCAFFOLD_EXTERNAL      } from './workflows/scaffolds'
@@ -43,6 +44,14 @@ workflow PHOENIX {
         gamma_ar         = PHOENIX_EXTERNAL.out.gamma_ar
 }
 
+//
+// WORKFLOW: Run QC steps from main cdcgov/phoenix analysis pipeline
+//
+workflow PHOENIX_QC {
+    main:
+        PHOENIX_QC_EXTERNAL ()
+}
+
 //
 // WORKFLOW: Run internal version of cdcgov/phoenix analysis pipeline that includes BUSCO, SRST2 and KRAKEN_ASMBLED
 //

diff --git a/modules/local/samplesheet_qc_check.nf b/modules/local/samplesheet_qc_check.nf
@@ -0,0 +1,28 @@
+process SAMPLESHEET_QC_CHECK {
+    tag "$samplesheet"
+    label 'process_low'
+    container 'quay.io/jvhagey/phoenix:base_v1.0.0'
+
+    /*container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/python:3.8.3' :
+        'quay.io/biocontainers/python:3.8.3' }"*/
+
+    input:
+    path samplesheet
+
+    output:
+    path '*.valid.csv' , emit: csv
+    path "versions.yml", emit: versions
+
+    script: // This script is bundled with the pipeline, in cdcgov/phoenix/bin/
+    """
+    check_samplesheet_qc.py \\
+    $samplesheet \\
+    samplesheet.valid.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+    END_VERSIONS
+    """
+}
diff --git a/subworkflows/local/input_qc_check.nf b/subworkflows/local/input_qc_check.nf
@@ -0,0 +1,110 @@
+//
+// Check input samplesheet and get read channels
+//
+
+include { SAMPLESHEET_QC_CHECK } from '../../modules/local/samplesheet_qc_check'
+
+workflow INPUT_QC_CHECK {
+    take:
+    samplesheet // file: /path/to/samplesheet.csv
+
+    main:
+    SAMPLESHEET_QC_CHECK ( samplesheet )
+        .csv
+        .splitCsv ( header:true, sep:',' )
+        .map { create_qc_channels(it) }
+        .set {ch_samples }
+
+    // Create channels to match upstream processes
+    // FASTP_TRIMD.out.reads -> tuple val(meta), path('*.trim.fastq.gz'), optional:true, emit: reads
+    ch_samples.map{meta, reads, fastp_pass_json, fastp_failed_json, spades, mlst, quast, amrfinderplus -> 
+        [ [id:meta.id, single_end:true], reads]
+    }.set { ch_reads }
+
+    // GATHERING_READ_QC_STATS: tuple val(meta), path(fastp_trimd_json), path(fastp_singles_json)
+    ch_samples.map{meta, reads, fastp_pass_json, fastp_failed_json, spades, mlst, quast, amrfinderplus -> 
+        [ [id:meta.id, single_end:true], fastp_pass_json, fastp_failed_json]
+    }.set { ch_fastp_json }
+
+    // SPADES_WF.out.spades_ch -> SPADES.out.scaffolds.map{meta, scaffolds -> [ [id:meta.id, single_end:true], scaffolds]}
+    ch_samples.map{meta, reads, fastp_pass_json, fastp_failed_json, spades, mlst, quast, amrfinderplus -> 
+        [ [id:meta.id, single_end:true], spades]
+    }.set { ch_spades }
+
+    // MLST.out.tsv -> tuple val(meta), path("*.tsv"), emit: tsv
+    ch_samples.map{meta, reads, fastp_pass_json, fastp_failed_json, spades, mlst, quast, amrfinderplus -> 
+        [ [id:meta.id, single_end:true], mlst]
+    }.set { ch_mlst }
+
+    // QUAST.out.report_tsv -> tuple val(meta), path("*.tsv"), emit: tsv
+    ch_samples.map{meta, reads, fastp_pass_json, fastp_failed_json, spades, mlst, quast, amrfinderplus -> 
+        [ [id:meta.id, single_end:true], quast]
+    }.set { ch_quast }
+
+    // AMRFINDERPLUS_RUN.out.report -> tuple val(meta), path("${meta.id}_all_genes.tsv"), emit: report
+    ch_samples.map{meta, reads, fastp_pass_json, fastp_failed_json, spades, mlst, quast, amrfinderplus -> 
+        [ [id:meta.id, single_end:true], amrfinderplus]
+    }.set { ch_amrfinderplus }
+
+    emit:
+    reads             = ch_reads
+    fastp_json        = ch_fastp_json
+    spades            = ch_spades
+    mlst              = ch_mlst
+    quast             = ch_quast
+    amrfinderplus     = ch_amrfinderplus
+    valid_samplesheet = SAMPLESHEET_QC_CHECK.out.csv
+    versions          = SAMPLESHEET_QC_CHECK.out.versions // channel: [ versions.yml ]
+}
+
+// Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
+def create_qc_channels(LinkedHashMap row) {
+    def meta = [:]
+    meta.id           = row.sample
+    meta.single_end   = row.single_end.toBoolean()
+    missing_input = false
+
+    def array = []
+    if (!file(row.fastq_1).exists()) {
+        exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}"
+    }
+
+    if (!meta.single_end) {
+        if (!file(row.fastq_2).exists()) {
+            exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}"
+        }
+    }
+
+    // Check remaining files
+    if (!file(row.fastp_pass_json).exists()) {
+        exit 1, "ERROR: Please check input samplesheet -> Fastp passed reads JSON file does not exist!\n${row.fastp_pass_json}"
+    }
+
+    if (!file(row.fastp_failed_json).exists()) {
+        exit 1, "ERROR: Please check input samplesheet -> Fastp failed reads JSON file does not exist!\n${row.fastp_failed_json}"
+    }
+
+    if (!file(row.spades).exists()) {
+        exit 1, "ERROR: Please check input samplesheet -> SPAdes assembly file does not exist!\n${row.spades}"
+    }
+
+    if (!file(row.mlst).exists()) {
+        exit 1, "ERROR: Please check input samplesheet -> MLST TSV report file does not exist!\n${row.mlst}"
+    }
+
+    if (!file(row.quast).exists()) {
+        exit 1, "ERROR: Please check input samplesheet -> QUAST TSV report file does not exist!\n${row.quast}"
+    }
+
+    if (!file(row.amrfinderplus).exists()) {
+        exit 1, "ERROR: Please check input samplesheet -> AMRFinder+ report file does not exist!\n${row.amrfinderplus}"
+    }
+
+    if (meta.single_end) {
+        array = [ meta, [ file(row.fastq_1) ], file(row.fastp_pass_json), file(row.fastp_failed_json), file(row.spades), file(row.mlst), file(row.quast), file(row.amrfinderplus) ]
+    } else {
+        array = [ meta, [ file(row.fastq_1), file(row.fastq_2) ], file(row.fastp_pass_json), file(row.fastp_failed_json), file(row.spades), file(row.mlst), file(row.quast), file(row.amrfinderplus) ]
+    }
+
+    return array
+}
diff --git a/workflows/phoenix_qc.nf b/workflows/phoenix_qc.nf
@@ -302,4 +302,3 @@ workflow.onComplete {
 ========================================================================================
     THE END
 ========================================================================================
-*/
-Original file line number
+Diff line change
@@ Expand Up / @@ -302,4 +302,3 @@ workflow.onComplete { @@
     ========================================================================================
         THE END
     ========================================================================================
-    */