Merge pull request #74 from kbessonov1984/master

ecTyper version 0.9.0 with improved reporting and O-antigen call rate
phac-nml · Oct 5, 2019 · bcc0e6a · bcc0e6a
2 parents 754ff4b + ec69b3e
commit bcc0e6a
Show file tree

Hide file tree

Showing 25 changed files with 303,848 additions and 105,174 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,6 +1,6 @@
 language: python
 python:
-  - "3.5"
+  - "3.6"
 before_install:
   - sudo apt-get update
 install:
@@ -14,7 +14,7 @@ install:
   - conda info -a
   - conda config --add channels conda-forge
   - conda config --add channels bioconda
-  - conda create -q -n test-environment python=3.5 samtools=1.8 pandas=0.23.1 bowtie2=2.3.4.1 mash=2.0 bcftools=1.8 biopython=1.70 blast=2.7.1 seqtk=1.2 pytest=3.5
+  - conda create -q -n test-environment python=3.6 samtools=1.8 pandas=0.23.1 bowtie2=2.3.4.1 mash=2.0 bcftools=1.8 biopython=1.70 blast=2.7.1 seqtk=1.2 pytest=3.5 requests=2.22.0 portalocker=1.5.1
   - conda activate test-environment
   - python setup.py install
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,25 @@
+**v0.8.1**
+* supports E.coli species detection via 10 short 1000 bp sequences based on E.coli core genomes
+* contains signatures for the 178 O and 53 H antigen types
+* non-E.coli isolates are identified using mash screen function against entire RefSeq database of all genomes
+
+
+**v0.9.0**
+* improved O-antigen serotyping coverage of complex samples that lack some O-antigen signatures. 
+* fall back to single O-antigen signature detection when both signature pairs are not found and species is E.coli
+* improved O-antigen identification favoring presence of both alleles (e.g. wzx and wzy) to support the final call. 
+The sum of scores for both alleles of the same antigen is used in ranking now
+* verification for E.albertii species against RefSeq genomes
+* addition of Quality Control flags in the output (as an extra column in the `results.tsv`) for ease of results interpretation
+* species reporting to better resolve E.coli vs Shigella vs E.albertii cases and contamination
+* serotype prediction for all Escherichia genus to ease classification of cryptic spqeices
+* automatic download of the RefSeq sketch and associated meta data every 6 months for improved species identification
+* improved species identification for the FASTQ files. All raw reads are used for species identification
+* better complex cases handling and error recovery in cases of poor reference allele coverage. 
+* serotype reporting based on multiple evidences including reference allele database and closest reference genome
+* query length coverage default threshold lowered from 50% to 10% for truncated alleles. This greatly improved sensitivity.
+Did not found any noticeable negative effect on specificity and accuracy based on EnteroBase, PNC2018-2019 and internal datasets
+* users is warned about potential false postive result in case of non-E.coli species or assembly errors
+* wrote additional unit tests
+
+
diff --git a/README.md b/README.md
@@ -14,6 +14,8 @@
 - biopython 1.70
 - blast 2.7.1
 - seqtk 1.2
+- requests >=2.0
+- portalocker 1.5.1
 
 # Installation
 1. Get `miniconda` if you do not already have `miniconda` or `anaconda`:

diff --git a/ectyper/Data/assembly_summary_refseq.txt b/ectyper/Data/assembly_summary_refseq.txt
diff --git a/ectyper/Data/ectyper_new.json b/ectyper/Data/ectyper_new.json
diff --git a/ectyper/__init__.py b/ectyper/__init__.py
@@ -1 +1 @@
-__version__ = "0.8.1"
+__version__ = "0.9.0"
diff --git a/ectyper/commandLineOptions.py b/ectyper/commandLineOptions.py
@@ -29,7 +29,7 @@ def check_percentage(value):
     parser = argparse.ArgumentParser(
         description='ectyper v{} Prediction of Escherichia coli serotype from '
                     'raw reads'
-                    ' or assembled genome sequences'.format(__version__)
+                    ' or assembled genome sequences. The default settings are recommended.'.format(__version__)
     )
 
     parser.add_argument(
@@ -67,8 +67,8 @@ def check_percentage(value):
         "-l",
         "--percentLength",
         type=check_percentage,
-        help="Percent length required for an allele match [default 50]",
-        default=50
+        help="Percent length required for an allele match [default 10]",
+        default=10
     )
 
     parser.add_argument(
@@ -103,6 +103,13 @@ def check_percentage(value):
              "the output"
     )
 
+    parser.add_argument(
+        "-D",
+        "--debug",
+        action="store_true",
+        help="Print more detailed log including debug level messages"
+    )
+
     if args is None:
         return parser.parse_args()
     else:

diff --git a/ectyper/ectyper.py b/ectyper/ectyper.py
@@ -2,7 +2,7 @@
 """
     Predictive serotyping for _E. coli_.
 """
-import os
+import os, time
 import tempfile
 import datetime
 import json
@@ -17,6 +17,7 @@
 
 
 # setup the application logging
+
 LOG = loggingFunctions.create_logger()
 
 
@@ -26,20 +27,33 @@ def run_program():
     Creates all required files and controls function execution.
     :return: success or failure
     """
-
     args = commandLineOptions.parse_command_line()
     output_directory = create_output_directory(args.output)
 
     # Create a file handler for log messages in the output directory
     fh = logging.FileHandler(os.path.join(output_directory, 'ectyper.log'), 'w', 'utf-8')
-    fh.setLevel(logging.DEBUG)
+
+    if args.debug:
+        fh.setLevel(logging.DEBUG)
+        LOG.setLevel(logging.DEBUG)
+    else:
+        fh.setLevel(logging.INFO)
+        LOG.setLevel(logging.INFO)
+
+
     LOG.addHandler(fh)
 
-    LOG.debug(args)
+
     LOG.info("Starting ectyper v{}".format(__version__))
     LOG.info("Output_directory is {}".format(output_directory))
+    LOG.info("Command-line arguments {}".format(args))
 
+    #init RefSeq database for species identification
+    if speciesIdentification.get_refseq_mash() == False:
+        LOG.critical("MASH RefSeq sketch does not exists and was not able to be downloaded. Aborting run ...")
+        exit("No MASH RefSeq sketch file found in the default location")
     # Initialize ectyper directory for the scope of this program
+
     with tempfile.TemporaryDirectory(dir=output_directory) as temp_dir:
         LOG.info("Gathering genome files")
         raw_genome_files = genomeFunctions.get_files_as_list(args.input)
@@ -48,52 +62,58 @@ def run_program():
         # 'fasta'=[], 'fastq'=[], 'other'=[]
         raw_files_dict = genomeFunctions.identify_raw_files(raw_genome_files,
                                                             args)
-
         alleles_fasta = create_alleles_fasta_file(temp_dir)
+
         combined_fasta = \
             genomeFunctions.create_combined_alleles_and_markers_file(
                 alleles_fasta, temp_dir)
+
+        #print(combined_fasta);
+        time.sleep(100)
         bowtie_base = genomeFunctions.create_bowtie_base(temp_dir,
                                                          combined_fasta) if \
                                                          raw_files_dict['fastq'] else None
 
         # Assemble any fastq files, get final fasta list
         LOG.info("Assembling final list of fasta files")
-        all_fasta_files = genomeFunctions.assemble_fastq(raw_files_dict,
+        all_fastafastq_files_dict = genomeFunctions.assemble_fastq(raw_files_dict,
                                                          temp_dir,
                                                          combined_fasta,
                                                          bowtie_base,
                                                          args)
 
         # Verify we have at least one fasta file. Optionally species ID.
         # Get a tuple of ecoli and other genomes
-        (ecoli_genomes, other_genomes_dict) = speciesIdentification.verify_ecoli(
-            all_fasta_files,
+        (ecoli_genomes_dict, other_genomes_dict) = speciesIdentification.verify_ecoli(
+            all_fastafastq_files_dict,
             raw_files_dict['other'],
             args,
             temp_dir)
 
-        LOG.info("Standardizing the genome headers based on file names")
-        final_fasta_files = genomeFunctions.get_genome_names_from_files(
-            ecoli_genomes,
-            temp_dir,
-            args
-            )
 
-        # Main prediction function
-        predictions_dict = run_prediction(final_fasta_files,
+        LOG.info("Standardizing the E.coli genome headers based on file names") #e.g. lcl|Escherichia_O26H11|17
+        #final_fasta_files \
+        predictions_dict={}
+        if ecoli_genomes_dict:
+            ecoli_genomes_dict = genomeFunctions.get_genome_names_from_files(
+                ecoli_genomes_dict,
+                temp_dir,
+                args
+                )
+            # Main prediction function
+            predictions_dict = run_prediction(ecoli_genomes_dict, #final_fasta_files
                                           args,
                                           alleles_fasta,
                                           temp_dir)
 
-        # Add empty rows for genomes without a blast result
+        # Add empty rows for genomes without a blast result or non-E.coli samples that did not undergo typing
         final_predictions = predictionFunctions.add_non_predicted(
             raw_genome_files, predictions_dict, other_genomes_dict)
 
         # Store most recent result in working directory
         LOG.info("Reporting results:\n")
 
-        predictionFunctions.report_result(final_predictions,
+        predictionFunctions.report_result(final_predictions, output_directory,
                                           os.path.join(output_directory,
                                                        'output.tsv'))
         LOG.info("\nECTyper has finished successfully.")
@@ -119,14 +139,14 @@ def create_output_directory(output_dir):
         ])
         out_dir = os.path.join(definitions.WORKPLACE_DIR, date_dir)
     else:
+        #print(output_dir)
         if os.path.isabs(output_dir):
             out_dir = output_dir
         else:
             out_dir = os.path.join(definitions.WORKPLACE_DIR, output_dir)
 
     if not os.path.exists(out_dir):
         os.makedirs(out_dir)
-
     return out_dir
 
 
@@ -138,6 +158,7 @@ def create_alleles_fasta_file(temp_dir):
     :temp_dir: temporary directory for length of program run
     :return: the filepath for alleles.fasta
     """
+
     output_file = os.path.join(temp_dir, 'alleles.fasta')
 
     with open(definitions.SEROTYPE_ALLELE_JSON, 'r') as jsonfh:
@@ -153,7 +174,7 @@ def create_alleles_fasta_file(temp_dir):
     return output_file
 
 
-def run_prediction(genome_files, args, alleles_fasta, temp_dir):
+def run_prediction(genome_files_dict, args, alleles_fasta, temp_dir):
     """
     Serotype prediction of all the input files, which have now been properly
     converted to fasta if required, and their headers standardized
@@ -166,6 +187,7 @@ def run_prediction(genome_files, args, alleles_fasta, temp_dir):
     :return: predictions_dict
     """
 
+    genome_files = [genome_files_dict[k]["modheaderfile"] for k in genome_files_dict.keys()]
     # Divide genome files into groups and create databases for each set
     per_core = int(len(genome_files) / args.cores) + 1
     group_size = 50 if per_core > 50 else per_core
@@ -184,6 +206,14 @@ def run_prediction(genome_files, args, alleles_fasta, temp_dir):
         # merge the per-database predictions with the final predictions dict
         for r in results:
             predictions_dict = {**r, **predictions_dict}
+
+    for genome_name in predictions_dict.keys():
+        try:
+            predictions_dict[genome_name]["species"] = genome_files_dict[genome_name]["species"]
+            predictions_dict[genome_name]["error"] = genome_files_dict[genome_name]["error"]
+        except KeyError as e:
+            predictions_dict[genome_name]["error"] = "Error: "+str(e)+" in "+genome_name
+
     return predictions_dict
 
 

diff --git a/ectyper/genomeFunctions.py b/ectyper/genomeFunctions.py
@@ -16,6 +16,7 @@
 LOG = logging.getLogger(__name__)
 
 
+
 def get_files_as_list(file_or_directory):
     """
     Creates a list of files from either the given file, or all files within the
@@ -88,30 +89,36 @@ def get_file_format(file):
     return 'other'
 
 
-def get_genome_names_from_files(files, temp_dir, args):
+def get_genome_names_from_files(files_dict, temp_dir, args):
     """
     For each file:
     Uses the name of the file for the genome name, creates a temporary file
     using
-    >lcl|filename as the name in the fasta header.
+    >lcl|filename as the name in the fasta header.  e.g. lcl|Escherichia_O26H11|17
     :param files: All the fasta files for analyses
     :param temp_dir: The ectyper temp directory
     :param args: Commandline arguments
     :return: List of files with fasta headers modified for filename
     """
-
-    modified_genomes = []
+    files=[]
+    for sample in files_dict.keys(): #{'Escherichia_O26H11': {'species': 'Escherichia coli', 'filepath': '/Data/Escherichia_O26H11.fasta'}}
+       files.append(files_dict[sample]["filepath"])
+    #modified_genomes = []
     partial_ghw = partial(genome_header_wrapper, temp_dir=temp_dir)
 
     with Pool(processes=args.cores) as pool:
-        results = pool.map(partial_ghw, files)
+        (results)= pool.map(partial_ghw, files)
+        #print(results)
 
         for r in results:
-            modified_genomes.append(r)
+            sample=r["samplename"]
+            files_dict[sample]["modheaderfile"] = r["newfile"]
+            #modified_genomes.append(r)
 
+    modified_genomes = [files_dict[samples]["modheaderfile"] for samples in files_dict.keys()]
     LOG.debug(("Modified genomes: {}".format(modified_genomes)))
-    return modified_genomes
-
+    #return modified_genomes
+    return files_dict
 
 def genome_header_wrapper(file, temp_dir):
     """
@@ -125,7 +132,7 @@ def genome_header_wrapper(file, temp_dir):
     # get only the name of the file for use in the fasta header
     file_base_name = os.path.basename(file)
     file_path_name = os.path.splitext(file_base_name)[0]
-    n_name = file_path_name.replace(' ', '_')
+    n_name = file_path_name.replace(' ', '_') #sample name
 
     # create a new file for the updated fasta headers
     new_file = tempfile.NamedTemporaryFile(dir=temp_dir, delete=False).name
@@ -137,7 +144,7 @@ def genome_header_wrapper(file, temp_dir):
                     ">lcl|" + n_name + "|" + record.description + "\n")
                 outfile.write(str(record.seq) + "\n")
 
-    return new_file
+    return {"oldfile":file,"newfile":new_file, "samplename":n_name}
 
 
 def create_bowtie_base(temp_dir, reference):
@@ -251,7 +258,7 @@ def assemble_reads(reads, bowtie_base, combined_fasta, temp_dir):
     with open(output_fasta, 'wb') as ofh:
         ofh.write(to_fasta_output.stdout)
 
-    return output_fasta
+    return {"fastq_file":reads,"fasta_file":output_fasta}
 
 
 def get_file_format_tuple(file):
@@ -316,14 +323,16 @@ def assemble_fastq(raw_files_dict, temp_dir, combined_fasta, bowtie_base, args):
                   combined_fasta=combined_fasta,
                   temp_dir=temp_dir)
 
-    all_fasta_files = raw_files_dict['fasta']
+    all_fasta_files_dict = dict.fromkeys(raw_files_dict['fasta']) #add assembled genomes as new keys
     with Pool(processes=args.cores) as pool:
-        results = pool.map(par, raw_files_dict['fastq'])
+        iterator = pool.map(par, raw_files_dict['fastq'])
+        for item in iterator:
+            all_fasta_files_dict[item["fasta_file"]]=item["fastq_file"]
 
-        for r in results:
-            all_fasta_files.append(r)
+   #     for r in results:
+   #         all_fasta_files.append(r)
 
-    return all_fasta_files
+    return all_fasta_files_dict
 
 
 def create_combined_alleles_and_markers_file(alleles_fasta, temp_dir):
@@ -340,6 +349,7 @@ def create_combined_alleles_and_markers_file(alleles_fasta, temp_dir):
     combined_file = os.path.join(temp_dir, 'combined_ident_serotype.fasta')
     LOG.info("Creating combined serotype and identification fasta file")
 
+    #print(definitions.ECOLI_MARKERS)
     with open(combined_file, 'w') as ofh:
         with open(definitions.ECOLI_MARKERS, 'r') as mfh:
             ofh.write(mfh.read())

diff --git a/ectyper/loggingFunctions.py b/ectyper/loggingFunctions.py
@@ -15,7 +15,9 @@ def create_logger():
     # define a Handler which writes INFO messages or higher to the sys.stderr
     console = logging.StreamHandler()
     console.setFormatter(formatter)
-    console.setLevel(logging.INFO)
+
+
+    #console.setLevel(logging.INFO)
     log.addHandler(console)
 
     return log