Skip to content

Commit

Permalink
Merge pull request #74 from kbessonov1984/master
Browse files Browse the repository at this point in the history
ecTyper version 0.9.0 with improved reporting and O-antigen call rate
  • Loading branch information
kbessonov1984 authored Oct 5, 2019
2 parents 754ff4b + ec69b3e commit bcc0e6a
Show file tree
Hide file tree
Showing 25 changed files with 303,848 additions and 105,174 deletions.
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
language: python
python:
- "3.5"
- "3.6"
before_install:
- sudo apt-get update
install:
Expand All @@ -14,7 +14,7 @@ install:
- conda info -a
- conda config --add channels conda-forge
- conda config --add channels bioconda
- conda create -q -n test-environment python=3.5 samtools=1.8 pandas=0.23.1 bowtie2=2.3.4.1 mash=2.0 bcftools=1.8 biopython=1.70 blast=2.7.1 seqtk=1.2 pytest=3.5
- conda create -q -n test-environment python=3.6 samtools=1.8 pandas=0.23.1 bowtie2=2.3.4.1 mash=2.0 bcftools=1.8 biopython=1.70 blast=2.7.1 seqtk=1.2 pytest=3.5 requests=2.22.0 portalocker=1.5.1
- conda activate test-environment
- python setup.py install

Expand Down
25 changes: 25 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
**v0.8.1**
* supports E.coli species detection via 10 short 1000 bp sequences based on E.coli core genomes
* contains signatures for the 178 O and 53 H antigen types
* non-E.coli isolates are identified using mash screen function against entire RefSeq database of all genomes


**v0.9.0**
* improved O-antigen serotyping coverage of complex samples that lack some O-antigen signatures.
* fall back to single O-antigen signature detection when both signature pairs are not found and species is E.coli
* improved O-antigen identification favoring presence of both alleles (e.g. wzx and wzy) to support the final call.
The sum of scores for both alleles of the same antigen is used in ranking now
* verification for E.albertii species against RefSeq genomes
* addition of Quality Control flags in the output (as an extra column in the `results.tsv`) for ease of results interpretation
* species reporting to better resolve E.coli vs Shigella vs E.albertii cases and contamination
* serotype prediction for all Escherichia genus to ease classification of cryptic spqeices
* automatic download of the RefSeq sketch and associated meta data every 6 months for improved species identification
* improved species identification for the FASTQ files. All raw reads are used for species identification
* better complex cases handling and error recovery in cases of poor reference allele coverage.
* serotype reporting based on multiple evidences including reference allele database and closest reference genome
* query length coverage default threshold lowered from 50% to 10% for truncated alleles. This greatly improved sensitivity.
Did not found any noticeable negative effect on specificity and accuracy based on EnteroBase, PNC2018-2019 and internal datasets
* users is warned about potential false postive result in case of non-E.coli species or assembly errors
* wrote additional unit tests


2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
- biopython 1.70
- blast 2.7.1
- seqtk 1.2
- requests >=2.0
- portalocker 1.5.1

# Installation
1. Get `miniconda` if you do not already have `miniconda` or `anaconda`:
Expand Down
105,020 changes: 0 additions & 105,020 deletions ectyper/Data/assembly_summary_refseq.txt

This file was deleted.

10,555 changes: 10,554 additions & 1 deletion ectyper/Data/ectyper_new.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion ectyper/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.8.1"
__version__ = "0.9.0"
13 changes: 10 additions & 3 deletions ectyper/commandLineOptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def check_percentage(value):
parser = argparse.ArgumentParser(
description='ectyper v{} Prediction of Escherichia coli serotype from '
'raw reads'
' or assembled genome sequences'.format(__version__)
' or assembled genome sequences. The default settings are recommended.'.format(__version__)
)

parser.add_argument(
Expand Down Expand Up @@ -67,8 +67,8 @@ def check_percentage(value):
"-l",
"--percentLength",
type=check_percentage,
help="Percent length required for an allele match [default 50]",
default=50
help="Percent length required for an allele match [default 10]",
default=10
)

parser.add_argument(
Expand Down Expand Up @@ -103,6 +103,13 @@ def check_percentage(value):
"the output"
)

parser.add_argument(
"-D",
"--debug",
action="store_true",
help="Print more detailed log including debug level messages"
)

if args is None:
return parser.parse_args()
else:
Expand Down
70 changes: 50 additions & 20 deletions ectyper/ectyper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""
Predictive serotyping for _E. coli_.
"""
import os
import os, time
import tempfile
import datetime
import json
Expand All @@ -17,6 +17,7 @@


# setup the application logging

LOG = loggingFunctions.create_logger()


Expand All @@ -26,20 +27,33 @@ def run_program():
Creates all required files and controls function execution.
:return: success or failure
"""

args = commandLineOptions.parse_command_line()
output_directory = create_output_directory(args.output)

# Create a file handler for log messages in the output directory
fh = logging.FileHandler(os.path.join(output_directory, 'ectyper.log'), 'w', 'utf-8')
fh.setLevel(logging.DEBUG)

if args.debug:
fh.setLevel(logging.DEBUG)
LOG.setLevel(logging.DEBUG)
else:
fh.setLevel(logging.INFO)
LOG.setLevel(logging.INFO)


LOG.addHandler(fh)

LOG.debug(args)

LOG.info("Starting ectyper v{}".format(__version__))
LOG.info("Output_directory is {}".format(output_directory))
LOG.info("Command-line arguments {}".format(args))

#init RefSeq database for species identification
if speciesIdentification.get_refseq_mash() == False:
LOG.critical("MASH RefSeq sketch does not exists and was not able to be downloaded. Aborting run ...")
exit("No MASH RefSeq sketch file found in the default location")
# Initialize ectyper directory for the scope of this program

with tempfile.TemporaryDirectory(dir=output_directory) as temp_dir:
LOG.info("Gathering genome files")
raw_genome_files = genomeFunctions.get_files_as_list(args.input)
Expand All @@ -48,52 +62,58 @@ def run_program():
# 'fasta'=[], 'fastq'=[], 'other'=[]
raw_files_dict = genomeFunctions.identify_raw_files(raw_genome_files,
args)

alleles_fasta = create_alleles_fasta_file(temp_dir)

combined_fasta = \
genomeFunctions.create_combined_alleles_and_markers_file(
alleles_fasta, temp_dir)

#print(combined_fasta);
time.sleep(100)
bowtie_base = genomeFunctions.create_bowtie_base(temp_dir,
combined_fasta) if \
raw_files_dict['fastq'] else None

# Assemble any fastq files, get final fasta list
LOG.info("Assembling final list of fasta files")
all_fasta_files = genomeFunctions.assemble_fastq(raw_files_dict,
all_fastafastq_files_dict = genomeFunctions.assemble_fastq(raw_files_dict,
temp_dir,
combined_fasta,
bowtie_base,
args)

# Verify we have at least one fasta file. Optionally species ID.
# Get a tuple of ecoli and other genomes
(ecoli_genomes, other_genomes_dict) = speciesIdentification.verify_ecoli(
all_fasta_files,
(ecoli_genomes_dict, other_genomes_dict) = speciesIdentification.verify_ecoli(
all_fastafastq_files_dict,
raw_files_dict['other'],
args,
temp_dir)

LOG.info("Standardizing the genome headers based on file names")
final_fasta_files = genomeFunctions.get_genome_names_from_files(
ecoli_genomes,
temp_dir,
args
)

# Main prediction function
predictions_dict = run_prediction(final_fasta_files,
LOG.info("Standardizing the E.coli genome headers based on file names") #e.g. lcl|Escherichia_O26H11|17
#final_fasta_files \
predictions_dict={}
if ecoli_genomes_dict:
ecoli_genomes_dict = genomeFunctions.get_genome_names_from_files(
ecoli_genomes_dict,
temp_dir,
args
)
# Main prediction function
predictions_dict = run_prediction(ecoli_genomes_dict, #final_fasta_files
args,
alleles_fasta,
temp_dir)

# Add empty rows for genomes without a blast result
# Add empty rows for genomes without a blast result or non-E.coli samples that did not undergo typing
final_predictions = predictionFunctions.add_non_predicted(
raw_genome_files, predictions_dict, other_genomes_dict)

# Store most recent result in working directory
LOG.info("Reporting results:\n")

predictionFunctions.report_result(final_predictions,
predictionFunctions.report_result(final_predictions, output_directory,
os.path.join(output_directory,
'output.tsv'))
LOG.info("\nECTyper has finished successfully.")
Expand All @@ -119,14 +139,14 @@ def create_output_directory(output_dir):
])
out_dir = os.path.join(definitions.WORKPLACE_DIR, date_dir)
else:
#print(output_dir)
if os.path.isabs(output_dir):
out_dir = output_dir
else:
out_dir = os.path.join(definitions.WORKPLACE_DIR, output_dir)

if not os.path.exists(out_dir):
os.makedirs(out_dir)

return out_dir


Expand All @@ -138,6 +158,7 @@ def create_alleles_fasta_file(temp_dir):
:temp_dir: temporary directory for length of program run
:return: the filepath for alleles.fasta
"""

output_file = os.path.join(temp_dir, 'alleles.fasta')

with open(definitions.SEROTYPE_ALLELE_JSON, 'r') as jsonfh:
Expand All @@ -153,7 +174,7 @@ def create_alleles_fasta_file(temp_dir):
return output_file


def run_prediction(genome_files, args, alleles_fasta, temp_dir):
def run_prediction(genome_files_dict, args, alleles_fasta, temp_dir):
"""
Serotype prediction of all the input files, which have now been properly
converted to fasta if required, and their headers standardized
Expand All @@ -166,6 +187,7 @@ def run_prediction(genome_files, args, alleles_fasta, temp_dir):
:return: predictions_dict
"""

genome_files = [genome_files_dict[k]["modheaderfile"] for k in genome_files_dict.keys()]
# Divide genome files into groups and create databases for each set
per_core = int(len(genome_files) / args.cores) + 1
group_size = 50 if per_core > 50 else per_core
Expand All @@ -184,6 +206,14 @@ def run_prediction(genome_files, args, alleles_fasta, temp_dir):
# merge the per-database predictions with the final predictions dict
for r in results:
predictions_dict = {**r, **predictions_dict}

for genome_name in predictions_dict.keys():
try:
predictions_dict[genome_name]["species"] = genome_files_dict[genome_name]["species"]
predictions_dict[genome_name]["error"] = genome_files_dict[genome_name]["error"]
except KeyError as e:
predictions_dict[genome_name]["error"] = "Error: "+str(e)+" in "+genome_name

return predictions_dict


Expand Down
42 changes: 26 additions & 16 deletions ectyper/genomeFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
LOG = logging.getLogger(__name__)



def get_files_as_list(file_or_directory):
"""
Creates a list of files from either the given file, or all files within the
Expand Down Expand Up @@ -88,30 +89,36 @@ def get_file_format(file):
return 'other'


def get_genome_names_from_files(files, temp_dir, args):
def get_genome_names_from_files(files_dict, temp_dir, args):
"""
For each file:
Uses the name of the file for the genome name, creates a temporary file
using
>lcl|filename as the name in the fasta header.
>lcl|filename as the name in the fasta header. e.g. lcl|Escherichia_O26H11|17
:param files: All the fasta files for analyses
:param temp_dir: The ectyper temp directory
:param args: Commandline arguments
:return: List of files with fasta headers modified for filename
"""

modified_genomes = []
files=[]
for sample in files_dict.keys(): #{'Escherichia_O26H11': {'species': 'Escherichia coli', 'filepath': '/Data/Escherichia_O26H11.fasta'}}
files.append(files_dict[sample]["filepath"])
#modified_genomes = []
partial_ghw = partial(genome_header_wrapper, temp_dir=temp_dir)

with Pool(processes=args.cores) as pool:
results = pool.map(partial_ghw, files)
(results)= pool.map(partial_ghw, files)
#print(results)

for r in results:
modified_genomes.append(r)
sample=r["samplename"]
files_dict[sample]["modheaderfile"] = r["newfile"]
#modified_genomes.append(r)

modified_genomes = [files_dict[samples]["modheaderfile"] for samples in files_dict.keys()]
LOG.debug(("Modified genomes: {}".format(modified_genomes)))
return modified_genomes

#return modified_genomes
return files_dict

def genome_header_wrapper(file, temp_dir):
"""
Expand All @@ -125,7 +132,7 @@ def genome_header_wrapper(file, temp_dir):
# get only the name of the file for use in the fasta header
file_base_name = os.path.basename(file)
file_path_name = os.path.splitext(file_base_name)[0]
n_name = file_path_name.replace(' ', '_')
n_name = file_path_name.replace(' ', '_') #sample name

# create a new file for the updated fasta headers
new_file = tempfile.NamedTemporaryFile(dir=temp_dir, delete=False).name
Expand All @@ -137,7 +144,7 @@ def genome_header_wrapper(file, temp_dir):
">lcl|" + n_name + "|" + record.description + "\n")
outfile.write(str(record.seq) + "\n")

return new_file
return {"oldfile":file,"newfile":new_file, "samplename":n_name}


def create_bowtie_base(temp_dir, reference):
Expand Down Expand Up @@ -251,7 +258,7 @@ def assemble_reads(reads, bowtie_base, combined_fasta, temp_dir):
with open(output_fasta, 'wb') as ofh:
ofh.write(to_fasta_output.stdout)

return output_fasta
return {"fastq_file":reads,"fasta_file":output_fasta}


def get_file_format_tuple(file):
Expand Down Expand Up @@ -316,14 +323,16 @@ def assemble_fastq(raw_files_dict, temp_dir, combined_fasta, bowtie_base, args):
combined_fasta=combined_fasta,
temp_dir=temp_dir)

all_fasta_files = raw_files_dict['fasta']
all_fasta_files_dict = dict.fromkeys(raw_files_dict['fasta']) #add assembled genomes as new keys
with Pool(processes=args.cores) as pool:
results = pool.map(par, raw_files_dict['fastq'])
iterator = pool.map(par, raw_files_dict['fastq'])
for item in iterator:
all_fasta_files_dict[item["fasta_file"]]=item["fastq_file"]

for r in results:
all_fasta_files.append(r)
# for r in results:
# all_fasta_files.append(r)

return all_fasta_files
return all_fasta_files_dict


def create_combined_alleles_and_markers_file(alleles_fasta, temp_dir):
Expand All @@ -340,6 +349,7 @@ def create_combined_alleles_and_markers_file(alleles_fasta, temp_dir):
combined_file = os.path.join(temp_dir, 'combined_ident_serotype.fasta')
LOG.info("Creating combined serotype and identification fasta file")

#print(definitions.ECOLI_MARKERS)
with open(combined_file, 'w') as ofh:
with open(definitions.ECOLI_MARKERS, 'r') as mfh:
ofh.write(mfh.read())
Expand Down
4 changes: 3 additions & 1 deletion ectyper/loggingFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ def create_logger():
# define a Handler which writes INFO messages or higher to the sys.stderr
console = logging.StreamHandler()
console.setFormatter(formatter)
console.setLevel(logging.INFO)


#console.setLevel(logging.INFO)
log.addHandler(console)

return log
Expand Down
Loading

0 comments on commit bcc0e6a

Please sign in to comment.