Skip to content

Commit

Permalink
fixed batch mode error when one of the files fails FASTA test causing…
Browse files Browse the repository at this point in the history
… other files in a pool not being analyzed.
  • Loading branch information
kbessonov1984 committed Jul 31, 2024
1 parent 6cca035 commit 41a5093
Show file tree
Hide file tree
Showing 6 changed files with 13 additions and 17 deletions.
5 changes: 2 additions & 3 deletions ectyper/commandLineOptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,9 @@ def checkdbversion():
)

parser.add_argument(
"-d",
"--maxdepth",
"--maxdirdepth",
help="Maximum number of directories to descend when searching an input directory of files",
default=1e6,
default=0,
type=int,
required=False
)
Expand Down
3 changes: 1 addition & 2 deletions ectyper/ectyper.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def run_program():
os.makedirs(temp_dir, exist_ok=True)

LOG.info("Gathering genome files list ...")
input_files_list = genomeFunctions.get_files_as_list(args.input, args.maxdepth)
input_files_list = genomeFunctions.get_files_as_list(args.input, args.maxdirdepth)
raw_genome_files = decompress_gunzip_files(input_files_list, temp_dir)

LOG.info(f"Identifying genome file types on {len(raw_genome_files)} inputs ...")
Expand Down Expand Up @@ -157,7 +157,6 @@ def run_program():
raw_files_dict['filesnotfound'],
args)



LOG.info("Standardizing the E.coli genome headers based on file names")

Expand Down
2 changes: 1 addition & 1 deletion ectyper/genomeFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def get_files_as_list(files_or_directories, max_depth_level):
LOG.info(f"Directory level exceeded ({dir_level_current} > {max_depth_level}), skipping {file_or_directory} ...")
continue

# if single directory is specified
# if directory is specified
if os.path.isdir(file_or_directory):
LOG.info(f"Gathering genomes from directory {file_or_directory} at level {dir_level_current} ...")
# Create a list containing the file names
Expand Down
4 changes: 2 additions & 2 deletions ectyper/predictionFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -868,7 +868,7 @@ def add_non_predicted(all_genomes_list, predictions_dict, other_dict, filesnotfo
:param predictions_data_frame: the Dict containing the ectyper predictions
:return: modified prediction file
"""

# genome names are given without the filename extension
for g in all_genomes_list:
gname = os.path.splitext(os.path.split(g)[1])[0]
Expand All @@ -887,7 +887,7 @@ def add_non_predicted(all_genomes_list, predictions_dict, other_dict, filesnotfo
}
else:
predictions_dict[gname] = {
'error': "No O and H antigen determinant E.coli genes were found. Try running with --verify parameter",
'error': f"No O and H antigen determinant E.coli genes were found in {gname}",
'species': ecoli_dict[gname]["species"]
}

Expand Down
14 changes: 7 additions & 7 deletions ectyper/speciesIdentification.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,29 +266,29 @@ def verify_ecoli_and_inputs(fasta_fastq_files_dict, ofiles, filesnotfound, args)
filesnotfound_dict = {}

fasta_files = fasta_fastq_files_dict.keys()

for fasta in fasta_files:
sampleName = getSampleName(fasta)
speciesname = "-"

if is_valid_fasta_file(fasta, sampleName) == False:
failverifyerrormessage = f"Sample {sampleName} FASTA file ({fasta}) is empty. This could happen when FASTA file generated from FASTQ input lacks raw reads mapping to O- and H- antigens database or input FASTA is empty/corrupted. Please check sequence input file of {sampleName}"
other_files_dict[sampleName] = {"species":speciesname,"filepath":fasta,"error":failverifyerrormessage}
return ecoli_files_dict, other_files_dict, filesnotfound_dict

if sampleName in ecoli_files_dict or sampleName in other_files_dict:
error_msg = "Duplicated parsed filenames found ('{}'). Offending file paths {}. Only unique file names are supported in batch mode".format(
sampleName, [file for file in fasta_files if sampleName in file]
)
LOG.error(error_msg)
raise ValueError(error_msg)

if is_valid_fasta_file(fasta, sampleName) == False:
failverifyerrormessage = f"Sample {sampleName} FASTA file ({fasta}) is invalid/empty. This could happen when FASTA file generated from FASTQ input lacks raw reads mapping to O- and H- antigens database or input FASTA is empty/corrupted. Please check sequence input file of {sampleName}"


#do species always regardless of --verify param. Do prediction on fastq files if available for better accuracy
if fasta_fastq_files_dict[fasta]:
fastq_file = fasta_fastq_files_dict[fasta]
speciesname = get_species(fastq_file, args, args.cores)
else:
speciesname = get_species(fasta, args, args.cores)

if args.verify:
failverifyerrormessage = "Sample identified as " + speciesname + ": serotyping results are only available for E.coli samples." \
"If sure that sample is E.coli run without --verify parameter."
Expand All @@ -311,5 +311,5 @@ def verify_ecoli_and_inputs(fasta_fastq_files_dict, ofiles, filesnotfound, args)
for file in filesnotfound:
sampleName = getSampleName(file)
filesnotfound_dict[sampleName]={"error":"File {} not found!".format(file)}

return ecoli_files_dict, other_files_dict,filesnotfound_dict
2 changes: 0 additions & 2 deletions ectyper/subprocess_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,5 @@ def run_subprocess(cmd, input_data=None, un=False, ignorereturncode=False):
else:
LOG.error("Error in subprocess. The following command failed: {}".format(cmd))
LOG.error("Subprocess failed with error: \"{}\"".format(comp_proc.stderr.decode("utf-8")))
#LOG.critical("ectyper has stopped")
return comp_proc
#raise Exception(f"subprocess failure while running {cmd} command")

0 comments on commit 41a5093

Please sign in to comment.