diff --git a/README.md b/README.md index 9688700..c7fe8fe 100644 --- a/README.md +++ b/README.md @@ -108,6 +108,8 @@ There are 3 output files: The first two files can have the same query sequence in several lines (query sequence/reference source) while the `consensus_annotation.tsv` will only have one line per query sequence (consensus/query). +Mantis can additionally output in gff format and also a kegg module matrix completeness. Please see [Output](https://github.com/PedroMTQ/mantis/wiki/Output) for more details. + # Further details * [Configuration](https://github.com/PedroMTQ/mantis/wiki/Configuration) diff --git a/__main__.py b/__main__.py index 390d359..de83616 100644 --- a/__main__.py +++ b/__main__.py @@ -83,6 +83,9 @@ help='[optional]\tgenerate KEGG modules completeness matrix.') parser.add_argument('-vkm', '--verbose_kegg_matrix', action='store_true', help='[optional]\tgenerate KEGG modules completeness matrix in verbose mode. Verbose mode gives, in addition to the default matrix, complete module name and missing KOs; it also exports a summary figure.') + parser.add_argument('-gff', '--output_gff', action='store_true', + help='[optional]\tgenerate GFF-formatted output files.') + parser.add_argument('-fo', '--force_output', action='store_true', help='[optional]\tIf you would like to force the output to the folder you specified. This may result in errrors!') #setup databases @@ -138,6 +141,7 @@ no_unifunc = args.no_unifunc kegg_matrix = args.kegg_matrix verbose_kegg_matrix = args.verbose_kegg_matrix + output_gff = args.output_gff force_output = args.force_output default_workers = args.default_workers chunk_size = args.chunk_size @@ -178,6 +182,7 @@ no_unifunc=no_unifunc, kegg_matrix=kegg_matrix, verbose_kegg_matrix=verbose_kegg_matrix, + output_gff=output_gff, default_workers=default_workers, chunk_size=chunk_size, time_limit=time_limit, diff --git a/source/Exceptions.py b/source/Exceptions.py index 25afa6b..b049bd8 100644 --- a/source/Exceptions.py +++ b/source/Exceptions.py @@ -7,6 +7,7 @@ BadNumberWorkers='You should not be seeing this, please contact the developer. Invalid number of workers in ' ConnectionError='Could not connect to url:\n' InvalidTranslation='Invalid residues for translation. Please make sure you provided a CDS of DNA or RNA in the target file:\n' +InvalidGFFVersion='No valid GFF version found!' class RequirementsNotMet(Exception): def __init__(self, *args): diff --git a/source/MANTIS.py b/source/MANTIS.py index a60aebe..d45cff9 100644 --- a/source/MANTIS.py +++ b/source/MANTIS.py @@ -31,6 +31,7 @@ def run_mantis(target_path, no_unifunc=False, kegg_matrix=False, verbose_kegg_matrix=False, + output_gff=False, verbose=True, default_workers=None, chunk_size=None, @@ -71,6 +72,7 @@ def run_mantis(target_path, no_unifunc=no_unifunc, kegg_matrix=kegg_matrix, verbose_kegg_matrix=verbose_kegg_matrix, + output_gff=output_gff, verbose=verbose, default_workers=default_workers, chunk_size=chunk_size, @@ -137,6 +139,7 @@ def __init__(self, no_unifunc=False, kegg_matrix=False, verbose_kegg_matrix=False, + output_gff=False, verbose=True, default_workers=None, chunk_size=None, @@ -188,6 +191,7 @@ def __init__(self, self.no_unifunc = no_unifunc self.kegg_matrix = kegg_matrix self.verbose_kegg_matrix = verbose_kegg_matrix + self.output_gff = output_gff if self.verbose_kegg_matrix: self.kegg_matrix=True self.default_workers = default_workers self.user_memory = user_memory @@ -245,6 +249,7 @@ def __str__(self): 'Algorithm:\t\t\t' + str(self.domain_algorithm) + '\n' if self.domain_algorithm else '', #'Formula:\t\t\t' + str(self.best_combo_formula) + '\n' if self.best_combo_formula else '', #'Sorting type:\t\t\t' + str(self.sorting_type) + '\n' if self.sorting_type else '', + 'Outputting GFF:\t\t\t' + str(self.output_gff) + '\n' if self.output_gff else '', 'Skip consensus:\t\t' + str(self.skip_consensus) + '\n' if self.skip_consensus else '', 'Skip memory management:\t\t' + str(self.skip_managed_memory) + '\n' if self.skip_managed_memory else '', 'Skip consensus expansion:\t' + str(self.no_consensus_expansion) + '\n' if self.no_consensus_expansion else '', @@ -432,6 +437,7 @@ def remove_uncompressed_files(self): @timeit_class def run_mantis_test(self): self.mantis_paths['custom']=MANTIS_FOLDER + 'tests/test_hmm/' + self.output_gff=True Path(self.output_folder).mkdir(parents=True, exist_ok=True) self.generate_fastas_to_annotate() self.generate_translated_sample() diff --git a/source/MANTIS_Consensus.py b/source/MANTIS_Consensus.py index 30701a5..03d3762 100644 --- a/source/MANTIS_Consensus.py +++ b/source/MANTIS_Consensus.py @@ -99,6 +99,8 @@ def read_interpreted_annotation(self, interpreted_annotation_tsv): dict_annotations[query]['ref_files'][ref_file][ref_hit] = {'identifiers': set(), 'description': set(), 'evalue': float(evalue), 'bitscore': float(bitscore), + 'direction': direction, + 'ref_hit_accession': ref_hit_accession, 'query_start': int(query_start), 'query_end': int(query_end), 'ref_start': int(ref_start), @@ -119,6 +121,79 @@ def read_interpreted_annotation(self, interpreted_annotation_tsv): self.add_from_go_obo(dict_annotations[query]['ref_files']) return dict_annotations + def generate_gff_line_consensus(self,query, dict_annotations, is_essential,consensus_hits, total_hits, ref_files_consensus,ref_names_consensus): + #https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md + #verified with http://genometools.org/cgi-bin/gff3validator.cgi + for ref_file in dict_annotations: + ref_annotations=dict_annotations[ref_file] + for ref_id in ref_annotations: + ref_id_annotations=dict_annotations[ref_file][ref_id] + query_start= str(ref_id_annotations['query_start']) + query_end= str(ref_id_annotations['query_end']) + evalue= str(ref_id_annotations['evalue']) + direction= ref_id_annotations['direction'] + ref_start= ref_id_annotations['ref_start'] + ref_end= ref_id_annotations['ref_end'] + ref_len= ref_id_annotations['ref_len'] + hit_accession= ref_id_annotations['ref_hit_accession'] + hit_identifiers=ref_id_annotations['identifiers'] + hit_description=ref_id_annotations['description'] + gff_line = '\t'.join([query, 'Mantis', 'CDS', query_start, query_end, evalue, direction, '0']) + '\t' + if hit_accession != '-': + attributes = f'Name={ref_id};Target={ref_id} {ref_start} {ref_end};Alias={hit_accession}' + else: + attributes = f'Name={ref_id};Target={ref_id} {ref_start} {ref_end}' + notes = f'Note=ref_file:{ref_file},ref_len:{ref_len}' + descriptions=[] + for d in hit_description: + descriptions.append(f'description:{d}') + if descriptions: + notes+=','+','.join(descriptions) + if is_essential: + notes+=f',is_essential_gene:True' + + dbxref = [] + ontology_terms = [] + for i in hit_identifiers: + if not i.startswith('go:'): + dbxref.append(i) + else: + ontology_terms.append(i) + all_annotations=None + if dbxref and ontology_terms: + all_annotations = 'Dbxref=' + ','.join(dbxref) + ';' + 'Ontology_term=' + ','.join(ontology_terms) + elif dbxref and not ontology_terms: + all_annotations = 'Dbxref=' + ','.join(dbxref) + elif not dbxref and ontology_terms: + all_annotations = 'Ontology_term=' + ','.join(ontology_terms) + if all_annotations: + gff_line += ';'.join([attributes, notes, all_annotations]) + else: + gff_line += ';'.join([attributes, notes]) + + yield gff_line + + + + def yield_seq_regions(self,interpreted_annotation_tsv): + already_added=set() + with open(interpreted_annotation_tsv) as file: + line = file.readline() + line = file.readline() + dict_annotations = {} + while line: + line = line.strip('\n') + line = line.strip() + line = line.split('\t') + query, ref_file, ref_hit, ref_hit_accession, evalue, bitscore, direction, query_len, query_start, query_end, ref_start, ref_end, ref_len = line[0:13] + if query not in already_added: + already_added.add(query) + yield f'##sequence-region {query} 1 {query_len}\n' + line=file.readline() + + + + def generate_consensus(self, interpreted_annotation_tsv, stdout_file_path=None): dict_annotations = self.read_interpreted_annotation(interpreted_annotation_tsv) self.query_match_hits = {} @@ -145,7 +220,12 @@ def generate_consensus(self, interpreted_annotation_tsv, stdout_file_path=None): consensus_line = self.generate_consensus_line(query, dict_annotations[query]['ref_files'], is_essential, consensus_hits, total_hits, ref_files_consensus, ref_names_consensus) - yield consensus_line + gff_lines=[] + if self.output_gff: + gff_lines = self.generate_gff_line_consensus(query, dict_annotations[query]['ref_files'], is_essential, + consensus_hits, total_hits, ref_files_consensus, + ref_names_consensus) + yield consensus_line,gff_lines # same method as non_overlapping hits from MANTIS_Processor , with some minor alterations to account for consensus # @timeit_function @@ -561,8 +641,7 @@ def remove_redundant_descriptions(self, all_descriptions): already_added.add(test) return res - def generate_consensus_line(self, query, query_dict, is_essential, consensus_hits, total_hits, ref_files_consensus, - ref_names_consensus): + def generate_consensus_line(self, query, query_dict, is_essential, consensus_hits, total_hits, ref_files_consensus,ref_names_consensus): ref_hits = ';'.join(ref_names_consensus) ref_files = ';'.join(ref_files_consensus) # consensus_coverage is a str with consensus sources/all sources, better as a str instead of float as its easier to understand @@ -610,13 +689,38 @@ def generate_consensus_output(self, interpreted_annotation_tsv, consensus_annota '|', 'Links', ] - with open(consensus_annotation_tsv, 'w+') as file: - first_line = '\t'.join(first_line) - file.write(first_line + '\n') - consensus_annotation = self.generate_consensus(interpreted_annotation_tsv,stdout_file_path=stdout_file_path) - for consensus_query in consensus_annotation: - line = '\t'.join(consensus_query) - file.write(line + '\n') + first_line = '\t'.join(first_line) + output_file=open(consensus_annotation_tsv, 'w+') + output_file.write(first_line + '\n') + + gff_file=None + gff_file_path = consensus_annotation_tsv.replace('.tsv', '.gff') + if self.output_gff: + gff_file=open(gff_file_path,'w+') + gff_file.write('##gff-version 3' + '\n') + + + + consensus_annotation = self.generate_consensus(interpreted_annotation_tsv,stdout_file_path=stdout_file_path) + if self.output_gff: + seq_regions=self.yield_seq_regions(interpreted_annotation_tsv) + for sr in seq_regions: + gff_file.write(sr) + + for consensus_query,gff_lines in consensus_annotation: + line = '\t'.join(consensus_query) + output_file.write(line + '\n') + if self.output_gff: + for gff_l in gff_lines: + gff_file.write(gff_l+'\n') + + + if gff_file: + gff_file.close() + output_file.close() + + + diff --git a/source/MANTIS_MP.py b/source/MANTIS_MP.py index 43e18ad..43807d9 100644 --- a/source/MANTIS_MP.py +++ b/source/MANTIS_MP.py @@ -576,7 +576,7 @@ def worker_interpret_output(self, queue, master_pid): if record is None: break output_annotation_tsv, current_chunk_dir = record interpreted_annotation_tsv = f'{current_chunk_dir}integrated_annotation.tsv' - self.generate_interpreted_output(output_annotation_tsv, interpreted_annotation_tsv) + self.generate_integrated_output(output_annotation_tsv, interpreted_annotation_tsv) ###Generate consensus output diff --git a/source/MANTIS_Metadata.py b/source/MANTIS_Metadata.py index 606cf88..e3767aa 100644 --- a/source/MANTIS_Metadata.py +++ b/source/MANTIS_Metadata.py @@ -166,14 +166,11 @@ def get_hit_links(self, dict_hits, ref_file): elif ref_file=='tcdb': self.get_link_compiled_metadata(dict_hits=dict_hits, ref_file_path=self.mantis_paths['tcdb'] + 'metadata.tsv') else: - custom_ref_path=self.get_target_custom_ref_paths(ref_file, folder=False) + custom_ref_path=self.get_target_custom_ref_paths(ref_file, folder=True) if custom_ref_path: - if custom_ref_path.endswith('.hmm'): - file_path = custom_ref_path.replace('.hmm', '.tsv') - elif custom_ref_path.endswith('.dmnd'): - file_path = custom_ref_path.replace('.dmnd', '.tsv') - if file_exists(file_path): - self.get_link_compiled_metadata(dict_hits=dict_hits, ref_file_path=file_path) + file_path = add_slash(custom_ref_path)+'metadata.tsv' + if file_exists(file_path): + self.get_link_compiled_metadata(dict_hits=dict_hits, ref_file_path=file_path) for hit in dict_hits: self.get_common_links(hit, dict_hits[hit]) if 'accession' in dict_hits[hit]['link']: self.get_common_links(dict_hits[hit]['link']['accession'], dict_hits[hit]) @@ -261,7 +258,49 @@ def read_and_interpret_output_annotation(self, output_annotation_tsv): ) return res - def generate_interpreted_output(self, output_annotation_tsv, interpreted_annotation_tsv): + def generate_gff_line_integrated(self,integrated_line): + #https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md + #verified with http://genometools.org/cgi-bin/gff3validator.cgi + line_split=integrated_line.index('|') + line_data,annotation_data=integrated_line[:line_split],integrated_line[line_split+1:] + query, ref_file, hit, hit_accession, evalue, bitscore, direction, query_len, query_start, query_end, ref_start, ref_end, ref_len = line_data + #attributes of gff line: + + if hit_accession!='-': + attributes=f'Name={hit};Target={hit} {ref_start} {ref_end};Alias={hit_accession}' + else: + attributes=f'Name={hit};Target={hit} {ref_start} {ref_end}' + notes=f'Note=ref_file:{ref_file},ref_len:{ref_len}' + dbxref=[] + ontology_terms=[] + descriptions = [] + + for i in annotation_data: + if not i.startswith('go:'): + dbxref.append(i) + elif i.startswith('description:'): + descriptions.append(i) + else: + ontology_terms.append(i) + if descriptions: + notes += ',' + ','.join(descriptions) + all_annotations=None + if dbxref and ontology_terms: + all_annotations='Dbxref='+','.join(dbxref)+';'+'Ontology_term='+','.join(ontology_terms) + elif dbxref and not ontology_terms: + all_annotations='Dbxref='+','.join(dbxref) + elif not dbxref and ontology_terms: + all_annotations='Ontology_term='+','.join(ontology_terms) + gff_line='\t'.join([query,'Mantis','CDS',query_start,query_end,evalue,direction,'0'])+'\t' + if all_annotations: + gff_line+=';'.join([attributes,notes,all_annotations]) + else: + gff_line+=';'.join([attributes,notes]) + sequence_region_line=f'##sequence-region {query} 1 {query_len}' + return query,sequence_region_line,gff_line + + + def generate_integrated_output(self, output_annotation_tsv, interpreted_annotation_tsv): first_line = ['Query', 'Ref_file', 'Ref_hit', @@ -277,14 +316,40 @@ def generate_interpreted_output(self, output_annotation_tsv, interpreted_annotat 'Ref_length', '|', 'Links'] - with open(interpreted_annotation_tsv, 'w+') as file: - first_line = '\t'.join(first_line) - file.write(first_line + '\n') - output_annotation = self.read_and_interpret_output_annotation(output_annotation_tsv) - for line in range(len(output_annotation)): - if output_annotation[line]: - out_line = '\t'.join(output_annotation[line]) - file.write(out_line + '\n') + first_line = '\t'.join(first_line) + output_file=open(interpreted_annotation_tsv, 'w+') + output_file.write(first_line + '\n') + + gff_file=None + gff_file_path = interpreted_annotation_tsv.replace('.tsv', '.gff') + gff_dict = {} + if self.output_gff: + gff_file=open(gff_file_path,'w+') + gff_file.write('##gff-version 3' + '\n') + + #generating output + output_annotation = self.read_and_interpret_output_annotation(output_annotation_tsv) + for line in range(len(output_annotation)): + current_output_line=output_annotation[line] + if current_output_line: + out_line = '\t'.join(current_output_line) + output_file.write(out_line + '\n') + + if gff_file: + seq_id,sequence_region_line,gff_line=self.generate_gff_line_integrated(current_output_line) + if seq_id not in gff_dict: + gff_dict[seq_id]={'seq_region':sequence_region_line,'lines':[]} + gff_dict[seq_id]['lines'].append(gff_line) + #writing gff + for seq_id in gff_dict: + gff_file.write(gff_dict[seq_id]['seq_region']+'\n') + for seq_id in gff_dict: + for annot_line in gff_dict[seq_id]['lines']: + gff_file.write(annot_line+'\n') + + if gff_file: + gff_file.close() + output_file.close() @@ -420,6 +485,7 @@ def export_sample_kos(self,samples_info): line=f'{sample_name}\t{kos}\n' file.write(line) + def generate_matrix(self): sample_paths = [] figure_info={} diff --git a/source/MANTIS_Processor.py b/source/MANTIS_Processor.py index 9b8c0b6..951270b 100644 --- a/source/MANTIS_Processor.py +++ b/source/MANTIS_Processor.py @@ -503,12 +503,12 @@ def read_dmndout(self, output_path, count_seqs_chunk, count_residues_original_fi #diamond's evalue is not scaled, so we need to multiply by residues count evalue = self.recalculate_evalue(evalue, self.diamond_db_size, count_residues_original_file) #hmmer's coordinates on the seq are always the same, regardless of direction - direction='Forward' + direction='+' if query_startseq1 homoserine dehydrogenase +atgaacgcgctcgccgctgacattcctgcgctgggcgtgggacgcctggcgctgctcggc +accggcacggtgggctcggccttcgtgcagcgctaccaggcgttgcaggcacgcgggctg +gaactgcccagcgtgcagtggttggccaattcgcgtactgcactggcgatcgatcgtgat +ctggcgctgccgctggaactggcacgacgcgcaccgcgtgacggcctcagctcgccaccg +tgggccagtgccgaagggctggagcgtggcgacgtagtggtcgatgccaccgccagtgaa +gacgtcgccgcgcgtcacgtgcagtggctggcgcgcggtgtgcacgtggtgactgccaac +aagctgggccgtggcgcgcagctggcacgcgcacaggccattgccgagagctgtgccgac +agcggtgcgcgttatggcgacagcgccaccgtgggcgctggcctgccgctgctgagcagc +ttgcgcgcgctggtggccggtggtgaccacatccatgccatcgagggcgtgctgtccggt +tcgctggcgtggctgttccatcgctatgacggccaagcgccgttctcggcggcggtgcgc +gaagcgctggcggcgggctacaccgaacccgacccacgcctggacctgtccggcgaggat +gtgcgccgcaagctgctgatcctggcccgcagcagcgggctggcgctggatgcttcgcag +gtgcaggtggattcgctggtacccgaaacactggcagcgctgccgctggaggctgcggtg +gctgcgctggaacagctggatgcgccgctgcaggcgcgctggcagcaggctcgcgacaat +ggtcgtgtgctgcgtttcgtcggccgcgttgatggcgagggtgcgcaggtcggcctgcgc +gagttgccggccgatcatccgctggcgcagggtgcgggcaccgacaaccgcgtggccatc +cacagcgatcgctaccgccgccagcccttgttgatccaggggccaggtgcgggtgcggaa +gtcactgcggccgcgttgctggatgacgtgctgcggatcgtaagttga + +>seq2 glucose-6-phosphate 1-dehydrogenase +atggcagagcaggtggccctgagccggacccaggtgtgcgggatcctgcgggaagagctt +ttccagggcgatgccttccatcagtcggatacacacatattcatcatcatgggtgcatcg +ggtgacctggccaagaagaagatctaccccaccatctggtggctgttccgggatggcctt +ctgcccgaaaacaccttcatcgtgggctatgcccgttcccgcctcacagtggctgacatc +cgcaaacagagtgagcccttcttcaaggccaccccagaggagaagctcaagctggaggac +ttctttgcccgcaactcctatgtggctggccagtacgatgatgcagcctcctaccagcgc +ctcaacagccacatgaatgccctccacctggggtcacaggccaaccgcctcttctacctg +gccttgcccccgaccgtctacgaggccgtcaccaagaacattcacgagtcctgcatgagc +cagataggctggaaccgcatcatcgtggagaagcccttcgggagggacctgcagagctct +gaccggctgtccaaccacatctcctccctgttccgtgaggaccagatctaccgcatcgac +cactacctgggcaaggagatggtgcagaacctcatggtgctgagatttgccaacaggatc +ttcggccccatctggaaccgggacaacatcgcctgcgttatcctcaccttcaaggagccc +tttggcactgagggtcgcgggggctatttcgatgaatttgggatcatccgggacgtgatg +cagaaccacctactgcagatgctgtgtctggtggccatggagaagcccgcctccaccaac +tcagatgacgtccgtgatgagaaggtcaaggtgttgaaatgcatctcagaggtgcaggcc +aacaatgtggtcctgggccagtacgtggggaaccccgatggagagggcgaggccaccaaa +gggtacctggacgaccccacggtgccccgcgggtccaccaccgccacttttgcagccgtc +gtcctctatgtggagaatgagaggtgggatggggtgcccttcatcctgcgctgcggcaag +gccctgaacgagcgcaaggccgaggtgaggctgcagttccatgatgtggccggcgacatc +ttccaccagcagtgcaagcgcaacgagctggtgatccgcgtgcagcccaacgaggccgtg +tacaccaagatgatgaccaagaagccgggcatgttcttcaaccccgaggagtcggagctg +gacctgacctacggcaacagatacaagaacgtgaagctccctgacgcctatgagcgcctc +atcctggacgtcttctgcgggagccagatgcacttcgtgcgcagcgacgagctccgtgag +gcctggcgtattttcaccccactgctgcaccagattgagctggagaagcccaagcccatc +ccctatatttatggcagccgaggccccacggaggcagacgagctgatgaagagagtgggt +ttccagtatgagggcacctacaagtgggtgaacccccacaagctctga \ No newline at end of file