added gff output

Former-commit-id: 3b4b6b4
PedroMTQ · Oct 12, 2021 · 6403c66 · 6403c66
1 parent d7e4250
commit 6403c66
Show file tree

Hide file tree

Showing 11 changed files with 305 additions and 39 deletions.
diff --git a/README.md b/README.md
@@ -108,6 +108,8 @@ There are 3 output files:
 
 The first two files can have the same query sequence in several lines (query sequence/reference source) while the `consensus_annotation.tsv` will only have one line per query sequence (consensus/query).
 
+Mantis can additionally output in gff format and also a kegg module matrix completeness. Please see [Output](https://github.com/PedroMTQ/mantis/wiki/Output) for more details.
+
 # Further details
 
 * [Configuration](https://github.com/PedroMTQ/mantis/wiki/Configuration)  

diff --git a/__main__.py b/__main__.py
@@ -83,6 +83,9 @@
                         help='[optional]\tgenerate KEGG modules completeness matrix.')
     parser.add_argument('-vkm', '--verbose_kegg_matrix', action='store_true',
                         help='[optional]\tgenerate KEGG modules completeness matrix in verbose mode. Verbose mode gives, in addition to the default matrix, complete module name and missing KOs; it also exports a summary figure.')
+    parser.add_argument('-gff', '--output_gff', action='store_true',
+                        help='[optional]\tgenerate GFF-formatted output files.')
+
     parser.add_argument('-fo', '--force_output', action='store_true',
                         help='[optional]\tIf you would like to force the output to the folder you specified. This may result in errrors!')
     #setup databases
@@ -138,6 +141,7 @@
         no_unifunc = args.no_unifunc
         kegg_matrix = args.kegg_matrix
         verbose_kegg_matrix = args.verbose_kegg_matrix
+        output_gff = args.output_gff
         force_output = args.force_output
         default_workers = args.default_workers
         chunk_size = args.chunk_size
@@ -178,6 +182,7 @@
                            no_unifunc=no_unifunc,
                            kegg_matrix=kegg_matrix,
                            verbose_kegg_matrix=verbose_kegg_matrix,
+                           output_gff=output_gff,
                            default_workers=default_workers,
                            chunk_size=chunk_size,
                            time_limit=time_limit,

diff --git a/source/Exceptions.py b/source/Exceptions.py
@@ -7,6 +7,7 @@
 BadNumberWorkers='You should not be seeing this, please contact the developer. Invalid number of workers in '
 ConnectionError='Could not connect to url:\n'
 InvalidTranslation='Invalid residues for translation. Please make sure you provided a CDS of DNA or RNA in the target file:\n'
+InvalidGFFVersion='No valid GFF version found!'
 
 class RequirementsNotMet(Exception):
     def __init__(self, *args):

diff --git a/source/MANTIS.py b/source/MANTIS.py
@@ -31,6 +31,7 @@ def run_mantis(target_path,
                no_unifunc=False,
                kegg_matrix=False,
                verbose_kegg_matrix=False,
+               output_gff=False,
                verbose=True,
                default_workers=None,
                chunk_size=None,
@@ -71,6 +72,7 @@ def run_mantis(target_path,
         no_unifunc=no_unifunc,
         kegg_matrix=kegg_matrix,
         verbose_kegg_matrix=verbose_kegg_matrix,
+        output_gff=output_gff,
         verbose=verbose,
         default_workers=default_workers,
         chunk_size=chunk_size,
@@ -137,6 +139,7 @@ def __init__(self,
                  no_unifunc=False,
                  kegg_matrix=False,
                  verbose_kegg_matrix=False,
+                 output_gff=False,
                  verbose=True,
                  default_workers=None,
                  chunk_size=None,
@@ -188,6 +191,7 @@ def __init__(self,
         self.no_unifunc = no_unifunc
         self.kegg_matrix = kegg_matrix
         self.verbose_kegg_matrix = verbose_kegg_matrix
+        self.output_gff = output_gff
         if self.verbose_kegg_matrix: self.kegg_matrix=True
         self.default_workers = default_workers
         self.user_memory = user_memory
@@ -245,6 +249,7 @@ def __str__(self):
             'Algorithm:\t\t\t' + str(self.domain_algorithm) + '\n' if self.domain_algorithm else '',
             #'Formula:\t\t\t' + str(self.best_combo_formula) + '\n' if self.best_combo_formula else '',
             #'Sorting type:\t\t\t' + str(self.sorting_type) + '\n' if self.sorting_type else '',
+            'Outputting GFF:\t\t\t' + str(self.output_gff) + '\n' if self.output_gff else '',
             'Skip consensus:\t\t' + str(self.skip_consensus) + '\n' if self.skip_consensus else '',
             'Skip memory management:\t\t' + str(self.skip_managed_memory) + '\n' if self.skip_managed_memory else '',
             'Skip consensus expansion:\t' + str(self.no_consensus_expansion) + '\n' if self.no_consensus_expansion else '',
@@ -432,6 +437,7 @@ def remove_uncompressed_files(self):
     @timeit_class
     def run_mantis_test(self):
         self.mantis_paths['custom']=MANTIS_FOLDER + 'tests/test_hmm/'
+        self.output_gff=True
         Path(self.output_folder).mkdir(parents=True, exist_ok=True)
         self.generate_fastas_to_annotate()
         self.generate_translated_sample()

diff --git a/source/MANTIS_Consensus.py b/source/MANTIS_Consensus.py
@@ -99,6 +99,8 @@ def read_interpreted_annotation(self, interpreted_annotation_tsv):
                 dict_annotations[query]['ref_files'][ref_file][ref_hit] = {'identifiers': set(), 'description': set(),
                                                                            'evalue': float(evalue),
                                                                            'bitscore': float(bitscore),
+                                                                           'direction': direction,
+                                                                           'ref_hit_accession': ref_hit_accession,
                                                                            'query_start': int(query_start),
                                                                            'query_end': int(query_end),
                                                                            'ref_start': int(ref_start),
@@ -119,6 +121,79 @@ def read_interpreted_annotation(self, interpreted_annotation_tsv):
             self.add_from_go_obo(dict_annotations[query]['ref_files'])
         return dict_annotations
 
+    def generate_gff_line_consensus(self,query, dict_annotations, is_essential,consensus_hits, total_hits, ref_files_consensus,ref_names_consensus):
+        #https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
+        #verified with http://genometools.org/cgi-bin/gff3validator.cgi
+        for ref_file in dict_annotations:
+            ref_annotations=dict_annotations[ref_file]
+            for ref_id in ref_annotations:
+                ref_id_annotations=dict_annotations[ref_file][ref_id]
+                query_start= str(ref_id_annotations['query_start'])
+                query_end= str(ref_id_annotations['query_end'])
+                evalue= str(ref_id_annotations['evalue'])
+                direction= ref_id_annotations['direction']
+                ref_start= ref_id_annotations['ref_start']
+                ref_end= ref_id_annotations['ref_end']
+                ref_len= ref_id_annotations['ref_len']
+                hit_accession= ref_id_annotations['ref_hit_accession']
+                hit_identifiers=ref_id_annotations['identifiers']
+                hit_description=ref_id_annotations['description']
+                gff_line = '\t'.join([query, 'Mantis', 'CDS', query_start, query_end, evalue, direction, '0']) + '\t'
+                if hit_accession != '-':
+                    attributes = f'Name={ref_id};Target={ref_id} {ref_start} {ref_end};Alias={hit_accession}'
+                else:
+                    attributes = f'Name={ref_id};Target={ref_id} {ref_start} {ref_end}'
+                notes = f'Note=ref_file:{ref_file},ref_len:{ref_len}'
+                descriptions=[]
+                for d in hit_description:
+                    descriptions.append(f'description:{d}')
+                if descriptions:
+                    notes+=','+','.join(descriptions)
+                if is_essential:
+                    notes+=f',is_essential_gene:True'
+
+                dbxref = []
+                ontology_terms = []
+                for i in hit_identifiers:
+                    if not i.startswith('go:'):
+                        dbxref.append(i)
+                    else:
+                        ontology_terms.append(i)
+                all_annotations=None
+                if dbxref and ontology_terms:
+                    all_annotations = 'Dbxref=' + ','.join(dbxref) + ';' + 'Ontology_term=' + ','.join(ontology_terms)
+                elif dbxref and not ontology_terms:
+                    all_annotations = 'Dbxref=' + ','.join(dbxref)
+                elif not dbxref and ontology_terms:
+                    all_annotations = 'Ontology_term=' + ','.join(ontology_terms)
+                if all_annotations:
+                    gff_line += ';'.join([attributes, notes, all_annotations])
+                else:
+                    gff_line += ';'.join([attributes, notes])
+
+                yield gff_line
+
+
+
+    def yield_seq_regions(self,interpreted_annotation_tsv):
+        already_added=set()
+        with open(interpreted_annotation_tsv) as file:
+            line = file.readline()
+            line = file.readline()
+            dict_annotations = {}
+            while line:
+                line = line.strip('\n')
+                line = line.strip()
+                line = line.split('\t')
+                query, ref_file, ref_hit, ref_hit_accession, evalue, bitscore, direction, query_len, query_start, query_end, ref_start, ref_end, ref_len = line[0:13]
+                if query not in already_added:
+                    already_added.add(query)
+                    yield f'##sequence-region {query} 1 {query_len}\n'
+                line=file.readline()
+
+
+
+
     def generate_consensus(self, interpreted_annotation_tsv, stdout_file_path=None):
         dict_annotations = self.read_interpreted_annotation(interpreted_annotation_tsv)
         self.query_match_hits = {}
@@ -145,7 +220,12 @@ def generate_consensus(self, interpreted_annotation_tsv, stdout_file_path=None):
             consensus_line = self.generate_consensus_line(query, dict_annotations[query]['ref_files'], is_essential,
                                                           consensus_hits, total_hits, ref_files_consensus,
                                                           ref_names_consensus)
-            yield consensus_line
+            gff_lines=[]
+            if self.output_gff:
+                gff_lines = self.generate_gff_line_consensus(query, dict_annotations[query]['ref_files'], is_essential,
+                                                          consensus_hits, total_hits, ref_files_consensus,
+                                                          ref_names_consensus)
+            yield consensus_line,gff_lines
 
     # same method as non_overlapping hits from MANTIS_Processor , with some minor alterations to account for consensus
     # @timeit_function
@@ -561,8 +641,7 @@ def remove_redundant_descriptions(self, all_descriptions):
                 already_added.add(test)
         return res
 
-    def generate_consensus_line(self, query, query_dict, is_essential, consensus_hits, total_hits, ref_files_consensus,
-                                ref_names_consensus):
+    def generate_consensus_line(self, query, query_dict, is_essential, consensus_hits, total_hits, ref_files_consensus,ref_names_consensus):
         ref_hits = ';'.join(ref_names_consensus)
         ref_files = ';'.join(ref_files_consensus)
         # consensus_coverage is a str with consensus sources/all sources, better as a str instead of float as its easier to understand
@@ -610,13 +689,38 @@ def generate_consensus_output(self, interpreted_annotation_tsv, consensus_annota
                       '|',
                       'Links',
                       ]
-        with open(consensus_annotation_tsv, 'w+') as file:
-            first_line = '\t'.join(first_line)
-            file.write(first_line + '\n')
-            consensus_annotation = self.generate_consensus(interpreted_annotation_tsv,stdout_file_path=stdout_file_path)
-            for consensus_query in consensus_annotation:
-                line = '\t'.join(consensus_query)
-                file.write(line + '\n')
+        first_line = '\t'.join(first_line)
+        output_file=open(consensus_annotation_tsv, 'w+')
+        output_file.write(first_line + '\n')
+
+        gff_file=None
+        gff_file_path = consensus_annotation_tsv.replace('.tsv', '.gff')
+        if self.output_gff:
+            gff_file=open(gff_file_path,'w+')
+            gff_file.write('##gff-version 3' + '\n')
+
+
+
+        consensus_annotation = self.generate_consensus(interpreted_annotation_tsv,stdout_file_path=stdout_file_path)
+        if self.output_gff:
+            seq_regions=self.yield_seq_regions(interpreted_annotation_tsv)
+            for sr in seq_regions:
+                gff_file.write(sr)
+
+        for consensus_query,gff_lines in consensus_annotation:
+            line = '\t'.join(consensus_query)
+            output_file.write(line + '\n')
+            if self.output_gff:
+                for gff_l in gff_lines:
+                    gff_file.write(gff_l+'\n')
+
+
+        if gff_file:
+            gff_file.close()
+        output_file.close()
+
+
+
 
 
 

diff --git a/source/MANTIS_MP.py b/source/MANTIS_MP.py
@@ -576,7 +576,7 @@ def worker_interpret_output(self, queue, master_pid):
             if record is None: break
             output_annotation_tsv, current_chunk_dir = record
             interpreted_annotation_tsv = f'{current_chunk_dir}integrated_annotation.tsv'
-            self.generate_interpreted_output(output_annotation_tsv, interpreted_annotation_tsv)
+            self.generate_integrated_output(output_annotation_tsv, interpreted_annotation_tsv)
 
     ###Generate consensus output
 

diff --git a/source/MANTIS_Metadata.py b/source/MANTIS_Metadata.py
@@ -166,14 +166,11 @@ def get_hit_links(self, dict_hits, ref_file):
         elif ref_file=='tcdb':
             self.get_link_compiled_metadata(dict_hits=dict_hits, ref_file_path=self.mantis_paths['tcdb'] + 'metadata.tsv')
         else:
-            custom_ref_path=self.get_target_custom_ref_paths(ref_file, folder=False)
+            custom_ref_path=self.get_target_custom_ref_paths(ref_file, folder=True)
             if custom_ref_path:
-                if custom_ref_path.endswith('.hmm'):
-                    file_path = custom_ref_path.replace('.hmm', '.tsv')
-                elif custom_ref_path.endswith('.dmnd'):
-                    file_path = custom_ref_path.replace('.dmnd', '.tsv')
-            if file_exists(file_path):
-                self.get_link_compiled_metadata(dict_hits=dict_hits, ref_file_path=file_path)
+                file_path = add_slash(custom_ref_path)+'metadata.tsv'
+                if file_exists(file_path):
+                    self.get_link_compiled_metadata(dict_hits=dict_hits, ref_file_path=file_path)
         for hit in dict_hits:
             self.get_common_links(hit, dict_hits[hit])
             if 'accession' in dict_hits[hit]['link']:   self.get_common_links(dict_hits[hit]['link']['accession'], dict_hits[hit])
@@ -261,7 +258,49 @@ def read_and_interpret_output_annotation(self, output_annotation_tsv):
                                                                )
         return res
 
-    def generate_interpreted_output(self, output_annotation_tsv, interpreted_annotation_tsv):
+    def generate_gff_line_integrated(self,integrated_line):
+        #https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
+        #verified with http://genometools.org/cgi-bin/gff3validator.cgi
+        line_split=integrated_line.index('|')
+        line_data,annotation_data=integrated_line[:line_split],integrated_line[line_split+1:]
+        query, ref_file, hit, hit_accession, evalue, bitscore, direction, query_len, query_start, query_end, ref_start, ref_end, ref_len = line_data
+        #attributes of gff line:
+
+        if hit_accession!='-':
+            attributes=f'Name={hit};Target={hit} {ref_start} {ref_end};Alias={hit_accession}'
+        else:
+            attributes=f'Name={hit};Target={hit} {ref_start} {ref_end}'
+        notes=f'Note=ref_file:{ref_file},ref_len:{ref_len}'
+        dbxref=[]
+        ontology_terms=[]
+        descriptions = []
+
+        for i in annotation_data:
+            if not i.startswith('go:'):
+                dbxref.append(i)
+            elif i.startswith('description:'):
+                descriptions.append(i)
+            else:
+                ontology_terms.append(i)
+        if descriptions:
+            notes += ',' + ','.join(descriptions)
+        all_annotations=None
+        if dbxref and ontology_terms:
+            all_annotations='Dbxref='+','.join(dbxref)+';'+'Ontology_term='+','.join(ontology_terms)
+        elif dbxref and not ontology_terms:
+            all_annotations='Dbxref='+','.join(dbxref)
+        elif not dbxref and ontology_terms:
+            all_annotations='Ontology_term='+','.join(ontology_terms)
+        gff_line='\t'.join([query,'Mantis','CDS',query_start,query_end,evalue,direction,'0'])+'\t'
+        if all_annotations:
+            gff_line+=';'.join([attributes,notes,all_annotations])
+        else:
+            gff_line+=';'.join([attributes,notes])
+        sequence_region_line=f'##sequence-region {query} 1 {query_len}'
+        return query,sequence_region_line,gff_line
+
+
+    def generate_integrated_output(self, output_annotation_tsv, interpreted_annotation_tsv):
         first_line = ['Query',
                       'Ref_file',
                       'Ref_hit',
@@ -277,14 +316,40 @@ def generate_interpreted_output(self, output_annotation_tsv, interpreted_annotat
                       'Ref_length',
                       '|',
                       'Links']
-        with open(interpreted_annotation_tsv, 'w+') as file:
-            first_line = '\t'.join(first_line)
-            file.write(first_line + '\n')
-            output_annotation = self.read_and_interpret_output_annotation(output_annotation_tsv)
-            for line in range(len(output_annotation)):
-                if output_annotation[line]:
-                    out_line = '\t'.join(output_annotation[line])
-                    file.write(out_line + '\n')
+        first_line = '\t'.join(first_line)
+        output_file=open(interpreted_annotation_tsv, 'w+')
+        output_file.write(first_line + '\n')
+
+        gff_file=None
+        gff_file_path = interpreted_annotation_tsv.replace('.tsv', '.gff')
+        gff_dict = {}
+        if self.output_gff:
+            gff_file=open(gff_file_path,'w+')
+            gff_file.write('##gff-version 3' + '\n')
+
+        #generating output
+        output_annotation = self.read_and_interpret_output_annotation(output_annotation_tsv)
+        for line in range(len(output_annotation)):
+            current_output_line=output_annotation[line]
+            if current_output_line:
+                out_line = '\t'.join(current_output_line)
+                output_file.write(out_line + '\n')
+
+                if gff_file:
+                    seq_id,sequence_region_line,gff_line=self.generate_gff_line_integrated(current_output_line)
+                    if seq_id not in gff_dict:
+                        gff_dict[seq_id]={'seq_region':sequence_region_line,'lines':[]}
+                    gff_dict[seq_id]['lines'].append(gff_line)
+        #writing gff
+        for seq_id in gff_dict:
+            gff_file.write(gff_dict[seq_id]['seq_region']+'\n')
+        for seq_id in gff_dict:
+            for annot_line in gff_dict[seq_id]['lines']:
+                gff_file.write(annot_line+'\n')
+
+        if gff_file:
+            gff_file.close()
+        output_file.close()
 
 
 
@@ -420,6 +485,7 @@ def export_sample_kos(self,samples_info):
                 line=f'{sample_name}\t{kos}\n'
                 file.write(line)
 
+
     def generate_matrix(self):
         sample_paths = []
         figure_info={}