Skip to content

Commit

Permalink
added gff output
Browse files Browse the repository at this point in the history
Former-commit-id: 3b4b6b4
  • Loading branch information
PedroMTQ committed Oct 12, 2021
1 parent d7e4250 commit 6403c66
Show file tree
Hide file tree
Showing 11 changed files with 305 additions and 39 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ There are 3 output files:

The first two files can have the same query sequence in several lines (query sequence/reference source) while the `consensus_annotation.tsv` will only have one line per query sequence (consensus/query).

Mantis can additionally output in gff format and also a kegg module matrix completeness. Please see [Output](https://github.com/PedroMTQ/mantis/wiki/Output) for more details.

# Further details

* [Configuration](https://github.com/PedroMTQ/mantis/wiki/Configuration)
Expand Down
5 changes: 5 additions & 0 deletions __main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@
help='[optional]\tgenerate KEGG modules completeness matrix.')
parser.add_argument('-vkm', '--verbose_kegg_matrix', action='store_true',
help='[optional]\tgenerate KEGG modules completeness matrix in verbose mode. Verbose mode gives, in addition to the default matrix, complete module name and missing KOs; it also exports a summary figure.')
parser.add_argument('-gff', '--output_gff', action='store_true',
help='[optional]\tgenerate GFF-formatted output files.')

parser.add_argument('-fo', '--force_output', action='store_true',
help='[optional]\tIf you would like to force the output to the folder you specified. This may result in errrors!')
#setup databases
Expand Down Expand Up @@ -138,6 +141,7 @@
no_unifunc = args.no_unifunc
kegg_matrix = args.kegg_matrix
verbose_kegg_matrix = args.verbose_kegg_matrix
output_gff = args.output_gff
force_output = args.force_output
default_workers = args.default_workers
chunk_size = args.chunk_size
Expand Down Expand Up @@ -178,6 +182,7 @@
no_unifunc=no_unifunc,
kegg_matrix=kegg_matrix,
verbose_kegg_matrix=verbose_kegg_matrix,
output_gff=output_gff,
default_workers=default_workers,
chunk_size=chunk_size,
time_limit=time_limit,
Expand Down
1 change: 1 addition & 0 deletions source/Exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
BadNumberWorkers='You should not be seeing this, please contact the developer. Invalid number of workers in '
ConnectionError='Could not connect to url:\n'
InvalidTranslation='Invalid residues for translation. Please make sure you provided a CDS of DNA or RNA in the target file:\n'
InvalidGFFVersion='No valid GFF version found!'

class RequirementsNotMet(Exception):
def __init__(self, *args):
Expand Down
6 changes: 6 additions & 0 deletions source/MANTIS.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def run_mantis(target_path,
no_unifunc=False,
kegg_matrix=False,
verbose_kegg_matrix=False,
output_gff=False,
verbose=True,
default_workers=None,
chunk_size=None,
Expand Down Expand Up @@ -71,6 +72,7 @@ def run_mantis(target_path,
no_unifunc=no_unifunc,
kegg_matrix=kegg_matrix,
verbose_kegg_matrix=verbose_kegg_matrix,
output_gff=output_gff,
verbose=verbose,
default_workers=default_workers,
chunk_size=chunk_size,
Expand Down Expand Up @@ -137,6 +139,7 @@ def __init__(self,
no_unifunc=False,
kegg_matrix=False,
verbose_kegg_matrix=False,
output_gff=False,
verbose=True,
default_workers=None,
chunk_size=None,
Expand Down Expand Up @@ -188,6 +191,7 @@ def __init__(self,
self.no_unifunc = no_unifunc
self.kegg_matrix = kegg_matrix
self.verbose_kegg_matrix = verbose_kegg_matrix
self.output_gff = output_gff
if self.verbose_kegg_matrix: self.kegg_matrix=True
self.default_workers = default_workers
self.user_memory = user_memory
Expand Down Expand Up @@ -245,6 +249,7 @@ def __str__(self):
'Algorithm:\t\t\t' + str(self.domain_algorithm) + '\n' if self.domain_algorithm else '',
#'Formula:\t\t\t' + str(self.best_combo_formula) + '\n' if self.best_combo_formula else '',
#'Sorting type:\t\t\t' + str(self.sorting_type) + '\n' if self.sorting_type else '',
'Outputting GFF:\t\t\t' + str(self.output_gff) + '\n' if self.output_gff else '',
'Skip consensus:\t\t' + str(self.skip_consensus) + '\n' if self.skip_consensus else '',
'Skip memory management:\t\t' + str(self.skip_managed_memory) + '\n' if self.skip_managed_memory else '',
'Skip consensus expansion:\t' + str(self.no_consensus_expansion) + '\n' if self.no_consensus_expansion else '',
Expand Down Expand Up @@ -432,6 +437,7 @@ def remove_uncompressed_files(self):
@timeit_class
def run_mantis_test(self):
self.mantis_paths['custom']=MANTIS_FOLDER + 'tests/test_hmm/'
self.output_gff=True
Path(self.output_folder).mkdir(parents=True, exist_ok=True)
self.generate_fastas_to_annotate()
self.generate_translated_sample()
Expand Down
124 changes: 114 additions & 10 deletions source/MANTIS_Consensus.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ def read_interpreted_annotation(self, interpreted_annotation_tsv):
dict_annotations[query]['ref_files'][ref_file][ref_hit] = {'identifiers': set(), 'description': set(),
'evalue': float(evalue),
'bitscore': float(bitscore),
'direction': direction,
'ref_hit_accession': ref_hit_accession,
'query_start': int(query_start),
'query_end': int(query_end),
'ref_start': int(ref_start),
Expand All @@ -119,6 +121,79 @@ def read_interpreted_annotation(self, interpreted_annotation_tsv):
self.add_from_go_obo(dict_annotations[query]['ref_files'])
return dict_annotations

def generate_gff_line_consensus(self,query, dict_annotations, is_essential,consensus_hits, total_hits, ref_files_consensus,ref_names_consensus):
#https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
#verified with http://genometools.org/cgi-bin/gff3validator.cgi
for ref_file in dict_annotations:
ref_annotations=dict_annotations[ref_file]
for ref_id in ref_annotations:
ref_id_annotations=dict_annotations[ref_file][ref_id]
query_start= str(ref_id_annotations['query_start'])
query_end= str(ref_id_annotations['query_end'])
evalue= str(ref_id_annotations['evalue'])
direction= ref_id_annotations['direction']
ref_start= ref_id_annotations['ref_start']
ref_end= ref_id_annotations['ref_end']
ref_len= ref_id_annotations['ref_len']
hit_accession= ref_id_annotations['ref_hit_accession']
hit_identifiers=ref_id_annotations['identifiers']
hit_description=ref_id_annotations['description']
gff_line = '\t'.join([query, 'Mantis', 'CDS', query_start, query_end, evalue, direction, '0']) + '\t'
if hit_accession != '-':
attributes = f'Name={ref_id};Target={ref_id} {ref_start} {ref_end};Alias={hit_accession}'
else:
attributes = f'Name={ref_id};Target={ref_id} {ref_start} {ref_end}'
notes = f'Note=ref_file:{ref_file},ref_len:{ref_len}'
descriptions=[]
for d in hit_description:
descriptions.append(f'description:{d}')
if descriptions:
notes+=','+','.join(descriptions)
if is_essential:
notes+=f',is_essential_gene:True'

dbxref = []
ontology_terms = []
for i in hit_identifiers:
if not i.startswith('go:'):
dbxref.append(i)
else:
ontology_terms.append(i)
all_annotations=None
if dbxref and ontology_terms:
all_annotations = 'Dbxref=' + ','.join(dbxref) + ';' + 'Ontology_term=' + ','.join(ontology_terms)
elif dbxref and not ontology_terms:
all_annotations = 'Dbxref=' + ','.join(dbxref)
elif not dbxref and ontology_terms:
all_annotations = 'Ontology_term=' + ','.join(ontology_terms)
if all_annotations:
gff_line += ';'.join([attributes, notes, all_annotations])
else:
gff_line += ';'.join([attributes, notes])

yield gff_line



def yield_seq_regions(self,interpreted_annotation_tsv):
already_added=set()
with open(interpreted_annotation_tsv) as file:
line = file.readline()
line = file.readline()
dict_annotations = {}
while line:
line = line.strip('\n')
line = line.strip()
line = line.split('\t')
query, ref_file, ref_hit, ref_hit_accession, evalue, bitscore, direction, query_len, query_start, query_end, ref_start, ref_end, ref_len = line[0:13]
if query not in already_added:
already_added.add(query)
yield f'##sequence-region {query} 1 {query_len}\n'
line=file.readline()




def generate_consensus(self, interpreted_annotation_tsv, stdout_file_path=None):
dict_annotations = self.read_interpreted_annotation(interpreted_annotation_tsv)
self.query_match_hits = {}
Expand All @@ -145,7 +220,12 @@ def generate_consensus(self, interpreted_annotation_tsv, stdout_file_path=None):
consensus_line = self.generate_consensus_line(query, dict_annotations[query]['ref_files'], is_essential,
consensus_hits, total_hits, ref_files_consensus,
ref_names_consensus)
yield consensus_line
gff_lines=[]
if self.output_gff:
gff_lines = self.generate_gff_line_consensus(query, dict_annotations[query]['ref_files'], is_essential,
consensus_hits, total_hits, ref_files_consensus,
ref_names_consensus)
yield consensus_line,gff_lines

# same method as non_overlapping hits from MANTIS_Processor , with some minor alterations to account for consensus
# @timeit_function
Expand Down Expand Up @@ -561,8 +641,7 @@ def remove_redundant_descriptions(self, all_descriptions):
already_added.add(test)
return res

def generate_consensus_line(self, query, query_dict, is_essential, consensus_hits, total_hits, ref_files_consensus,
ref_names_consensus):
def generate_consensus_line(self, query, query_dict, is_essential, consensus_hits, total_hits, ref_files_consensus,ref_names_consensus):
ref_hits = ';'.join(ref_names_consensus)
ref_files = ';'.join(ref_files_consensus)
# consensus_coverage is a str with consensus sources/all sources, better as a str instead of float as its easier to understand
Expand Down Expand Up @@ -610,13 +689,38 @@ def generate_consensus_output(self, interpreted_annotation_tsv, consensus_annota
'|',
'Links',
]
with open(consensus_annotation_tsv, 'w+') as file:
first_line = '\t'.join(first_line)
file.write(first_line + '\n')
consensus_annotation = self.generate_consensus(interpreted_annotation_tsv,stdout_file_path=stdout_file_path)
for consensus_query in consensus_annotation:
line = '\t'.join(consensus_query)
file.write(line + '\n')
first_line = '\t'.join(first_line)
output_file=open(consensus_annotation_tsv, 'w+')
output_file.write(first_line + '\n')

gff_file=None
gff_file_path = consensus_annotation_tsv.replace('.tsv', '.gff')
if self.output_gff:
gff_file=open(gff_file_path,'w+')
gff_file.write('##gff-version 3' + '\n')



consensus_annotation = self.generate_consensus(interpreted_annotation_tsv,stdout_file_path=stdout_file_path)
if self.output_gff:
seq_regions=self.yield_seq_regions(interpreted_annotation_tsv)
for sr in seq_regions:
gff_file.write(sr)

for consensus_query,gff_lines in consensus_annotation:
line = '\t'.join(consensus_query)
output_file.write(line + '\n')
if self.output_gff:
for gff_l in gff_lines:
gff_file.write(gff_l+'\n')


if gff_file:
gff_file.close()
output_file.close()






Expand Down
2 changes: 1 addition & 1 deletion source/MANTIS_MP.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,7 @@ def worker_interpret_output(self, queue, master_pid):
if record is None: break
output_annotation_tsv, current_chunk_dir = record
interpreted_annotation_tsv = f'{current_chunk_dir}integrated_annotation.tsv'
self.generate_interpreted_output(output_annotation_tsv, interpreted_annotation_tsv)
self.generate_integrated_output(output_annotation_tsv, interpreted_annotation_tsv)

###Generate consensus output

Expand Down
98 changes: 82 additions & 16 deletions source/MANTIS_Metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,14 +166,11 @@ def get_hit_links(self, dict_hits, ref_file):
elif ref_file=='tcdb':
self.get_link_compiled_metadata(dict_hits=dict_hits, ref_file_path=self.mantis_paths['tcdb'] + 'metadata.tsv')
else:
custom_ref_path=self.get_target_custom_ref_paths(ref_file, folder=False)
custom_ref_path=self.get_target_custom_ref_paths(ref_file, folder=True)
if custom_ref_path:
if custom_ref_path.endswith('.hmm'):
file_path = custom_ref_path.replace('.hmm', '.tsv')
elif custom_ref_path.endswith('.dmnd'):
file_path = custom_ref_path.replace('.dmnd', '.tsv')
if file_exists(file_path):
self.get_link_compiled_metadata(dict_hits=dict_hits, ref_file_path=file_path)
file_path = add_slash(custom_ref_path)+'metadata.tsv'
if file_exists(file_path):
self.get_link_compiled_metadata(dict_hits=dict_hits, ref_file_path=file_path)
for hit in dict_hits:
self.get_common_links(hit, dict_hits[hit])
if 'accession' in dict_hits[hit]['link']: self.get_common_links(dict_hits[hit]['link']['accession'], dict_hits[hit])
Expand Down Expand Up @@ -261,7 +258,49 @@ def read_and_interpret_output_annotation(self, output_annotation_tsv):
)
return res

def generate_interpreted_output(self, output_annotation_tsv, interpreted_annotation_tsv):
def generate_gff_line_integrated(self,integrated_line):
#https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
#verified with http://genometools.org/cgi-bin/gff3validator.cgi
line_split=integrated_line.index('|')
line_data,annotation_data=integrated_line[:line_split],integrated_line[line_split+1:]
query, ref_file, hit, hit_accession, evalue, bitscore, direction, query_len, query_start, query_end, ref_start, ref_end, ref_len = line_data
#attributes of gff line:

if hit_accession!='-':
attributes=f'Name={hit};Target={hit} {ref_start} {ref_end};Alias={hit_accession}'
else:
attributes=f'Name={hit};Target={hit} {ref_start} {ref_end}'
notes=f'Note=ref_file:{ref_file},ref_len:{ref_len}'
dbxref=[]
ontology_terms=[]
descriptions = []

for i in annotation_data:
if not i.startswith('go:'):
dbxref.append(i)
elif i.startswith('description:'):
descriptions.append(i)
else:
ontology_terms.append(i)
if descriptions:
notes += ',' + ','.join(descriptions)
all_annotations=None
if dbxref and ontology_terms:
all_annotations='Dbxref='+','.join(dbxref)+';'+'Ontology_term='+','.join(ontology_terms)
elif dbxref and not ontology_terms:
all_annotations='Dbxref='+','.join(dbxref)
elif not dbxref and ontology_terms:
all_annotations='Ontology_term='+','.join(ontology_terms)
gff_line='\t'.join([query,'Mantis','CDS',query_start,query_end,evalue,direction,'0'])+'\t'
if all_annotations:
gff_line+=';'.join([attributes,notes,all_annotations])
else:
gff_line+=';'.join([attributes,notes])
sequence_region_line=f'##sequence-region {query} 1 {query_len}'
return query,sequence_region_line,gff_line


def generate_integrated_output(self, output_annotation_tsv, interpreted_annotation_tsv):
first_line = ['Query',
'Ref_file',
'Ref_hit',
Expand All @@ -277,14 +316,40 @@ def generate_interpreted_output(self, output_annotation_tsv, interpreted_annotat
'Ref_length',
'|',
'Links']
with open(interpreted_annotation_tsv, 'w+') as file:
first_line = '\t'.join(first_line)
file.write(first_line + '\n')
output_annotation = self.read_and_interpret_output_annotation(output_annotation_tsv)
for line in range(len(output_annotation)):
if output_annotation[line]:
out_line = '\t'.join(output_annotation[line])
file.write(out_line + '\n')
first_line = '\t'.join(first_line)
output_file=open(interpreted_annotation_tsv, 'w+')
output_file.write(first_line + '\n')

gff_file=None
gff_file_path = interpreted_annotation_tsv.replace('.tsv', '.gff')
gff_dict = {}
if self.output_gff:
gff_file=open(gff_file_path,'w+')
gff_file.write('##gff-version 3' + '\n')

#generating output
output_annotation = self.read_and_interpret_output_annotation(output_annotation_tsv)
for line in range(len(output_annotation)):
current_output_line=output_annotation[line]
if current_output_line:
out_line = '\t'.join(current_output_line)
output_file.write(out_line + '\n')

if gff_file:
seq_id,sequence_region_line,gff_line=self.generate_gff_line_integrated(current_output_line)
if seq_id not in gff_dict:
gff_dict[seq_id]={'seq_region':sequence_region_line,'lines':[]}
gff_dict[seq_id]['lines'].append(gff_line)
#writing gff
for seq_id in gff_dict:
gff_file.write(gff_dict[seq_id]['seq_region']+'\n')
for seq_id in gff_dict:
for annot_line in gff_dict[seq_id]['lines']:
gff_file.write(annot_line+'\n')

if gff_file:
gff_file.close()
output_file.close()



Expand Down Expand Up @@ -420,6 +485,7 @@ def export_sample_kos(self,samples_info):
line=f'{sample_name}\t{kos}\n'
file.write(line)


def generate_matrix(self):
sample_paths = []
figure_info={}
Expand Down
Loading

0 comments on commit 6403c66

Please sign in to comment.