From 4fb350cd14f984394957063a7ee89834bcf77fc2 Mon Sep 17 00:00:00 2001 From: Pedro Q Date: Wed, 23 Feb 2022 12:26:57 +0100 Subject: [PATCH] added sorting output --- mantis/Consensus.py | 77 +++++++++++++++++++++++++++++++++++++++++++-- mantis/Metadata.py | 4 ++- 2 files changed, 78 insertions(+), 3 deletions(-) diff --git a/mantis/Consensus.py b/mantis/Consensus.py index fec38c5..4087396 100644 --- a/mantis/Consensus.py +++ b/mantis/Consensus.py @@ -640,7 +640,7 @@ def generate_consensus_line(self, query, query_dict, is_essential, consensus_hit identifiers = query_dict[ref_file][ref_hit_name]['identifiers'] all_identifiers.update(identifiers) all_descriptions.update(description) - res = list(row_start) + res = [] sorted_identifiers = sorted(all_identifiers) for link in sorted_identifiers: if 'enzyme_ec' in link: @@ -662,6 +662,8 @@ def generate_consensus_line(self, query, query_dict, is_essential, consensus_hit for link in clean_descriptions: if 'description:' + link not in res: res.append('description:' + link) + res=sorted(res) + res=row_start+res return res def generate_consensus_output(self, interpreted_annotation_tsv, consensus_annotation_tsv, stdout_file_path=None): @@ -705,6 +707,77 @@ def generate_consensus_output(self, interpreted_annotation_tsv, consensus_annota +################### + def get_best_hits_approximation(self, query_hits, sorting_class,sorting_type): + ''' + this is just a lazy implementation when the amount of hits is too big and/or the hits are too small + even with multiprocessing and cython we may run into computationally unfeasable calculations + when this happens, we do generate a straightforward "best hit" + Best hit will take the lowest evalue hit and add the next lowest evalue hit (without overlapping) until we reach a point where this cycle cant be repeated. + This doesnt effectively calculate the "best hit", just a biased (since we start with lowest evalue as root) approximation + Still, it is a pretty good approximation anyhow + ''' + if sorting_class=='consensus': query_hits = self.sort_scaled_hits(query_hits,sorting_type=sorting_type) + else: query_hits = self.sort_hits(query_hits, sorting_class,sorting_type=sorting_type) + combo = [] + while query_hits: + next_hit = query_hits.pop(0) + if sorting_class == 'consensus': + if not self.is_overlap_Consensus(combo, next_hit): + combo.append(next_hit) + elif sorting_class == 'processor': + if not self.is_overlap(combo, next_hit): + combo.append(next_hit) + return combo + + + def get_min_max_dfs(self, cython_possible_combos, conversion_dict, sorting_class): + min_val = None + max_val = None + for len_combo in cython_possible_combos: + for cython_combo in cython_possible_combos[len_combo]: + combo = self.cython_to_query_hits(cython_combo, conversion_dict) + for hit in combo: + if sorting_class == 'processor': + hit_info = hit + elif sorting_class == 'consensus': + hmm_file, hmm_hit, hit_info = hit + # we dont consider 0 for the scaling, 0 will always be scaled to max/1 + if hit_info[self.sorting_type]: + current_val = log10(hit_info[self.sorting_type]) + if min_val is None: min_val = current_val + if max_val is None: max_val = current_val + if current_val > max_val: max_val = current_val + if current_val < min_val: min_val = current_val + # lower is best + if self.sorting_type == 'evalue': + return max_val, min_val + # higher is best + elif self.sorting_type == 'bitscore': + return min_val, max_val + + def cython_to_query_hits(self, cython_hits, conversion_dict): + res = [] + for hit in cython_hits: + res.append(conversion_dict[hit[0]]) + return res + + if __name__ == '__main__': m = Consensus() - + m.output_gff=False + m.domain_algorithm='heuristic' + m.sorting_type='bitscore' + m.time_limit=60 + m.mantis_ref_weights={'else':0.7} + m.overlap_value=0.1 + m.best_combo_formula=1 + m.no_consensus_expansion=False + m.minimum_consensus_overlap=0.7 + m.no_unifunc=False + f1 = '/home/pedroq/Desktop/test_valentina/mantis_da_01/Bebs_MG_PBGL_ESB_S44/run01/' + f2 = '/home/pedroq/Desktop/test_valentina/mantis_da_02/Bebs_MG_PBGL_ESB_S44/run01/' + #m.generate_consensus_output(f'/home/pedroq/Desktop/test_valentina/test1/integrated_annotation.tsv',f'/home/pedroq/Desktop/test_valentina/test1/consensus_annotation.tsv') + #print('######################') + m.generate_consensus_output(f'{f1}integrated_annotation.tsv',f'/home/pedroq/Desktop/test_valentina/test1/consensus_annotation.tsv') + m.generate_consensus_output(f'{f2}integrated_annotation.tsv',f'/home/pedroq/Desktop/test_valentina/test2/consensus_annotation.tsv') diff --git a/mantis/Metadata.py b/mantis/Metadata.py index 554d309..701c487 100644 --- a/mantis/Metadata.py +++ b/mantis/Metadata.py @@ -154,7 +154,7 @@ def generate_interpreted_line(self, query, ref_file, link, evalue, bitscore, dir if 'accession' in temp_link: hit_accession = temp_link.pop('accession') row_start = [query, ref_file, hit, hit_accession, evalue, bitscore,direction, query_len, query_start, query_end, ref_start, ref_end, ref_len, '|'] - res = list(row_start) + res = [] sorted_keys = sorted(temp_link.keys()) if 'enzyme_ec' in sorted_keys: sorted_keys.remove('enzyme_ec') @@ -172,6 +172,8 @@ def generate_interpreted_line(self, query, ref_file, link, evalue, bitscore, dir if isinstance(temp_link[link_key], str): temp_link[link_key] = [temp_link[link_key]] for inner_l in temp_link[link_key]: res.append(link_key + ':' + inner_l) + res=sorted(res) + res=row_start+res return res def read_and_interpret_output_annotation(self, output_annotation_tsv):