changed km to actually represent kegg matrix completeness; added verb…

…ose mode Former-commit-id: 231a762
PedroMTQ · Aug 29, 2021 · 6533ab7 · 6533ab7
1 parent 545f513
commit 6533ab7
Show file tree

Hide file tree

Showing 8 changed files with 356 additions and 56 deletions.
diff --git a/Resources/KEGG/get_pickle.py b/Resources/KEGG/get_pickle.py
@@ -4,10 +4,171 @@
 import json
 import re
 import os
+import copy
+
+
+#this class is based on https://github.com/merenlab/anvio
+class anvio_kegg_paths():
+
+    def split_path(self, step):
+        """This function handles compound steps that should be split into multiple alternative paths.
+        It first splits the input step into substeps, and then since each substep could be its own mini-definition,
+        it recursively calls the definition unrolling function to parse it. The list of all alternative paths
+        that can be made from this step is returned.
+        """
+
+        if step[0] == "(" and step[-1] == ")":
+            substeps = self.split_by_delim_not_within_parens(step[1:-1], ",")
+            if not substeps: # if it doesn't work, try without removing surrounding parentheses
+                substeps = self.split_by_delim_not_within_parens(step, ",")
+        else:
+            substeps = self.split_by_delim_not_within_parens(step, ",")
+
+        alt_path_list = []
+        for s in substeps:
+            alt_paths_from_substep = self.recursive_definition_unroller(s)
+            for a in alt_paths_from_substep:
+                alt_path_list.append(a)
+
+        return alt_path_list
+
+    def split_by_delim_not_within_parens(self, d, delims, return_delims=False):
+        """Takes a string, and splits it on the given delimiter(s) as long as the delimeter is not within parentheses.
+        This function exists because regular expressions don't handle nested parentheses very well. It is used in the
+        recursive module definition unrolling functions to split module steps, but it is generically written in case
+        it could have other uses in the future.
+        The function can also be used to determine if the parentheses in the string are unbalanced (it will return False
+        instead of the list of splits in this situation)
+        PARAMETERS
+        ==========
+        d : str
+            string to split
+        delims : str or list of str
+            a single delimiter, or a list of delimiters, to split on
+        return_delims : boolean
+            if this is true then the list of delimiters found between each split is also returned
+        RETURNS
+        =======
+        If parentheses are unbalanced in the string, this function returns False. Otherwise:
+        splits : list
+            strings that were split from d
+        delim_list : list
+            delimiters that were found between each split (only returned if return_delims is True)
+        """
+
+        parens_level = 0
+        last_split_index = 0
+        splits = []
+        delim_list = []
+        for i in range(len(d)):
+            # only split if not within parentheses
+            if d[i] in delims and parens_level == 0:
+                splits.append(d[last_split_index:i])
+                delim_list.append(d[i])
+                last_split_index = i + 1 # we add 1 here to skip the space
+            elif d[i] == "(":
+                parens_level += 1
+            elif d[i] == ")":
+                parens_level -= 1
+
+            # if parentheses become unbalanced, return False to indicate this
+            if parens_level < 0:
+                return False
+        splits.append(d[last_split_index:len(d)])
+
+        if return_delims:
+            return splits, delim_list
+        return splits
+
+    def recursive_definition_unroller(self,step):
+        """This function recursively splits a module definition into its components.
+        First, the definition is split into its component steps (separated by spaces).
+        Each step is either an atomic step (a single KO, module number, '--', or nonessential KO starting with '-'),
+        a protein complex, or a compound step.
+        Atomic steps are used to extend each path that has been found so far. Protein complexes are split into
+        their respective components, which may be split further by the split_paths() function to find all possible
+        alternative complexes, before being used to extend each path. Compound steps are split and recursively processed
+        by the split_paths() function before the resulting downstream paths are used to extend each path.
+        PARAMETERS
+        ==========
+        step : str
+            step definition to split into component steps as necessary
+        RETURNS
+        =======
+        paths_list : list
+            all paths that the input step has been unrolled into
+        """
+
+        split_steps = self.split_by_delim_not_within_parens(step, " ")
+        paths_list = [[]]  # list to save all paths, with initial empty path list to extend from
+        for s in split_steps:
+            # base case: step is a ko, mnum, non-essential step, or '--'
+            if (len(s) == 6 and s[0] == "K") or (len(s) == 6 and s[0] == "M") or (s == "--") or (
+                    len(s) == 7 and s[0] == "-"):
+                for p in paths_list:
+                    p.extend([s])
+            else:
+                if s[0] == "(" and s[-1] == ")":
+                    # here we try splitting to see if removing the outer parentheses will make the definition become unbalanced
+                    # (the only way to figure this out is to try it because regex cannot handle nested parentheses)
+                    comma_substeps = self.split_by_delim_not_within_parens(s[1:-1], ",")
+                    if not comma_substeps:  # if it doesn't work, try without removing surrounding parentheses
+                        comma_substeps = self.split_by_delim_not_within_parens(s, ",")
+                    space_substeps = self.split_by_delim_not_within_parens(s[1:-1], " ")
+                    if not space_substeps:
+                        space_substeps = self.split_by_delim_not_within_parens(s, " ")
+                else:
+                    comma_substeps = self.split_by_delim_not_within_parens(s, ",")
+                    space_substeps = self.split_by_delim_not_within_parens(s, " ")
+
+                # complex case: no commas OR spaces outside parentheses so this is a protein complex rather than a compound step
+                if len(comma_substeps) == 1 and len(space_substeps) == 1:
+                    complex_components, delimiters = self.split_by_delim_not_within_parens(s, ["+", "-"],return_delims=True)
+                    complex_strs = [""]
+
+                    # reconstruct the complex (and any alternate possible complexes) while keeping the +/- structure the same
+                    for i in range(len(complex_components)):
+                        c = complex_components[i]
+                        if c[0] == '(':
+                            alts = self.split_path(c)
+                            new_complex_strs = []
+                            for a in alts:
+                                if len(a) > 1:
+                                    raise Exception
+                                for cs in complex_strs:
+                                    extended_complex = cs + a[0]
+                                    new_complex_strs.append(extended_complex)
+                            complex_strs = new_complex_strs
+                        else:
+                            for j in range(len(complex_strs)):
+                                complex_strs[j] += c
+
+                        if i < len(delimiters):
+                            for j in range(len(complex_strs)):
+                                complex_strs[j] += delimiters[i]
+
+                    new_paths_list = []
+                    for cs in complex_strs:
+                        for p in paths_list:
+                            p_copy = copy.copy(p)
+                            p_copy.extend([cs])
+                            new_paths_list.append(p_copy)
+                    paths_list = new_paths_list
+
+                # compound step case:
+                else:
+                    alts = self.split_path(s)
+                    new_paths_list = []
+                    for a in alts:
+                        for p in paths_list:
+                            p_copy = copy.copy(p)
+                            p_copy.extend(a)
+                            new_paths_list.append(p_copy)
+                    paths_list = new_paths_list
+
+        return paths_list
+
 
-#from https://www.genome.jp/kegg-bin/show_brite?ko00002.keg
-kegg_module='modules.json'
-pickle_path='modules.pickle'
 
 def save_metrics(pickle_path,to_pickle):
     with open(pickle_path, 'wb') as handle:
@@ -19,17 +180,36 @@ def load_metrics(pickle_path):
             pickled_results= pickle.load(handle)
             return pickled_results
 
-def find_ko(string_to_search):
-    res=set()
-    #I could do upper and lower case but since it's only one letter, it's not very safe...
-    pattern = re.compile('(?<![A-Za-z])K\d{4,}')
-    search= re.finditer(pattern,string_to_search)
-    for i in search:
-        res.add(i.group())
+
+def remove_non_essential_kos(ko_str):
+    res=[]
+    re_pattern=re.compile('-K\d{5}')
+    for step in ko_str:
+        temp=step
+        search=re.findall(re_pattern,step)
+        for s in search:
+            temp=temp.replace(s,'')
+        res.append(temp)
+    return res
+
+def get_sets_module(string_to_search):
+    module_str=string_to_search.split('hidden">')[-1].split('<br>')[0].strip().replace('<wbr>','')
+    res=[]
+    all_paths= anvio_instance.recursive_definition_unroller(module_str)
+    for i in range(len(all_paths)):
+        only_essentials=remove_non_essential_kos(all_paths[i])
+        temp=set()
+        for step in only_essentials:
+            step_modules=step.split('+')
+            temp.update(step_modules)
+        res.append(temp)
     return res
 
+
+
 def get_ko_from_module(module_id):
     url = 'https://www.genome.jp/dbget-bin/www_bget?md:' + module_id
+    print(f'Getting module {module_id}')
     webpage = None
     c = 0
     while not webpage and c <= 10:
@@ -42,8 +222,8 @@ def get_ko_from_module(module_id):
         webpage=webpage[start:]
         end=re.search('</div></div></td></tr>',webpage).span()[0]
         webpage=webpage[:end]
-        ko_set=find_ko(webpage)
-    return ko_set
+        ko_str=get_sets_module(webpage)
+    return ko_str
 
 
 
@@ -59,6 +239,27 @@ def read_modules(file_path):
             if sub_path_name not in tree_modules[main_path_name]: tree_modules[main_path_name][sub_path_name] = {}
             modules=sub_path['children']
             for module in modules:
-                module_name=module['name'].split()[0]
-                tree_modules[main_path_name][sub_path_name][module_name]=get_ko_from_module(module_name)
-    return tree_modules
+                module_name=module['name'].split('[')[0]
+                module_id=module_name.split()[0]
+                module_name=module_name.replace(module_id,'').strip()
+                tree_modules[main_path_name][sub_path_name][module_id]=[module_name,get_ko_from_module(module_id)]
+
+    return tree_modules
+
+
+
+
+
+
+def main(kegg_module,pickle_path):
+    modules=read_modules(kegg_module)
+    save_metrics(pickle_path,modules)
+
+
+
+if __name__ == '__main__':
+    # from https://www.genome.jp/kegg-bin/show_brite?ko00002.keg
+    kegg_module = 'modules.json'
+    pickle_path = 'modules.pickle'
+    anvio_instance = anvio_kegg_paths()
+    main(kegg_module,pickle_path)
diff --git a/Resources/KEGG/modules.pickle b/Resources/KEGG/modules.pickle
diff --git a/__main__.py b/__main__.py
@@ -1,18 +1,24 @@
-import argparse
-import os
-from datetime import datetime
-import sys
-import uuid
-
-from source.MANTIS import run_mantis, run_mantis_test,print_citation_mantis
-from source.MANTIS_NLP import test_nlp
-from source.MANTIS_Assembler import add_slash, \
-    get_path_level, \
-    check_installation, \
-    extract_nog_metadata, \
-    setup_databases, \
-    merge_hmm_folder
-from source.utils import MANTIS_FOLDER
+try:
+    import argparse
+    import os
+    from datetime import datetime
+    import sys
+    import uuid
+
+    from source.MANTIS import run_mantis, run_mantis_test,print_citation_mantis
+    from source.MANTIS_NLP import test_nlp
+    from source.MANTIS_Assembler import add_slash, \
+        get_path_level, \
+        check_installation, \
+        extract_nog_metadata, \
+        setup_databases, \
+        merge_hmm_folder
+    from source.utils import MANTIS_FOLDER
+except ImportError as e:
+    import signal
+    master_pid = os.getpid()
+    print('Import Error!')
+    os.kill(master_pid, signal.SIGKILL)
 
 
 if __name__ == '__main__':
@@ -75,6 +81,8 @@
                         help='[optional]\tdo not expand hits during consensus generation.')
     parser.add_argument('-km', '--kegg_matrix', action='store_true',
                         help='[optional]\tgenerate KEGG modules completeness matrix.')
+    parser.add_argument('-vkm', '--verbose_kegg_matrix', action='store_true',
+                        help='[optional]\tgenerate KEGG modules completeness matrix in verbose mode. Verbose mode gives, in addition to the default matrix, complete module name and missing KOs; it also exports a summary figure.')
     parser.add_argument('-fo', '--force_output', action='store_true',
                         help='[optional]\tIf you would like to force the output to the folder you specified. This may result in errrors!')
     #setup databases
@@ -126,6 +134,7 @@
         no_consensus_expansion = args.no_consensus_expansion
         no_unifunc = args.no_unifunc
         kegg_matrix = args.kegg_matrix
+        verbose_kegg_matrix = args.verbose_kegg_matrix
         force_output = args.force_output
         default_workers = args.default_workers
         chunk_size = args.chunk_size
@@ -164,6 +173,7 @@
                            no_consensus_expansion=no_consensus_expansion,
                            no_unifunc=no_unifunc,
                            kegg_matrix=kegg_matrix,
+                           verbose_kegg_matrix=verbose_kegg_matrix,
                            default_workers=default_workers,
                            chunk_size=chunk_size,
                            time_limit=time_limit,

diff --git a/source/Exceptions.py b/source/Exceptions.py
@@ -1,6 +1,7 @@
 RequirementsNotMet='Installation check not passed! Make sure you\'ve setup the databases and your system meets all the requirements!'
 NoValidFiles='No valid files to annotate'
 InvalidTargetFile='You did not insert a valid target file!\n'
+InvalidFastaFormat='Fasta format is not valid!\n'
 InstallationCheckNotPassed='Installation check not passed! Make sure you\'ve setup the databases and your system meets all the requirements!'
 CythonNotCompiled= 'Cython has not been correctly compiled! Please go to mantis/source/ and run python utils.py'
 BadNumberWorkers='You should not be seeing this, please contact the developer. Invalid number of workers in '

diff --git a/source/MANTIS.py b/source/MANTIS.py
@@ -29,6 +29,7 @@ def run_mantis(target_path,
                no_consensus_expansion=False,
                no_unifunc=False,
                kegg_matrix=False,
+               verbose_kegg_matrix=False,
                verbose=True,
                default_workers=None,
                chunk_size=None,
@@ -67,6 +68,7 @@ def run_mantis(target_path,
         no_consensus_expansion=no_consensus_expansion,
         no_unifunc=no_unifunc,
         kegg_matrix=kegg_matrix,
+        verbose_kegg_matrix=verbose_kegg_matrix,
         verbose=verbose,
         default_workers=default_workers,
         chunk_size=chunk_size,
@@ -131,6 +133,7 @@ def __init__(self,
                  no_consensus_expansion=False,
                  no_unifunc=False,
                  kegg_matrix=False,
+                 verbose_kegg_matrix=False,
                  verbose=True,
                  default_workers=None,
                  chunk_size=None,
@@ -179,6 +182,8 @@ def __init__(self,
         self.no_consensus_expansion = no_consensus_expansion
         self.no_unifunc = no_unifunc
         self.kegg_matrix = kegg_matrix
+        self.verbose_kegg_matrix = verbose_kegg_matrix
+        if self.verbose_kegg_matrix: self.kegg_matrix=True
         self.default_workers = default_workers
         self.user_memory = user_memory
         # chunk size is highly relevant in the execution time
@@ -212,6 +217,14 @@ def print_available_hardware(self):
         print(f'Workers per core: {WORKER_PER_CORE}')
 
     def __str__(self):
+        if self.kegg_matrix and not self.verbose_kegg_matrix:
+            kegg_matrix_str='Generate KEGG modules matrix:\t' + str(self.kegg_matrix) + '\n'
+        elif self.kegg_matrix and self.verbose_kegg_matrix:
+            kegg_matrix_str='Generate KEGG modules matrix in verbose mode:\t' + str(self.verbose_kegg_matrix) + '\n'
+        else:
+            kegg_matrix_str=''
+
+
         output_list = [
             'Output folder:\t\t\t' + str(self.output_folder) + '\n' if self.output_folder else '',
             'Mantis config:\t\t\t' + str(self.mantis_config) + '\n' if self.mantis_config else '',
@@ -230,7 +243,7 @@ def __str__(self):
             'Skip memory management:\t\t' + str(self.skip_managed_memory) + '\n' if self.skip_managed_memory else '',
             'Skip consensus expansion:\t' + str(self.no_consensus_expansion) + '\n' if self.no_consensus_expansion else '',
             'Skip text similarity analysis:\t' + str(self.no_unifunc) + '\n' if self.no_unifunc else '',
-            'Generate KEGG modules matrix:\t' + str(self.kegg_matrix) + '\n' if self.kegg_matrix else '',
+            kegg_matrix_str,
             '------------------------------------------']
         return 'User configuration:' + '\n' + '------------------------------------------' + '\n' + ''.join(output_list)
 
@@ -503,5 +516,4 @@ def run_mantis(self):
 
 if __name__ == '__main__':
     m = MANTIS()
-    essential_genes = m.get_essential_genes_list()
-    print(essential_genes)
+