Skip to content

Commit

Permalink
changed km to actually represent kegg matrix completeness; added verb…
Browse files Browse the repository at this point in the history
…ose mode

Former-commit-id: 231a762
  • Loading branch information
PedroMTQ committed Aug 29, 2021
1 parent 545f513 commit 6533ab7
Show file tree
Hide file tree
Showing 8 changed files with 356 additions and 56 deletions.
231 changes: 216 additions & 15 deletions Resources/KEGG/get_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,171 @@
import json
import re
import os
import copy


#this class is based on https://github.com/merenlab/anvio
class anvio_kegg_paths():

def split_path(self, step):
"""This function handles compound steps that should be split into multiple alternative paths.
It first splits the input step into substeps, and then since each substep could be its own mini-definition,
it recursively calls the definition unrolling function to parse it. The list of all alternative paths
that can be made from this step is returned.
"""

if step[0] == "(" and step[-1] == ")":
substeps = self.split_by_delim_not_within_parens(step[1:-1], ",")
if not substeps: # if it doesn't work, try without removing surrounding parentheses
substeps = self.split_by_delim_not_within_parens(step, ",")
else:
substeps = self.split_by_delim_not_within_parens(step, ",")

alt_path_list = []
for s in substeps:
alt_paths_from_substep = self.recursive_definition_unroller(s)
for a in alt_paths_from_substep:
alt_path_list.append(a)

return alt_path_list

def split_by_delim_not_within_parens(self, d, delims, return_delims=False):
"""Takes a string, and splits it on the given delimiter(s) as long as the delimeter is not within parentheses.
This function exists because regular expressions don't handle nested parentheses very well. It is used in the
recursive module definition unrolling functions to split module steps, but it is generically written in case
it could have other uses in the future.
The function can also be used to determine if the parentheses in the string are unbalanced (it will return False
instead of the list of splits in this situation)
PARAMETERS
==========
d : str
string to split
delims : str or list of str
a single delimiter, or a list of delimiters, to split on
return_delims : boolean
if this is true then the list of delimiters found between each split is also returned
RETURNS
=======
If parentheses are unbalanced in the string, this function returns False. Otherwise:
splits : list
strings that were split from d
delim_list : list
delimiters that were found between each split (only returned if return_delims is True)
"""

parens_level = 0
last_split_index = 0
splits = []
delim_list = []
for i in range(len(d)):
# only split if not within parentheses
if d[i] in delims and parens_level == 0:
splits.append(d[last_split_index:i])
delim_list.append(d[i])
last_split_index = i + 1 # we add 1 here to skip the space
elif d[i] == "(":
parens_level += 1
elif d[i] == ")":
parens_level -= 1

# if parentheses become unbalanced, return False to indicate this
if parens_level < 0:
return False
splits.append(d[last_split_index:len(d)])

if return_delims:
return splits, delim_list
return splits

def recursive_definition_unroller(self,step):
"""This function recursively splits a module definition into its components.
First, the definition is split into its component steps (separated by spaces).
Each step is either an atomic step (a single KO, module number, '--', or nonessential KO starting with '-'),
a protein complex, or a compound step.
Atomic steps are used to extend each path that has been found so far. Protein complexes are split into
their respective components, which may be split further by the split_paths() function to find all possible
alternative complexes, before being used to extend each path. Compound steps are split and recursively processed
by the split_paths() function before the resulting downstream paths are used to extend each path.
PARAMETERS
==========
step : str
step definition to split into component steps as necessary
RETURNS
=======
paths_list : list
all paths that the input step has been unrolled into
"""

split_steps = self.split_by_delim_not_within_parens(step, " ")
paths_list = [[]] # list to save all paths, with initial empty path list to extend from
for s in split_steps:
# base case: step is a ko, mnum, non-essential step, or '--'
if (len(s) == 6 and s[0] == "K") or (len(s) == 6 and s[0] == "M") or (s == "--") or (
len(s) == 7 and s[0] == "-"):
for p in paths_list:
p.extend([s])
else:
if s[0] == "(" and s[-1] == ")":
# here we try splitting to see if removing the outer parentheses will make the definition become unbalanced
# (the only way to figure this out is to try it because regex cannot handle nested parentheses)
comma_substeps = self.split_by_delim_not_within_parens(s[1:-1], ",")
if not comma_substeps: # if it doesn't work, try without removing surrounding parentheses
comma_substeps = self.split_by_delim_not_within_parens(s, ",")
space_substeps = self.split_by_delim_not_within_parens(s[1:-1], " ")
if not space_substeps:
space_substeps = self.split_by_delim_not_within_parens(s, " ")
else:
comma_substeps = self.split_by_delim_not_within_parens(s, ",")
space_substeps = self.split_by_delim_not_within_parens(s, " ")

# complex case: no commas OR spaces outside parentheses so this is a protein complex rather than a compound step
if len(comma_substeps) == 1 and len(space_substeps) == 1:
complex_components, delimiters = self.split_by_delim_not_within_parens(s, ["+", "-"],return_delims=True)
complex_strs = [""]

# reconstruct the complex (and any alternate possible complexes) while keeping the +/- structure the same
for i in range(len(complex_components)):
c = complex_components[i]
if c[0] == '(':
alts = self.split_path(c)
new_complex_strs = []
for a in alts:
if len(a) > 1:
raise Exception
for cs in complex_strs:
extended_complex = cs + a[0]
new_complex_strs.append(extended_complex)
complex_strs = new_complex_strs
else:
for j in range(len(complex_strs)):
complex_strs[j] += c

if i < len(delimiters):
for j in range(len(complex_strs)):
complex_strs[j] += delimiters[i]

new_paths_list = []
for cs in complex_strs:
for p in paths_list:
p_copy = copy.copy(p)
p_copy.extend([cs])
new_paths_list.append(p_copy)
paths_list = new_paths_list

# compound step case:
else:
alts = self.split_path(s)
new_paths_list = []
for a in alts:
for p in paths_list:
p_copy = copy.copy(p)
p_copy.extend(a)
new_paths_list.append(p_copy)
paths_list = new_paths_list

return paths_list


#from https://www.genome.jp/kegg-bin/show_brite?ko00002.keg
kegg_module='modules.json'
pickle_path='modules.pickle'

def save_metrics(pickle_path,to_pickle):
with open(pickle_path, 'wb') as handle:
Expand All @@ -19,17 +180,36 @@ def load_metrics(pickle_path):
pickled_results= pickle.load(handle)
return pickled_results

def find_ko(string_to_search):
res=set()
#I could do upper and lower case but since it's only one letter, it's not very safe...
pattern = re.compile('(?<![A-Za-z])K\d{4,}')
search= re.finditer(pattern,string_to_search)
for i in search:
res.add(i.group())

def remove_non_essential_kos(ko_str):
res=[]
re_pattern=re.compile('-K\d{5}')
for step in ko_str:
temp=step
search=re.findall(re_pattern,step)
for s in search:
temp=temp.replace(s,'')
res.append(temp)
return res

def get_sets_module(string_to_search):
module_str=string_to_search.split('hidden">')[-1].split('<br>')[0].strip().replace('<wbr>','')
res=[]
all_paths= anvio_instance.recursive_definition_unroller(module_str)
for i in range(len(all_paths)):
only_essentials=remove_non_essential_kos(all_paths[i])
temp=set()
for step in only_essentials:
step_modules=step.split('+')
temp.update(step_modules)
res.append(temp)
return res



def get_ko_from_module(module_id):
url = 'https://www.genome.jp/dbget-bin/www_bget?md:' + module_id
print(f'Getting module {module_id}')
webpage = None
c = 0
while not webpage and c <= 10:
Expand All @@ -42,8 +222,8 @@ def get_ko_from_module(module_id):
webpage=webpage[start:]
end=re.search('</div></div></td></tr>',webpage).span()[0]
webpage=webpage[:end]
ko_set=find_ko(webpage)
return ko_set
ko_str=get_sets_module(webpage)
return ko_str



Expand All @@ -59,6 +239,27 @@ def read_modules(file_path):
if sub_path_name not in tree_modules[main_path_name]: tree_modules[main_path_name][sub_path_name] = {}
modules=sub_path['children']
for module in modules:
module_name=module['name'].split()[0]
tree_modules[main_path_name][sub_path_name][module_name]=get_ko_from_module(module_name)
return tree_modules
module_name=module['name'].split('[')[0]
module_id=module_name.split()[0]
module_name=module_name.replace(module_id,'').strip()
tree_modules[main_path_name][sub_path_name][module_id]=[module_name,get_ko_from_module(module_id)]

return tree_modules






def main(kegg_module,pickle_path):
modules=read_modules(kegg_module)
save_metrics(pickle_path,modules)



if __name__ == '__main__':
# from https://www.genome.jp/kegg-bin/show_brite?ko00002.keg
kegg_module = 'modules.json'
pickle_path = 'modules.pickle'
anvio_instance = anvio_kegg_paths()
main(kegg_module,pickle_path)
Binary file modified Resources/KEGG/modules.pickle
Binary file not shown.
40 changes: 25 additions & 15 deletions __main__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
import argparse
import os
from datetime import datetime
import sys
import uuid

from source.MANTIS import run_mantis, run_mantis_test,print_citation_mantis
from source.MANTIS_NLP import test_nlp
from source.MANTIS_Assembler import add_slash, \
get_path_level, \
check_installation, \
extract_nog_metadata, \
setup_databases, \
merge_hmm_folder
from source.utils import MANTIS_FOLDER
try:
import argparse
import os
from datetime import datetime
import sys
import uuid

from source.MANTIS import run_mantis, run_mantis_test,print_citation_mantis
from source.MANTIS_NLP import test_nlp
from source.MANTIS_Assembler import add_slash, \
get_path_level, \
check_installation, \
extract_nog_metadata, \
setup_databases, \
merge_hmm_folder
from source.utils import MANTIS_FOLDER
except ImportError as e:
import signal
master_pid = os.getpid()
print('Import Error!')
os.kill(master_pid, signal.SIGKILL)


if __name__ == '__main__':
Expand Down Expand Up @@ -75,6 +81,8 @@
help='[optional]\tdo not expand hits during consensus generation.')
parser.add_argument('-km', '--kegg_matrix', action='store_true',
help='[optional]\tgenerate KEGG modules completeness matrix.')
parser.add_argument('-vkm', '--verbose_kegg_matrix', action='store_true',
help='[optional]\tgenerate KEGG modules completeness matrix in verbose mode. Verbose mode gives, in addition to the default matrix, complete module name and missing KOs; it also exports a summary figure.')
parser.add_argument('-fo', '--force_output', action='store_true',
help='[optional]\tIf you would like to force the output to the folder you specified. This may result in errrors!')
#setup databases
Expand Down Expand Up @@ -126,6 +134,7 @@
no_consensus_expansion = args.no_consensus_expansion
no_unifunc = args.no_unifunc
kegg_matrix = args.kegg_matrix
verbose_kegg_matrix = args.verbose_kegg_matrix
force_output = args.force_output
default_workers = args.default_workers
chunk_size = args.chunk_size
Expand Down Expand Up @@ -164,6 +173,7 @@
no_consensus_expansion=no_consensus_expansion,
no_unifunc=no_unifunc,
kegg_matrix=kegg_matrix,
verbose_kegg_matrix=verbose_kegg_matrix,
default_workers=default_workers,
chunk_size=chunk_size,
time_limit=time_limit,
Expand Down
1 change: 1 addition & 0 deletions source/Exceptions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
RequirementsNotMet='Installation check not passed! Make sure you\'ve setup the databases and your system meets all the requirements!'
NoValidFiles='No valid files to annotate'
InvalidTargetFile='You did not insert a valid target file!\n'
InvalidFastaFormat='Fasta format is not valid!\n'
InstallationCheckNotPassed='Installation check not passed! Make sure you\'ve setup the databases and your system meets all the requirements!'
CythonNotCompiled= 'Cython has not been correctly compiled! Please go to mantis/source/ and run python utils.py'
BadNumberWorkers='You should not be seeing this, please contact the developer. Invalid number of workers in '
Expand Down
18 changes: 15 additions & 3 deletions source/MANTIS.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def run_mantis(target_path,
no_consensus_expansion=False,
no_unifunc=False,
kegg_matrix=False,
verbose_kegg_matrix=False,
verbose=True,
default_workers=None,
chunk_size=None,
Expand Down Expand Up @@ -67,6 +68,7 @@ def run_mantis(target_path,
no_consensus_expansion=no_consensus_expansion,
no_unifunc=no_unifunc,
kegg_matrix=kegg_matrix,
verbose_kegg_matrix=verbose_kegg_matrix,
verbose=verbose,
default_workers=default_workers,
chunk_size=chunk_size,
Expand Down Expand Up @@ -131,6 +133,7 @@ def __init__(self,
no_consensus_expansion=False,
no_unifunc=False,
kegg_matrix=False,
verbose_kegg_matrix=False,
verbose=True,
default_workers=None,
chunk_size=None,
Expand Down Expand Up @@ -179,6 +182,8 @@ def __init__(self,
self.no_consensus_expansion = no_consensus_expansion
self.no_unifunc = no_unifunc
self.kegg_matrix = kegg_matrix
self.verbose_kegg_matrix = verbose_kegg_matrix
if self.verbose_kegg_matrix: self.kegg_matrix=True
self.default_workers = default_workers
self.user_memory = user_memory
# chunk size is highly relevant in the execution time
Expand Down Expand Up @@ -212,6 +217,14 @@ def print_available_hardware(self):
print(f'Workers per core: {WORKER_PER_CORE}')

def __str__(self):
if self.kegg_matrix and not self.verbose_kegg_matrix:
kegg_matrix_str='Generate KEGG modules matrix:\t' + str(self.kegg_matrix) + '\n'
elif self.kegg_matrix and self.verbose_kegg_matrix:
kegg_matrix_str='Generate KEGG modules matrix in verbose mode:\t' + str(self.verbose_kegg_matrix) + '\n'
else:
kegg_matrix_str=''


output_list = [
'Output folder:\t\t\t' + str(self.output_folder) + '\n' if self.output_folder else '',
'Mantis config:\t\t\t' + str(self.mantis_config) + '\n' if self.mantis_config else '',
Expand All @@ -230,7 +243,7 @@ def __str__(self):
'Skip memory management:\t\t' + str(self.skip_managed_memory) + '\n' if self.skip_managed_memory else '',
'Skip consensus expansion:\t' + str(self.no_consensus_expansion) + '\n' if self.no_consensus_expansion else '',
'Skip text similarity analysis:\t' + str(self.no_unifunc) + '\n' if self.no_unifunc else '',
'Generate KEGG modules matrix:\t' + str(self.kegg_matrix) + '\n' if self.kegg_matrix else '',
kegg_matrix_str,
'------------------------------------------']
return 'User configuration:' + '\n' + '------------------------------------------' + '\n' + ''.join(output_list)

Expand Down Expand Up @@ -503,5 +516,4 @@ def run_mantis(self):

if __name__ == '__main__':
m = MANTIS()
essential_genes = m.get_essential_genes_list()
print(essential_genes)

Loading

0 comments on commit 6533ab7

Please sign in to comment.