From 1ede47ac25337e2e225c7d42f95fc6509bd370f2 Mon Sep 17 00:00:00 2001 From: Feriolet Date: Sat, 11 May 2024 22:00:49 +0800 Subject: [PATCH 01/22] Implement Saturated Ring Sampling Conformation Feature --- easydock/preparation_for_docking.py | 203 ++++++++++++++++++++++++++-- easydock/vina_dock.py | 96 +++++++------ 2 files changed, 245 insertions(+), 54 deletions(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index 3a27d17..7005951 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -8,7 +8,9 @@ RDKitMolCreate) from rdkit import Chem from rdkit.Chem import AllChem - +import numpy as np +from sklearn.cluster import AgglomerativeClustering +from rdkit.Chem.rdMolAlign import AlignMolConformers def cpu_type(x): return max(1, min(int(x), cpu_count())) @@ -35,13 +37,18 @@ def mol_from_smi_or_molblock(ligand_string): def mk_prepare_ligand(mol, verbose=False): + pdbqt_string_list = [] preparator = MoleculePreparation(hydrate=False, flexible_amides=False, rigid_macrocycles=True, min_ring_size=7, max_ring_size=33) try: - mol_setups = preparator.prepare(mol) - for setup in mol_setups: - pdbqt_string, is_ok, error_msg = PDBQTWriterLegacy.write_string(setup) - if verbose: - print(f"{setup}") + cids = [x.GetId() for x in mol.GetConformers()] + for cid in cids: + mol_setups = preparator.prepare(mol, conformer_id=cid) + + for setup in mol_setups: + pdbqt_string, is_ok, error_msg = PDBQTWriterLegacy.write_string(setup) + pdbqt_string_list.append(pdbqt_string) + if verbose: + print(f"{setup}") except Exception: sys.stderr.write( "Warning. Incorrect mol object to convert to pdbqt. Continue. \n" @@ -49,29 +56,201 @@ def mk_prepare_ligand(mol, verbose=False): traceback.print_exc() pdbqt_string = None - return pdbqt_string + return pdbqt_string_list + + +def GetConformerRMSFromAtomIds(mol, confId1, confId2, atomIds=None, prealigned=False): + """ Returns the RMS between two conformations based on the atomIds passed as the input. + By default, the conformers will be aligned to the first conformer + before the RMS calculation and, as a side-effect, the second will be left + in the aligned state. + Arguments: + - mol: the molecule + - confId1: the id of the first conformer + - confId2: the id of the second conformer + - atomIds: (optional) list of atom ids to use a points for + alingment **AND RMSD** calculation- defaults to all atoms + - prealigned: (optional) by default the conformers are assumed + be unaligned and the second conformer be aligned + to the first + + """ + # align the conformers if necessary + # Note: the reference conformer is always the first one + if not prealigned: + if atomIds: + AlignMolConformers(mol, confIds=[confId1, confId2], atomIds=atomIds) + else: + AlignMolConformers(mol, confIds=[confId1, confId2]) + + # calculate the RMS between the two conformations + conf1 = mol.GetConformer(id=confId1) + conf2 = mol.GetConformer(id=confId2) + ssr = 0 + for i in [id for id in atomIds]: + d = conf1.GetAtomPosition(i).Distance(conf2.GetAtomPosition(i)) + ssr += d * d + ssr /= mol.GetNumAtoms() + return np.sqrt(ssr) + +def GetConformerRMSMatrixForSaturatedRingMolecule(mol, atomIds=None, prealigned=False): + """ Returns the RMS matrix of the conformers of a molecule based on the alignment and rmsd of saturated ring. + The function calculates the mean of the RMSD of each saturated ring with the GetConformerRMSFromAtomIds. + The alignment is done per ring (for example, three alignments are done for three saturated ring) + + By default, the conformers will be aligned to the first conformer + before the RMS calculation and, as a side-effect, the second will be left + in the aligned state. + + As a side-effect, the conformers will be aligned to the first + conformer (i.e. the reference) and will left in the aligned state. + + Arguments: + - mol: the molecule + - atomIds: (optional) list of atom ids to use a points for + alingment - defaults to all atoms + - prealigned: (optional) by default the conformers are assumed + be unaligned and will therefore be aligned to the + first conformer + + Note that the returned RMS matrix is symmetrical, i.e. it is the + lower half of the matrix, e.g. for 5 conformers:: + + rmsmatrix = [ a, + b, c, + d, e, f, + g, h, i, j] + + where a is the RMS between conformers 0 and 1, b is the RMS between + conformers 0 and 2, etc. + This way it can be directly used as distance matrix in e.g. Butina + clustering. -def mol_embedding_3d(mol, seed=43): + """ - def gen_conf(mole, useRandomCoords, randomSeed): + cmat_list = [] + for atom_id_in_ring in atomIds: + # if necessary, align the conformers + # Note: the reference conformer is always the first one + rmsvals = [] + confIds = [conf.GetId() for conf in mol.GetConformers()] + if not prealigned: + if atom_id_in_ring: + AlignMolConformers(mol, atomIds=atom_id_in_ring, RMSlist=rmsvals) + else: + AlignMolConformers(mol, RMSlist=rmsvals) + else: # already prealigned + for i in range(1, len(confIds)): + rmsvals.append(GetConformerRMSFromAtomIds(mol, confIds[0], confIds[i], atomIds=atom_id_in_ring, prealigned=prealigned)) + # loop over the conformations (except the reference one) + cmat_per_ring = [] + for i in range(1, len(confIds)): + cmat_per_ring.append(rmsvals[i - 1]) + for j in range(1, i): + cmat_per_ring.append(GetConformerRMSFromAtomIds(mol, confIds[i], confIds[j], atomIds=atom_id_in_ring, prealigned=True)) + + cmat_list.append(np.array(cmat_per_ring)) + + cmat_list_array = np.array(cmat_list) + + return list(np.mean(cmat_list_array, axis=0)) + + +def mol_embedding_3d(mol: Chem.Mol, seed: int=43): + + def find_saturated_ring(mol: Chem.Mol): + atom_list = mol.GetAtoms() + ssr = Chem.GetSymmSSSR(mol) + saturated_ring_list = [] + for ring in ssr: + is_atom_saturated_array = np.array([atom_list[atom_id].GetHybridization() == Chem.HybridizationType.SP3 for atom_id in ring]) + is_ring_unsaturated = np.any(np.nonzero(is_atom_saturated_array==0)) + if is_ring_unsaturated: + continue + + saturated_ring_list.append(ring) + + return saturated_ring_list + + def gen_conf(mole: Chem.Mol, useRandomCoords: bool, randomSeed: int, has_saturated_ring: bool): params = AllChem.ETKDGv3() params.useRandomCoords = useRandomCoords params.randomSeed = randomSeed - conf_stat = AllChem.EmbedMolecule(mole, params) + if has_saturated_ring: + #10 is used as default according to the C language documentation iirc, but I have to specify the numbers. + conf_stat = AllChem.EmbedMultipleConfs(mole, 10, params) + else: + conf_stat = AllChem.EmbedMolecule(mole, params) return mole, conf_stat + + def remove_confs_rms(mol, saturated_ring_list, rms=0.25, keep_nconf=None): + """ + The function uses AgglomerativeClustering to select conformers. + + :param mol: input molecule with multiple conformers + :param rms: discard conformers which are closer than given value to a kept conformer + :param keep_nconf: keep at most the given number of conformers. This parameter has precedence over rms + :return: + """ + + def gen_ids(ids): + for i in range(1, len(ids)): + for j in range(0, i): + yield j, i + + if keep_nconf and mol.GetNumConformers() <= keep_nconf: + return mol + + if mol.GetNumConformers() <= 1: + return mol + + mol_tmp = Chem.RemoveHs(mol) # calc rms for heavy atoms only + rms_ = GetConformerRMSMatrixForSaturatedRingMolecule(mol_tmp, atomIds=saturated_ring_list, prealigned=False) + + cids = [c.GetId() for c in mol_tmp.GetConformers()] + arr = np.zeros((len(cids), len(cids))) + for (i, j), v in zip(gen_ids(cids), rms_): + arr[i, j] = v + arr[j, i] = v + if keep_nconf: + cl = AgglomerativeClustering(n_clusters=keep_nconf, linkage='complete', metric='precomputed').fit(arr) + else: + cl = AgglomerativeClustering(n_clusters=None, linkage='complete', metric='precomputed', distance_threshold=rms).fit(arr) + keep_ids = [] + for i in set(cl.labels_): + ids = np.where(cl.labels_ == i)[0] + j = arr[np.ix_(ids, ids)].mean(axis=0).argmin() + keep_ids.append(cids[j]) + remove_ids = set(cids) - set(keep_ids) + + for cid in sorted(remove_ids, reverse=True): + mol.RemoveConformer(cid) + + return mol + + if not isinstance(mol, Chem.Mol): return None + + saturated_ring = find_saturated_ring(mol) + has_saturated_ring = (len(saturated_ring)>0) + mol = Chem.AddHs(mol, addCoords=True) if not mol_is_3d(mol): # only for non 3D input structures - mol, conf_stat = gen_conf(mol, useRandomCoords=False, randomSeed=seed) + mol, conf_stat = gen_conf(mol, useRandomCoords=False, randomSeed=seed, has_saturated_ring=has_saturated_ring) if conf_stat == -1: # if molecule is big enough and rdkit cannot generate a conformation - use params.useRandomCoords = True - mol, conf_stat = gen_conf(mol, useRandomCoords=True, randomSeed=seed) + mol, conf_stat = gen_conf(mol, useRandomCoords=True, randomSeed=seed, has_saturated_ring=has_saturated_ring) if conf_stat == -1: return None AllChem.UFFOptimizeMolecule(mol, maxIters=100) + print(f"[For Testing Only] {mol.GetProp('_Name')} has {len(saturated_ring)} saturated ring") + print(f"[For Testing Only] Before removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") + mol = remove_confs_rms(mol, saturated_ring) + print(f"[For Testing Only] After removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") + return mol diff --git a/easydock/vina_dock.py b/easydock/vina_dock.py index 4bbe042..ffc8508 100755 --- a/easydock/vina_dock.py +++ b/easydock/vina_dock.py @@ -75,51 +75,63 @@ def mol_dock(mol, config): config = __parse_config(config) + + mol_id = mol.GetProp('_Name') - ligand_pdbqt = ligand_preparation(mol, boron_replacement=True) - if ligand_pdbqt is None: + ligand_pdbqt_list = ligand_preparation(mol, boron_replacement=True) + + if ligand_pdbqt_list is None: return mol_id, None - output_fd, output_fname = tempfile.mkstemp(suffix='_output.json', text=True) - ligand_fd, ligand_fname = tempfile.mkstemp(suffix='_ligand.pdbqt', text=True) - - try: - with open(ligand_fname, 'wt') as f: - f.write(ligand_pdbqt) - - p = os.path.realpath(__file__) - python_exec = sys.executable - cmd = f'{python_exec} {os.path.dirname(p)}/vina_dock_cli.py -l {ligand_fname} -p {config["protein"]} ' \ - f'-o {output_fname} --center {" ".join(map(str, config["center"]))} ' \ - f'--box_size {" ".join(map(str, config["box_size"]))} ' \ - f'-e {config["exhaustiveness"]} --seed {config["seed"]} --nposes {config["n_poses"]} -c {config["ncpu"]}' - start_time = timeit.default_timer() - subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True) # this will trigger CalledProcessError and skip next lines) - dock_time = round(timeit.default_timer() - start_time, 1) - - with open(output_fname) as f: - res = f.read() - if res: - res = json.loads(res) - mol_block = pdbqt2molblock(res['poses'].split('MODEL')[1], mol, mol_id) - output = {'docking_score': res['docking_score'], - 'pdb_block': res['poses'], - 'mol_block': mol_block, - 'dock_time': dock_time} - - except subprocess.CalledProcessError as e: - sys.stderr.write(f'Error caused by docking of {mol_id}\n') - sys.stderr.write(str(e) + '\n') - sys.stderr.write('STDERR output:\n') - sys.stderr.write(e.stderr + '\n') - sys.stderr.flush() - output = None - - finally: - os.close(output_fd) - os.close(ligand_fd) - os.unlink(ligand_fname) - os.unlink(output_fname) + dock_output_conformer_list = [] + start_time = timeit.default_timer() + for ligand_pdbqt in ligand_pdbqt_list: + output_fd, output_fname = tempfile.mkstemp(suffix='_output.json', text=True) + ligand_fd, ligand_fname = tempfile.mkstemp(suffix='_ligand.pdbqt', text=True) + + try: + with open(ligand_fname, 'wt') as f: + f.write(ligand_pdbqt) + + p = os.path.realpath(__file__) + python_exec = sys.executable + cmd = f'{python_exec} {os.path.dirname(p)}/vina_dock_cli.py -l {ligand_fname} -p {config["protein"]} ' \ + f'-o {output_fname} --center {" ".join(map(str, config["center"]))} ' \ + f'--box_size {" ".join(map(str, config["box_size"]))} ' \ + f'-e {config["exhaustiveness"]} --seed {config["seed"]} --nposes {config["n_poses"]} -c {config["ncpu"]}' + subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True) # this will trigger CalledProcessError and skip next lines) + + with open(output_fname) as f: + res = f.read() + if res: + res = json.loads(res) + mol_block = pdbqt2molblock(res['poses'].split('MODEL')[1], mol, mol_id) + output = {'docking_score': res['docking_score'], + 'pdb_block': res['poses'], + 'mol_block': mol_block} + + dock_output_conformer_list.append(output) + + except subprocess.CalledProcessError as e: + sys.stderr.write(f'Error caused by docking of {mol_id}\n') + sys.stderr.write(str(e) + '\n') + sys.stderr.write('STDERR output:\n') + sys.stderr.write(e.stderr + '\n') + sys.stderr.flush() + output = None + + finally: + os.close(output_fd) + os.close(ligand_fd) + os.unlink(ligand_fname) + os.unlink(output_fname) + + dock_time = round(timeit.default_timer() - start_time, 1) + print(f'[For Testing Only Sanity check]: There are {len(dock_output_conformer_list)} {mol_id} conformers that has been docked') + print(f'\n') + docking_score_list = [float(conformer_output['docking_score']) for conformer_output in dock_output_conformer_list] + output = dock_output_conformer_list[docking_score_list.index(min(docking_score_list))] + output['dock_time'] = dock_time return mol_id, output From 99c5fc889d47b3ab37f8c2f17be949073311850c Mon Sep 17 00:00:00 2001 From: Feriolet Date: Sat, 11 May 2024 22:30:54 +0800 Subject: [PATCH 02/22] Change [id for id in atomIds] to atomIds in GetConformerRMSFromAtomIds --- easydock/preparation_for_docking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index 7005951..9efea4a 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -88,7 +88,7 @@ def GetConformerRMSFromAtomIds(mol, confId1, confId2, atomIds=None, prealigned=F conf1 = mol.GetConformer(id=confId1) conf2 = mol.GetConformer(id=confId2) ssr = 0 - for i in [id for id in atomIds]: + for i in atomIds: d = conf1.GetAtomPosition(i).Distance(conf2.GetAtomPosition(i)) ssr += d * d ssr /= mol.GetNumAtoms() From e474dc4425a720e1441604c60476403ea4c2f308 Mon Sep 17 00:00:00 2001 From: Feriolet Date: Sat, 11 May 2024 22:39:52 +0800 Subject: [PATCH 03/22] Type hints for created functions --- easydock/preparation_for_docking.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index 9efea4a..61992a4 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -11,6 +11,7 @@ import numpy as np from sklearn.cluster import AgglomerativeClustering from rdkit.Chem.rdMolAlign import AlignMolConformers +from typing import Optional def cpu_type(x): return max(1, min(int(x), cpu_count())) @@ -59,7 +60,7 @@ def mk_prepare_ligand(mol, verbose=False): return pdbqt_string_list -def GetConformerRMSFromAtomIds(mol, confId1, confId2, atomIds=None, prealigned=False): +def GetConformerRMSFromAtomIds(mol: Chem.Mol, confId1: int, confId2: int, atomIds: Optional[bool | list[int]]=None, prealigned: bool=False): """ Returns the RMS between two conformations based on the atomIds passed as the input. By default, the conformers will be aligned to the first conformer before the RMS calculation and, as a side-effect, the second will be left @@ -94,7 +95,7 @@ def GetConformerRMSFromAtomIds(mol, confId1, confId2, atomIds=None, prealigned=F ssr /= mol.GetNumAtoms() return np.sqrt(ssr) -def GetConformerRMSMatrixForSaturatedRingMolecule(mol, atomIds=None, prealigned=False): +def GetConformerRMSMatrixForSaturatedRingMolecule(mol: Chem.Mol, atomIds:list[list[int]]=None, prealigned: bool=False): """ Returns the RMS matrix of the conformers of a molecule based on the alignment and rmsd of saturated ring. The function calculates the mean of the RMSD of each saturated ring with the GetConformerRMSFromAtomIds. The alignment is done per ring (for example, three alignments are done for three saturated ring) @@ -184,7 +185,7 @@ def gen_conf(mole: Chem.Mol, useRandomCoords: bool, randomSeed: int, has_saturat conf_stat = AllChem.EmbedMolecule(mole, params) return mole, conf_stat - def remove_confs_rms(mol, saturated_ring_list, rms=0.25, keep_nconf=None): + def remove_confs_rms(mol: Chem.Mol, saturated_ring_list: list[list[int]], rms: float=0.25, keep_nconf: Optional[bool | int]=None): """ The function uses AgglomerativeClustering to select conformers. From d763851fbd578fe705043f704dbb327e4497f67c Mon Sep 17 00:00:00 2001 From: Feriolet Date: Sat, 11 May 2024 22:45:00 +0800 Subject: [PATCH 04/22] Type hints for created functions --- easydock/preparation_for_docking.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index 61992a4..d17c6b0 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -11,7 +11,7 @@ import numpy as np from sklearn.cluster import AgglomerativeClustering from rdkit.Chem.rdMolAlign import AlignMolConformers -from typing import Optional +from typing import Optional, Iterator def cpu_type(x): return max(1, min(int(x), cpu_count())) @@ -60,7 +60,7 @@ def mk_prepare_ligand(mol, verbose=False): return pdbqt_string_list -def GetConformerRMSFromAtomIds(mol: Chem.Mol, confId1: int, confId2: int, atomIds: Optional[bool | list[int]]=None, prealigned: bool=False): +def GetConformerRMSFromAtomIds(mol: Chem.Mol, confId1: int, confId2: int, atomIds: Optional[bool | list[int]]=None, prealigned: bool=False) -> float: """ Returns the RMS between two conformations based on the atomIds passed as the input. By default, the conformers will be aligned to the first conformer before the RMS calculation and, as a side-effect, the second will be left @@ -95,7 +95,7 @@ def GetConformerRMSFromAtomIds(mol: Chem.Mol, confId1: int, confId2: int, atomId ssr /= mol.GetNumAtoms() return np.sqrt(ssr) -def GetConformerRMSMatrixForSaturatedRingMolecule(mol: Chem.Mol, atomIds:list[list[int]]=None, prealigned: bool=False): +def GetConformerRMSMatrixForSaturatedRingMolecule(mol: Chem.Mol, atomIds:list[list[int]]=None, prealigned: bool=False) -> list[float]: """ Returns the RMS matrix of the conformers of a molecule based on the alignment and rmsd of saturated ring. The function calculates the mean of the RMSD of each saturated ring with the GetConformerRMSFromAtomIds. The alignment is done per ring (for example, three alignments are done for three saturated ring) @@ -158,7 +158,7 @@ def GetConformerRMSMatrixForSaturatedRingMolecule(mol: Chem.Mol, atomIds:list[li return list(np.mean(cmat_list_array, axis=0)) -def mol_embedding_3d(mol: Chem.Mol, seed: int=43): +def mol_embedding_3d(mol: Chem.Mol, seed: int=43) -> Chem.Mol: def find_saturated_ring(mol: Chem.Mol): atom_list = mol.GetAtoms() @@ -174,7 +174,7 @@ def find_saturated_ring(mol: Chem.Mol): return saturated_ring_list - def gen_conf(mole: Chem.Mol, useRandomCoords: bool, randomSeed: int, has_saturated_ring: bool): + def gen_conf(mole: Chem.Mol, useRandomCoords: bool, randomSeed: int, has_saturated_ring: bool) -> tuple[Chem.Mol, float]: params = AllChem.ETKDGv3() params.useRandomCoords = useRandomCoords params.randomSeed = randomSeed @@ -185,7 +185,7 @@ def gen_conf(mole: Chem.Mol, useRandomCoords: bool, randomSeed: int, has_saturat conf_stat = AllChem.EmbedMolecule(mole, params) return mole, conf_stat - def remove_confs_rms(mol: Chem.Mol, saturated_ring_list: list[list[int]], rms: float=0.25, keep_nconf: Optional[bool | int]=None): + def remove_confs_rms(mol: Chem.Mol, saturated_ring_list: list[list[int]], rms: float=0.25, keep_nconf: Optional[bool | int]=None) -> Chem.Mol: """ The function uses AgglomerativeClustering to select conformers. @@ -195,7 +195,7 @@ def remove_confs_rms(mol: Chem.Mol, saturated_ring_list: list[list[int]], rms: f :return: """ - def gen_ids(ids): + def gen_ids(ids: int) -> Iterator[int, int]: for i in range(1, len(ids)): for j in range(0, i): yield j, i From 9fb8c35b7ca07c5d3636610970065e5f31664a38 Mon Sep 17 00:00:00 2001 From: Feriolet Date: Sat, 11 May 2024 22:46:20 +0800 Subject: [PATCH 05/22] Type hints for created functions --- easydock/preparation_for_docking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index d17c6b0..c40906e 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -160,7 +160,7 @@ def GetConformerRMSMatrixForSaturatedRingMolecule(mol: Chem.Mol, atomIds:list[li def mol_embedding_3d(mol: Chem.Mol, seed: int=43) -> Chem.Mol: - def find_saturated_ring(mol: Chem.Mol): + def find_saturated_ring(mol: Chem.Mol) -> list[list[int]]: atom_list = mol.GetAtoms() ssr = Chem.GetSymmSSSR(mol) saturated_ring_list = [] From 4ff112259971edea6a74091a5614b3a6e6184210 Mon Sep 17 00:00:00 2001 From: Feriolet Date: Sat, 11 May 2024 23:06:08 +0800 Subject: [PATCH 06/22] Explicitly print error message if PDBQTWriterLegacy can't generate pdbqt --- easydock/preparation_for_docking.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index c40906e..6ba1c50 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -47,6 +47,8 @@ def mk_prepare_ligand(mol, verbose=False): for setup in mol_setups: pdbqt_string, is_ok, error_msg = PDBQTWriterLegacy.write_string(setup) + if not is_ok: + print(f"{mol.GetProp('_Name')} has error in converting to pdbqt: {error_msg}") pdbqt_string_list.append(pdbqt_string) if verbose: print(f"{setup}") From 8671e46cfad26a08af27c35bd105b29d2c10b535 Mon Sep 17 00:00:00 2001 From: Feriolet Date: Mon, 13 May 2024 19:10:08 +0800 Subject: [PATCH 07/22] Fix code according to reviews --- easydock/preparation_for_docking.py | 74 ++++++++++++++++++++--------- easydock/vina_dock.py | 5 +- 2 files changed, 55 insertions(+), 24 deletions(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index 6ba1c50..d1e056f 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -49,6 +49,8 @@ def mk_prepare_ligand(mol, verbose=False): pdbqt_string, is_ok, error_msg = PDBQTWriterLegacy.write_string(setup) if not is_ok: print(f"{mol.GetProp('_Name')} has error in converting to pdbqt: {error_msg}") + return None + pdbqt_string_list.append(pdbqt_string) if verbose: print(f"{setup}") @@ -57,12 +59,12 @@ def mk_prepare_ligand(mol, verbose=False): "Warning. Incorrect mol object to convert to pdbqt. Continue. \n" ) traceback.print_exc() - pdbqt_string = None + pdbqt_string_list = None return pdbqt_string_list -def GetConformerRMSFromAtomIds(mol: Chem.Mol, confId1: int, confId2: int, atomIds: Optional[bool | list[int]]=None, prealigned: bool=False) -> float: +def GetConformerRMSFromAtomIds(mol: Chem.Mol, confId1: int, confId2: int, atomIds: Optional[list[int]]=None, prealigned: bool=False) -> float: """ Returns the RMS between two conformations based on the atomIds passed as the input. By default, the conformers will be aligned to the first conformer before the RMS calculation and, as a side-effect, the second will be left @@ -91,10 +93,17 @@ def GetConformerRMSFromAtomIds(mol: Chem.Mol, confId1: int, confId2: int, atomId conf1 = mol.GetConformer(id=confId1) conf2 = mol.GetConformer(id=confId2) ssr = 0 - for i in atomIds: - d = conf1.GetAtomPosition(i).Distance(conf2.GetAtomPosition(i)) - ssr += d * d - ssr /= mol.GetNumAtoms() + if atomIds: + for i in atomIds: + d = conf1.GetAtomPosition(i).Distance(conf2.GetAtomPosition(i)) + ssr += d * d + ssr /= len(atomIds) + else: + for i in range(mol.GetNumAtoms()): + d = conf1.GetAtomPosition(i).Distance(conf2.GetAtomPosition(i)) + ssr += d * d + ssr /= mol.GetNumAtoms() + return np.sqrt(ssr) def GetConformerRMSMatrixForSaturatedRingMolecule(mol: Chem.Mol, atomIds:list[list[int]]=None, prealigned: bool=False) -> list[float]: @@ -163,16 +172,12 @@ def GetConformerRMSMatrixForSaturatedRingMolecule(mol: Chem.Mol, atomIds:list[li def mol_embedding_3d(mol: Chem.Mol, seed: int=43) -> Chem.Mol: def find_saturated_ring(mol: Chem.Mol) -> list[list[int]]: - atom_list = mol.GetAtoms() ssr = Chem.GetSymmSSSR(mol) saturated_ring_list = [] for ring in ssr: - is_atom_saturated_array = np.array([atom_list[atom_id].GetHybridization() == Chem.HybridizationType.SP3 for atom_id in ring]) - is_ring_unsaturated = np.any(np.nonzero(is_atom_saturated_array==0)) - if is_ring_unsaturated: - continue - - saturated_ring_list.append(ring) + is_atom_saturated_array = [mol.GetAtomWithIdx(atom_id).GetHybridization() == Chem.HybridizationType.SP3 for atom_id in ring] + if any(is_atom_saturated_array): + saturated_ring_list.append(ring) return saturated_ring_list @@ -187,7 +192,7 @@ def gen_conf(mole: Chem.Mol, useRandomCoords: bool, randomSeed: int, has_saturat conf_stat = AllChem.EmbedMolecule(mole, params) return mole, conf_stat - def remove_confs_rms(mol: Chem.Mol, saturated_ring_list: list[list[int]], rms: float=0.25, keep_nconf: Optional[bool | int]=None) -> Chem.Mol: + def remove_confs_rms(mol: Chem.Mol, saturated_ring_list: list[list[int]], rms: float=0.25, keep_nconf: Optional[int]=None) -> Chem.Mol: """ The function uses AgglomerativeClustering to select conformers. @@ -197,7 +202,7 @@ def remove_confs_rms(mol: Chem.Mol, saturated_ring_list: list[list[int]], rms: f :return: """ - def gen_ids(ids: int) -> Iterator[int, int]: + def gen_ids(ids: int) -> Iterator[int]: for i in range(1, len(ids)): for j in range(0, i): yield j, i @@ -216,10 +221,8 @@ def gen_ids(ids: int) -> Iterator[int, int]: for (i, j), v in zip(gen_ids(cids), rms_): arr[i, j] = v arr[j, i] = v - if keep_nconf: - cl = AgglomerativeClustering(n_clusters=keep_nconf, linkage='complete', metric='precomputed').fit(arr) - else: - cl = AgglomerativeClustering(n_clusters=None, linkage='complete', metric='precomputed', distance_threshold=rms).fit(arr) + + cl = AgglomerativeClustering(n_clusters=None, linkage='complete', metric='precomputed', distance_threshold=rms).fit(arr) keep_ids = [] for i in set(cl.labels_): @@ -231,14 +234,39 @@ def gen_ids(ids: int) -> Iterator[int, int]: for cid in sorted(remove_ids, reverse=True): mol.RemoveConformer(cid) + if mol.GetNumConformers() == 1: + return mol + + if keep_nconf: + if mol.GetNumConformers() <= keep_nconf: + return mol + + cids = [c.GetId() for c in mol.GetConformers()] + arr = np.zeros((len(cids), len(cids))) + for (i, j), v in zip(gen_ids(cids), rms_): + arr[i, j] = v + arr[j, i] = v + + cl = AgglomerativeClustering(n_clusters=keep_nconf, linkage='complete', metric='precomputed').fit(arr) + + keep_ids = [] + for i in set(cl.labels_): + ids = np.where(cl.labels_ == i)[0] + j = arr[np.ix_(ids, ids)].mean(axis=0).argmin() + keep_ids.append(cids[j]) + remove_ids = set(cids) - set(keep_ids) + + for cid in sorted(remove_ids, reverse=True): + mol.RemoveConformer(cid) + return mol if not isinstance(mol, Chem.Mol): return None - saturated_ring = find_saturated_ring(mol) - has_saturated_ring = (len(saturated_ring)>0) + saturated_rings = find_saturated_ring(mol) + has_saturated_ring = (len(saturated_rings)>0) mol = Chem.AddHs(mol, addCoords=True) if not mol_is_3d(mol): # only for non 3D input structures @@ -249,9 +277,9 @@ def gen_ids(ids: int) -> Iterator[int, int]: if conf_stat == -1: return None AllChem.UFFOptimizeMolecule(mol, maxIters=100) - print(f"[For Testing Only] {mol.GetProp('_Name')} has {len(saturated_ring)} saturated ring") + print(f"[For Testing Only] {mol.GetProp('_Name')} has {len(saturated_rings)} saturated ring") print(f"[For Testing Only] Before removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") - mol = remove_confs_rms(mol, saturated_ring) + mol = remove_confs_rms(mol, saturated_rings) print(f"[For Testing Only] After removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") return mol diff --git a/easydock/vina_dock.py b/easydock/vina_dock.py index ffc8508..6c9236b 100755 --- a/easydock/vina_dock.py +++ b/easydock/vina_dock.py @@ -118,7 +118,6 @@ def mol_dock(mol, config): sys.stderr.write('STDERR output:\n') sys.stderr.write(e.stderr + '\n') sys.stderr.flush() - output = None finally: os.close(output_fd) @@ -130,6 +129,10 @@ def mol_dock(mol, config): print(f'[For Testing Only Sanity check]: There are {len(dock_output_conformer_list)} {mol_id} conformers that has been docked') print(f'\n') docking_score_list = [float(conformer_output['docking_score']) for conformer_output in dock_output_conformer_list] + + if not docking_score_list: + return mol_id, None + output = dock_output_conformer_list[docking_score_list.index(min(docking_score_list))] output['dock_time'] = dock_time From 999fe40a4cd20cf99e8c3040d260c53349a34455 Mon Sep 17 00:00:00 2001 From: Feriolet Date: Mon, 13 May 2024 19:21:46 +0800 Subject: [PATCH 08/22] Fix where arr is not truncated to surviving cids for nkeep_conf --- easydock/preparation_for_docking.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index d1e056f..ce48790 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -242,10 +242,7 @@ def gen_ids(ids: int) -> Iterator[int]: return mol cids = [c.GetId() for c in mol.GetConformers()] - arr = np.zeros((len(cids), len(cids))) - for (i, j), v in zip(gen_ids(cids), rms_): - arr[i, j] = v - arr[j, i] = v + arr = arr[np.ix_(cids, cids)] cl = AgglomerativeClustering(n_clusters=keep_nconf, linkage='complete', metric='precomputed').fit(arr) @@ -279,7 +276,7 @@ def gen_ids(ids: int) -> Iterator[int]: AllChem.UFFOptimizeMolecule(mol, maxIters=100) print(f"[For Testing Only] {mol.GetProp('_Name')} has {len(saturated_rings)} saturated ring") print(f"[For Testing Only] Before removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") - mol = remove_confs_rms(mol, saturated_rings) + mol = remove_confs_rms(mol, saturated_rings,keep_nconf=1) print(f"[For Testing Only] After removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") return mol From 3efb624a7fd2f8ab7e576d4b904e0d7fcba3468b Mon Sep 17 00:00:00 2001 From: Feriolet Date: Mon, 13 May 2024 19:26:57 +0800 Subject: [PATCH 09/22] remove hard coded nkeep_conf=1 --- easydock/preparation_for_docking.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index ce48790..d3ed0b1 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -243,7 +243,7 @@ def gen_ids(ids: int) -> Iterator[int]: cids = [c.GetId() for c in mol.GetConformers()] arr = arr[np.ix_(cids, cids)] - + cl = AgglomerativeClustering(n_clusters=keep_nconf, linkage='complete', metric='precomputed').fit(arr) keep_ids = [] @@ -276,7 +276,7 @@ def gen_ids(ids: int) -> Iterator[int]: AllChem.UFFOptimizeMolecule(mol, maxIters=100) print(f"[For Testing Only] {mol.GetProp('_Name')} has {len(saturated_rings)} saturated ring") print(f"[For Testing Only] Before removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") - mol = remove_confs_rms(mol, saturated_rings,keep_nconf=1) + mol = remove_confs_rms(mol, saturated_rings) print(f"[For Testing Only] After removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") return mol From 3b04c35dc829a379f1ffafe5b15f57dfce65a566 Mon Sep 17 00:00:00 2001 From: Feriolet Date: Mon, 13 May 2024 19:39:05 +0800 Subject: [PATCH 10/22] Changing any to all to reflect that all atom should be sp3 in aring --- easydock/preparation_for_docking.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index d3ed0b1..1ff2a97 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -176,9 +176,8 @@ def find_saturated_ring(mol: Chem.Mol) -> list[list[int]]: saturated_ring_list = [] for ring in ssr: is_atom_saturated_array = [mol.GetAtomWithIdx(atom_id).GetHybridization() == Chem.HybridizationType.SP3 for atom_id in ring] - if any(is_atom_saturated_array): + if all(is_atom_saturated_array): saturated_ring_list.append(ring) - return saturated_ring_list def gen_conf(mole: Chem.Mol, useRandomCoords: bool, randomSeed: int, has_saturated_ring: bool) -> tuple[Chem.Mol, float]: From cfea4f9bbfe0c87ca9675684c93deb6cc9438221 Mon Sep 17 00:00:00 2001 From: Feriolet Date: Mon, 13 May 2024 22:14:30 +0800 Subject: [PATCH 11/22] Fix based on reviews --- easydock/preparation_for_docking.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index 1ff2a97..acc3758 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -142,6 +142,7 @@ def GetConformerRMSMatrixForSaturatedRingMolecule(mol: Chem.Mol, atomIds:list[li """ cmat_list = [] + total_ring_atoms = sum([len(atom_in_ring) for atom_in_ring in atomIds]) for atom_id_in_ring in atomIds: # if necessary, align the conformers # Note: the reference conformer is always the first one @@ -162,11 +163,10 @@ def GetConformerRMSMatrixForSaturatedRingMolecule(mol: Chem.Mol, atomIds:list[li for j in range(1, i): cmat_per_ring.append(GetConformerRMSFromAtomIds(mol, confIds[i], confIds[j], atomIds=atom_id_in_ring, prealigned=True)) - cmat_list.append(np.array(cmat_per_ring)) + cmat_list.append(np.square(np.array(cmat_per_ring))*len(atom_id_in_ring)) cmat_list_array = np.array(cmat_list) - - return list(np.mean(cmat_list_array, axis=0)) + return list(np.sqrt(np.sum(cmat_list_array, axis=0)/total_ring_atoms)) def mol_embedding_3d(mol: Chem.Mol, seed: int=43) -> Chem.Mol: @@ -176,7 +176,7 @@ def find_saturated_ring(mol: Chem.Mol) -> list[list[int]]: saturated_ring_list = [] for ring in ssr: is_atom_saturated_array = [mol.GetAtomWithIdx(atom_id).GetHybridization() == Chem.HybridizationType.SP3 for atom_id in ring] - if all(is_atom_saturated_array): + if any(is_atom_saturated_array): saturated_ring_list.append(ring) return saturated_ring_list @@ -236,21 +236,16 @@ def gen_ids(ids: int) -> Iterator[int]: if mol.GetNumConformers() == 1: return mol - if keep_nconf: - if mol.GetNumConformers() <= keep_nconf: - return mol - - cids = [c.GetId() for c in mol.GetConformers()] - arr = arr[np.ix_(cids, cids)] - + if keep_nconf and mol.GetNumConformers() > keep_nconf: + arr = arr[np.ix_(keep_ids, keep_ids)] cl = AgglomerativeClustering(n_clusters=keep_nconf, linkage='complete', metric='precomputed').fit(arr) - keep_ids = [] + keep_ids_nconf_filter = [] for i in set(cl.labels_): ids = np.where(cl.labels_ == i)[0] j = arr[np.ix_(ids, ids)].mean(axis=0).argmin() - keep_ids.append(cids[j]) - remove_ids = set(cids) - set(keep_ids) + keep_ids_nconf_filter.append(cids[j]) + remove_ids = set(keep_ids) - set(keep_ids_nconf_filter) for cid in sorted(remove_ids, reverse=True): mol.RemoveConformer(cid) From 9af0c2aa8ac29d344ae48178f250c916d6e8f8d8 Mon Sep 17 00:00:00 2001 From: Feriolet Date: Tue, 14 May 2024 23:15:08 +0800 Subject: [PATCH 12/22] fix arr for keep_nconf --- easydock/preparation_for_docking.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index acc3758..11b2a86 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -212,10 +212,9 @@ def gen_ids(ids: int) -> Iterator[int]: if mol.GetNumConformers() <= 1: return mol - mol_tmp = Chem.RemoveHs(mol) # calc rms for heavy atoms only - rms_ = GetConformerRMSMatrixForSaturatedRingMolecule(mol_tmp, atomIds=saturated_ring_list, prealigned=False) + rms_ = GetConformerRMSMatrixForSaturatedRingMolecule(Chem.RemoveHs(mol), atomIds=saturated_ring_list, prealigned=False) - cids = [c.GetId() for c in mol_tmp.GetConformers()] + cids = [c.GetId() for c in mol.GetConformers()] arr = np.zeros((len(cids), len(cids))) for (i, j), v in zip(gen_ids(cids), rms_): arr[i, j] = v @@ -233,25 +232,23 @@ def gen_ids(ids: int) -> Iterator[int]: for cid in sorted(remove_ids, reverse=True): mol.RemoveConformer(cid) - if mol.GetNumConformers() == 1: - return mol - - if keep_nconf and mol.GetNumConformers() > keep_nconf: - arr = arr[np.ix_(keep_ids, keep_ids)] + if keep_nconf and mol.GetNumConformers() > keep_nconf and mol.GetNumConformers() > 1: + ids = np.in1d(cids, keep_ids) + arr = arr[np.ix_(ids, ids)] # here other indexing operation should be used, because ids is a boolean array cl = AgglomerativeClustering(n_clusters=keep_nconf, linkage='complete', metric='precomputed').fit(arr) - keep_ids_nconf_filter = [] + keep_ids = [] + cids = [c.GetId() for c in mol.GetConformers()] for i in set(cl.labels_): ids = np.where(cl.labels_ == i)[0] j = arr[np.ix_(ids, ids)].mean(axis=0).argmin() - keep_ids_nconf_filter.append(cids[j]) - remove_ids = set(keep_ids) - set(keep_ids_nconf_filter) + keep_ids.append(cids[j]) + remove_ids = set(cids) - set(keep_ids) for cid in sorted(remove_ids, reverse=True): mol.RemoveConformer(cid) - + return mol - if not isinstance(mol, Chem.Mol): return None @@ -270,7 +267,7 @@ def gen_ids(ids: int) -> Iterator[int]: AllChem.UFFOptimizeMolecule(mol, maxIters=100) print(f"[For Testing Only] {mol.GetProp('_Name')} has {len(saturated_rings)} saturated ring") print(f"[For Testing Only] Before removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") - mol = remove_confs_rms(mol, saturated_rings) + mol = remove_confs_rms(mol, saturated_rings, keep_nconf=1) print(f"[For Testing Only] After removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") return mol From b21498343e2a5a62978208ceba3952a5a57b5d98 Mon Sep 17 00:00:00 2001 From: Feriolet Date: Tue, 14 May 2024 23:16:20 +0800 Subject: [PATCH 13/22] clean_up --- easydock/preparation_for_docking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index 11b2a86..46006f8 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -267,7 +267,7 @@ def gen_ids(ids: int) -> Iterator[int]: AllChem.UFFOptimizeMolecule(mol, maxIters=100) print(f"[For Testing Only] {mol.GetProp('_Name')} has {len(saturated_rings)} saturated ring") print(f"[For Testing Only] Before removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") - mol = remove_confs_rms(mol, saturated_rings, keep_nconf=1) + mol = remove_confs_rms(mol, saturated_rings) print(f"[For Testing Only] After removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") return mol From b1ae4b01179e0cbd4c8882c5266f5dc0ef15df3f Mon Sep 17 00:00:00 2001 From: Feriolet Date: Wed, 15 May 2024 21:27:39 +0800 Subject: [PATCH 14/22] Add substituent index to saturated ring index --- easydock/preparation_for_docking.py | 36 +++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index 46006f8..4382fac 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -7,7 +7,7 @@ from meeko import (MoleculePreparation, PDBQTMolecule, PDBQTWriterLegacy, RDKitMolCreate) from rdkit import Chem -from rdkit.Chem import AllChem +from rdkit.Chem import AllChem, Draw import numpy as np from sklearn.cluster import AgglomerativeClustering from rdkit.Chem.rdMolAlign import AlignMolConformers @@ -171,13 +171,22 @@ def GetConformerRMSMatrixForSaturatedRingMolecule(mol: Chem.Mol, atomIds:list[li def mol_embedding_3d(mol: Chem.Mol, seed: int=43) -> Chem.Mol: - def find_saturated_ring(mol: Chem.Mol) -> list[list[int]]: + def find_saturated_ring_with_substituent(mol: Chem.Mol) -> list[list[int]]: ssr = Chem.GetSymmSSSR(mol) saturated_ring_list = [] - for ring in ssr: - is_atom_saturated_array = [mol.GetAtomWithIdx(atom_id).GetHybridization() == Chem.HybridizationType.SP3 for atom_id in ring] + for ring_idx in ssr: + is_atom_saturated_array = [mol.GetAtomWithIdx(atom_id).GetHybridization() == Chem.HybridizationType.SP3 for atom_id in ring_idx] if any(is_atom_saturated_array): - saturated_ring_list.append(ring) + ring_and_substituent_idx = [] + for ring_atom_idx in ring_idx: + ring_and_substituent_idx += [x.GetIdx() for x in mol.GetAtomWithIdx(ring_atom_idx).GetNeighbors()] + ring_and_substituent_idx = list(set(ring_and_substituent_idx)) + saturated_ring_list.append(ring_and_substituent_idx) + + #to show that the function pick the correct neighbors and inspect visually. + #to be removed before merging the PR + img = Draw.MolsToGridImage([mol],highlightAtomLists=[list(set([ x for ring in saturated_ring_list for x in ring]))]) + img.save(f'images/{mol}_{mol.GetProp("_Name")}.png') return saturated_ring_list def gen_conf(mole: Chem.Mol, useRandomCoords: bool, randomSeed: int, has_saturated_ring: bool) -> tuple[Chem.Mol, float]: @@ -186,7 +195,7 @@ def gen_conf(mole: Chem.Mol, useRandomCoords: bool, randomSeed: int, has_saturat params.randomSeed = randomSeed if has_saturated_ring: #10 is used as default according to the C language documentation iirc, but I have to specify the numbers. - conf_stat = AllChem.EmbedMultipleConfs(mole, 10, params) + conf_stat = AllChem.EmbedMultipleConfs(mole, 100, params) else: conf_stat = AllChem.EmbedMolecule(mole, params) return mole, conf_stat @@ -253,8 +262,8 @@ def gen_ids(ids: int) -> Iterator[int]: if not isinstance(mol, Chem.Mol): return None - saturated_rings = find_saturated_ring(mol) - has_saturated_ring = (len(saturated_rings)>0) + saturated_rings_with_substituents = find_saturated_ring_with_substituent(mol) + has_saturated_ring = (len(saturated_rings_with_substituents)>0) mol = Chem.AddHs(mol, addCoords=True) if not mol_is_3d(mol): # only for non 3D input structures @@ -265,11 +274,18 @@ def gen_ids(ids: int) -> Iterator[int]: if conf_stat == -1: return None AllChem.UFFOptimizeMolecule(mol, maxIters=100) - print(f"[For Testing Only] {mol.GetProp('_Name')} has {len(saturated_rings)} saturated ring") + + writer = Chem.SDWriter(f'conformer/{mol.GetProp("_Name")}_before_remove.sdf') + for cid in [c.GetId() for c in mol.GetConformers()]: + writer.write(mol, confId=cid) + print(f"[For Testing Only] {mol.GetProp('_Name')} has {len(saturated_rings_with_substituents)} saturated ring") print(f"[For Testing Only] Before removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") - mol = remove_confs_rms(mol, saturated_rings) + mol = remove_confs_rms(mol, saturated_rings_with_substituents) print(f"[For Testing Only] After removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") + writer = Chem.SDWriter(f'conformer/{mol.GetProp("_Name")}_after_remove.sdf') + for cid in [c.GetId() for c in mol.GetConformers()]: + writer.write(mol, confId=cid) return mol From 105223bb0a7ac2fba669c0a787a6657379c76cdb Mon Sep 17 00:00:00 2001 From: Feriolet Date: Sat, 25 May 2024 17:24:12 +0800 Subject: [PATCH 15/22] fix indexing --- easydock/preparation_for_docking.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index 4382fac..bd95965 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -230,12 +230,12 @@ def gen_ids(ids: int) -> Iterator[int]: arr[j, i] = v cl = AgglomerativeClustering(n_clusters=None, linkage='complete', metric='precomputed', distance_threshold=rms).fit(arr) - + print(cl.labels_) keep_ids = [] for i in set(cl.labels_): ids = np.where(cl.labels_ == i)[0] j = arr[np.ix_(ids, ids)].mean(axis=0).argmin() - keep_ids.append(cids[j]) + keep_ids.append(cids[ids[j]]) remove_ids = set(cids) - set(keep_ids) for cid in sorted(remove_ids, reverse=True): @@ -251,7 +251,7 @@ def gen_ids(ids: int) -> Iterator[int]: for i in set(cl.labels_): ids = np.where(cl.labels_ == i)[0] j = arr[np.ix_(ids, ids)].mean(axis=0).argmin() - keep_ids.append(cids[j]) + keep_ids.append(cids[ids[j]]) remove_ids = set(cids) - set(keep_ids) for cid in sorted(remove_ids, reverse=True): @@ -280,10 +280,12 @@ def gen_ids(ids: int) -> Iterator[int]: writer.write(mol, confId=cid) print(f"[For Testing Only] {mol.GetProp('_Name')} has {len(saturated_rings_with_substituents)} saturated ring") print(f"[For Testing Only] Before removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") - mol = remove_confs_rms(mol, saturated_rings_with_substituents) + mol = remove_confs_rms(mol, saturated_rings_with_substituents, rms=0.5) print(f"[For Testing Only] After removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") - writer = Chem.SDWriter(f'conformer/{mol.GetProp("_Name")}_after_remove.sdf') + AlignMolConformers(mol) + + writer = Chem.SDWriter(f'conformer2/{mol.GetProp("_Name")}_after_remove_050.sdf') for cid in [c.GetId() for c in mol.GetConformers()]: writer.write(mol, confId=cid) return mol From 4bb4515619e4d842cae1fd6262abdcfc6c8887fd Mon Sep 17 00:00:00 2001 From: Feriolet Date: Mon, 27 May 2024 21:58:04 +0800 Subject: [PATCH 16/22] filter rms threshold until all matrix > rms --- easydock/preparation_for_docking.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index bd95965..40821c6 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -230,7 +230,6 @@ def gen_ids(ids: int) -> Iterator[int]: arr[j, i] = v cl = AgglomerativeClustering(n_clusters=None, linkage='complete', metric='precomputed', distance_threshold=rms).fit(arr) - print(cl.labels_) keep_ids = [] for i in set(cl.labels_): ids = np.where(cl.labels_ == i)[0] @@ -241,6 +240,29 @@ def gen_ids(ids: int) -> Iterator[int]: for cid in sorted(remove_ids, reverse=True): mol.RemoveConformer(cid) + while True: + + ids = np.in1d(cids, keep_ids) + arr = arr[np.ix_(ids, ids)] + + if len(arr) == 2 or not ((arr < rms) & (arr !=0)).any(): + break + + cids = [c.GetId() for c in mol.GetConformers()] + + #keep ids with maximum rms + max_rms_ids = np.unravel_index(np.argmax(arr, axis=None), arr.shape) + count_below_rms = np.count_nonzero((arr < rms) & (arr !=0), axis=1) + + max_count_below_rms_ids = np.argwhere(count_below_rms == np.amax(count_below_rms)) + remove_ids = np.setdiff1d(max_count_below_rms_ids, np.array(max_rms_ids)) + remove_cids = [cids[id] for id in remove_ids] + + keep_ids = np.array(list(set(cids) - set(list(remove_cids)))) + + for cid in sorted(remove_cids, reverse=True): + mol.RemoveConformer(cid) + if keep_nconf and mol.GetNumConformers() > keep_nconf and mol.GetNumConformers() > 1: ids = np.in1d(cids, keep_ids) arr = arr[np.ix_(ids, ids)] # here other indexing operation should be used, because ids is a boolean array @@ -280,12 +302,12 @@ def gen_ids(ids: int) -> Iterator[int]: writer.write(mol, confId=cid) print(f"[For Testing Only] {mol.GetProp('_Name')} has {len(saturated_rings_with_substituents)} saturated ring") print(f"[For Testing Only] Before removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") - mol = remove_confs_rms(mol, saturated_rings_with_substituents, rms=0.5) + mol = remove_confs_rms(mol, saturated_rings_with_substituents, rms=0.1) print(f"[For Testing Only] After removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") AlignMolConformers(mol) - writer = Chem.SDWriter(f'conformer2/{mol.GetProp("_Name")}_after_remove_050.sdf') + writer = Chem.SDWriter(f'conformer3/{mol.GetProp("_Name")}_after_remove_030.sdf') for cid in [c.GetId() for c in mol.GetConformers()]: writer.write(mol, confId=cid) return mol From a2c28f927f3ad5dd752d472a98bbe774ad104372 Mon Sep 17 00:00:00 2001 From: Feriolet Date: Mon, 27 May 2024 22:25:39 +0800 Subject: [PATCH 17/22] accomodate when conformer with max number of low rms is also the conformer with highest rms, that conformer is also removed --- easydock/preparation_for_docking.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index 40821c6..b82afb7 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -240,12 +240,14 @@ def gen_ids(ids: int) -> Iterator[int]: for cid in sorted(remove_ids, reverse=True): mol.RemoveConformer(cid) + from time import sleep while True: - + sleep(0.5) ids = np.in1d(cids, keep_ids) arr = arr[np.ix_(ids, ids)] - if len(arr) == 2 or not ((arr < rms) & (arr !=0)).any(): + #sometimes clustering result gives matrix < rms + if all(arr[arr != 0] < rms) or not any(arr[arr != 0] < rms): break cids = [c.GetId() for c in mol.GetConformers()] @@ -256,6 +258,10 @@ def gen_ids(ids: int) -> Iterator[int]: max_count_below_rms_ids = np.argwhere(count_below_rms == np.amax(count_below_rms)) remove_ids = np.setdiff1d(max_count_below_rms_ids, np.array(max_rms_ids)) + + # remove max rms if remove id is empty + if len(remove_ids) ==0: + remove_ids = np.setdiff1d(max_count_below_rms_ids, np.array([])) remove_cids = [cids[id] for id in remove_ids] keep_ids = np.array(list(set(cids) - set(list(remove_cids)))) @@ -302,12 +308,12 @@ def gen_ids(ids: int) -> Iterator[int]: writer.write(mol, confId=cid) print(f"[For Testing Only] {mol.GetProp('_Name')} has {len(saturated_rings_with_substituents)} saturated ring") print(f"[For Testing Only] Before removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") - mol = remove_confs_rms(mol, saturated_rings_with_substituents, rms=0.1) + mol = remove_confs_rms(mol, saturated_rings_with_substituents, rms=0.5) print(f"[For Testing Only] After removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") AlignMolConformers(mol) - writer = Chem.SDWriter(f'conformer3/{mol.GetProp("_Name")}_after_remove_030.sdf') + writer = Chem.SDWriter(f'conformer3/{mol.GetProp("_Name")}_after_remove_050.sdf') for cid in [c.GetId() for c in mol.GetConformers()]: writer.write(mol, confId=cid) return mol From b28dae455c345e9526ec485f6e14b1308c7f60ce Mon Sep 17 00:00:00 2001 From: Feriolet Date: Mon, 27 May 2024 22:27:05 +0800 Subject: [PATCH 18/22] remove debugging code --- easydock/preparation_for_docking.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index b82afb7..517e608 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -240,13 +240,11 @@ def gen_ids(ids: int) -> Iterator[int]: for cid in sorted(remove_ids, reverse=True): mol.RemoveConformer(cid) - from time import sleep while True: - sleep(0.5) ids = np.in1d(cids, keep_ids) arr = arr[np.ix_(ids, ids)] - #sometimes clustering result gives matrix < rms + #sometimes clustering result gives matrix < rms when rms is high enough if all(arr[arr != 0] < rms) or not any(arr[arr != 0] < rms): break From 558541d32819366ab4bf8a79acfca2ccb4de7221 Mon Sep 17 00:00:00 2001 From: Feriolet Date: Wed, 29 May 2024 19:04:21 +0800 Subject: [PATCH 19/22] calculate keep_nconf based on ring num and size --- easydock/preparation_for_docking.py | 37 ++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index 517e608..56c529d 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -171,9 +171,19 @@ def GetConformerRMSMatrixForSaturatedRingMolecule(mol: Chem.Mol, atomIds:list[li def mol_embedding_3d(mol: Chem.Mol, seed: int=43) -> Chem.Mol: - def find_saturated_ring_with_substituent(mol: Chem.Mol) -> list[list[int]]: + def find_saturated_ring(mol: Chem.Mol) -> list[list[int]]: ssr = Chem.GetSymmSSSR(mol) saturated_ring_list = [] + for ring_idx in ssr: + is_atom_saturated_array = [mol.GetAtomWithIdx(atom_id).GetHybridization() == Chem.HybridizationType.SP3 for atom_id in ring_idx] + if any(is_atom_saturated_array): + saturated_ring_list.append(ring_idx) + + return saturated_ring_list + + def find_saturated_ring_with_substituent(mol: Chem.Mol) -> list[list[int]]: + ssr = Chem.GetSymmSSSR(mol) + saturated_ring_with_substituent_list = [] for ring_idx in ssr: is_atom_saturated_array = [mol.GetAtomWithIdx(atom_id).GetHybridization() == Chem.HybridizationType.SP3 for atom_id in ring_idx] if any(is_atom_saturated_array): @@ -181,13 +191,13 @@ def find_saturated_ring_with_substituent(mol: Chem.Mol) -> list[list[int]]: for ring_atom_idx in ring_idx: ring_and_substituent_idx += [x.GetIdx() for x in mol.GetAtomWithIdx(ring_atom_idx).GetNeighbors()] ring_and_substituent_idx = list(set(ring_and_substituent_idx)) - saturated_ring_list.append(ring_and_substituent_idx) + saturated_ring_with_substituent_list.append(ring_and_substituent_idx) #to show that the function pick the correct neighbors and inspect visually. #to be removed before merging the PR - img = Draw.MolsToGridImage([mol],highlightAtomLists=[list(set([ x for ring in saturated_ring_list for x in ring]))]) + img = Draw.MolsToGridImage([mol],highlightAtomLists=[list(set([ x for ring in saturated_ring_with_substituent_list for x in ring]))]) img.save(f'images/{mol}_{mol.GetProp("_Name")}.png') - return saturated_ring_list + return saturated_ring_with_substituent_list def gen_conf(mole: Chem.Mol, useRandomCoords: bool, randomSeed: int, has_saturated_ring: bool) -> tuple[Chem.Mol, float]: params = AllChem.ETKDGv3() @@ -200,6 +210,14 @@ def gen_conf(mole: Chem.Mol, useRandomCoords: bool, randomSeed: int, has_saturat conf_stat = AllChem.EmbedMolecule(mole, params) return mole, conf_stat + def calculate_ring_nconf(saturated_ring_ids: list[int]) -> int: + if len(saturated_ring_ids) <= 6: + return 2 + elif len(saturated_ring_ids) == 7: + return 3 + else: + return 4 + def remove_confs_rms(mol: Chem.Mol, saturated_ring_list: list[list[int]], rms: float=0.25, keep_nconf: Optional[int]=None) -> Chem.Mol: """ The function uses AgglomerativeClustering to select conformers. @@ -250,14 +268,14 @@ def gen_ids(ids: int) -> Iterator[int]: cids = [c.GetId() for c in mol.GetConformers()] - #keep ids with maximum rms + #keep ids with maximum rms. This assume that even if maximum rms has among the highest num of low rms, removing other conformer is sufficient max_rms_ids = np.unravel_index(np.argmax(arr, axis=None), arr.shape) count_below_rms = np.count_nonzero((arr < rms) & (arr !=0), axis=1) max_count_below_rms_ids = np.argwhere(count_below_rms == np.amax(count_below_rms)) remove_ids = np.setdiff1d(max_count_below_rms_ids, np.array(max_rms_ids)) - # remove max rms if remove id is empty + # remove max rms if remove id is empty to prevent endless while loop if len(remove_ids) ==0: remove_ids = np.setdiff1d(max_count_below_rms_ids, np.array([])) remove_cids = [cids[id] for id in remove_ids] @@ -288,6 +306,7 @@ def gen_ids(ids: int) -> Iterator[int]: if not isinstance(mol, Chem.Mol): return None + saturated_ring_list = find_saturated_ring(mol) saturated_rings_with_substituents = find_saturated_ring_with_substituent(mol) has_saturated_ring = (len(saturated_rings_with_substituents)>0) @@ -306,12 +325,12 @@ def gen_ids(ids: int) -> Iterator[int]: writer.write(mol, confId=cid) print(f"[For Testing Only] {mol.GetProp('_Name')} has {len(saturated_rings_with_substituents)} saturated ring") print(f"[For Testing Only] Before removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") - mol = remove_confs_rms(mol, saturated_rings_with_substituents, rms=0.5) + mol = remove_confs_rms(mol, saturated_rings_with_substituents, rms=1, keep_nconf= sum([calculate_ring_nconf(saturated_ring) for saturated_ring in saturated_ring_list])) print(f"[For Testing Only] After removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") - + AlignMolConformers(mol) - writer = Chem.SDWriter(f'conformer3/{mol.GetProp("_Name")}_after_remove_050.sdf') + writer = Chem.SDWriter(f'conformer_keepnconf/{mol.GetProp("_Name")}_after_remove_100.sdf') for cid in [c.GetId() for c in mol.GetConformers()]: writer.write(mol, confId=cid) return mol From 5ce28fcb8c20a27d058880ba8a08cecdae186c45 Mon Sep 17 00:00:00 2001 From: Feriolet Date: Wed, 29 May 2024 22:34:16 +0800 Subject: [PATCH 20/22] improve find_saturated_ring_with_substituent and remove hard coded code before merge --- easydock/preparation_for_docking.py | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index 56c529d..bfb0bd1 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -181,22 +181,15 @@ def find_saturated_ring(mol: Chem.Mol) -> list[list[int]]: return saturated_ring_list - def find_saturated_ring_with_substituent(mol: Chem.Mol) -> list[list[int]]: - ssr = Chem.GetSymmSSSR(mol) + def find_saturated_ring_with_substituent(saturated_ring_list: list[list[int]], mol: Chem.Mol) -> list[list[int]]: saturated_ring_with_substituent_list = [] - for ring_idx in ssr: - is_atom_saturated_array = [mol.GetAtomWithIdx(atom_id).GetHybridization() == Chem.HybridizationType.SP3 for atom_id in ring_idx] - if any(is_atom_saturated_array): + for ring_idx in saturated_ring_list: ring_and_substituent_idx = [] for ring_atom_idx in ring_idx: ring_and_substituent_idx += [x.GetIdx() for x in mol.GetAtomWithIdx(ring_atom_idx).GetNeighbors()] ring_and_substituent_idx = list(set(ring_and_substituent_idx)) saturated_ring_with_substituent_list.append(ring_and_substituent_idx) - #to show that the function pick the correct neighbors and inspect visually. - #to be removed before merging the PR - img = Draw.MolsToGridImage([mol],highlightAtomLists=[list(set([ x for ring in saturated_ring_with_substituent_list for x in ring]))]) - img.save(f'images/{mol}_{mol.GetProp("_Name")}.png') return saturated_ring_with_substituent_list def gen_conf(mole: Chem.Mol, useRandomCoords: bool, randomSeed: int, has_saturated_ring: bool) -> tuple[Chem.Mol, float]: @@ -307,7 +300,7 @@ def gen_ids(ids: int) -> Iterator[int]: return None saturated_ring_list = find_saturated_ring(mol) - saturated_rings_with_substituents = find_saturated_ring_with_substituent(mol) + saturated_rings_with_substituents = find_saturated_ring_with_substituent(saturated_ring_list, mol) has_saturated_ring = (len(saturated_rings_with_substituents)>0) mol = Chem.AddHs(mol, addCoords=True) @@ -320,9 +313,6 @@ def gen_ids(ids: int) -> Iterator[int]: return None AllChem.UFFOptimizeMolecule(mol, maxIters=100) - writer = Chem.SDWriter(f'conformer/{mol.GetProp("_Name")}_before_remove.sdf') - for cid in [c.GetId() for c in mol.GetConformers()]: - writer.write(mol, confId=cid) print(f"[For Testing Only] {mol.GetProp('_Name')} has {len(saturated_rings_with_substituents)} saturated ring") print(f"[For Testing Only] Before removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf") mol = remove_confs_rms(mol, saturated_rings_with_substituents, rms=1, keep_nconf= sum([calculate_ring_nconf(saturated_ring) for saturated_ring in saturated_ring_list])) @@ -330,9 +320,6 @@ def gen_ids(ids: int) -> Iterator[int]: AlignMolConformers(mol) - writer = Chem.SDWriter(f'conformer_keepnconf/{mol.GetProp("_Name")}_after_remove_100.sdf') - for cid in [c.GetId() for c in mol.GetConformers()]: - writer.write(mol, confId=cid) return mol From 6f1e01555e12ae66f439b45396c0403a3667b6df Mon Sep 17 00:00:00 2001 From: Feriolet Date: Thu, 30 May 2024 19:11:26 +0800 Subject: [PATCH 21/22] remove iterative procedure --- easydock/preparation_for_docking.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index bfb0bd1..08bacb6 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -251,33 +251,6 @@ def gen_ids(ids: int) -> Iterator[int]: for cid in sorted(remove_ids, reverse=True): mol.RemoveConformer(cid) - while True: - ids = np.in1d(cids, keep_ids) - arr = arr[np.ix_(ids, ids)] - - #sometimes clustering result gives matrix < rms when rms is high enough - if all(arr[arr != 0] < rms) or not any(arr[arr != 0] < rms): - break - - cids = [c.GetId() for c in mol.GetConformers()] - - #keep ids with maximum rms. This assume that even if maximum rms has among the highest num of low rms, removing other conformer is sufficient - max_rms_ids = np.unravel_index(np.argmax(arr, axis=None), arr.shape) - count_below_rms = np.count_nonzero((arr < rms) & (arr !=0), axis=1) - - max_count_below_rms_ids = np.argwhere(count_below_rms == np.amax(count_below_rms)) - remove_ids = np.setdiff1d(max_count_below_rms_ids, np.array(max_rms_ids)) - - # remove max rms if remove id is empty to prevent endless while loop - if len(remove_ids) ==0: - remove_ids = np.setdiff1d(max_count_below_rms_ids, np.array([])) - remove_cids = [cids[id] for id in remove_ids] - - keep_ids = np.array(list(set(cids) - set(list(remove_cids)))) - - for cid in sorted(remove_cids, reverse=True): - mol.RemoveConformer(cid) - if keep_nconf and mol.GetNumConformers() > keep_nconf and mol.GetNumConformers() > 1: ids = np.in1d(cids, keep_ids) arr = arr[np.ix_(ids, ids)] # here other indexing operation should be used, because ids is a boolean array From 66ddfe32e15adecd35bcbb651c4918eaa9372f88 Mon Sep 17 00:00:00 2001 From: Feriolet Date: Fri, 7 Jun 2024 16:19:36 +0700 Subject: [PATCH 22/22] fix mk_prepare_ligand to prepare ligand if some confomers are valid --- easydock/preparation_for_docking.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/easydock/preparation_for_docking.py b/easydock/preparation_for_docking.py index 08bacb6..75297ed 100644 --- a/easydock/preparation_for_docking.py +++ b/easydock/preparation_for_docking.py @@ -49,9 +49,9 @@ def mk_prepare_ligand(mol, verbose=False): pdbqt_string, is_ok, error_msg = PDBQTWriterLegacy.write_string(setup) if not is_ok: print(f"{mol.GetProp('_Name')} has error in converting to pdbqt: {error_msg}") - return None - - pdbqt_string_list.append(pdbqt_string) + else: + pdbqt_string_list.append(pdbqt_string) + if verbose: print(f"{setup}") except Exception: