Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Saturated Ring Sampling Conformation Feature #34

Open
wants to merge 23 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
1ede47a
Implement Saturated Ring Sampling Conformation Feature
Feriolet May 11, 2024
99c5fc8
Change [id for id in atomIds] to atomIds in GetConformerRMSFromAtomIds
Feriolet May 11, 2024
e474dc4
Type hints for created functions
Feriolet May 11, 2024
d763851
Type hints for created functions
Feriolet May 11, 2024
9fb8c35
Type hints for created functions
Feriolet May 11, 2024
4ff1122
Explicitly print error message if PDBQTWriterLegacy can't generate pdbqt
Feriolet May 11, 2024
8671e46
Fix code according to reviews
Feriolet May 13, 2024
999fe40
Fix where arr is not truncated to surviving cids for nkeep_conf
Feriolet May 13, 2024
3efb624
remove hard coded nkeep_conf=1
Feriolet May 13, 2024
3b04c35
Changing any to all to reflect that all atom should be sp3 in aring
Feriolet May 13, 2024
cfea4f9
Fix based on reviews
Feriolet May 13, 2024
9af0c2a
fix arr for keep_nconf
Feriolet May 14, 2024
b214983
clean_up
Feriolet May 14, 2024
b1ae4b0
Add substituent index to saturated ring index
Feriolet May 15, 2024
105223b
fix indexing
Feriolet May 25, 2024
4bb4515
filter rms threshold until all matrix > rms
Feriolet May 27, 2024
a2c28f9
accomodate when conformer with max number of low rms is also the conf…
Feriolet May 27, 2024
b28dae4
remove debugging code
Feriolet May 27, 2024
558541d
calculate keep_nconf based on ring num and size
Feriolet May 29, 2024
5ce28fc
improve find_saturated_ring_with_substituent and remove hard coded co…
Feriolet May 29, 2024
6f1e015
remove iterative procedure
Feriolet May 30, 2024
66ddfe3
fix mk_prepare_ligand to prepare ligand if some confomers are valid
Feriolet Jun 7, 2024
1389e70
Merge branch 'ci-lab-cz:master' into ring_conformer
Feriolet Jul 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
251 changes: 236 additions & 15 deletions easydock/preparation_for_docking.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@
from meeko import (MoleculePreparation, PDBQTMolecule, PDBQTWriterLegacy,
RDKitMolCreate)
from rdkit import Chem
from rdkit.Chem import AllChem

from rdkit.Chem import AllChem, Draw
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from rdkit.Chem.rdMolAlign import AlignMolConformers
from typing import Optional, Iterator

def cpu_type(x):
return max(1, min(int(x), cpu_count()))
Expand All @@ -35,43 +38,261 @@ def mol_from_smi_or_molblock(ligand_string):


def mk_prepare_ligand(mol, verbose=False):
pdbqt_string_list = []
preparator = MoleculePreparation(hydrate=False, flexible_amides=False, rigid_macrocycles=True, min_ring_size=7, max_ring_size=33)
try:
mol_setups = preparator.prepare(mol)
for setup in mol_setups:
pdbqt_string, is_ok, error_msg = PDBQTWriterLegacy.write_string(setup)
if verbose:
print(f"{setup}")
cids = [x.GetId() for x in mol.GetConformers()]
for cid in cids:
mol_setups = preparator.prepare(mol, conformer_id=cid)

for setup in mol_setups:
pdbqt_string, is_ok, error_msg = PDBQTWriterLegacy.write_string(setup)
if not is_ok:
print(f"{mol.GetProp('_Name')} has error in converting to pdbqt: {error_msg}")
else:
pdbqt_string_list.append(pdbqt_string)

if verbose:
print(f"{setup}")
except Exception:
sys.stderr.write(
"Warning. Incorrect mol object to convert to pdbqt. Continue. \n"
)
traceback.print_exc()
pdbqt_string = None
pdbqt_string_list = None

return pdbqt_string_list


def GetConformerRMSFromAtomIds(mol: Chem.Mol, confId1: int, confId2: int, atomIds: Optional[list[int]]=None, prealigned: bool=False) -> float:
""" Returns the RMS between two conformations based on the atomIds passed as the input.
By default, the conformers will be aligned to the first conformer
before the RMS calculation and, as a side-effect, the second will be left
in the aligned state.

return pdbqt_string
Arguments:
- mol: the molecule
- confId1: the id of the first conformer
- confId2: the id of the second conformer
- atomIds: (optional) list of atom ids to use a points for
alingment **AND RMSD** calculation- defaults to all atoms
- prealigned: (optional) by default the conformers are assumed
be unaligned and the second conformer be aligned
to the first

"""
# align the conformers if necessary
# Note: the reference conformer is always the first one
if not prealigned:
if atomIds:
AlignMolConformers(mol, confIds=[confId1, confId2], atomIds=atomIds)
else:
AlignMolConformers(mol, confIds=[confId1, confId2])

# calculate the RMS between the two conformations
conf1 = mol.GetConformer(id=confId1)
conf2 = mol.GetConformer(id=confId2)
ssr = 0
if atomIds:
for i in atomIds:
d = conf1.GetAtomPosition(i).Distance(conf2.GetAtomPosition(i))
ssr += d * d
ssr /= len(atomIds)
else:
for i in range(mol.GetNumAtoms()):
d = conf1.GetAtomPosition(i).Distance(conf2.GetAtomPosition(i))
ssr += d * d
ssr /= mol.GetNumAtoms()

return np.sqrt(ssr)

def GetConformerRMSMatrixForSaturatedRingMolecule(mol: Chem.Mol, atomIds:list[list[int]]=None, prealigned: bool=False) -> list[float]:
""" Returns the RMS matrix of the conformers of a molecule based on the alignment and rmsd of saturated ring.
The function calculates the mean of the RMSD of each saturated ring with the GetConformerRMSFromAtomIds.
The alignment is done per ring (for example, three alignments are done for three saturated ring)

By default, the conformers will be aligned to the first conformer
before the RMS calculation and, as a side-effect, the second will be left
in the aligned state.

As a side-effect, the conformers will be aligned to the first
conformer (i.e. the reference) and will left in the aligned state.

Arguments:
- mol: the molecule
- atomIds: (optional) list of atom ids to use a points for
alingment - defaults to all atoms
- prealigned: (optional) by default the conformers are assumed
be unaligned and will therefore be aligned to the
first conformer

Note that the returned RMS matrix is symmetrical, i.e. it is the
lower half of the matrix, e.g. for 5 conformers::

rmsmatrix = [ a,
b, c,
d, e, f,
g, h, i, j]

where a is the RMS between conformers 0 and 1, b is the RMS between
conformers 0 and 2, etc.
This way it can be directly used as distance matrix in e.g. Butina
clustering.

def mol_embedding_3d(mol, seed=43):
"""

def gen_conf(mole, useRandomCoords, randomSeed):
cmat_list = []
total_ring_atoms = sum([len(atom_in_ring) for atom_in_ring in atomIds])
for atom_id_in_ring in atomIds:
# if necessary, align the conformers
# Note: the reference conformer is always the first one
rmsvals = []
confIds = [conf.GetId() for conf in mol.GetConformers()]
if not prealigned:
if atom_id_in_ring:
AlignMolConformers(mol, atomIds=atom_id_in_ring, RMSlist=rmsvals)
else:
AlignMolConformers(mol, RMSlist=rmsvals)
else: # already prealigned
for i in range(1, len(confIds)):
rmsvals.append(GetConformerRMSFromAtomIds(mol, confIds[0], confIds[i], atomIds=atom_id_in_ring, prealigned=prealigned))
# loop over the conformations (except the reference one)
cmat_per_ring = []
for i in range(1, len(confIds)):
cmat_per_ring.append(rmsvals[i - 1])
for j in range(1, i):
cmat_per_ring.append(GetConformerRMSFromAtomIds(mol, confIds[i], confIds[j], atomIds=atom_id_in_ring, prealigned=True))

cmat_list.append(np.square(np.array(cmat_per_ring))*len(atom_id_in_ring))

cmat_list_array = np.array(cmat_list)
return list(np.sqrt(np.sum(cmat_list_array, axis=0)/total_ring_atoms))


def mol_embedding_3d(mol: Chem.Mol, seed: int=43) -> Chem.Mol:

def find_saturated_ring(mol: Chem.Mol) -> list[list[int]]:
ssr = Chem.GetSymmSSSR(mol)
saturated_ring_list = []
for ring_idx in ssr:
is_atom_saturated_array = [mol.GetAtomWithIdx(atom_id).GetHybridization() == Chem.HybridizationType.SP3 for atom_id in ring_idx]
if any(is_atom_saturated_array):
saturated_ring_list.append(ring_idx)

return saturated_ring_list

def find_saturated_ring_with_substituent(saturated_ring_list: list[list[int]], mol: Chem.Mol) -> list[list[int]]:
saturated_ring_with_substituent_list = []
for ring_idx in saturated_ring_list:
ring_and_substituent_idx = []
for ring_atom_idx in ring_idx:
ring_and_substituent_idx += [x.GetIdx() for x in mol.GetAtomWithIdx(ring_atom_idx).GetNeighbors()]
ring_and_substituent_idx = list(set(ring_and_substituent_idx))
saturated_ring_with_substituent_list.append(ring_and_substituent_idx)

return saturated_ring_with_substituent_list

def gen_conf(mole: Chem.Mol, useRandomCoords: bool, randomSeed: int, has_saturated_ring: bool) -> tuple[Chem.Mol, float]:
params = AllChem.ETKDGv3()
params.useRandomCoords = useRandomCoords
params.randomSeed = randomSeed
conf_stat = AllChem.EmbedMolecule(mole, params)
if has_saturated_ring:
#10 is used as default according to the C language documentation iirc, but I have to specify the numbers.
conf_stat = AllChem.EmbedMultipleConfs(mole, 100, params)
else:
conf_stat = AllChem.EmbedMolecule(mole, params)
return mole, conf_stat


def calculate_ring_nconf(saturated_ring_ids: list[int]) -> int:
if len(saturated_ring_ids) <= 6:
return 2
elif len(saturated_ring_ids) == 7:
return 3
else:
return 4

def remove_confs_rms(mol: Chem.Mol, saturated_ring_list: list[list[int]], rms: float=0.25, keep_nconf: Optional[int]=None) -> Chem.Mol:
"""
The function uses AgglomerativeClustering to select conformers.

:param mol: input molecule with multiple conformers
:param rms: discard conformers which are closer than given value to a kept conformer
:param keep_nconf: keep at most the given number of conformers. This parameter has precedence over rms
:return:
"""

def gen_ids(ids: int) -> Iterator[int]:
for i in range(1, len(ids)):
for j in range(0, i):
yield j, i

if keep_nconf and mol.GetNumConformers() <= keep_nconf:
return mol

if mol.GetNumConformers() <= 1:
return mol

rms_ = GetConformerRMSMatrixForSaturatedRingMolecule(Chem.RemoveHs(mol), atomIds=saturated_ring_list, prealigned=False)

cids = [c.GetId() for c in mol.GetConformers()]
arr = np.zeros((len(cids), len(cids)))
for (i, j), v in zip(gen_ids(cids), rms_):
arr[i, j] = v
arr[j, i] = v

cl = AgglomerativeClustering(n_clusters=None, linkage='complete', metric='precomputed', distance_threshold=rms).fit(arr)
keep_ids = []
for i in set(cl.labels_):
ids = np.where(cl.labels_ == i)[0]
j = arr[np.ix_(ids, ids)].mean(axis=0).argmin()
keep_ids.append(cids[ids[j]])
remove_ids = set(cids) - set(keep_ids)

for cid in sorted(remove_ids, reverse=True):
mol.RemoveConformer(cid)

if keep_nconf and mol.GetNumConformers() > keep_nconf and mol.GetNumConformers() > 1:
ids = np.in1d(cids, keep_ids)
arr = arr[np.ix_(ids, ids)] # here other indexing operation should be used, because ids is a boolean array
cl = AgglomerativeClustering(n_clusters=keep_nconf, linkage='complete', metric='precomputed').fit(arr)

keep_ids = []
cids = [c.GetId() for c in mol.GetConformers()]
for i in set(cl.labels_):
ids = np.where(cl.labels_ == i)[0]
j = arr[np.ix_(ids, ids)].mean(axis=0).argmin()
keep_ids.append(cids[ids[j]])
remove_ids = set(cids) - set(keep_ids)

for cid in sorted(remove_ids, reverse=True):
mol.RemoveConformer(cid)

return mol

if not isinstance(mol, Chem.Mol):
return None

saturated_ring_list = find_saturated_ring(mol)
saturated_rings_with_substituents = find_saturated_ring_with_substituent(saturated_ring_list, mol)
has_saturated_ring = (len(saturated_rings_with_substituents)>0)

mol = Chem.AddHs(mol, addCoords=True)
if not mol_is_3d(mol): # only for non 3D input structures
mol, conf_stat = gen_conf(mol, useRandomCoords=False, randomSeed=seed)
mol, conf_stat = gen_conf(mol, useRandomCoords=False, randomSeed=seed, has_saturated_ring=has_saturated_ring)
if conf_stat == -1:
# if molecule is big enough and rdkit cannot generate a conformation - use params.useRandomCoords = True
mol, conf_stat = gen_conf(mol, useRandomCoords=True, randomSeed=seed)
mol, conf_stat = gen_conf(mol, useRandomCoords=True, randomSeed=seed, has_saturated_ring=has_saturated_ring)
if conf_stat == -1:
return None
AllChem.UFFOptimizeMolecule(mol, maxIters=100)

print(f"[For Testing Only] {mol.GetProp('_Name')} has {len(saturated_rings_with_substituents)} saturated ring")
print(f"[For Testing Only] Before removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf")
mol = remove_confs_rms(mol, saturated_rings_with_substituents, rms=1, keep_nconf= sum([calculate_ring_nconf(saturated_ring) for saturated_ring in saturated_ring_list]))
print(f"[For Testing Only] After removing conformation: {mol.GetProp('_Name')} has {mol.GetNumConformers()} conf")

AlignMolConformers(mol)

return mol


Expand Down
99 changes: 57 additions & 42 deletions easydock/vina_dock.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,51 +75,66 @@ def mol_dock(mol, config):

config = __parse_config(config)



mol_id = mol.GetProp('_Name')
ligand_pdbqt = ligand_preparation(mol, boron_replacement=True)
if ligand_pdbqt is None:
ligand_pdbqt_list = ligand_preparation(mol, boron_replacement=True)

if ligand_pdbqt_list is None:
return mol_id, None

output_fd, output_fname = tempfile.mkstemp(suffix='_output.json', text=True)
ligand_fd, ligand_fname = tempfile.mkstemp(suffix='_ligand.pdbqt', text=True)

try:
with open(ligand_fname, 'wt') as f:
f.write(ligand_pdbqt)

p = os.path.realpath(__file__)
python_exec = sys.executable
cmd = f'{python_exec} {os.path.dirname(p)}/vina_dock_cli.py -l {ligand_fname} -p {config["protein"]} ' \
f'-o {output_fname} --center {" ".join(map(str, config["center"]))} ' \
f'--box_size {" ".join(map(str, config["box_size"]))} ' \
f'-e {config["exhaustiveness"]} --seed {config["seed"]} --nposes {config["n_poses"]} -c {config["ncpu"]}'
start_time = timeit.default_timer()
subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True) # this will trigger CalledProcessError and skip next lines)
dock_time = round(timeit.default_timer() - start_time, 1)

with open(output_fname) as f:
res = f.read()
if res:
res = json.loads(res)
mol_block = pdbqt2molblock(res['poses'].split('MODEL')[1], mol, mol_id)
output = {'docking_score': res['docking_score'],
'pdb_block': res['poses'],
'mol_block': mol_block,
'dock_time': dock_time}

except subprocess.CalledProcessError as e:
sys.stderr.write(f'Error caused by docking of {mol_id}\n')
sys.stderr.write(str(e) + '\n')
sys.stderr.write('STDERR output:\n')
sys.stderr.write(e.stderr + '\n')
sys.stderr.flush()
output = None

finally:
os.close(output_fd)
os.close(ligand_fd)
os.unlink(ligand_fname)
os.unlink(output_fname)
dock_output_conformer_list = []
start_time = timeit.default_timer()
for ligand_pdbqt in ligand_pdbqt_list:
output_fd, output_fname = tempfile.mkstemp(suffix='_output.json', text=True)
ligand_fd, ligand_fname = tempfile.mkstemp(suffix='_ligand.pdbqt', text=True)

try:
with open(ligand_fname, 'wt') as f:
f.write(ligand_pdbqt)

p = os.path.realpath(__file__)
python_exec = sys.executable
cmd = f'{python_exec} {os.path.dirname(p)}/vina_dock_cli.py -l {ligand_fname} -p {config["protein"]} ' \
f'-o {output_fname} --center {" ".join(map(str, config["center"]))} ' \
f'--box_size {" ".join(map(str, config["box_size"]))} ' \
f'-e {config["exhaustiveness"]} --seed {config["seed"]} --nposes {config["n_poses"]} -c {config["ncpu"]}'
subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True) # this will trigger CalledProcessError and skip next lines)

with open(output_fname) as f:
res = f.read()
if res:
res = json.loads(res)
mol_block = pdbqt2molblock(res['poses'].split('MODEL')[1], mol, mol_id)
output = {'docking_score': res['docking_score'],
'pdb_block': res['poses'],
'mol_block': mol_block}

dock_output_conformer_list.append(output)

except subprocess.CalledProcessError as e:
sys.stderr.write(f'Error caused by docking of {mol_id}\n')
sys.stderr.write(str(e) + '\n')
sys.stderr.write('STDERR output:\n')
sys.stderr.write(e.stderr + '\n')
sys.stderr.flush()

finally:
os.close(output_fd)
os.close(ligand_fd)
os.unlink(ligand_fname)
os.unlink(output_fname)

dock_time = round(timeit.default_timer() - start_time, 1)
print(f'[For Testing Only Sanity check]: There are {len(dock_output_conformer_list)} {mol_id} conformers that has been docked')
print(f'\n')
docking_score_list = [float(conformer_output['docking_score']) for conformer_output in dock_output_conformer_list]
DrrDom marked this conversation as resolved.
Show resolved Hide resolved

if not docking_score_list:
return mol_id, None

output = dock_output_conformer_list[docking_score_list.index(min(docking_score_list))]
output['dock_time'] = dock_time

return mol_id, output

Expand Down