Skip to content

Commit

Permalink
Initial commit v0.2
Browse files Browse the repository at this point in the history
  • Loading branch information
Kexin-Zhang-UCAS committed Aug 25, 2021
0 parents commit edc19f5
Show file tree
Hide file tree
Showing 13 changed files with 2,201 additions and 0 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
recursive-include alphafold2mmopt *.txt
41 changes: 41 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# alphafold2mmopt

Based on `alphafold2`, `alphafoldmmopt` is developed to optimize molecules from any protein-generation model.

## dependence

```shell
conda install openmm
pip install dm-tree
pip install biopython
pip install dm-haiku
conda install -c omnia pdbfixer
pip install pytorch
```

## installation and run

* install

```shell
pip install alphafold2mmopt
```

* running: for a given any `*.pdb` file

```shell
# shell
alphafold2mmopt *.pdb
```
```python
# python
from proteinopt.common.protein import Protein
from proteinopt.relax.localminimizer import relax
prot = Protein(pdbname)
opt_pdb=relax(prot)
```
# Updation
## v0.2
> 1. Use pytorch instead of Jax to accelerate lDDT matrix calculation.
> > The lDDT calculation time can be reduced from 3s to 0.1s.
> 2. Reconstruct api for Protein object and reduce repeated clean operation.
3 changes: 3 additions & 0 deletions alphafold2mmopt/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from . import relax
from . import common
from . import utils
3 changes: 3 additions & 0 deletions alphafold2mmopt/common/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from . import residue_constants
from . import protein
from . import cleanup
144 changes: 144 additions & 0 deletions alphafold2mmopt/common/cleanup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import io
import pdbfixer
from simtk.openmm import app
from simtk.openmm.app import element
import time
import simtk.openmm as openmm
import numpy as np
from simtk import unit

ENERGY = unit.kilocalories_per_mole
LENGTH = unit.angstroms

def fix_pdb(pdbfile, alterations_info):
"""Apply pdbfixer to the contents of a PDB file; return a PDB string result.
1) Replaces nonstandard residues.
2) Removes heterogens (non protein residues) including water.
3) Adds missing residues and missing atoms within existing residues.
4) Adds hydrogens assuming pH=7.0.
5) KeepIds is currently true, so the fixer must keep the existing chain and
residue identifiers. This will fail for some files in wider PDB that have
invalid IDs.
Args:
pdbfile: Input PDB file handle.
alterations_info: A dict that will store details of changes made.
Returns:
A PDB string representing the fixed structure.
"""

fixer = pdbfixer.PDBFixer(pdbfile=pdbfile)

fixer.findNonstandardResidues()

alterations_info['nonstandard_residues'] = fixer.nonstandardResidues
fixer.replaceNonstandardResidues()
_remove_heterogens(fixer, alterations_info, keep_water=False)

fixer.findMissingResidues()
alterations_info['missing_residues'] = fixer.missingResidues
fixer.findMissingAtoms()
alterations_info['missing_heavy_atoms'] = fixer.missingAtoms
alterations_info['missing_terminals'] = fixer.missingTerminals

################################# time comsuption
fixer.addMissingAtoms(seed=0)
modeller = app.Modeller(fixer.topology, fixer.positions)
modeller.addHydrogens(platform=openmm.Platform.getPlatformByName("CUDA"))
fixer.topology = modeller.topology
fixer.positions = modeller.positions

#################################
# print(time.time() - begin)

out_handle = io.StringIO()
app.PDBFile.writeFile(fixer.topology, fixer.positions, out_handle,
keepIds=True)
return out_handle.getvalue()


def clean_structure(pdb_structure, alterations_info):
"""Applies additional fixes to an OpenMM structure, to handle edge cases.
Args:
pdb_structure: An OpenMM structure to modify and fix.
alterations_info: A dict that will store details of changes made.
"""
_replace_met_se(pdb_structure, alterations_info)
_remove_chains_of_length_one(pdb_structure, alterations_info)


def _remove_heterogens(fixer, alterations_info, keep_water):
"""Removes the residues that Pdbfixer considers to be heterogens.
Args:
fixer: A Pdbfixer instance.
alterations_info: A dict that will store details of changes made.
keep_water: If True, water (HOH) is not considered to be a heterogen.
"""
initial_resnames = set()
for chain in fixer.topology.chains():
for residue in chain.residues():
initial_resnames.add(residue.name)
fixer.removeHeterogens(keepWater=keep_water)
final_resnames = set()
for chain in fixer.topology.chains():
for residue in chain.residues():
final_resnames.add(residue.name)
alterations_info['removed_heterogens'] = (
initial_resnames.difference(final_resnames))


def _replace_met_se(pdb_structure, alterations_info):
"""Replace the Se in any MET residues that were not marked as modified."""
modified_met_residues = []
for res in pdb_structure.iter_residues():
name = res.get_name_with_spaces().strip()
if name == 'MET':
s_atom = res.get_atom('SD')
if s_atom.element_symbol == 'Se':
s_atom.element_symbol = 'S'
s_atom.element = element.get_by_symbol('S')
modified_met_residues.append(s_atom.residue_number)
alterations_info['Se_in_MET'] = modified_met_residues


def _remove_chains_of_length_one(pdb_structure, alterations_info):
"""Removes chains that correspond to a single amino acid.
A single amino acid in a chain is both N and C terminus. There is no force
template for this case.
Args:
pdb_structure: An OpenMM pdb_structure to modify and fix.
alterations_info: A dict that will store details of changes made.
"""
removed_chains = {}
for model in pdb_structure.iter_models():
valid_chains = [c for c in model.iter_chains() if len(c) > 1]
invalid_chain_ids = [c.chain_id for c in model.iter_chains() if len(c) <= 1]
model.chains = valid_chains
for chain_id in invalid_chain_ids:
model.chains_by_id.pop(chain_id)
removed_chains[model.number] = invalid_chain_ids
alterations_info['removed_chains'] = removed_chains

def _check_cleaned_atoms(pdb_cleaned_string: str, pdb_ref_string: str):
"""Checks that no atom positions have been altered by cleaning."""
cleaned = app.PDBFile(io.StringIO(pdb_cleaned_string))
reference = app.PDBFile(io.StringIO(pdb_ref_string))

cl_xyz = np.array(cleaned.getPositions().value_in_unit(LENGTH))
ref_xyz = np.array(reference.getPositions().value_in_unit(LENGTH))

for ref_res, cl_res in zip(reference.topology.residues(),
cleaned.topology.residues()):
assert ref_res.name == cl_res.name
for rat in ref_res.atoms():
for cat in cl_res.atoms():
if cat.name == rat.name:
if not np.array_equal(cl_xyz[cat.index], ref_xyz[rat.index]):
raise ValueError(f"Coordinates of cleaned atom {cat} do not match "
f"coordinates of reference atom {rat}.")
151 changes: 151 additions & 0 deletions alphafold2mmopt/common/protein.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import io
from typing import Optional
from Bio.PDB import PDBParser
import numpy as np
from . import residue_constants
from . import cleanup
from simtk.openmm.app.internal.pdbstructure import PdbStructure
import simtk.openmm.app as app
from ..utils import cost

class Protein():
def __init__(self,fname):
self.atom_positions=""
self.aatype=""
self.atom_mask=""
self.residue_index=""
self.b_factors=""
self.from_pdb_file(fname)
@cost
def from_pdb_file(self,fname,chain_id: Optional[str] = None):
parser = PDBParser(QUIET=True)
structure = parser.get_structure('none', fname)
models = list(structure.get_models())
assert len(models)==1, f'Only single model_ PDBs are supported. Found {len(models)} models.'
model = models[0]
if chain_id is not None:
chain = model[chain_id]
else:
chains = list(model.get_chains())
assert len(chains)==1, f'Only single chain PDBs are supported when chain_id not specified. Found {len(chains)} chains.'
chain = chains[0]
atom_positions = []
aatype = []
atom_mask = []
residue_index = []
b_factors = []
for res in chain:
if res.id[2] != ' ':
raise ValueError(
f'PDB contains an insertion code at chain {chain.id} and residue '
f'index {res.id[1]}. These are not supported.')
res_shortname = residue_constants.restype_3to1.get(res.resname, 'X')
restype_idx = residue_constants.restype_order.get(
res_shortname, residue_constants.restype_num)
pos = np.zeros((residue_constants.atom_type_num, 3))
mask = np.zeros((residue_constants.atom_type_num,))
res_b_factors = np.zeros((residue_constants.atom_type_num,))
for atom in res:
if atom.name not in residue_constants.atom_types:
continue
pos[residue_constants.atom_order[atom.name]] = atom.coord
mask[residue_constants.atom_order[atom.name]] = 1.
res_b_factors[residue_constants.atom_order[atom.name]] = atom.bfactor
if np.sum(mask) < 0.5:
# If no known atom positions are reported for the residue then skip it.
continue
aatype.append(restype_idx)
atom_positions.append(pos)
atom_mask.append(mask)
residue_index.append(res.id[1])
b_factors.append(res_b_factors)
self.aatype=np.array(aatype)
self.atom_mask=np.array(atom_mask)
self.atom_positions=np.array(atom_positions)
self.residue_index=np.array(residue_index)
self.b_factors=np.array(b_factors)
def to_pdb(self):
restypes = residue_constants.restypes + ['X']
res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], 'UNK')
atom_types = residue_constants.atom_types
pdb_lines = []
# residue_index = self.residue_index.astype(np.int32)
if np.any(self.aatype > residue_constants.restype_num):
raise ValueError('Invalid aatypes.')

pdb_lines.append('MODEL 1')
atom_index = 1
chain_id = 'A'
# Add all atom sites.
for i in range(self.aatype.shape[0]):
res_name_3 = res_1to3(self.aatype[i])
for atom_name, pos, mask, b_factor in zip(
atom_types, self.atom_positions[i], self.atom_mask[i], self.b_factors[i]):
if mask < 0.5:
continue

record_type = 'ATOM'
name = atom_name if len(atom_name) == 4 else f' {atom_name}'
alt_loc = ''
insertion_code = ''
occupancy = 1.00
element = atom_name[0] # Protein supports only C, N, O, S, this works.
charge = ''
# PDB is a columnar format, every space matters here!
atom_line = (f'{record_type:<6}{atom_index:>5} {name:<4}{alt_loc:>1}'
f'{res_name_3:>3} {chain_id:>1}'
f'{self.residue_index[i]:>4}{insertion_code:>1} '
f'{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}'
f'{occupancy:>6.2f}{b_factor:>6.2f} '
f'{element:>2}{charge:>2}')
pdb_lines.append(atom_line)
atom_index += 1

# Close the chain.
chain_end = 'TER'
chain_termination_line = (
f'{chain_end:<6}{atom_index:>5} {res_1to3(self.aatype[-1]):>3} '
f'{chain_id:>1}{self.residue_index[-1]:>4}')
pdb_lines.append(chain_termination_line)
pdb_lines.append('ENDMDL')

pdb_lines.append('END')
pdb_lines.append('')
return '\n'.join(pdb_lines)
def ideal_atom_mask(self):
return residue_constants.STANDARD_ATOM_MASK[self.aatype]
def from_prediction(self):
pass

def _check_atom_mask_is_ideal(self):
"""Sanity-check the atom mask is ideal, up to a possible OXT."""
# atom_mask = prot.atom_mask
ideal_atom_mask = self.ideal_atom_mask()
# TODO
# utils.assert_equal_nonterminal_atom_types(atom_mask, ideal_atom_mask)
@cost
def clean_protein(self,checks=True):
self._check_atom_mask_is_ideal()
# Clean pdb.
prot_pdb_string = self.to_pdb()
pdb_file = io.StringIO(prot_pdb_string)
alterations_info = {}
fixed_pdb = cleanup.fix_pdb(pdb_file, alterations_info)
fixed_pdb_file = io.StringIO(fixed_pdb)
pdb_structure = PdbStructure(fixed_pdb_file)
cleanup.clean_structure(pdb_structure, alterations_info)
# print("alterations info: %s", alterations_info)
# Write pdb file of cleaned structure.
as_file = app.PDBFile(pdb_structure)
with io.StringIO() as f:
as_file.writeFile(as_file.getTopology(), as_file.getPositions(), f)
pdb_string = f.getvalue()
if checks:
cleanup._check_cleaned_atoms(pdb_string, prot_pdb_string)
return pdb_string






Loading

0 comments on commit edc19f5

Please sign in to comment.