Initial commit v0.2

Kexin-Zhang-UCAS · Aug 25, 2021 · edc19f5 · edc19f5
commit edc19f5
Show file tree

Hide file tree

Showing 13 changed files with 2,201 additions and 0 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include alphafold2mmopt *.txt
diff --git a/README.md b/README.md
@@ -0,0 +1,41 @@
+# alphafold2mmopt
+
+Based on `alphafold2`, `alphafoldmmopt` is developed to optimize molecules from any protein-generation model.
+
+## dependence
+
+```shell
+conda install openmm
+pip install dm-tree
+pip install biopython
+pip install dm-haiku
+conda install -c omnia pdbfixer
+pip install pytorch
+```
+
+## installation and run
+
+* install
+
+```shell
+pip install alphafold2mmopt
+```
+
+* running: for a given any `*.pdb` file
+
+```shell
+# shell
+alphafold2mmopt *.pdb
+```
+```python
+# python
+from proteinopt.common.protein import Protein
+from proteinopt.relax.localminimizer import relax
+prot = Protein(pdbname)
+opt_pdb=relax(prot)
+```
+# Updation
+## v0.2
+> 1. Use pytorch instead of Jax to accelerate lDDT matrix calculation.
+> > The lDDT calculation time can be reduced from 3s to 0.1s.
+> 2. Reconstruct api for Protein object and reduce repeated clean operation.
diff --git a/alphafold2mmopt/__init__.py b/alphafold2mmopt/__init__.py
@@ -0,0 +1,3 @@
+from . import relax
+from . import common
+from . import utils
diff --git a/alphafold2mmopt/common/__init__.py b/alphafold2mmopt/common/__init__.py
@@ -0,0 +1,3 @@
+from . import residue_constants
+from . import protein
+from . import cleanup
diff --git a/alphafold2mmopt/common/cleanup.py b/alphafold2mmopt/common/cleanup.py
@@ -0,0 +1,144 @@
+import io
+import pdbfixer
+from simtk.openmm import app
+from simtk.openmm.app import element
+import time
+import simtk.openmm as openmm
+import numpy as np
+from simtk import unit
+
+ENERGY = unit.kilocalories_per_mole
+LENGTH = unit.angstroms
+
+def fix_pdb(pdbfile, alterations_info):
+    """Apply pdbfixer to the contents of a PDB file; return a PDB string result.
+
+    1) Replaces nonstandard residues.
+    2) Removes heterogens (non protein residues) including water.
+    3) Adds missing residues and missing atoms within existing residues.
+    4) Adds hydrogens assuming pH=7.0.
+    5) KeepIds is currently true, so the fixer must keep the existing chain and
+       residue identifiers. This will fail for some files in wider PDB that have
+       invalid IDs.
+
+    Args:
+      pdbfile: Input PDB file handle.
+      alterations_info: A dict that will store details of changes made.
+
+    Returns:
+      A PDB string representing the fixed structure.
+    """
+
+    fixer = pdbfixer.PDBFixer(pdbfile=pdbfile)
+
+    fixer.findNonstandardResidues()
+
+    alterations_info['nonstandard_residues'] = fixer.nonstandardResidues
+    fixer.replaceNonstandardResidues()
+    _remove_heterogens(fixer, alterations_info, keep_water=False)
+
+    fixer.findMissingResidues()
+    alterations_info['missing_residues'] = fixer.missingResidues
+    fixer.findMissingAtoms()
+    alterations_info['missing_heavy_atoms'] = fixer.missingAtoms
+    alterations_info['missing_terminals'] = fixer.missingTerminals
+
+    ################################# time comsuption
+    fixer.addMissingAtoms(seed=0)
+    modeller = app.Modeller(fixer.topology, fixer.positions)
+    modeller.addHydrogens(platform=openmm.Platform.getPlatformByName("CUDA"))
+    fixer.topology = modeller.topology
+    fixer.positions = modeller.positions
+
+    #################################
+    # print(time.time() - begin)
+
+    out_handle = io.StringIO()
+    app.PDBFile.writeFile(fixer.topology, fixer.positions, out_handle,
+                          keepIds=True)
+    return out_handle.getvalue()
+
+
+def clean_structure(pdb_structure, alterations_info):
+    """Applies additional fixes to an OpenMM structure, to handle edge cases.
+
+    Args:
+      pdb_structure: An OpenMM structure to modify and fix.
+      alterations_info: A dict that will store details of changes made.
+    """
+    _replace_met_se(pdb_structure, alterations_info)
+    _remove_chains_of_length_one(pdb_structure, alterations_info)
+
+
+def _remove_heterogens(fixer, alterations_info, keep_water):
+    """Removes the residues that Pdbfixer considers to be heterogens.
+
+    Args:
+      fixer: A Pdbfixer instance.
+      alterations_info: A dict that will store details of changes made.
+      keep_water: If True, water (HOH) is not considered to be a heterogen.
+    """
+    initial_resnames = set()
+    for chain in fixer.topology.chains():
+        for residue in chain.residues():
+            initial_resnames.add(residue.name)
+    fixer.removeHeterogens(keepWater=keep_water)
+    final_resnames = set()
+    for chain in fixer.topology.chains():
+        for residue in chain.residues():
+            final_resnames.add(residue.name)
+    alterations_info['removed_heterogens'] = (
+        initial_resnames.difference(final_resnames))
+
+
+def _replace_met_se(pdb_structure, alterations_info):
+    """Replace the Se in any MET residues that were not marked as modified."""
+    modified_met_residues = []
+    for res in pdb_structure.iter_residues():
+        name = res.get_name_with_spaces().strip()
+        if name == 'MET':
+            s_atom = res.get_atom('SD')
+            if s_atom.element_symbol == 'Se':
+                s_atom.element_symbol = 'S'
+                s_atom.element = element.get_by_symbol('S')
+                modified_met_residues.append(s_atom.residue_number)
+    alterations_info['Se_in_MET'] = modified_met_residues
+
+
+def _remove_chains_of_length_one(pdb_structure, alterations_info):
+    """Removes chains that correspond to a single amino acid.
+
+    A single amino acid in a chain is both N and C terminus. There is no force
+    template for this case.
+
+    Args:
+      pdb_structure: An OpenMM pdb_structure to modify and fix.
+      alterations_info: A dict that will store details of changes made.
+    """
+    removed_chains = {}
+    for model in pdb_structure.iter_models():
+        valid_chains = [c for c in model.iter_chains() if len(c) > 1]
+        invalid_chain_ids = [c.chain_id for c in model.iter_chains() if len(c) <= 1]
+        model.chains = valid_chains
+        for chain_id in invalid_chain_ids:
+            model.chains_by_id.pop(chain_id)
+        removed_chains[model.number] = invalid_chain_ids
+    alterations_info['removed_chains'] = removed_chains
+
+def _check_cleaned_atoms(pdb_cleaned_string: str, pdb_ref_string: str):
+    """Checks that no atom positions have been altered by cleaning."""
+    cleaned = app.PDBFile(io.StringIO(pdb_cleaned_string))
+    reference = app.PDBFile(io.StringIO(pdb_ref_string))
+
+    cl_xyz = np.array(cleaned.getPositions().value_in_unit(LENGTH))
+    ref_xyz = np.array(reference.getPositions().value_in_unit(LENGTH))
+
+    for ref_res, cl_res in zip(reference.topology.residues(),
+                               cleaned.topology.residues()):
+        assert ref_res.name == cl_res.name
+        for rat in ref_res.atoms():
+            for cat in cl_res.atoms():
+                if cat.name == rat.name:
+                    if not np.array_equal(cl_xyz[cat.index], ref_xyz[rat.index]):
+                        raise ValueError(f"Coordinates of cleaned atom {cat} do not match "
+                                         f"coordinates of reference atom {rat}.")
diff --git a/alphafold2mmopt/common/protein.py b/alphafold2mmopt/common/protein.py
@@ -0,0 +1,151 @@
+import io
+from typing import Optional
+from Bio.PDB import PDBParser
+import numpy as np
+from . import residue_constants
+from . import cleanup
+from simtk.openmm.app.internal.pdbstructure import PdbStructure
+import simtk.openmm.app as app
+from ..utils import cost
+
+class Protein():
+    def __init__(self,fname):
+        self.atom_positions=""
+        self.aatype=""
+        self.atom_mask=""
+        self.residue_index=""
+        self.b_factors=""
+        self.from_pdb_file(fname)
+    @cost
+    def from_pdb_file(self,fname,chain_id: Optional[str] = None):
+        parser = PDBParser(QUIET=True)
+        structure = parser.get_structure('none', fname)
+        models = list(structure.get_models())
+        assert len(models)==1, f'Only single model_ PDBs are supported. Found {len(models)} models.'
+        model = models[0]
+        if chain_id is not None:
+            chain = model[chain_id]
+        else:
+            chains = list(model.get_chains())
+            assert len(chains)==1, f'Only single chain PDBs are supported when chain_id not specified. Found {len(chains)} chains.'
+            chain = chains[0]
+        atom_positions = []
+        aatype = []
+        atom_mask = []
+        residue_index = []
+        b_factors = []
+        for res in chain:
+            if res.id[2] != ' ':
+                raise ValueError(
+                    f'PDB contains an insertion code at chain {chain.id} and residue '
+                    f'index {res.id[1]}. These are not supported.')
+            res_shortname = residue_constants.restype_3to1.get(res.resname, 'X')
+            restype_idx = residue_constants.restype_order.get(
+                res_shortname, residue_constants.restype_num)
+            pos = np.zeros((residue_constants.atom_type_num, 3))
+            mask = np.zeros((residue_constants.atom_type_num,))
+            res_b_factors = np.zeros((residue_constants.atom_type_num,))
+            for atom in res:
+                if atom.name not in residue_constants.atom_types:
+                    continue
+                pos[residue_constants.atom_order[atom.name]] = atom.coord
+                mask[residue_constants.atom_order[atom.name]] = 1.
+                res_b_factors[residue_constants.atom_order[atom.name]] = atom.bfactor
+            if np.sum(mask) < 0.5:
+                # If no known atom positions are reported for the residue then skip it.
+                continue
+            aatype.append(restype_idx)
+            atom_positions.append(pos)
+            atom_mask.append(mask)
+            residue_index.append(res.id[1])
+            b_factors.append(res_b_factors)
+        self.aatype=np.array(aatype)
+        self.atom_mask=np.array(atom_mask)
+        self.atom_positions=np.array(atom_positions)
+        self.residue_index=np.array(residue_index)
+        self.b_factors=np.array(b_factors)
+    def to_pdb(self):
+        restypes = residue_constants.restypes + ['X']
+        res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], 'UNK')
+        atom_types = residue_constants.atom_types
+        pdb_lines = []
+        # residue_index = self.residue_index.astype(np.int32)
+        if np.any(self.aatype > residue_constants.restype_num):
+            raise ValueError('Invalid aatypes.')
+
+        pdb_lines.append('MODEL     1')
+        atom_index = 1
+        chain_id = 'A'
+        # Add all atom sites.
+        for i in range(self.aatype.shape[0]):
+            res_name_3 = res_1to3(self.aatype[i])
+            for atom_name, pos, mask, b_factor in zip(
+                    atom_types, self.atom_positions[i], self.atom_mask[i], self.b_factors[i]):
+                if mask < 0.5:
+                    continue
+
+                record_type = 'ATOM'
+                name = atom_name if len(atom_name) == 4 else f' {atom_name}'
+                alt_loc = ''
+                insertion_code = ''
+                occupancy = 1.00
+                element = atom_name[0]  # Protein supports only C, N, O, S, this works.
+                charge = ''
+                # PDB is a columnar format, every space matters here!
+                atom_line = (f'{record_type:<6}{atom_index:>5} {name:<4}{alt_loc:>1}'
+                             f'{res_name_3:>3} {chain_id:>1}'
+                             f'{self.residue_index[i]:>4}{insertion_code:>1}   '
+                             f'{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}'
+                             f'{occupancy:>6.2f}{b_factor:>6.2f}          '
+                             f'{element:>2}{charge:>2}')
+                pdb_lines.append(atom_line)
+                atom_index += 1
+
+        # Close the chain.
+        chain_end = 'TER'
+        chain_termination_line = (
+            f'{chain_end:<6}{atom_index:>5}      {res_1to3(self.aatype[-1]):>3} '
+            f'{chain_id:>1}{self.residue_index[-1]:>4}')
+        pdb_lines.append(chain_termination_line)
+        pdb_lines.append('ENDMDL')
+
+        pdb_lines.append('END')
+        pdb_lines.append('')
+        return '\n'.join(pdb_lines)
+    def ideal_atom_mask(self):
+        return residue_constants.STANDARD_ATOM_MASK[self.aatype]
+    def from_prediction(self):
+        pass
+
+    def _check_atom_mask_is_ideal(self):
+        """Sanity-check the atom mask is ideal, up to a possible OXT."""
+        # atom_mask = prot.atom_mask
+        ideal_atom_mask = self.ideal_atom_mask()
+        # TODO
+        # utils.assert_equal_nonterminal_atom_types(atom_mask, ideal_atom_mask)
+    @cost
+    def clean_protein(self,checks=True):
+        self._check_atom_mask_is_ideal()
+        # Clean pdb.
+        prot_pdb_string = self.to_pdb()
+        pdb_file = io.StringIO(prot_pdb_string)
+        alterations_info = {}
+        fixed_pdb = cleanup.fix_pdb(pdb_file, alterations_info)
+        fixed_pdb_file = io.StringIO(fixed_pdb)
+        pdb_structure = PdbStructure(fixed_pdb_file)
+        cleanup.clean_structure(pdb_structure, alterations_info)
+        # print("alterations info: %s", alterations_info)
+        # Write pdb file of cleaned structure.
+        as_file = app.PDBFile(pdb_structure)
+        with io.StringIO() as f:
+            as_file.writeFile(as_file.getTopology(), as_file.getPositions(), f)
+            pdb_string = f.getvalue()
+        if checks:
+            cleanup._check_cleaned_atoms(pdb_string, prot_pdb_string)
+        return pdb_string
+
+
+
+
+
+