-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit edc19f5
Showing
13 changed files
with
2,201 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
recursive-include alphafold2mmopt *.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# alphafold2mmopt | ||
|
||
Based on `alphafold2`, `alphafoldmmopt` is developed to optimize molecules from any protein-generation model. | ||
|
||
## dependence | ||
|
||
```shell | ||
conda install openmm | ||
pip install dm-tree | ||
pip install biopython | ||
pip install dm-haiku | ||
conda install -c omnia pdbfixer | ||
pip install pytorch | ||
``` | ||
|
||
## installation and run | ||
|
||
* install | ||
|
||
```shell | ||
pip install alphafold2mmopt | ||
``` | ||
|
||
* running: for a given any `*.pdb` file | ||
|
||
```shell | ||
# shell | ||
alphafold2mmopt *.pdb | ||
``` | ||
```python | ||
# python | ||
from proteinopt.common.protein import Protein | ||
from proteinopt.relax.localminimizer import relax | ||
prot = Protein(pdbname) | ||
opt_pdb=relax(prot) | ||
``` | ||
# Updation | ||
## v0.2 | ||
> 1. Use pytorch instead of Jax to accelerate lDDT matrix calculation. | ||
> > The lDDT calculation time can be reduced from 3s to 0.1s. | ||
> 2. Reconstruct api for Protein object and reduce repeated clean operation. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from . import relax | ||
from . import common | ||
from . import utils |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from . import residue_constants | ||
from . import protein | ||
from . import cleanup |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
import io | ||
import pdbfixer | ||
from simtk.openmm import app | ||
from simtk.openmm.app import element | ||
import time | ||
import simtk.openmm as openmm | ||
import numpy as np | ||
from simtk import unit | ||
|
||
ENERGY = unit.kilocalories_per_mole | ||
LENGTH = unit.angstroms | ||
|
||
def fix_pdb(pdbfile, alterations_info): | ||
"""Apply pdbfixer to the contents of a PDB file; return a PDB string result. | ||
1) Replaces nonstandard residues. | ||
2) Removes heterogens (non protein residues) including water. | ||
3) Adds missing residues and missing atoms within existing residues. | ||
4) Adds hydrogens assuming pH=7.0. | ||
5) KeepIds is currently true, so the fixer must keep the existing chain and | ||
residue identifiers. This will fail for some files in wider PDB that have | ||
invalid IDs. | ||
Args: | ||
pdbfile: Input PDB file handle. | ||
alterations_info: A dict that will store details of changes made. | ||
Returns: | ||
A PDB string representing the fixed structure. | ||
""" | ||
|
||
fixer = pdbfixer.PDBFixer(pdbfile=pdbfile) | ||
|
||
fixer.findNonstandardResidues() | ||
|
||
alterations_info['nonstandard_residues'] = fixer.nonstandardResidues | ||
fixer.replaceNonstandardResidues() | ||
_remove_heterogens(fixer, alterations_info, keep_water=False) | ||
|
||
fixer.findMissingResidues() | ||
alterations_info['missing_residues'] = fixer.missingResidues | ||
fixer.findMissingAtoms() | ||
alterations_info['missing_heavy_atoms'] = fixer.missingAtoms | ||
alterations_info['missing_terminals'] = fixer.missingTerminals | ||
|
||
################################# time comsuption | ||
fixer.addMissingAtoms(seed=0) | ||
modeller = app.Modeller(fixer.topology, fixer.positions) | ||
modeller.addHydrogens(platform=openmm.Platform.getPlatformByName("CUDA")) | ||
fixer.topology = modeller.topology | ||
fixer.positions = modeller.positions | ||
|
||
################################# | ||
# print(time.time() - begin) | ||
|
||
out_handle = io.StringIO() | ||
app.PDBFile.writeFile(fixer.topology, fixer.positions, out_handle, | ||
keepIds=True) | ||
return out_handle.getvalue() | ||
|
||
|
||
def clean_structure(pdb_structure, alterations_info): | ||
"""Applies additional fixes to an OpenMM structure, to handle edge cases. | ||
Args: | ||
pdb_structure: An OpenMM structure to modify and fix. | ||
alterations_info: A dict that will store details of changes made. | ||
""" | ||
_replace_met_se(pdb_structure, alterations_info) | ||
_remove_chains_of_length_one(pdb_structure, alterations_info) | ||
|
||
|
||
def _remove_heterogens(fixer, alterations_info, keep_water): | ||
"""Removes the residues that Pdbfixer considers to be heterogens. | ||
Args: | ||
fixer: A Pdbfixer instance. | ||
alterations_info: A dict that will store details of changes made. | ||
keep_water: If True, water (HOH) is not considered to be a heterogen. | ||
""" | ||
initial_resnames = set() | ||
for chain in fixer.topology.chains(): | ||
for residue in chain.residues(): | ||
initial_resnames.add(residue.name) | ||
fixer.removeHeterogens(keepWater=keep_water) | ||
final_resnames = set() | ||
for chain in fixer.topology.chains(): | ||
for residue in chain.residues(): | ||
final_resnames.add(residue.name) | ||
alterations_info['removed_heterogens'] = ( | ||
initial_resnames.difference(final_resnames)) | ||
|
||
|
||
def _replace_met_se(pdb_structure, alterations_info): | ||
"""Replace the Se in any MET residues that were not marked as modified.""" | ||
modified_met_residues = [] | ||
for res in pdb_structure.iter_residues(): | ||
name = res.get_name_with_spaces().strip() | ||
if name == 'MET': | ||
s_atom = res.get_atom('SD') | ||
if s_atom.element_symbol == 'Se': | ||
s_atom.element_symbol = 'S' | ||
s_atom.element = element.get_by_symbol('S') | ||
modified_met_residues.append(s_atom.residue_number) | ||
alterations_info['Se_in_MET'] = modified_met_residues | ||
|
||
|
||
def _remove_chains_of_length_one(pdb_structure, alterations_info): | ||
"""Removes chains that correspond to a single amino acid. | ||
A single amino acid in a chain is both N and C terminus. There is no force | ||
template for this case. | ||
Args: | ||
pdb_structure: An OpenMM pdb_structure to modify and fix. | ||
alterations_info: A dict that will store details of changes made. | ||
""" | ||
removed_chains = {} | ||
for model in pdb_structure.iter_models(): | ||
valid_chains = [c for c in model.iter_chains() if len(c) > 1] | ||
invalid_chain_ids = [c.chain_id for c in model.iter_chains() if len(c) <= 1] | ||
model.chains = valid_chains | ||
for chain_id in invalid_chain_ids: | ||
model.chains_by_id.pop(chain_id) | ||
removed_chains[model.number] = invalid_chain_ids | ||
alterations_info['removed_chains'] = removed_chains | ||
|
||
def _check_cleaned_atoms(pdb_cleaned_string: str, pdb_ref_string: str): | ||
"""Checks that no atom positions have been altered by cleaning.""" | ||
cleaned = app.PDBFile(io.StringIO(pdb_cleaned_string)) | ||
reference = app.PDBFile(io.StringIO(pdb_ref_string)) | ||
|
||
cl_xyz = np.array(cleaned.getPositions().value_in_unit(LENGTH)) | ||
ref_xyz = np.array(reference.getPositions().value_in_unit(LENGTH)) | ||
|
||
for ref_res, cl_res in zip(reference.topology.residues(), | ||
cleaned.topology.residues()): | ||
assert ref_res.name == cl_res.name | ||
for rat in ref_res.atoms(): | ||
for cat in cl_res.atoms(): | ||
if cat.name == rat.name: | ||
if not np.array_equal(cl_xyz[cat.index], ref_xyz[rat.index]): | ||
raise ValueError(f"Coordinates of cleaned atom {cat} do not match " | ||
f"coordinates of reference atom {rat}.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
import io | ||
from typing import Optional | ||
from Bio.PDB import PDBParser | ||
import numpy as np | ||
from . import residue_constants | ||
from . import cleanup | ||
from simtk.openmm.app.internal.pdbstructure import PdbStructure | ||
import simtk.openmm.app as app | ||
from ..utils import cost | ||
|
||
class Protein(): | ||
def __init__(self,fname): | ||
self.atom_positions="" | ||
self.aatype="" | ||
self.atom_mask="" | ||
self.residue_index="" | ||
self.b_factors="" | ||
self.from_pdb_file(fname) | ||
@cost | ||
def from_pdb_file(self,fname,chain_id: Optional[str] = None): | ||
parser = PDBParser(QUIET=True) | ||
structure = parser.get_structure('none', fname) | ||
models = list(structure.get_models()) | ||
assert len(models)==1, f'Only single model_ PDBs are supported. Found {len(models)} models.' | ||
model = models[0] | ||
if chain_id is not None: | ||
chain = model[chain_id] | ||
else: | ||
chains = list(model.get_chains()) | ||
assert len(chains)==1, f'Only single chain PDBs are supported when chain_id not specified. Found {len(chains)} chains.' | ||
chain = chains[0] | ||
atom_positions = [] | ||
aatype = [] | ||
atom_mask = [] | ||
residue_index = [] | ||
b_factors = [] | ||
for res in chain: | ||
if res.id[2] != ' ': | ||
raise ValueError( | ||
f'PDB contains an insertion code at chain {chain.id} and residue ' | ||
f'index {res.id[1]}. These are not supported.') | ||
res_shortname = residue_constants.restype_3to1.get(res.resname, 'X') | ||
restype_idx = residue_constants.restype_order.get( | ||
res_shortname, residue_constants.restype_num) | ||
pos = np.zeros((residue_constants.atom_type_num, 3)) | ||
mask = np.zeros((residue_constants.atom_type_num,)) | ||
res_b_factors = np.zeros((residue_constants.atom_type_num,)) | ||
for atom in res: | ||
if atom.name not in residue_constants.atom_types: | ||
continue | ||
pos[residue_constants.atom_order[atom.name]] = atom.coord | ||
mask[residue_constants.atom_order[atom.name]] = 1. | ||
res_b_factors[residue_constants.atom_order[atom.name]] = atom.bfactor | ||
if np.sum(mask) < 0.5: | ||
# If no known atom positions are reported for the residue then skip it. | ||
continue | ||
aatype.append(restype_idx) | ||
atom_positions.append(pos) | ||
atom_mask.append(mask) | ||
residue_index.append(res.id[1]) | ||
b_factors.append(res_b_factors) | ||
self.aatype=np.array(aatype) | ||
self.atom_mask=np.array(atom_mask) | ||
self.atom_positions=np.array(atom_positions) | ||
self.residue_index=np.array(residue_index) | ||
self.b_factors=np.array(b_factors) | ||
def to_pdb(self): | ||
restypes = residue_constants.restypes + ['X'] | ||
res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], 'UNK') | ||
atom_types = residue_constants.atom_types | ||
pdb_lines = [] | ||
# residue_index = self.residue_index.astype(np.int32) | ||
if np.any(self.aatype > residue_constants.restype_num): | ||
raise ValueError('Invalid aatypes.') | ||
|
||
pdb_lines.append('MODEL 1') | ||
atom_index = 1 | ||
chain_id = 'A' | ||
# Add all atom sites. | ||
for i in range(self.aatype.shape[0]): | ||
res_name_3 = res_1to3(self.aatype[i]) | ||
for atom_name, pos, mask, b_factor in zip( | ||
atom_types, self.atom_positions[i], self.atom_mask[i], self.b_factors[i]): | ||
if mask < 0.5: | ||
continue | ||
|
||
record_type = 'ATOM' | ||
name = atom_name if len(atom_name) == 4 else f' {atom_name}' | ||
alt_loc = '' | ||
insertion_code = '' | ||
occupancy = 1.00 | ||
element = atom_name[0] # Protein supports only C, N, O, S, this works. | ||
charge = '' | ||
# PDB is a columnar format, every space matters here! | ||
atom_line = (f'{record_type:<6}{atom_index:>5} {name:<4}{alt_loc:>1}' | ||
f'{res_name_3:>3} {chain_id:>1}' | ||
f'{self.residue_index[i]:>4}{insertion_code:>1} ' | ||
f'{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}' | ||
f'{occupancy:>6.2f}{b_factor:>6.2f} ' | ||
f'{element:>2}{charge:>2}') | ||
pdb_lines.append(atom_line) | ||
atom_index += 1 | ||
|
||
# Close the chain. | ||
chain_end = 'TER' | ||
chain_termination_line = ( | ||
f'{chain_end:<6}{atom_index:>5} {res_1to3(self.aatype[-1]):>3} ' | ||
f'{chain_id:>1}{self.residue_index[-1]:>4}') | ||
pdb_lines.append(chain_termination_line) | ||
pdb_lines.append('ENDMDL') | ||
|
||
pdb_lines.append('END') | ||
pdb_lines.append('') | ||
return '\n'.join(pdb_lines) | ||
def ideal_atom_mask(self): | ||
return residue_constants.STANDARD_ATOM_MASK[self.aatype] | ||
def from_prediction(self): | ||
pass | ||
|
||
def _check_atom_mask_is_ideal(self): | ||
"""Sanity-check the atom mask is ideal, up to a possible OXT.""" | ||
# atom_mask = prot.atom_mask | ||
ideal_atom_mask = self.ideal_atom_mask() | ||
# TODO | ||
# utils.assert_equal_nonterminal_atom_types(atom_mask, ideal_atom_mask) | ||
@cost | ||
def clean_protein(self,checks=True): | ||
self._check_atom_mask_is_ideal() | ||
# Clean pdb. | ||
prot_pdb_string = self.to_pdb() | ||
pdb_file = io.StringIO(prot_pdb_string) | ||
alterations_info = {} | ||
fixed_pdb = cleanup.fix_pdb(pdb_file, alterations_info) | ||
fixed_pdb_file = io.StringIO(fixed_pdb) | ||
pdb_structure = PdbStructure(fixed_pdb_file) | ||
cleanup.clean_structure(pdb_structure, alterations_info) | ||
# print("alterations info: %s", alterations_info) | ||
# Write pdb file of cleaned structure. | ||
as_file = app.PDBFile(pdb_structure) | ||
with io.StringIO() as f: | ||
as_file.writeFile(as_file.getTopology(), as_file.getPositions(), f) | ||
pdb_string = f.getvalue() | ||
if checks: | ||
cleanup._check_cleaned_atoms(pdb_string, prot_pdb_string) | ||
return pdb_string | ||
|
||
|
||
|
||
|
||
|
||
|
Oops, something went wrong.