Merge pull request #576 from padix-key/repair

Add repair functions
biotite-dev · May 30, 2024 · 427cea1 · 427cea1
2 parents 7ae1da6 + 4dc6c91
commit 427cea1
Show file tree

Hide file tree

Showing 11 changed files with 324 additions and 179 deletions.
diff --git a/doc/apidoc.json b/doc/apidoc.json
@@ -284,6 +284,12 @@
             "check_bond_continuity",
             "check_linear_continuity"
         ],
+        "Repair" : [
+            "renumber_atom_ids",
+            "renumber_res_ids",
+            "create_continuous_res_ids",
+            "infer_elements"
+        ],
         "Residue level utility" : [
             "get_residue_starts",
             "get_residues",

diff --git a/src/biotite/structure/__init__.py b/src/biotite/structure/__init__.py
@@ -120,6 +120,7 @@
 from .molecules import *
 from .pseudoknots import *
 from .rdf import *
+from .repair import *
 from .residues import *
 from .chains import *
 from .sasa import *

diff --git a/src/biotite/structure/integrity.py b/src/biotite/structure/integrity.py
@@ -12,7 +12,7 @@
 __all__ = ["check_id_continuity", "check_atom_id_continuity",
            "check_res_id_continuity", "check_backbone_continuity",
            "check_duplicate_atoms", "check_bond_continuity",
-           "check_linear_continuity", "renumber_atom_ids", "renumber_res_ids"]
+           "check_linear_continuity"]
 
 import numpy as np
 import warnings
@@ -32,17 +32,17 @@ def check_id_continuity(array):
     """
     Check if the residue IDs are incremented by more than 1 or
     decremented, from one atom to the next one.
-    
+
     An increment by more than 1 is as strong clue for missing residues,
     a decrement means probably a start of a new chain.
 
     DEPRECATED: Use :func:`check_res_id_continuity()` instead.
-    
+
     Parameters
     ----------
     array : AtomArray or AtomArrayStack
         The array to be checked.
-    
+
     Returns
     -------
     discontinuity : ndarray, dtype=int
@@ -60,14 +60,14 @@ def check_atom_id_continuity(array):
     """
     Check if the atom IDs are incremented by more than 1 or
     decremented, from one atom to the next one.
-    
+
     An increment by more than 1 is as strong clue for missing atoms.
-    
+
     Parameters
     ----------
     array : AtomArray or AtomArrayStack
         The array to be checked.
-    
+
     Returns
     -------
     discontinuity : ndarray, dtype=int
@@ -81,15 +81,15 @@ def check_res_id_continuity(array):
     """
     Check if the residue IDs are incremented by more than 1 or
     decremented, from one atom to the next one.
-    
+
     An increment by more than 1 is as strong clue for missing residues,
     a decrement means probably a start of a new chain.
-    
+
     Parameters
     ----------
     array : AtomArray or AtomArrayStack
         The array to be checked.
-    
+
     Returns
     -------
     discontinuity : ndarray, dtype=int
@@ -168,7 +168,7 @@ def check_backbone_continuity(array, min_len=1.2, max_len=1.8):
     """
     Check if the (peptide or phosphate) backbone atoms have
     non-reasonable distance to the next atom.
-    
+
     A large or very small distance is a very strong clue, that there is
     no bond between those atoms, therefore the chain is discontinued.
 
@@ -206,16 +206,16 @@ def check_duplicate_atoms(array):
     """
     Check if a structure contains duplicate atoms, i.e. two atoms in a
     structure have the same annotations (coordinates may be different).
-    
+
     Duplicate atoms may appear, when a structure has occupancy for an
     atom at two or more positions or when the *altloc* positions are
     improperly read.
-    
+
     Parameters
     ----------
     array : AtomArray or AtomArrayStack
         The array to be checked.
-    
+
     Returns
     -------
     duplicate : ndarray, dtype=int
@@ -232,7 +232,7 @@ def check_duplicate_atoms(array):
         for annot in annots:
             # For each annotation array filter out the atoms until
             # index i that have an unequal annotation
-            # to the atom at index i 
+            # to the atom at index i
             is_dublicate &= (annot[:i] == annot[i])
         # After checking all annotation arrays,
         # if there still is any duplicate to the atom at index i,
@@ -255,7 +255,7 @@ def check_in_box(array):
     ----------
     array : AtomArray or AtomArrayStack
         The array to be checked.
-    
+
     Returns
     -------
     outside : ndarray, dtype=int
@@ -266,54 +266,3 @@ def check_in_box(array):
     box = array.box
     fractions = coord_to_fraction(array, box)
     return np.where(((fractions >= 0) & (fractions < 1)).all(axis=-1))[0]
-
-
-def renumber_atom_ids(array, start=None):
-    """
-    Renumber the atom IDs of the given array.
-
-    Parameters
-    ----------
-    array : AtomArray or AtomArrayStack
-        The array to be checked.
-    start : int, optional
-        The starting index for renumbering.
-        The first ID in the array is taken by default.
-
-    Returns
-    -------
-    array : AtomArray or AtomArrayStack
-        The renumbered array.
-    """
-    if "atom_id" not in array.get_annotation_categories():
-        raise ValueError("The atom array must have the 'atom_id' annotation")
-    if start is None:
-        start = array.atom_id[0]
-    array.atom_id = np.arange(start, array.shape[-1]+1)
-    return array
-
-
-def renumber_res_ids(array, start=None):
-    """
-    Renumber the residue IDs of the given array.
-
-    Parameters
-    ----------
-    array : AtomArray or AtomArrayStack
-        The array to be checked.
-    start : int, optional
-        The starting index for renumbering.
-        The first ID in the array is taken by default.
-    
-    Returns
-    -------
-    array : AtomArray or AtomArrayStack
-        The renumbered array.
-    """
-    if start is None:
-        start = array.res_id[0]
-    diff = np.diff(array.res_id)
-    diff[diff != 0] = 1
-    new_res_ids =  np.concatenate(([start], diff)).cumsum()
-    array.res_id = new_res_ids
-    return array
diff --git a/src/biotite/structure/io/general.py b/src/biotite/structure/io/general.py
@@ -13,7 +13,7 @@
 
 import os.path
 import io
-from ..atoms import AtomArray, AtomArrayStack
+from ..atoms import AtomArrayStack
 
 
 def load_structure(file_path, template=None, **kwargs):
@@ -224,42 +224,3 @@ def _as_single_model_if_possible(atoms):
         return atoms[0]
     else:
         return atoms
-
-
-# Helper function to estimate elements from atom names
-_elements = [elem.upper() for elem in
-["H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg",
-"Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe",
-"Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y",
-"Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te",
-"I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb",
-"Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt",
-"Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa",
-"U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf",
-"Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts",
-"Og"]
-]
-def _guess_element(atom_name):
-    # remove digits (1H -> H)
-    elem = "".join([i for i in atom_name if not i.isdigit()])
-    elem = elem.upper()
-    if len(elem) == 0:
-        return ""
-
-    # Some often used elements for biomolecules
-    if elem.startswith("C") or elem.startswith("N") or \
-        elem.startswith("O") or elem.startswith("S") or \
-        elem.startswith("H"):
-        return elem[0]
-
-    # Exactly match element abbreviations
-    try:
-        return _elements[_elements.index(elem[:2])]
-    except ValueError:
-        try:
-            return _elements[_elements.index(elem[0])]
-        except ValueError:
-            pass
-
-    return ""
-
diff --git a/src/biotite/structure/io/gro/file.py b/src/biotite/structure/io/gro/file.py
@@ -10,7 +10,7 @@
 from ...atoms import AtomArray, AtomArrayStack
 from ...box import is_orthogonal
 from ....file import TextFile, InvalidFileError
-from ..general import _guess_element as guess_element
+from ...repair import infer_elements
 from ...error import BadStructureError
 import copy
 from datetime import datetime
@@ -38,15 +38,15 @@ class GROFile(TextFile):
     --------
     Load a `\\*.gro` file, modify the structure and save the new
     structure into a new file:
-    
+
     >>> import os.path
     >>> file = GROFile.read(os.path.join(path_to_structures, "1l2y.gro"))
     >>> array_stack = file.get_structure()
     >>> array_stack_mod = rotate(array_stack, [1,2,3])
     >>> file = GROFile()
     >>> file.set_structure(array_stack_mod)
     >>> file.write(os.path.join(path_to_directory, "1l2y_mod.gro"))
-    
+
     """
     def get_model_count(self):
         """
@@ -68,7 +68,7 @@ def get_structure(self, model=None):
         """
         Get an :class:`AtomArray` or :class:`AtomArrayStack` from the
         GRO file.
-        
+
         Parameters
         ----------
         model : int, optional
@@ -80,21 +80,21 @@ def get_structure(self, model=None):
             If this parameter is omitted, an :class:`AtomArrayStack`
             containing all models will be returned, even if the
             structure contains only one model.
-        
+
         Returns
         -------
         array : AtomArray or AtomArrayStack
             The return type depends on the `model` parameter.
         """
-        
+
         def get_atom_line_i(model_start_i, model_atom_counts):
             """
             Helper function to get the indices of all atoms for a model
             """
             return np.arange(
                 model_start_i+1, model_start_i+1+model_atom_counts
             )
-        
+
         def set_box_dimen(box_param):
             """
             Helper function to create the box vectors from the values
@@ -104,7 +104,7 @@ def set_box_dimen(box_param):
             ----------
             box_param : list of float
                 The box dimensions in the GRO file.
-            
+
             Returns
             -------
             box_vectors : ndarray, dtype=float, shape=(3,3)
@@ -171,7 +171,7 @@ def set_box_dimen(box_param):
             array.res_id[i] = int(line[0:5])
             array.res_name[i] = line[5:10].strip()
             array.atom_name[i] = line[10:15].strip()
-            array.element[i] = guess_element(line[10:15].strip())
+        array.element = infer_elements(array.atom_name)
 
         # Fill in coordinates and boxes
         if isinstance(array, AtomArray):
@@ -186,7 +186,7 @@ def set_box_dimen(box_param):
             box_i = atom_i[-1] + 1
             box_param = [float(e)*10 for e in self.lines[box_i].split()]
             array.box = set_box_dimen(box_param)
-        
+
         elif isinstance(array, AtomArrayStack):
             for m in range(len(model_start_i)):
                 atom_i = get_atom_line_i(
@@ -204,18 +204,18 @@ def set_box_dimen(box_param):
                 # Create a box in the stack if not already existing
                 # and the box is not a dummy
                 if box is not None:
-                    if array.box is None: 
+                    if array.box is None:
                         array.box = np.zeros((array.stack_depth(), 3, 3))
                     array.box[m] = box
-                    
+
         return array
 
-            
+
     def set_structure(self, array):
         """
         Set the :class:`AtomArray` or :class:`AtomArrayStack` for the
         file.
-        
+
         Parameters
         ----------
         array : AtomArray or AtomArrayStack
@@ -235,7 +235,7 @@ def get_box_dimen(array):
             ----------
             array : AtomArray
                 The atom array to get the box dimensions from.
-            
+
             Returns
             -------
             box : str
@@ -259,7 +259,7 @@ def get_box_dimen(array):
                         box[2,0], box[2,1],
                     )
                     return " ".join([f"{e:>9.5f}" for e in box_elements])
-        
+
         if "atom_id" in array.get_annotation_categories():
             atom_id = array.atom_id
         else: