Add Protein Blocks alphabet

biotite-dev · Oct 17, 2024 · 9a9c94b · 9a9c94b
1 parent 1f487cc
commit 9a9c94b
Show file tree

Hide file tree

Showing 12 changed files with 337 additions and 4 deletions.
diff --git a/doc/apidoc.json b/doc/apidoc.json
@@ -384,10 +384,12 @@
     },
     "biotite.structure.alphabet" : {
         "Structural alphabets": [
-            "I3DSequence"
+            "I3DSequence",
+            "ProteinBlocksAlphabet"
         ],
         "Conversion Function": [
-            "to_3di"
+            "to_3di",
+            "to_protein_blocks"
         ]
     }
 }
diff --git a/src/biotite/sequence/align/matrix.py b/src/biotite/sequence/align/matrix.py
@@ -418,3 +418,46 @@ def std_3di_matrix():
         from biotite.structure.alphabet.i3d import I3DSequence
 
         return SubstitutionMatrix(I3DSequence.alphabet, I3DSequence.alphabet, "3Di")
+
+    @staticmethod
+    @functools.cache
+    def std_protein_blocks_matrix(unknown_match=200, unkown_mismatch=-200):
+        """
+        Get the substitution matrix for Protein Blocks sequences.
+
+        The matrix is adapted from *PBxplore* :footcite:`Barnoud2017`.
+
+        Parameters
+        ----------
+        unknown_match, unkown_mismatch : int, optional
+            The match and mismatch score for unknown/missing residues.
+            The default values were chose arbitrarily.
+
+        Returns
+        -------
+        matrix : SubstitutionMatrix
+            Default matrix.
+
+        References
+        ----------
+
+        .. footbibliography::
+
+        """
+        from biotite.structure.alphabet.pb import ProteinBlocksSequence
+
+        alphabet = ProteinBlocksSequence.alphabet
+        unknown_symbol = ProteinBlocksSequence.unknown_symbol
+        matrix_dict = SubstitutionMatrix.dict_from_db("PB")
+        # Add match/mismatch scores for unknown/missing residues
+        for symbol in alphabet:
+            if symbol == unknown_symbol:
+                continue
+            matrix_dict[symbol, unknown_symbol] = unkown_mismatch
+            matrix_dict[unknown_symbol, symbol] = unkown_mismatch
+        matrix_dict[unknown_symbol, unknown_symbol] = unknown_match
+        return SubstitutionMatrix(
+            alphabet,
+            alphabet,
+            matrix_dict,
+        )
diff --git a/src/biotite/sequence/align/matrix_data/PB.license b/src/biotite/sequence/align/matrix_data/PB.license
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2013 Poulain, A. G. de Brevern
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/src/biotite/sequence/align/matrix_data/PB.mat b/src/biotite/sequence/align/matrix_data/PB.mat
@@ -0,0 +1,18 @@
+# PB substitution matrix, adapted from PBxplore
+   a     b     c     d     e     f     g     h     i     j     k     l     m     n     o     p
+a  516   -59   113  -105  -411  -177   -27  -361    47  -103  -644  -259  -599  -372  -124   -83
+b  -59   541  -146  -210  -155  -310   -97    90   182  -128   -30    29  -745  -242  -165    22
+c  113  -146   360   -14  -333  -240    49  -438  -269  -282  -688  -682  -608  -455  -147     6
+d -105  -210   -14   221     5  -131  -349  -278  -253  -173  -585  -670 -1573 -1048  -691  -497
+e -411  -155  -333     5   520   185   186   138  -378   -70  -112  -514 -1136  -469  -617  -632
+f -177  -310  -240  -131   185   459   -99   -45  -445    83  -214   -88  -547  -629  -406  -552
+g  -27   -97    49  -349   186   -99   665   -99   -89  -118  -409  -138  -124   172   128   254
+h -361    90  -438  -278   138   -45   -99   632  -205   316   192  -108  -712  -359    95  -399
+i   47   182  -269  -253  -378  -445   -89  -205   696   186     8    15  -709  -269  -169   226
+j -103  -128  -282  -173   -70    83  -118   316   186   768   196     5  -398  -340  -117  -104
+k -644   -30  -688  -585  -112  -214  -409   192     8   196   568   -65  -270  -231  -471  -382
+l -259    29  -682  -670  -514   -88  -138  -108    15     5   -65   533  -131     8   -11  -316
+m -599  -745  -608 -1573 -1136  -547  -124  -712  -709  -398  -270  -131   241    -4  -190  -155
+n -372  -242  -455 -1048  -469  -629   172  -359  -269  -340  -231     8    -4   703    88   146
+o -124  -165  -147  -691  -617  -406   128    95  -169  -117  -471   -11  -190    88   716    58
+p  -83    22     6  -497  -632  -552   254  -399   226  -104  -382  -316  -155   146    58   609
diff --git a/src/biotite/structure/alphabet/__init__.py b/src/biotite/structure/alphabet/__init__.py
@@ -10,3 +10,4 @@
 __author__ = "Martin Larralde, Patrick Kunzmann"
 
 from .i3d import *
+from .pb import *
diff --git a/src/biotite/structure/alphabet/pb.license b/src/biotite/structure/alphabet/pb.license
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2013 Poulain, A. G. de Brevern
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/src/biotite/structure/alphabet/pb.py b/src/biotite/structure/alphabet/pb.py
@@ -0,0 +1,143 @@
+# This source code is part of the Biotite package and is distributed
+# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
+# information.
+
+"""
+Conversion of structures into the *Protein Blocks* structural alphabet.
+"""
+
+__name__ = "biotite.structure.alphabet"
+__author__ = "Patrick Kunzmann"
+__all__ = ["ProteinBlocksSequence", "to_protein_blocks"]
+
+import numpy as np
+from biotite.sequence.alphabet import LetterAlphabet
+from biotite.sequence.sequence import Sequence
+from biotite.structure.chains import get_chain_starts
+from biotite.structure.geometry import dihedral_backbone
+
+# PB reference angles, adapted from PBxplore
+PB_ANGLES = np.array(
+    [
+        [41.14, 75.53, 13.92, -99.80, 131.88, -96.27, 122.08, -99.68],
+        [108.24, -90.12, 119.54, -92.21, -18.06, -128.93, 147.04, -99.90],
+        [-11.61, -105.66, 94.81, -106.09, 133.56, -106.93, 135.97, -100.63],
+        [141.98, -112.79, 132.20, -114.79, 140.11, -111.05, 139.54, -103.16],
+        [133.25, -112.37, 137.64, -108.13, 133.00, -87.30, 120.54, 77.40],
+        [116.40, -105.53, 129.32, -96.68, 140.72, -74.19, -26.65, -94.51],
+        [0.40, -81.83, 4.91, -100.59, 85.50, -71.65, 130.78, 84.98],
+        [119.14, -102.58, 130.83, -67.91, 121.55, 76.25, -2.95, -90.88],
+        [130.68, -56.92, 119.26, 77.85, 10.42, -99.43, 141.40, -98.01],
+        [114.32, -121.47, 118.14, 82.88, -150.05, -83.81, 23.35, -85.82],
+        [117.16, -95.41, 140.40, -59.35, -29.23, -72.39, -25.08, -76.16],
+        [139.20, -55.96, -32.70, -68.51, -26.09, -74.44, -22.60, -71.74],
+        [-39.62, -64.73, -39.52, -65.54, -38.88, -66.89, -37.76, -70.19],
+        [-35.34, -65.03, -38.12, -66.34, -29.51, -89.10, -2.91, 77.90],
+        [-45.29, -67.44, -27.72, -87.27, 5.13, 77.49, 30.71, -93.23],
+        [-27.09, -86.14, 0.30, 59.85, 21.51, -96.30, 132.67, -92.91],
+    ]
+)
+
+
+class ProteinBlocksSequence(Sequence):
+    """
+    Representation of a structure in the *Protein Blocks* structural alphabet.
+    :footcite:`Brevern2000`.
+
+    Parameters
+    ----------
+    sequence : iterable object, optional
+        The 3Di sequence.
+        This may either be a list or a string.
+        May take upper or lower case letters.
+        By default the sequence is empty.
+
+    See also
+    --------
+    to_protein_blocks : Create *Protein Blocks* sequences from a structure.
+
+    References
+    ----------
+
+    .. footbibliography::
+
+    """
+
+    alphabet = LetterAlphabet("abcdefghijklmnopZ")
+    unknown_symbol = "Z"
+
+    def get_alphabet(self):
+        return ProteinBlocksSequence.alphabet
+
+
+def to_protein_blocks(atoms):
+    """
+    Encode each chain in the given structure to the *Protein Blocks* structural
+    alphabet.
+    :footcite:`Brevern2000`
+
+    Parameters
+    ----------
+    atoms : AtomArray
+        The atom array to encode.
+        May contain multiple chains.
+
+    Returns
+    -------
+    sequences : list of Sequence, length=n
+        The encoded *Protein Blocks* sequence for each peptide chain in the structure.
+    chain_start_indices : ndarray, shape=(n,), dtype=int
+        The atom index where each chain starts.
+
+    References
+    ----------
+
+    .. footbibliography::
+
+    Examples
+    --------
+
+    >>> sequences, chain_starts = to_protein_blocks(atom_array)
+    >>> print(sequences[0])
+    ZZmmmmmnopjmnopacdZZ
+    """
+    sequences = []
+    chain_start_indices = get_chain_starts(atoms, add_exclusive_stop=True)
+    for i in range(len(chain_start_indices) - 1):
+        start = chain_start_indices[i]
+        stop = chain_start_indices[i + 1]
+        chain = atoms[start:stop]
+        sequences.append(_to_protein_blocks(chain))
+    return sequences, chain_start_indices[:-1]
+
+
+def _to_protein_blocks(chain):
+    phi, psi, _ = dihedral_backbone(chain)
+
+    pb_angles = np.full((len(phi), 8), np.nan)
+    pb_angles[2:-2, 0] = psi[:-4]
+    pb_angles[2:-2, 1] = phi[1:-3]
+    pb_angles[2:-2, 2] = psi[1:-3]
+    pb_angles[2:-2, 3] = phi[2:-2]
+    pb_angles[2:-2, 4] = psi[2:-2]
+    pb_angles[2:-2, 5] = phi[3:-1]
+    pb_angles[2:-2, 6] = psi[3:-1]
+    pb_angles[2:-2, 7] = phi[4:]
+    pb_angles = np.rad2deg(pb_angles)
+
+    # Angle RMSD of all reference angles with all actual angles
+    rmsda = np.sum(
+        ((PB_ANGLES[:, np.newaxis] - pb_angles[np.newaxis, :] + 180) % 360 - 180) ** 2,
+        axis=-1,
+    )
+    # Where RMSDA is NaN, (missing atoms/residues or chain ends) set symbol to unknown
+    pb_seq_code = np.full(len(pb_angles), ProteinBlocksSequence.alphabet.encode("Z"))
+    pb_available_mask = ~np.isnan(rmsda).any(axis=0)
+    # Chose PB, where the RMSDA to the reference angle is lowest
+    # Due to the definition of Biotite symbol codes
+    # the index of the chosen PB is directly the symbol code
+    pb_seq_code[pb_available_mask] = np.argmin(rmsda[:, pb_available_mask], axis=0)
+    # Put the array of symbol codes into actual sequence objects
+    pb_sequence = ProteinBlocksSequence()
+    pb_sequence.code = pb_seq_code
+    return pb_sequence
diff --git a/tests/sequence/align/test_matrix.py b/tests/sequence/align/test_matrix.py
@@ -14,7 +14,7 @@
     [
         entry
         for entry in align.SubstitutionMatrix.list_db()
-        if entry not in ["NUC", "GONNET", "3Di"]
+        if entry not in ["NUC", "GONNET", "3Di", "PB"]
     ],
 )
 def test_matrices(db_entry):
@@ -45,6 +45,7 @@ def test_structural_alphabet_matrices(matrix_name, alphabet):
         "std_protein_matrix",
         "std_nucleotide_matrix",
         "std_3di_matrix",
+        "std_protein_blocks_matrix",
     ],
 )
 def test_default_matrices(method_name):

diff --git a/tests/structure/data/alphabet/1ay7.bcif b/tests/structure/data/alphabet/1ay7.bcif
diff --git a/tests/structure/data/alphabet/README.rst b/tests/structure/data/alphabet/README.rst
@@ -15,4 +15,11 @@ The 3Di sequences in `i3d.fasta` were generated with `foldseek` according to
 
     $ foldseek createdb --chain-name-mode 1 tests/structure/data/*.cif /tmp/biotite_3di
     $ foldseek lndb /tmp/biotite_3di_h /tmp/biotite_3di_ss_h
-    $ foldseek convert2fasta /tmp/biotite_3di_ss tests/structure/data/alphabet/i3d.fasta
+    $ foldseek convert2fasta /tmp/biotite_3di_ss tests/structure/data/alphabet/i3d.fasta
+
+Protein Blocks sequences
+------------------------
+
+Only one sequence is available in `pb.fasta`, that is taken from
+`https://pbxplore.readthedocs.io/en/latest/PBassign.html`.
+`1ay7.bcif` contains the corresponding structure.
diff --git a/tests/structure/data/alphabet/pb.fasta b/tests/structure/data/alphabet/pb.fasta
@@ -0,0 +1,2 @@
+>1ay7
+ZZdddfklpcbfklmmmmmmmmnopafklgoiaklmmmmmmmmpacddddddehkllmmmmnnommmmmmmmmmmmmmnopacddddZZ
diff --git a/tests/structure/test_pb.py b/tests/structure/test_pb.py
@@ -0,0 +1,74 @@
+from pathlib import Path
+import numpy as np
+import pytest
+import biotite.sequence.io.fasta as fasta
+import biotite.structure as struc
+import biotite.structure.alphabet as strucalph
+import biotite.structure.io.pdbx as pdbx
+from tests.util import data_dir
+
+
+@pytest.fixture
+def reference_sequence():
+    """
+    Get the reference Protein Blocks sequence for the alphabet example structure.
+    """
+    _, seq_string = next(
+        fasta.FastaFile.read_iter(Path(data_dir("structure")) / "alphabet" / "pb.fasta")
+    )
+    return strucalph.ProteinBlocksSequence(seq_string)
+
+
+@pytest.fixture
+def reference_chain():
+    pdbx_file = pdbx.BinaryCIFFile.read(
+        Path(data_dir("structure")) / "alphabet" / "1ay7.bcif"
+    )
+    atoms = pdbx.get_structure(pdbx_file, model=1)
+    atoms = atoms[struc.filter_amino_acids(atoms)]
+    chain = atoms[atoms.chain_id == "B"]
+    return chain
+
+
+def test_to_protein_blocks(reference_chain, reference_sequence):
+    """
+    Test the structure conversion to protein blocks based on a reference example from
+    the PBexplore documentation
+    (https://pbxplore.readthedocs.io/en/latest/intro_PB.html).
+    """
+    test_pb_sequences, _ = strucalph.to_protein_blocks(reference_chain)
+
+    assert len(test_pb_sequences) == 1
+    assert str(test_pb_sequences[0]) == str(reference_sequence)
+
+
+def test_missing_residues(reference_chain, reference_sequence):
+    """
+    Like, `test_to_protein_blocks()`, but in some residues backbone atoms are missing.
+    Expect that these and adjacent residues get the unknown symbol 'Z' in the
+    PB sequence.
+    """
+    N_DELETIONS = 5
+    # The 'Z' symbol
+    UKNOWN_SYMBOL = strucalph.ProteinBlocksSequence.unknown_symbol
+
+    np.random.seed(0)
+    del_backbone_residue_ids = np.random.choice(
+        np.unique(reference_chain.res_id), N_DELETIONS, replace=False
+    )
+    reference_chain = reference_chain[
+        ~np.isin(reference_chain.res_id, del_backbone_residue_ids)
+        | ~np.isin(reference_chain.atom_name, ("N", "CA", "C"))
+    ]
+
+    for res_id in del_backbone_residue_ids:
+        seq_index = res_id - reference_chain.res_id[0]
+        # Convert the PDB symbol for residue and adjacent ones to 'Z'
+        start_index = max(0, seq_index - 2)
+        end_index = min(len(reference_sequence), seq_index + 2)
+        reference_sequence[start_index : end_index + 1] = UKNOWN_SYMBOL
+
+    test_pb_sequences, _ = strucalph.to_protein_blocks(reference_chain)
+
+    assert len(test_pb_sequences) == 1
+    assert str(test_pb_sequences[0]) == str(reference_sequence)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -10,3 +10,4 @@
		__author__ = "Martin Larralde, Patrick Kunzmann"

		from .i3d import *
		from .pb import *
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		>1ay7
		ZZdddfklpcbfklmmmmmmmmnopafklgoiaklmmmmmmmmpacddddddehkllmmmmnnommmmmmmmmmmmmmnopacddddZZ