Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Protein Blocks structural alphabet #676

Merged
merged 2 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions doc/apidoc.json
Original file line number Diff line number Diff line change
Expand Up @@ -404,10 +404,12 @@
},
"biotite.structure.alphabet" : {
"Structural alphabets": [
"I3DSequence"
"I3DSequence",
"ProteinBlocksSequence"
],
"Conversion Function": [
"to_3di"
"to_3di",
"to_protein_blocks"
]
}
}
10 changes: 6 additions & 4 deletions doc/examples/scripts/structure/protein/ramachandran.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@
# Download and parse file
file = rcsb.fetch("3vkh", "cif", gettempdir())
atom_array = strucio.load_structure(file)
# Extract first peptide chain
peptide = atom_array[struc.filter_amino_acids(atom_array)]
chain = peptide[peptide.chain_id == "A"]
# Calculate backbone dihedral angles
# from one of the two identical chains in the asymmetric unit
phi, psi, omega = struc.dihedral_backbone(atom_array[atom_array.chain_id == "A"])
# Conversion from radians into degree
phi *= 180 / np.pi
psi *= 180 / np.pi
phi, psi, omega = struc.dihedral_backbone(chain)
phi = np.rad2deg(phi)
psi = np.rad2deg(psi)
# Remove invalid values (NaN) at first and last position
phi = phi[1:-1]
psi = psi[1:-1]
Expand Down
4 changes: 2 additions & 2 deletions doc/tutorial/structure/measurement.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ simple *Ramachandran* plot for the first frame of *TC5b*.
)
ax.set_xlim(-180,180)
ax.set_ylim(-180,180)
ax.set_xlabel("$\phi$")
_ = ax.set_ylabel("$\psi$")
ax.set_xlabel(r"$\phi$")
_ = ax.set_ylabel(r"$\psi$")

Surface area
------------
Expand Down
46 changes: 46 additions & 0 deletions src/biotite/sequence/align/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class SubstitutionMatrix(object):
- Structural alphabet substitution matrices

- **3Di** - For 3Di alphabet from ``foldseek`` :footcite:`VanKempen2024`
- **PB** - For Protein Blocks alphabet from *PBexplore* :footcite:`Barnoud2017`

A list of all available matrix names is returned by
:meth:`list_db()`.
Expand Down Expand Up @@ -408,6 +409,7 @@ def std_3di_matrix():
"""
Get the default :class:`SubstitutionMatrix` for 3Di sequence
alignments.
:footcite:`VanKempen2024`

Returns
-------
Expand All @@ -418,3 +420,47 @@ def std_3di_matrix():
from biotite.structure.alphabet.i3d import I3DSequence

return SubstitutionMatrix(I3DSequence.alphabet, I3DSequence.alphabet, "3Di")

@staticmethod
@functools.cache
def std_protein_blocks_matrix(unknown_match=200, unkown_mismatch=-200):
"""
Get the default :class:`SubstitutionMatrix` for Protein Blocks sequences.

The matrix is adapted from *PBxplore* :footcite:`Barnoud2017`.

Parameters
----------
unknown_match, unknown_mismatch : int, optional
The match and mismatch score for undefined symbols.
The default values were chosen arbitrarily, but are in the order of
magnitude of the other score values.

Returns
-------
matrix : SubstitutionMatrix
Default matrix.

References
----------

.. footbibliography::

"""
from biotite.structure.alphabet.pb import ProteinBlocksSequence

alphabet = ProteinBlocksSequence.alphabet
unknown_symbol = ProteinBlocksSequence.unknown_symbol
matrix_dict = SubstitutionMatrix.dict_from_db("PB")
# Add match/mismatch scores for undefined symbols residues
for symbol in alphabet:
if symbol == unknown_symbol:
continue
matrix_dict[symbol, unknown_symbol] = unkown_mismatch
matrix_dict[unknown_symbol, symbol] = unkown_mismatch
matrix_dict[unknown_symbol, unknown_symbol] = unknown_match
return SubstitutionMatrix(
alphabet,
alphabet,
matrix_dict,
)
21 changes: 21 additions & 0 deletions src/biotite/sequence/align/matrix_data/PB.license
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
The MIT License (MIT)

Copyright (c) 2013 Poulain, A. G. de Brevern

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
18 changes: 18 additions & 0 deletions src/biotite/sequence/align/matrix_data/PB.mat
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# PB substitution matrix, adapted from PBxplore
a b c d e f g h i j k l m n o p
a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83
b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22
c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6
d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497
e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632
f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552
g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254
h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399
i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226
j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104
k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382
l -259 29 -682 -670 -514 -88 -138 -108 15 5 -65 533 -131 8 -11 -316
m -599 -745 -608 -1573 -1136 -547 -124 -712 -709 -398 -270 -131 241 -4 -190 -155
n -372 -242 -455 -1048 -469 -629 172 -359 -269 -340 -231 8 -4 703 88 146
o -124 -165 -147 -691 -617 -406 128 95 -169 -117 -471 -11 -190 88 716 58
p -83 22 6 -497 -632 -552 254 -399 226 -104 -382 -316 -155 146 58 609
1 change: 1 addition & 0 deletions src/biotite/structure/alphabet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@
__author__ = "Martin Larralde, Patrick Kunzmann"

from .i3d import *
from .pb import *
21 changes: 21 additions & 0 deletions src/biotite/structure/alphabet/pb.license
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
The MIT License (MIT)

Copyright (c) 2013 Poulain, A. G. de Brevern

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
143 changes: 143 additions & 0 deletions src/biotite/structure/alphabet/pb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# This source code is part of the Biotite package and is distributed
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.

"""
Conversion of structures into the *Protein Blocks* structural alphabet.
"""

__name__ = "biotite.structure.alphabet"
__author__ = "Patrick Kunzmann"
__all__ = ["ProteinBlocksSequence", "to_protein_blocks"]

import numpy as np
from biotite.sequence.alphabet import LetterAlphabet
from biotite.sequence.sequence import Sequence
from biotite.structure.chains import get_chain_starts
from biotite.structure.geometry import dihedral_backbone

# PB reference angles, adapted from PBxplore
PB_ANGLES = np.array(
[
[41.14, 75.53, 13.92, -99.80, 131.88, -96.27, 122.08, -99.68],
[108.24, -90.12, 119.54, -92.21, -18.06, -128.93, 147.04, -99.90],
[-11.61, -105.66, 94.81, -106.09, 133.56, -106.93, 135.97, -100.63],
[141.98, -112.79, 132.20, -114.79, 140.11, -111.05, 139.54, -103.16],
[133.25, -112.37, 137.64, -108.13, 133.00, -87.30, 120.54, 77.40],
[116.40, -105.53, 129.32, -96.68, 140.72, -74.19, -26.65, -94.51],
[0.40, -81.83, 4.91, -100.59, 85.50, -71.65, 130.78, 84.98],
[119.14, -102.58, 130.83, -67.91, 121.55, 76.25, -2.95, -90.88],
[130.68, -56.92, 119.26, 77.85, 10.42, -99.43, 141.40, -98.01],
[114.32, -121.47, 118.14, 82.88, -150.05, -83.81, 23.35, -85.82],
[117.16, -95.41, 140.40, -59.35, -29.23, -72.39, -25.08, -76.16],
[139.20, -55.96, -32.70, -68.51, -26.09, -74.44, -22.60, -71.74],
[-39.62, -64.73, -39.52, -65.54, -38.88, -66.89, -37.76, -70.19],
[-35.34, -65.03, -38.12, -66.34, -29.51, -89.10, -2.91, 77.90],
[-45.29, -67.44, -27.72, -87.27, 5.13, 77.49, 30.71, -93.23],
[-27.09, -86.14, 0.30, 59.85, 21.51, -96.30, 132.67, -92.91],
]
) # fmt: skip


class ProteinBlocksSequence(Sequence):
"""
Representation of a structure in the *Protein Blocks* structural alphabet.
:footcite:`Brevern2000`

Parameters
----------
sequence : iterable object, optional
The *Protein Blocks* sequence.
This may either be a list or a string.
May take upper or lower case letters.
By default the sequence is empty.

See also
--------
to_protein_blocks : Create *Protein Blocks* sequences from a structure.

References
----------

.. footbibliography::

"""

alphabet = LetterAlphabet("abcdefghijklmnopZ")
unknown_symbol = "Z"

def get_alphabet(self):
return ProteinBlocksSequence.alphabet


def to_protein_blocks(atoms):
"""
Encode each chain in the given structure to the *Protein Blocks* structural
alphabet.
:footcite:`Brevern2000`

Parameters
----------
atoms : AtomArray
The atom array to encode.
May contain multiple chains.

Returns
-------
sequences : list of Sequence, length=n
The encoded *Protein Blocks* sequence for each peptide chain in the structure.
chain_start_indices : ndarray, shape=(n,), dtype=int
The atom index where each chain starts.

References
----------

.. footbibliography::

Examples
--------

>>> sequences, chain_starts = to_protein_blocks(atom_array)
>>> print(sequences[0])
ZZmmmmmnopjmnopacdZZ
"""
sequences = []
chain_start_indices = get_chain_starts(atoms, add_exclusive_stop=True)
for i in range(len(chain_start_indices) - 1):
start = chain_start_indices[i]
stop = chain_start_indices[i + 1]
chain = atoms[start:stop]
sequences.append(_to_protein_blocks(chain))
return sequences, chain_start_indices[:-1]


def _to_protein_blocks(chain):
phi, psi, _ = dihedral_backbone(chain)

pb_angles = np.full((len(phi), 8), np.nan)
pb_angles[2:-2, 0] = psi[:-4]
pb_angles[2:-2, 1] = phi[1:-3]
pb_angles[2:-2, 2] = psi[1:-3]
pb_angles[2:-2, 3] = phi[2:-2]
pb_angles[2:-2, 4] = psi[2:-2]
pb_angles[2:-2, 5] = phi[3:-1]
pb_angles[2:-2, 6] = psi[3:-1]
pb_angles[2:-2, 7] = phi[4:]
pb_angles = np.rad2deg(pb_angles)

# Angle RMSD of all reference angles with all actual angles
rmsda = np.sum(
((PB_ANGLES[:, np.newaxis] - pb_angles[np.newaxis, :] + 180) % 360 - 180) ** 2,
axis=-1,
)
# Where RMSDA is NaN, (missing atoms/residues or chain ends) set symbol to unknown
pb_seq_code = np.full(len(pb_angles), ProteinBlocksSequence.alphabet.encode("Z"))
pb_available_mask = ~np.isnan(rmsda).any(axis=0)
# Chose PB, where the RMSDA to the reference angle is lowest
# Due to the definition of Biotite symbol codes
# the index of the chosen PB is directly the symbol code
pb_seq_code[pb_available_mask] = np.argmin(rmsda[:, pb_available_mask], axis=0)
# Put the array of symbol codes into actual sequence objects
pb_sequence = ProteinBlocksSequence()
pb_sequence.code = pb_seq_code
return pb_sequence
Loading
Loading