Skip to content

Commit

Permalink
Add Protein Blocks alphabet
Browse files Browse the repository at this point in the history
  • Loading branch information
padix-key committed Nov 4, 2024
1 parent d89e5ab commit e0be076
Show file tree
Hide file tree
Showing 13 changed files with 344 additions and 4 deletions.
6 changes: 4 additions & 2 deletions doc/apidoc.json
Original file line number Diff line number Diff line change
Expand Up @@ -404,10 +404,12 @@
},
"biotite.structure.alphabet" : {
"Structural alphabets": [
"I3DSequence"
"I3DSequence",
"ProteinBlocksSequence"
],
"Conversion Function": [
"to_3di"
"to_3di",
"to_protein_blocks"
]
}
}
46 changes: 46 additions & 0 deletions src/biotite/sequence/align/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class SubstitutionMatrix(object):
- Structural alphabet substitution matrices
- **3Di** - For 3Di alphabet from ``foldseek`` :footcite:`VanKempen2024`
- **PB** - For Protein Blocks alphabet from *PBexplore* :footcite:`Barnoud2017`
A list of all available matrix names is returned by
:meth:`list_db()`.
Expand Down Expand Up @@ -408,6 +409,7 @@ def std_3di_matrix():
"""
Get the default :class:`SubstitutionMatrix` for 3Di sequence
alignments.
:footcite:`VanKempen2024`
Returns
-------
Expand All @@ -418,3 +420,47 @@ def std_3di_matrix():
from biotite.structure.alphabet.i3d import I3DSequence

return SubstitutionMatrix(I3DSequence.alphabet, I3DSequence.alphabet, "3Di")

@staticmethod
@functools.cache
def std_protein_blocks_matrix(unknown_match=200, unkown_mismatch=-200):
"""
Get the default :class:`SubstitutionMatrix` for Protein Blocks sequences.
The matrix is adapted from *PBxplore* :footcite:`Barnoud2017`.
Parameters
----------
unknown_match, unknown_mismatch : int, optional
The match and mismatch score for undefined symbols.
The default values were chosen arbitrarily, but are in the order of
magnitude of the other score values.
Returns
-------
matrix : SubstitutionMatrix
Default matrix.
References
----------
.. footbibliography::
"""
from biotite.structure.alphabet.pb import ProteinBlocksSequence

alphabet = ProteinBlocksSequence.alphabet
unknown_symbol = ProteinBlocksSequence.unknown_symbol
matrix_dict = SubstitutionMatrix.dict_from_db("PB")
# Add match/mismatch scores for undefined symbols residues
for symbol in alphabet:
if symbol == unknown_symbol:
continue
matrix_dict[symbol, unknown_symbol] = unkown_mismatch
matrix_dict[unknown_symbol, symbol] = unkown_mismatch
matrix_dict[unknown_symbol, unknown_symbol] = unknown_match
return SubstitutionMatrix(
alphabet,
alphabet,
matrix_dict,
)
21 changes: 21 additions & 0 deletions src/biotite/sequence/align/matrix_data/PB.license
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
The MIT License (MIT)

Copyright (c) 2013 Poulain, A. G. de Brevern

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
18 changes: 18 additions & 0 deletions src/biotite/sequence/align/matrix_data/PB.mat
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# PB substitution matrix, adapted from PBxplore
a b c d e f g h i j k l m n o p
a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83
b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22
c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6
d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497
e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632
f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552
g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254
h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399
i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226
j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104
k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382
l -259 29 -682 -670 -514 -88 -138 -108 15 5 -65 533 -131 8 -11 -316
m -599 -745 -608 -1573 -1136 -547 -124 -712 -709 -398 -270 -131 241 -4 -190 -155
n -372 -242 -455 -1048 -469 -629 172 -359 -269 -340 -231 8 -4 703 88 146
o -124 -165 -147 -691 -617 -406 128 95 -169 -117 -471 -11 -190 88 716 58
p -83 22 6 -497 -632 -552 254 -399 226 -104 -382 -316 -155 146 58 609
1 change: 1 addition & 0 deletions src/biotite/structure/alphabet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@
__author__ = "Martin Larralde, Patrick Kunzmann"

from .i3d import *
from .pb import *
21 changes: 21 additions & 0 deletions src/biotite/structure/alphabet/pb.license
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
The MIT License (MIT)

Copyright (c) 2013 Poulain, A. G. de Brevern

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
143 changes: 143 additions & 0 deletions src/biotite/structure/alphabet/pb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# This source code is part of the Biotite package and is distributed
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.

"""
Conversion of structures into the *Protein Blocks* structural alphabet.
"""

__name__ = "biotite.structure.alphabet"
__author__ = "Patrick Kunzmann"
__all__ = ["ProteinBlocksSequence", "to_protein_blocks"]

import numpy as np
from biotite.sequence.alphabet import LetterAlphabet
from biotite.sequence.sequence import Sequence
from biotite.structure.chains import get_chain_starts
from biotite.structure.geometry import dihedral_backbone

# PB reference angles, adapted from PBxplore
PB_ANGLES = np.array(
[
[41.14, 75.53, 13.92, -99.80, 131.88, -96.27, 122.08, -99.68],
[108.24, -90.12, 119.54, -92.21, -18.06, -128.93, 147.04, -99.90],
[-11.61, -105.66, 94.81, -106.09, 133.56, -106.93, 135.97, -100.63],
[141.98, -112.79, 132.20, -114.79, 140.11, -111.05, 139.54, -103.16],
[133.25, -112.37, 137.64, -108.13, 133.00, -87.30, 120.54, 77.40],
[116.40, -105.53, 129.32, -96.68, 140.72, -74.19, -26.65, -94.51],
[0.40, -81.83, 4.91, -100.59, 85.50, -71.65, 130.78, 84.98],
[119.14, -102.58, 130.83, -67.91, 121.55, 76.25, -2.95, -90.88],
[130.68, -56.92, 119.26, 77.85, 10.42, -99.43, 141.40, -98.01],
[114.32, -121.47, 118.14, 82.88, -150.05, -83.81, 23.35, -85.82],
[117.16, -95.41, 140.40, -59.35, -29.23, -72.39, -25.08, -76.16],
[139.20, -55.96, -32.70, -68.51, -26.09, -74.44, -22.60, -71.74],
[-39.62, -64.73, -39.52, -65.54, -38.88, -66.89, -37.76, -70.19],
[-35.34, -65.03, -38.12, -66.34, -29.51, -89.10, -2.91, 77.90],
[-45.29, -67.44, -27.72, -87.27, 5.13, 77.49, 30.71, -93.23],
[-27.09, -86.14, 0.30, 59.85, 21.51, -96.30, 132.67, -92.91],
]
) # fmt: skip


class ProteinBlocksSequence(Sequence):
"""
Representation of a structure in the *Protein Blocks* structural alphabet.
:footcite:`Brevern2000`
Parameters
----------
sequence : iterable object, optional
The *Protein Blocks* sequence.
This may either be a list or a string.
May take upper or lower case letters.
By default the sequence is empty.
See also
--------
to_protein_blocks : Create *Protein Blocks* sequences from a structure.
References
----------
.. footbibliography::
"""

alphabet = LetterAlphabet("abcdefghijklmnopZ")
unknown_symbol = "Z"

def get_alphabet(self):
return ProteinBlocksSequence.alphabet


def to_protein_blocks(atoms):
"""
Encode each chain in the given structure to the *Protein Blocks* structural
alphabet.
:footcite:`Brevern2000`
Parameters
----------
atoms : AtomArray
The atom array to encode.
May contain multiple chains.
Returns
-------
sequences : list of Sequence, length=n
The encoded *Protein Blocks* sequence for each peptide chain in the structure.
chain_start_indices : ndarray, shape=(n,), dtype=int
The atom index where each chain starts.
References
----------
.. footbibliography::
Examples
--------
>>> sequences, chain_starts = to_protein_blocks(atom_array)
>>> print(sequences[0])
ZZmmmmmnopjmnopacdZZ
"""
sequences = []
chain_start_indices = get_chain_starts(atoms, add_exclusive_stop=True)
for i in range(len(chain_start_indices) - 1):
start = chain_start_indices[i]
stop = chain_start_indices[i + 1]
chain = atoms[start:stop]
sequences.append(_to_protein_blocks(chain))
return sequences, chain_start_indices[:-1]


def _to_protein_blocks(chain):
phi, psi, _ = dihedral_backbone(chain)

pb_angles = np.full((len(phi), 8), np.nan)
pb_angles[2:-2, 0] = psi[:-4]
pb_angles[2:-2, 1] = phi[1:-3]
pb_angles[2:-2, 2] = psi[1:-3]
pb_angles[2:-2, 3] = phi[2:-2]
pb_angles[2:-2, 4] = psi[2:-2]
pb_angles[2:-2, 5] = phi[3:-1]
pb_angles[2:-2, 6] = psi[3:-1]
pb_angles[2:-2, 7] = phi[4:]
pb_angles = np.rad2deg(pb_angles)

# Angle RMSD of all reference angles with all actual angles
rmsda = np.sum(
((PB_ANGLES[:, np.newaxis] - pb_angles[np.newaxis, :] + 180) % 360 - 180) ** 2,
axis=-1,
)
# Where RMSDA is NaN, (missing atoms/residues or chain ends) set symbol to unknown
pb_seq_code = np.full(len(pb_angles), ProteinBlocksSequence.alphabet.encode("Z"))
pb_available_mask = ~np.isnan(rmsda).any(axis=0)
# Chose PB, where the RMSDA to the reference angle is lowest
# Due to the definition of Biotite symbol codes
# the index of the chosen PB is directly the symbol code
pb_seq_code[pb_available_mask] = np.argmin(rmsda[:, pb_available_mask], axis=0)
# Put the array of symbol codes into actual sequence objects
pb_sequence = ProteinBlocksSequence()
pb_sequence.code = pb_seq_code
return pb_sequence
3 changes: 2 additions & 1 deletion tests/sequence/align/test_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
[
entry
for entry in align.SubstitutionMatrix.list_db()
if entry not in ["NUC", "GONNET", "3Di"]
if entry not in ["NUC", "GONNET", "3Di", "PB"]
],
)
def test_matrices(db_entry):
Expand Down Expand Up @@ -45,6 +45,7 @@ def test_structural_alphabet_matrices(matrix_name, alphabet):
"std_protein_matrix",
"std_nucleotide_matrix",
"std_3di_matrix",
"std_protein_blocks_matrix",
],
)
def test_default_matrices(method_name):
Expand Down
Binary file added tests/structure/data/alphabet/1ay7.bcif
Binary file not shown.
9 changes: 8 additions & 1 deletion tests/structure/data/alphabet/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,11 @@ The 3Di sequences in `i3d.fasta` were generated with `foldseek` according to
$ foldseek createdb --chain-name-mode 1 tests/structure/data/*.cif /tmp/biotite_3di
$ foldseek lndb /tmp/biotite_3di_h /tmp/biotite_3di_ss_h
$ foldseek convert2fasta /tmp/biotite_3di_ss tests/structure/data/alphabet/i3d.fasta
$ foldseek convert2fasta /tmp/biotite_3di_ss tests/structure/data/alphabet/i3d.fasta
Protein Blocks sequences
------------------------

Only one sequence is available in `pb.fasta`, that is taken from
`https://pbxplore.readthedocs.io/en/latest/PBassign.html`.
`1ay7.bcif` contains the corresponding structure.
2 changes: 2 additions & 0 deletions tests/structure/data/alphabet/pb.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>1ay7
ZZdddfklpcbfklmmmmmmmmnopafklgoiaklmmmmmmmmpacddddddehkllmmmmnnommmmmmmmmmmmmmnopacddddZZ
2 changes: 2 additions & 0 deletions tests/structure/test_i3d.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def test_missing_residues():
atoms = pdbx.get_structure(pdbx_file, model=1)
atoms = atoms[struc.filter_amino_acids(atoms)]

# Randomly delete some backbone atoms
rng = np.random.default_rng(1)
del_backbone_residue_ids = rng.choice(
np.unique(atoms.res_id), N_DELETIONS, replace=False
Expand All @@ -94,6 +95,7 @@ def test_missing_residues():
]
test_sequences, _ = strucalph.to_3di(atoms)

# Apply the same deletions to the reference sequence
ref_sequence = _get_ref_3di_sequence(PDB_ID, atoms.chain_id[0])
for res_id in del_backbone_residue_ids:
seq_index = res_id - atoms.res_id[0]
Expand Down
Loading

0 comments on commit e0be076

Please sign in to comment.