Skip to content

Commit

Permalink
Add Protein Blocks alphabet
Browse files Browse the repository at this point in the history
  • Loading branch information
padix-key committed Oct 17, 2024
1 parent 1f487cc commit 9a9c94b
Show file tree
Hide file tree
Showing 12 changed files with 337 additions and 4 deletions.
6 changes: 4 additions & 2 deletions doc/apidoc.json
Original file line number Diff line number Diff line change
Expand Up @@ -384,10 +384,12 @@
},
"biotite.structure.alphabet" : {
"Structural alphabets": [
"I3DSequence"
"I3DSequence",
"ProteinBlocksAlphabet"
],
"Conversion Function": [
"to_3di"
"to_3di",
"to_protein_blocks"
]
}
}
43 changes: 43 additions & 0 deletions src/biotite/sequence/align/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,3 +418,46 @@ def std_3di_matrix():
from biotite.structure.alphabet.i3d import I3DSequence

return SubstitutionMatrix(I3DSequence.alphabet, I3DSequence.alphabet, "3Di")

@staticmethod
@functools.cache
def std_protein_blocks_matrix(unknown_match=200, unkown_mismatch=-200):
"""
Get the substitution matrix for Protein Blocks sequences.
The matrix is adapted from *PBxplore* :footcite:`Barnoud2017`.
Parameters
----------
unknown_match, unkown_mismatch : int, optional
The match and mismatch score for unknown/missing residues.
The default values were chose arbitrarily.
Returns
-------
matrix : SubstitutionMatrix
Default matrix.
References
----------
.. footbibliography::
"""
from biotite.structure.alphabet.pb import ProteinBlocksSequence

alphabet = ProteinBlocksSequence.alphabet
unknown_symbol = ProteinBlocksSequence.unknown_symbol
matrix_dict = SubstitutionMatrix.dict_from_db("PB")
# Add match/mismatch scores for unknown/missing residues
for symbol in alphabet:
if symbol == unknown_symbol:
continue
matrix_dict[symbol, unknown_symbol] = unkown_mismatch
matrix_dict[unknown_symbol, symbol] = unkown_mismatch
matrix_dict[unknown_symbol, unknown_symbol] = unknown_match
return SubstitutionMatrix(
alphabet,
alphabet,
matrix_dict,
)
21 changes: 21 additions & 0 deletions src/biotite/sequence/align/matrix_data/PB.license
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
The MIT License (MIT)

Copyright (c) 2013 Poulain, A. G. de Brevern

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
18 changes: 18 additions & 0 deletions src/biotite/sequence/align/matrix_data/PB.mat
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# PB substitution matrix, adapted from PBxplore
a b c d e f g h i j k l m n o p
a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83
b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22
c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6
d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497
e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632
f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552
g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254
h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399
i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226
j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104
k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382
l -259 29 -682 -670 -514 -88 -138 -108 15 5 -65 533 -131 8 -11 -316
m -599 -745 -608 -1573 -1136 -547 -124 -712 -709 -398 -270 -131 241 -4 -190 -155
n -372 -242 -455 -1048 -469 -629 172 -359 -269 -340 -231 8 -4 703 88 146
o -124 -165 -147 -691 -617 -406 128 95 -169 -117 -471 -11 -190 88 716 58
p -83 22 6 -497 -632 -552 254 -399 226 -104 -382 -316 -155 146 58 609
1 change: 1 addition & 0 deletions src/biotite/structure/alphabet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@
__author__ = "Martin Larralde, Patrick Kunzmann"

from .i3d import *
from .pb import *
21 changes: 21 additions & 0 deletions src/biotite/structure/alphabet/pb.license
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
The MIT License (MIT)

Copyright (c) 2013 Poulain, A. G. de Brevern

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
143 changes: 143 additions & 0 deletions src/biotite/structure/alphabet/pb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# This source code is part of the Biotite package and is distributed
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.

"""
Conversion of structures into the *Protein Blocks* structural alphabet.
"""

__name__ = "biotite.structure.alphabet"
__author__ = "Patrick Kunzmann"
__all__ = ["ProteinBlocksSequence", "to_protein_blocks"]

import numpy as np
from biotite.sequence.alphabet import LetterAlphabet
from biotite.sequence.sequence import Sequence
from biotite.structure.chains import get_chain_starts
from biotite.structure.geometry import dihedral_backbone

# PB reference angles, adapted from PBxplore
PB_ANGLES = np.array(
[
[41.14, 75.53, 13.92, -99.80, 131.88, -96.27, 122.08, -99.68],
[108.24, -90.12, 119.54, -92.21, -18.06, -128.93, 147.04, -99.90],
[-11.61, -105.66, 94.81, -106.09, 133.56, -106.93, 135.97, -100.63],
[141.98, -112.79, 132.20, -114.79, 140.11, -111.05, 139.54, -103.16],
[133.25, -112.37, 137.64, -108.13, 133.00, -87.30, 120.54, 77.40],
[116.40, -105.53, 129.32, -96.68, 140.72, -74.19, -26.65, -94.51],
[0.40, -81.83, 4.91, -100.59, 85.50, -71.65, 130.78, 84.98],
[119.14, -102.58, 130.83, -67.91, 121.55, 76.25, -2.95, -90.88],
[130.68, -56.92, 119.26, 77.85, 10.42, -99.43, 141.40, -98.01],
[114.32, -121.47, 118.14, 82.88, -150.05, -83.81, 23.35, -85.82],
[117.16, -95.41, 140.40, -59.35, -29.23, -72.39, -25.08, -76.16],
[139.20, -55.96, -32.70, -68.51, -26.09, -74.44, -22.60, -71.74],
[-39.62, -64.73, -39.52, -65.54, -38.88, -66.89, -37.76, -70.19],
[-35.34, -65.03, -38.12, -66.34, -29.51, -89.10, -2.91, 77.90],
[-45.29, -67.44, -27.72, -87.27, 5.13, 77.49, 30.71, -93.23],
[-27.09, -86.14, 0.30, 59.85, 21.51, -96.30, 132.67, -92.91],
]
)


class ProteinBlocksSequence(Sequence):
"""
Representation of a structure in the *Protein Blocks* structural alphabet.
:footcite:`Brevern2000`.
Parameters
----------
sequence : iterable object, optional
The 3Di sequence.
This may either be a list or a string.
May take upper or lower case letters.
By default the sequence is empty.
See also
--------
to_protein_blocks : Create *Protein Blocks* sequences from a structure.
References
----------
.. footbibliography::
"""

alphabet = LetterAlphabet("abcdefghijklmnopZ")
unknown_symbol = "Z"

def get_alphabet(self):
return ProteinBlocksSequence.alphabet


def to_protein_blocks(atoms):
"""
Encode each chain in the given structure to the *Protein Blocks* structural
alphabet.
:footcite:`Brevern2000`
Parameters
----------
atoms : AtomArray
The atom array to encode.
May contain multiple chains.
Returns
-------
sequences : list of Sequence, length=n
The encoded *Protein Blocks* sequence for each peptide chain in the structure.
chain_start_indices : ndarray, shape=(n,), dtype=int
The atom index where each chain starts.
References
----------
.. footbibliography::
Examples
--------
>>> sequences, chain_starts = to_protein_blocks(atom_array)
>>> print(sequences[0])
ZZmmmmmnopjmnopacdZZ
"""
sequences = []
chain_start_indices = get_chain_starts(atoms, add_exclusive_stop=True)
for i in range(len(chain_start_indices) - 1):
start = chain_start_indices[i]
stop = chain_start_indices[i + 1]
chain = atoms[start:stop]
sequences.append(_to_protein_blocks(chain))
return sequences, chain_start_indices[:-1]


def _to_protein_blocks(chain):
phi, psi, _ = dihedral_backbone(chain)

pb_angles = np.full((len(phi), 8), np.nan)
pb_angles[2:-2, 0] = psi[:-4]
pb_angles[2:-2, 1] = phi[1:-3]
pb_angles[2:-2, 2] = psi[1:-3]
pb_angles[2:-2, 3] = phi[2:-2]
pb_angles[2:-2, 4] = psi[2:-2]
pb_angles[2:-2, 5] = phi[3:-1]
pb_angles[2:-2, 6] = psi[3:-1]
pb_angles[2:-2, 7] = phi[4:]
pb_angles = np.rad2deg(pb_angles)

# Angle RMSD of all reference angles with all actual angles
rmsda = np.sum(
((PB_ANGLES[:, np.newaxis] - pb_angles[np.newaxis, :] + 180) % 360 - 180) ** 2,
axis=-1,
)
# Where RMSDA is NaN, (missing atoms/residues or chain ends) set symbol to unknown
pb_seq_code = np.full(len(pb_angles), ProteinBlocksSequence.alphabet.encode("Z"))
pb_available_mask = ~np.isnan(rmsda).any(axis=0)
# Chose PB, where the RMSDA to the reference angle is lowest
# Due to the definition of Biotite symbol codes
# the index of the chosen PB is directly the symbol code
pb_seq_code[pb_available_mask] = np.argmin(rmsda[:, pb_available_mask], axis=0)
# Put the array of symbol codes into actual sequence objects
pb_sequence = ProteinBlocksSequence()
pb_sequence.code = pb_seq_code
return pb_sequence
3 changes: 2 additions & 1 deletion tests/sequence/align/test_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
[
entry
for entry in align.SubstitutionMatrix.list_db()
if entry not in ["NUC", "GONNET", "3Di"]
if entry not in ["NUC", "GONNET", "3Di", "PB"]
],
)
def test_matrices(db_entry):
Expand Down Expand Up @@ -45,6 +45,7 @@ def test_structural_alphabet_matrices(matrix_name, alphabet):
"std_protein_matrix",
"std_nucleotide_matrix",
"std_3di_matrix",
"std_protein_blocks_matrix",
],
)
def test_default_matrices(method_name):
Expand Down
Binary file added tests/structure/data/alphabet/1ay7.bcif
Binary file not shown.
9 changes: 8 additions & 1 deletion tests/structure/data/alphabet/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,11 @@ The 3Di sequences in `i3d.fasta` were generated with `foldseek` according to
$ foldseek createdb --chain-name-mode 1 tests/structure/data/*.cif /tmp/biotite_3di
$ foldseek lndb /tmp/biotite_3di_h /tmp/biotite_3di_ss_h
$ foldseek convert2fasta /tmp/biotite_3di_ss tests/structure/data/alphabet/i3d.fasta
$ foldseek convert2fasta /tmp/biotite_3di_ss tests/structure/data/alphabet/i3d.fasta
Protein Blocks sequences
------------------------

Only one sequence is available in `pb.fasta`, that is taken from
`https://pbxplore.readthedocs.io/en/latest/PBassign.html`.
`1ay7.bcif` contains the corresponding structure.
2 changes: 2 additions & 0 deletions tests/structure/data/alphabet/pb.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>1ay7
ZZdddfklpcbfklmmmmmmmmnopafklgoiaklmmmmmmmmpacddddddehkllmmmmnnommmmmmmmmmmmmmnopacddddZZ
74 changes: 74 additions & 0 deletions tests/structure/test_pb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from pathlib import Path
import numpy as np
import pytest
import biotite.sequence.io.fasta as fasta
import biotite.structure as struc
import biotite.structure.alphabet as strucalph
import biotite.structure.io.pdbx as pdbx
from tests.util import data_dir


@pytest.fixture
def reference_sequence():
"""
Get the reference Protein Blocks sequence for the alphabet example structure.
"""
_, seq_string = next(
fasta.FastaFile.read_iter(Path(data_dir("structure")) / "alphabet" / "pb.fasta")
)
return strucalph.ProteinBlocksSequence(seq_string)


@pytest.fixture
def reference_chain():
pdbx_file = pdbx.BinaryCIFFile.read(
Path(data_dir("structure")) / "alphabet" / "1ay7.bcif"
)
atoms = pdbx.get_structure(pdbx_file, model=1)
atoms = atoms[struc.filter_amino_acids(atoms)]
chain = atoms[atoms.chain_id == "B"]
return chain


def test_to_protein_blocks(reference_chain, reference_sequence):
"""
Test the structure conversion to protein blocks based on a reference example from
the PBexplore documentation
(https://pbxplore.readthedocs.io/en/latest/intro_PB.html).
"""
test_pb_sequences, _ = strucalph.to_protein_blocks(reference_chain)

assert len(test_pb_sequences) == 1
assert str(test_pb_sequences[0]) == str(reference_sequence)


def test_missing_residues(reference_chain, reference_sequence):
"""
Like, `test_to_protein_blocks()`, but in some residues backbone atoms are missing.
Expect that these and adjacent residues get the unknown symbol 'Z' in the
PB sequence.
"""
N_DELETIONS = 5
# The 'Z' symbol
UKNOWN_SYMBOL = strucalph.ProteinBlocksSequence.unknown_symbol

np.random.seed(0)
del_backbone_residue_ids = np.random.choice(
np.unique(reference_chain.res_id), N_DELETIONS, replace=False
)
reference_chain = reference_chain[
~np.isin(reference_chain.res_id, del_backbone_residue_ids)
| ~np.isin(reference_chain.atom_name, ("N", "CA", "C"))
]

for res_id in del_backbone_residue_ids:
seq_index = res_id - reference_chain.res_id[0]
# Convert the PDB symbol for residue and adjacent ones to 'Z'
start_index = max(0, seq_index - 2)
end_index = min(len(reference_sequence), seq_index + 2)
reference_sequence[start_index : end_index + 1] = UKNOWN_SYMBOL

test_pb_sequences, _ = strucalph.to_protein_blocks(reference_chain)

assert len(test_pb_sequences) == 1
assert str(test_pb_sequences[0]) == str(reference_sequence)

0 comments on commit 9a9c94b

Please sign in to comment.