-
Notifications
You must be signed in to change notification settings - Fork 103
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
337 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
The MIT License (MIT) | ||
|
||
Copyright (c) 2013 Poulain, A. G. de Brevern | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# PB substitution matrix, adapted from PBxplore | ||
a b c d e f g h i j k l m n o p | ||
a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83 | ||
b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22 | ||
c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6 | ||
d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497 | ||
e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632 | ||
f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552 | ||
g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254 | ||
h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399 | ||
i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226 | ||
j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104 | ||
k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382 | ||
l -259 29 -682 -670 -514 -88 -138 -108 15 5 -65 533 -131 8 -11 -316 | ||
m -599 -745 -608 -1573 -1136 -547 -124 -712 -709 -398 -270 -131 241 -4 -190 -155 | ||
n -372 -242 -455 -1048 -469 -629 172 -359 -269 -340 -231 8 -4 703 88 146 | ||
o -124 -165 -147 -691 -617 -406 128 95 -169 -117 -471 -11 -190 88 716 58 | ||
p -83 22 6 -497 -632 -552 254 -399 226 -104 -382 -316 -155 146 58 609 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,3 +10,4 @@ | |
__author__ = "Martin Larralde, Patrick Kunzmann" | ||
|
||
from .i3d import * | ||
from .pb import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
The MIT License (MIT) | ||
|
||
Copyright (c) 2013 Poulain, A. G. de Brevern | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
# This source code is part of the Biotite package and is distributed | ||
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further | ||
# information. | ||
|
||
""" | ||
Conversion of structures into the *Protein Blocks* structural alphabet. | ||
""" | ||
|
||
__name__ = "biotite.structure.alphabet" | ||
__author__ = "Patrick Kunzmann" | ||
__all__ = ["ProteinBlocksSequence", "to_protein_blocks"] | ||
|
||
import numpy as np | ||
from biotite.sequence.alphabet import LetterAlphabet | ||
from biotite.sequence.sequence import Sequence | ||
from biotite.structure.chains import get_chain_starts | ||
from biotite.structure.geometry import dihedral_backbone | ||
|
||
# PB reference angles, adapted from PBxplore | ||
PB_ANGLES = np.array( | ||
[ | ||
[41.14, 75.53, 13.92, -99.80, 131.88, -96.27, 122.08, -99.68], | ||
[108.24, -90.12, 119.54, -92.21, -18.06, -128.93, 147.04, -99.90], | ||
[-11.61, -105.66, 94.81, -106.09, 133.56, -106.93, 135.97, -100.63], | ||
[141.98, -112.79, 132.20, -114.79, 140.11, -111.05, 139.54, -103.16], | ||
[133.25, -112.37, 137.64, -108.13, 133.00, -87.30, 120.54, 77.40], | ||
[116.40, -105.53, 129.32, -96.68, 140.72, -74.19, -26.65, -94.51], | ||
[0.40, -81.83, 4.91, -100.59, 85.50, -71.65, 130.78, 84.98], | ||
[119.14, -102.58, 130.83, -67.91, 121.55, 76.25, -2.95, -90.88], | ||
[130.68, -56.92, 119.26, 77.85, 10.42, -99.43, 141.40, -98.01], | ||
[114.32, -121.47, 118.14, 82.88, -150.05, -83.81, 23.35, -85.82], | ||
[117.16, -95.41, 140.40, -59.35, -29.23, -72.39, -25.08, -76.16], | ||
[139.20, -55.96, -32.70, -68.51, -26.09, -74.44, -22.60, -71.74], | ||
[-39.62, -64.73, -39.52, -65.54, -38.88, -66.89, -37.76, -70.19], | ||
[-35.34, -65.03, -38.12, -66.34, -29.51, -89.10, -2.91, 77.90], | ||
[-45.29, -67.44, -27.72, -87.27, 5.13, 77.49, 30.71, -93.23], | ||
[-27.09, -86.14, 0.30, 59.85, 21.51, -96.30, 132.67, -92.91], | ||
] | ||
) | ||
|
||
|
||
class ProteinBlocksSequence(Sequence): | ||
""" | ||
Representation of a structure in the *Protein Blocks* structural alphabet. | ||
:footcite:`Brevern2000`. | ||
Parameters | ||
---------- | ||
sequence : iterable object, optional | ||
The 3Di sequence. | ||
This may either be a list or a string. | ||
May take upper or lower case letters. | ||
By default the sequence is empty. | ||
See also | ||
-------- | ||
to_protein_blocks : Create *Protein Blocks* sequences from a structure. | ||
References | ||
---------- | ||
.. footbibliography:: | ||
""" | ||
|
||
alphabet = LetterAlphabet("abcdefghijklmnopZ") | ||
unknown_symbol = "Z" | ||
|
||
def get_alphabet(self): | ||
return ProteinBlocksSequence.alphabet | ||
|
||
|
||
def to_protein_blocks(atoms): | ||
""" | ||
Encode each chain in the given structure to the *Protein Blocks* structural | ||
alphabet. | ||
:footcite:`Brevern2000` | ||
Parameters | ||
---------- | ||
atoms : AtomArray | ||
The atom array to encode. | ||
May contain multiple chains. | ||
Returns | ||
------- | ||
sequences : list of Sequence, length=n | ||
The encoded *Protein Blocks* sequence for each peptide chain in the structure. | ||
chain_start_indices : ndarray, shape=(n,), dtype=int | ||
The atom index where each chain starts. | ||
References | ||
---------- | ||
.. footbibliography:: | ||
Examples | ||
-------- | ||
>>> sequences, chain_starts = to_protein_blocks(atom_array) | ||
>>> print(sequences[0]) | ||
ZZmmmmmnopjmnopacdZZ | ||
""" | ||
sequences = [] | ||
chain_start_indices = get_chain_starts(atoms, add_exclusive_stop=True) | ||
for i in range(len(chain_start_indices) - 1): | ||
start = chain_start_indices[i] | ||
stop = chain_start_indices[i + 1] | ||
chain = atoms[start:stop] | ||
sequences.append(_to_protein_blocks(chain)) | ||
return sequences, chain_start_indices[:-1] | ||
|
||
|
||
def _to_protein_blocks(chain): | ||
phi, psi, _ = dihedral_backbone(chain) | ||
|
||
pb_angles = np.full((len(phi), 8), np.nan) | ||
pb_angles[2:-2, 0] = psi[:-4] | ||
pb_angles[2:-2, 1] = phi[1:-3] | ||
pb_angles[2:-2, 2] = psi[1:-3] | ||
pb_angles[2:-2, 3] = phi[2:-2] | ||
pb_angles[2:-2, 4] = psi[2:-2] | ||
pb_angles[2:-2, 5] = phi[3:-1] | ||
pb_angles[2:-2, 6] = psi[3:-1] | ||
pb_angles[2:-2, 7] = phi[4:] | ||
pb_angles = np.rad2deg(pb_angles) | ||
|
||
# Angle RMSD of all reference angles with all actual angles | ||
rmsda = np.sum( | ||
((PB_ANGLES[:, np.newaxis] - pb_angles[np.newaxis, :] + 180) % 360 - 180) ** 2, | ||
axis=-1, | ||
) | ||
# Where RMSDA is NaN, (missing atoms/residues or chain ends) set symbol to unknown | ||
pb_seq_code = np.full(len(pb_angles), ProteinBlocksSequence.alphabet.encode("Z")) | ||
pb_available_mask = ~np.isnan(rmsda).any(axis=0) | ||
# Chose PB, where the RMSDA to the reference angle is lowest | ||
# Due to the definition of Biotite symbol codes | ||
# the index of the chosen PB is directly the symbol code | ||
pb_seq_code[pb_available_mask] = np.argmin(rmsda[:, pb_available_mask], axis=0) | ||
# Put the array of symbol codes into actual sequence objects | ||
pb_sequence = ProteinBlocksSequence() | ||
pb_sequence.code = pb_seq_code | ||
return pb_sequence |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
>1ay7 | ||
ZZdddfklpcbfklmmmmmmmmnopafklgoiaklmmmmmmmmpacddddddehkllmmmmnnommmmmmmmmmmmmmnopacddddZZ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
from pathlib import Path | ||
import numpy as np | ||
import pytest | ||
import biotite.sequence.io.fasta as fasta | ||
import biotite.structure as struc | ||
import biotite.structure.alphabet as strucalph | ||
import biotite.structure.io.pdbx as pdbx | ||
from tests.util import data_dir | ||
|
||
|
||
@pytest.fixture | ||
def reference_sequence(): | ||
""" | ||
Get the reference Protein Blocks sequence for the alphabet example structure. | ||
""" | ||
_, seq_string = next( | ||
fasta.FastaFile.read_iter(Path(data_dir("structure")) / "alphabet" / "pb.fasta") | ||
) | ||
return strucalph.ProteinBlocksSequence(seq_string) | ||
|
||
|
||
@pytest.fixture | ||
def reference_chain(): | ||
pdbx_file = pdbx.BinaryCIFFile.read( | ||
Path(data_dir("structure")) / "alphabet" / "1ay7.bcif" | ||
) | ||
atoms = pdbx.get_structure(pdbx_file, model=1) | ||
atoms = atoms[struc.filter_amino_acids(atoms)] | ||
chain = atoms[atoms.chain_id == "B"] | ||
return chain | ||
|
||
|
||
def test_to_protein_blocks(reference_chain, reference_sequence): | ||
""" | ||
Test the structure conversion to protein blocks based on a reference example from | ||
the PBexplore documentation | ||
(https://pbxplore.readthedocs.io/en/latest/intro_PB.html). | ||
""" | ||
test_pb_sequences, _ = strucalph.to_protein_blocks(reference_chain) | ||
|
||
assert len(test_pb_sequences) == 1 | ||
assert str(test_pb_sequences[0]) == str(reference_sequence) | ||
|
||
|
||
def test_missing_residues(reference_chain, reference_sequence): | ||
""" | ||
Like, `test_to_protein_blocks()`, but in some residues backbone atoms are missing. | ||
Expect that these and adjacent residues get the unknown symbol 'Z' in the | ||
PB sequence. | ||
""" | ||
N_DELETIONS = 5 | ||
# The 'Z' symbol | ||
UKNOWN_SYMBOL = strucalph.ProteinBlocksSequence.unknown_symbol | ||
|
||
np.random.seed(0) | ||
del_backbone_residue_ids = np.random.choice( | ||
np.unique(reference_chain.res_id), N_DELETIONS, replace=False | ||
) | ||
reference_chain = reference_chain[ | ||
~np.isin(reference_chain.res_id, del_backbone_residue_ids) | ||
| ~np.isin(reference_chain.atom_name, ("N", "CA", "C")) | ||
] | ||
|
||
for res_id in del_backbone_residue_ids: | ||
seq_index = res_id - reference_chain.res_id[0] | ||
# Convert the PDB symbol for residue and adjacent ones to 'Z' | ||
start_index = max(0, seq_index - 2) | ||
end_index = min(len(reference_sequence), seq_index + 2) | ||
reference_sequence[start_index : end_index + 1] = UKNOWN_SYMBOL | ||
|
||
test_pb_sequences, _ = strucalph.to_protein_blocks(reference_chain) | ||
|
||
assert len(test_pb_sequences) == 1 | ||
assert str(test_pb_sequences[0]) == str(reference_sequence) |