diff --git a/doc/apidoc.json b/doc/apidoc.json index 55371866d..7432602f4 100644 --- a/doc/apidoc.json +++ b/doc/apidoc.json @@ -404,10 +404,12 @@ }, "biotite.structure.alphabet" : { "Structural alphabets": [ - "I3DSequence" + "I3DSequence", + "ProteinBlocksSequence" ], "Conversion Function": [ - "to_3di" + "to_3di", + "to_protein_blocks" ] } } diff --git a/doc/examples/scripts/structure/protein/ramachandran.py b/doc/examples/scripts/structure/protein/ramachandran.py index 806ac283f..0f17bdbf7 100644 --- a/doc/examples/scripts/structure/protein/ramachandran.py +++ b/doc/examples/scripts/structure/protein/ramachandran.py @@ -21,12 +21,14 @@ # Download and parse file file = rcsb.fetch("3vkh", "cif", gettempdir()) atom_array = strucio.load_structure(file) +# Extract first peptide chain +peptide = atom_array[struc.filter_amino_acids(atom_array)] +chain = peptide[peptide.chain_id == "A"] # Calculate backbone dihedral angles # from one of the two identical chains in the asymmetric unit -phi, psi, omega = struc.dihedral_backbone(atom_array[atom_array.chain_id == "A"]) -# Conversion from radians into degree -phi *= 180 / np.pi -psi *= 180 / np.pi +phi, psi, omega = struc.dihedral_backbone(chain) +phi = np.rad2deg(phi) +psi = np.rad2deg(psi) # Remove invalid values (NaN) at first and last position phi = phi[1:-1] psi = psi[1:-1] diff --git a/doc/tutorial/structure/measurement.rst b/doc/tutorial/structure/measurement.rst index a5bd94d6f..63eeb73aa 100644 --- a/doc/tutorial/structure/measurement.rst +++ b/doc/tutorial/structure/measurement.rst @@ -98,8 +98,8 @@ simple *Ramachandran* plot for the first frame of *TC5b*. ) ax.set_xlim(-180,180) ax.set_ylim(-180,180) - ax.set_xlabel("$\phi$") - _ = ax.set_ylabel("$\psi$") + ax.set_xlabel(r"$\phi$") + _ = ax.set_ylabel(r"$\psi$") Surface area ------------ diff --git a/src/biotite/sequence/align/matrix.py b/src/biotite/sequence/align/matrix.py index 7c80fa1fc..9e4324ac9 100644 --- a/src/biotite/sequence/align/matrix.py +++ b/src/biotite/sequence/align/matrix.py @@ -65,6 +65,7 @@ class SubstitutionMatrix(object): - Structural alphabet substitution matrices - **3Di** - For 3Di alphabet from ``foldseek`` :footcite:`VanKempen2024` + - **PB** - For Protein Blocks alphabet from *PBexplore* :footcite:`Barnoud2017` A list of all available matrix names is returned by :meth:`list_db()`. @@ -408,6 +409,7 @@ def std_3di_matrix(): """ Get the default :class:`SubstitutionMatrix` for 3Di sequence alignments. + :footcite:`VanKempen2024` Returns ------- @@ -418,3 +420,47 @@ def std_3di_matrix(): from biotite.structure.alphabet.i3d import I3DSequence return SubstitutionMatrix(I3DSequence.alphabet, I3DSequence.alphabet, "3Di") + + @staticmethod + @functools.cache + def std_protein_blocks_matrix(unknown_match=200, unkown_mismatch=-200): + """ + Get the default :class:`SubstitutionMatrix` for Protein Blocks sequences. + + The matrix is adapted from *PBxplore* :footcite:`Barnoud2017`. + + Parameters + ---------- + unknown_match, unknown_mismatch : int, optional + The match and mismatch score for undefined symbols. + The default values were chosen arbitrarily, but are in the order of + magnitude of the other score values. + + Returns + ------- + matrix : SubstitutionMatrix + Default matrix. + + References + ---------- + + .. footbibliography:: + + """ + from biotite.structure.alphabet.pb import ProteinBlocksSequence + + alphabet = ProteinBlocksSequence.alphabet + unknown_symbol = ProteinBlocksSequence.unknown_symbol + matrix_dict = SubstitutionMatrix.dict_from_db("PB") + # Add match/mismatch scores for undefined symbols residues + for symbol in alphabet: + if symbol == unknown_symbol: + continue + matrix_dict[symbol, unknown_symbol] = unkown_mismatch + matrix_dict[unknown_symbol, symbol] = unkown_mismatch + matrix_dict[unknown_symbol, unknown_symbol] = unknown_match + return SubstitutionMatrix( + alphabet, + alphabet, + matrix_dict, + ) diff --git a/src/biotite/sequence/align/matrix_data/PB.license b/src/biotite/sequence/align/matrix_data/PB.license new file mode 100644 index 000000000..688633bfa --- /dev/null +++ b/src/biotite/sequence/align/matrix_data/PB.license @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2013 Poulain, A. G. de Brevern + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/src/biotite/sequence/align/matrix_data/PB.mat b/src/biotite/sequence/align/matrix_data/PB.mat new file mode 100644 index 000000000..abb8dc293 --- /dev/null +++ b/src/biotite/sequence/align/matrix_data/PB.mat @@ -0,0 +1,18 @@ +# PB substitution matrix, adapted from PBxplore + a b c d e f g h i j k l m n o p +a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83 +b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22 +c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6 +d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497 +e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632 +f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552 +g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254 +h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399 +i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226 +j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104 +k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382 +l -259 29 -682 -670 -514 -88 -138 -108 15 5 -65 533 -131 8 -11 -316 +m -599 -745 -608 -1573 -1136 -547 -124 -712 -709 -398 -270 -131 241 -4 -190 -155 +n -372 -242 -455 -1048 -469 -629 172 -359 -269 -340 -231 8 -4 703 88 146 +o -124 -165 -147 -691 -617 -406 128 95 -169 -117 -471 -11 -190 88 716 58 +p -83 22 6 -497 -632 -552 254 -399 226 -104 -382 -316 -155 146 58 609 \ No newline at end of file diff --git a/src/biotite/structure/alphabet/__init__.py b/src/biotite/structure/alphabet/__init__.py index 1e1678822..baafab6e1 100644 --- a/src/biotite/structure/alphabet/__init__.py +++ b/src/biotite/structure/alphabet/__init__.py @@ -10,3 +10,4 @@ __author__ = "Martin Larralde, Patrick Kunzmann" from .i3d import * +from .pb import * diff --git a/src/biotite/structure/alphabet/pb.license b/src/biotite/structure/alphabet/pb.license new file mode 100644 index 000000000..688633bfa --- /dev/null +++ b/src/biotite/structure/alphabet/pb.license @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2013 Poulain, A. G. de Brevern + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/src/biotite/structure/alphabet/pb.py b/src/biotite/structure/alphabet/pb.py new file mode 100644 index 000000000..e2c527cca --- /dev/null +++ b/src/biotite/structure/alphabet/pb.py @@ -0,0 +1,143 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +""" +Conversion of structures into the *Protein Blocks* structural alphabet. +""" + +__name__ = "biotite.structure.alphabet" +__author__ = "Patrick Kunzmann" +__all__ = ["ProteinBlocksSequence", "to_protein_blocks"] + +import numpy as np +from biotite.sequence.alphabet import LetterAlphabet +from biotite.sequence.sequence import Sequence +from biotite.structure.chains import get_chain_starts +from biotite.structure.geometry import dihedral_backbone + +# PB reference angles, adapted from PBxplore +PB_ANGLES = np.array( + [ + [41.14, 75.53, 13.92, -99.80, 131.88, -96.27, 122.08, -99.68], + [108.24, -90.12, 119.54, -92.21, -18.06, -128.93, 147.04, -99.90], + [-11.61, -105.66, 94.81, -106.09, 133.56, -106.93, 135.97, -100.63], + [141.98, -112.79, 132.20, -114.79, 140.11, -111.05, 139.54, -103.16], + [133.25, -112.37, 137.64, -108.13, 133.00, -87.30, 120.54, 77.40], + [116.40, -105.53, 129.32, -96.68, 140.72, -74.19, -26.65, -94.51], + [0.40, -81.83, 4.91, -100.59, 85.50, -71.65, 130.78, 84.98], + [119.14, -102.58, 130.83, -67.91, 121.55, 76.25, -2.95, -90.88], + [130.68, -56.92, 119.26, 77.85, 10.42, -99.43, 141.40, -98.01], + [114.32, -121.47, 118.14, 82.88, -150.05, -83.81, 23.35, -85.82], + [117.16, -95.41, 140.40, -59.35, -29.23, -72.39, -25.08, -76.16], + [139.20, -55.96, -32.70, -68.51, -26.09, -74.44, -22.60, -71.74], + [-39.62, -64.73, -39.52, -65.54, -38.88, -66.89, -37.76, -70.19], + [-35.34, -65.03, -38.12, -66.34, -29.51, -89.10, -2.91, 77.90], + [-45.29, -67.44, -27.72, -87.27, 5.13, 77.49, 30.71, -93.23], + [-27.09, -86.14, 0.30, 59.85, 21.51, -96.30, 132.67, -92.91], + ] +) # fmt: skip + + +class ProteinBlocksSequence(Sequence): + """ + Representation of a structure in the *Protein Blocks* structural alphabet. + :footcite:`Brevern2000` + + Parameters + ---------- + sequence : iterable object, optional + The *Protein Blocks* sequence. + This may either be a list or a string. + May take upper or lower case letters. + By default the sequence is empty. + + See also + -------- + to_protein_blocks : Create *Protein Blocks* sequences from a structure. + + References + ---------- + + .. footbibliography:: + + """ + + alphabet = LetterAlphabet("abcdefghijklmnopZ") + unknown_symbol = "Z" + + def get_alphabet(self): + return ProteinBlocksSequence.alphabet + + +def to_protein_blocks(atoms): + """ + Encode each chain in the given structure to the *Protein Blocks* structural + alphabet. + :footcite:`Brevern2000` + + Parameters + ---------- + atoms : AtomArray + The atom array to encode. + May contain multiple chains. + + Returns + ------- + sequences : list of Sequence, length=n + The encoded *Protein Blocks* sequence for each peptide chain in the structure. + chain_start_indices : ndarray, shape=(n,), dtype=int + The atom index where each chain starts. + + References + ---------- + + .. footbibliography:: + + Examples + -------- + + >>> sequences, chain_starts = to_protein_blocks(atom_array) + >>> print(sequences[0]) + ZZmmmmmnopjmnopacdZZ + """ + sequences = [] + chain_start_indices = get_chain_starts(atoms, add_exclusive_stop=True) + for i in range(len(chain_start_indices) - 1): + start = chain_start_indices[i] + stop = chain_start_indices[i + 1] + chain = atoms[start:stop] + sequences.append(_to_protein_blocks(chain)) + return sequences, chain_start_indices[:-1] + + +def _to_protein_blocks(chain): + phi, psi, _ = dihedral_backbone(chain) + + pb_angles = np.full((len(phi), 8), np.nan) + pb_angles[2:-2, 0] = psi[:-4] + pb_angles[2:-2, 1] = phi[1:-3] + pb_angles[2:-2, 2] = psi[1:-3] + pb_angles[2:-2, 3] = phi[2:-2] + pb_angles[2:-2, 4] = psi[2:-2] + pb_angles[2:-2, 5] = phi[3:-1] + pb_angles[2:-2, 6] = psi[3:-1] + pb_angles[2:-2, 7] = phi[4:] + pb_angles = np.rad2deg(pb_angles) + + # Angle RMSD of all reference angles with all actual angles + rmsda = np.sum( + ((PB_ANGLES[:, np.newaxis] - pb_angles[np.newaxis, :] + 180) % 360 - 180) ** 2, + axis=-1, + ) + # Where RMSDA is NaN, (missing atoms/residues or chain ends) set symbol to unknown + pb_seq_code = np.full(len(pb_angles), ProteinBlocksSequence.alphabet.encode("Z")) + pb_available_mask = ~np.isnan(rmsda).any(axis=0) + # Chose PB, where the RMSDA to the reference angle is lowest + # Due to the definition of Biotite symbol codes + # the index of the chosen PB is directly the symbol code + pb_seq_code[pb_available_mask] = np.argmin(rmsda[:, pb_available_mask], axis=0) + # Put the array of symbol codes into actual sequence objects + pb_sequence = ProteinBlocksSequence() + pb_sequence.code = pb_seq_code + return pb_sequence diff --git a/src/biotite/structure/geometry.py b/src/biotite/structure/geometry.py index cc5c59f4e..8f64fbfb8 100644 --- a/src/biotite/structure/geometry.py +++ b/src/biotite/structure/geometry.py @@ -25,10 +25,12 @@ import numpy as np from biotite.structure.atoms import AtomArray, AtomArrayStack, coord from biotite.structure.box import coord_to_fraction, fraction_to_coord, is_orthogonal -from biotite.structure.chains import chain_iter -from biotite.structure.error import BadStructureError -from biotite.structure.filter import filter_peptide_backbone -from biotite.structure.util import norm_vector, vector_dot +from biotite.structure.filter import filter_amino_acids +from biotite.structure.util import ( + coord_for_atom_name_per_residue, + norm_vector, + vector_dot, +) def displacement(atoms1, atoms2, box=None): @@ -480,139 +482,84 @@ def index_dihedral(*args, **kwargs): def dihedral_backbone(atom_array): """ - Measure the characteristic backbone dihedral angles of a protein - structure. + Measure the characteristic backbone dihedral angles of a chain. Parameters ---------- - atom_array: AtomArray or AtomArrayStack - The protein structure. A complete backbone, without gaps, - is required here. - Chain transitions are allowed, the angles at the transition are - `NaN`. - The order of the backbone atoms for each residue must be - (N, CA, C). + atoms: AtomArray or AtomArrayStack + The protein structure to measure the dihedral angles for. + For missing backbone atoms the corresponding angles are `NaN`. Returns ------- phi, psi, omega : ndarray - An array containing the 3 backbone dihedral angles for every - CA. 'phi' is not defined at the N-terminus, 'psi' and 'omega' - are not defined at the C-terminus. In these places the arrays - have *NaN* values. If an :class:`AtomArrayStack` is given, the - output angles are 2-dimensional, the first dimension corresponds - to the model number. - - Raises - ------ - BadStructureError - If the amount of backbone atoms is not equal to amount of - residues times 3 (for N, CA and C). - - See Also - -------- - dihedral - - Examples - -------- - - >>> phi, psi, omega = dihedral_backbone(atom_array) - >>> print(np.stack([np.rad2deg(phi), np.rad2deg(psi)]).T) - [[ nan -56.145] - [ -43.980 -51.309] - [ -66.466 -30.898] - [ -65.219 -45.945] - [ -64.747 -30.346] - [ -73.136 -43.425] - [ -64.882 -43.255] - [ -59.509 -25.698] - [ -77.989 -8.823] - [ 110.784 8.079] - [ 55.244 -124.371] - [ -57.983 -28.766] - [ -81.834 19.125] - [-124.057 13.401] - [ 67.931 25.218] - [-143.952 131.297] - [ -70.100 160.068] - [ -69.484 145.669] - [ -77.264 124.223] - [ -78.100 nan]] + An array containing the 3 backbone dihedral angles for every CA atom. + `phi` is not defined at the N-terminus, `psi` and `omega` are not defined at the + C-terminus. + In these places the arrays have *NaN* values. + If an :class:`AtomArrayStack` is given, the output angles are 2-dimensional, + the first dimension corresponds to the model number. """ - bb_filter = filter_peptide_backbone(atom_array) - backbone = atom_array[..., bb_filter] - - if ( - backbone.array_length() % 3 != 0 - or (backbone.atom_name[0::3] != "N").any() - or (backbone.atom_name[1::3] != "CA").any() - or (backbone.atom_name[2::3] != "C").any() - ): - raise BadStructureError( - "The backbone is invalid, must be repeats of (N, CA, C), " - "maybe a backbone atom is missing" - ) - phis = [] - psis = [] - omegas = [] - for chain_bb in chain_iter(backbone): - phi, psi, omega = _dihedral_backbone(chain_bb) - phis.append(phi) - psis.append(psi) - omegas.append(omega) - return ( - np.concatenate(phis, axis=-1), - np.concatenate(psis, axis=-1), - np.concatenate(omegas, axis=-1), - ) + amino_acid_mask = filter_amino_acids(atom_array) + # Coordinates for dihedral angle calculation + coord_n, coord_ca, coord_c = coord_for_atom_name_per_residue( + atom_array, + ("N", "CA", "C"), + amino_acid_mask, + ) + n_residues = coord_n.shape[-2] -def _dihedral_backbone(chain_bb): - bb_coord = chain_bb.coord # Coordinates for dihedral angle calculation # Dim 0: Model index (only for atom array stacks) # Dim 1: Angle index # Dim 2: X, Y, Z coordinates # Dim 3: Atoms involved in dihedral angle - if isinstance(chain_bb, AtomArray): - angle_coord_shape = (len(bb_coord) // 3, 3, 4) - elif isinstance(chain_bb, AtomArrayStack): - angle_coord_shape = (bb_coord.shape[0], bb_coord.shape[1] // 3, 3, 4) - phi_coord = np.full(angle_coord_shape, np.nan) - psi_coord = np.full(angle_coord_shape, np.nan) - omega_coord = np.full(angle_coord_shape, np.nan) - - # Indices for coordinates of CA atoms - ca_i = np.arange(bb_coord.shape[-2] // 3) * 3 + 1 + if isinstance(atom_array, AtomArray): + angle_coord_shape: tuple[int, ...] = (n_residues, 3, 4) + elif isinstance(atom_array, AtomArrayStack): + angle_coord_shape = (atom_array.stack_depth(), n_residues, 3, 4) + coord_for_phi = np.full(angle_coord_shape, np.nan, dtype=np.float32) + coord_for_psi = np.full(angle_coord_shape, np.nan, dtype=np.float32) + coord_for_omg = np.full(angle_coord_shape, np.nan, dtype=np.float32) + # fmt: off - phi_coord [..., 1:, :, 0] = bb_coord[..., ca_i[1: ]-2, :] - phi_coord [..., 1:, :, 1] = bb_coord[..., ca_i[1: ]-1, :] - phi_coord [..., 1:, :, 2] = bb_coord[..., ca_i[1: ], :] - phi_coord [..., 1:, :, 3] = bb_coord[..., ca_i[1: ]+1, :] - psi_coord [..., :-1, :, 0] = bb_coord[..., ca_i[:-1]-1, :] - psi_coord [..., :-1, :, 1] = bb_coord[..., ca_i[:-1], :] - psi_coord [..., :-1, :, 2] = bb_coord[..., ca_i[:-1]+1, :] - psi_coord [..., :-1, :, 3] = bb_coord[..., ca_i[:-1]+2, :] - omega_coord[..., :-1, :, 0] = bb_coord[..., ca_i[:-1], :] - omega_coord[..., :-1, :, 1] = bb_coord[..., ca_i[:-1]+1, :] - omega_coord[..., :-1, :, 2] = bb_coord[..., ca_i[:-1]+2, :] - omega_coord[..., :-1, :, 3] = bb_coord[..., ca_i[:-1]+3, :] + coord_for_phi[..., 1:, :, 0] = coord_c[..., 0:-1, :] + coord_for_phi[..., 1:, :, 1] = coord_n[..., 1:, :] + coord_for_phi[..., 1:, :, 2] = coord_ca[..., 1:, :] + coord_for_phi[..., 1:, :, 3] = coord_c[..., 1:, :] + + coord_for_psi[..., 0:-1, :, 0] = coord_n[..., 0:-1, :] + coord_for_psi[..., 0:-1, :, 1] = coord_ca[..., 0:-1, :] + coord_for_psi[..., 0:-1, :, 2] = coord_c[..., 0:-1, :] + coord_for_psi[..., 0:-1, :, 3] = coord_n[..., 1:, :] + + coord_for_omg[..., 0:-1, :, 0] = coord_ca[..., 0:-1, :] + coord_for_omg[..., 0:-1, :, 1] = coord_c[..., 0:-1, :] + coord_for_omg[..., 0:-1, :, 2] = coord_n[..., 1:, :] + coord_for_omg[..., 0:-1, :, 3] = coord_ca[..., 1:, :] # fmt: on phi = dihedral( - phi_coord[..., 0], phi_coord[..., 1], phi_coord[..., 2], phi_coord[..., 3] + coord_for_phi[..., 0], + coord_for_phi[..., 1], + coord_for_phi[..., 2], + coord_for_phi[..., 3], ) psi = dihedral( - psi_coord[..., 0], psi_coord[..., 1], psi_coord[..., 2], psi_coord[..., 3] + coord_for_psi[..., 0], + coord_for_psi[..., 1], + coord_for_psi[..., 2], + coord_for_psi[..., 3], ) - omega = dihedral( - omega_coord[..., 0], - omega_coord[..., 1], - omega_coord[..., 2], - omega_coord[..., 3], + omg = dihedral( + coord_for_omg[..., 0], + coord_for_omg[..., 1], + coord_for_omg[..., 2], + coord_for_omg[..., 3], ) - return phi, psi, omega + return phi, psi, omg def centroid(atoms): diff --git a/src/biotite/structure/util.py b/src/biotite/structure/util.py index 43816adfe..a2579d929 100644 --- a/src/biotite/structure/util.py +++ b/src/biotite/structure/util.py @@ -17,7 +17,7 @@ ] import numpy as np -from biotite.structure.atoms import AtomArray +from biotite.structure.atoms import AtomArrayStack from biotite.structure.error import BadStructureError from biotite.structure.residues import get_residue_masks, get_residue_starts @@ -105,7 +105,7 @@ def matrix_rotate(v, matrix): return v -def coord_for_atom_name_per_residue(atoms, atom_names): +def coord_for_atom_name_per_residue(atoms, atom_names, mask=None): """ Get the coordinates of a specific atom for every residue. @@ -118,39 +118,51 @@ def coord_for_atom_name_per_residue(atoms, atom_names): atoms : AtomArray, shape=(n,) or AtomArrayStack, shape=(m,n) The atom array or stack to get the residue-wise coordinates from. atom_names : list of str, length=k + The atom names to get the coordinates for. + mask : ndarray, shape=(n,), dtype=bool, optional + A boolean mask to further select valid atoms from `atoms`. Returns ------- coord: ndarray, shape=(k, m, r, 3) or shape=(k, r, 3) The coordinates of the specified atom for each residue. """ + is_multi_model = isinstance(atoms, AtomArrayStack) residue_starts = get_residue_starts(atoms) all_residue_masks = get_residue_masks(atoms, residue_starts) - if isinstance(atoms, AtomArray): + if is_multi_model: coord = np.full( - (len(atom_names), len(residue_starts), 3), + (len(atom_names), atoms.stack_depth(), len(residue_starts), 3), np.nan, dtype=np.float32, ) else: coord = np.full( - (len(atom_names), atoms.stack_depth(), len(residue_starts), 3), + (len(atom_names), len(residue_starts), 3), np.nan, dtype=np.float32, ) for i, atom_name in enumerate(atom_names): - atom_mask_for_name = atoms.atom_name == atom_name - all_residue_masks_for_specified_atom = all_residue_masks & atom_mask_for_name + specified_atom_mask = atoms.atom_name == atom_name + if mask is not None: + specified_atom_mask &= mask + all_residue_masks_for_specified_atom = all_residue_masks & specified_atom_mask number_of_specified_atoms_per_residue = np.count_nonzero( all_residue_masks_for_specified_atom, axis=-1 ) if np.any(number_of_specified_atoms_per_residue > 1): raise BadStructureError(f"Multiple '{atom_name}' atoms per residue") residues_with_specified_atom = number_of_specified_atoms_per_residue == 1 - coord[i, ..., residues_with_specified_atom, :] = atoms.coord[ - ..., atom_mask_for_name, : - ] + coord_of_specified_atoms = atoms.coord[..., specified_atom_mask, :] + if is_multi_model: + # Swap dimensions due to NumPy's behavior when using advanced indexing + # (https://numpy.org/devdocs/user/basics.indexing.html#combining-advanced-and-basic-indexing) + coord[i, ..., residues_with_specified_atom, :] = ( + coord_of_specified_atoms.transpose(1, 0, 2) + ) + else: + coord[i, residues_with_specified_atom, :] = coord_of_specified_atoms return coord diff --git a/tests/sequence/align/test_matrix.py b/tests/sequence/align/test_matrix.py index ca10133bb..2745af8cb 100644 --- a/tests/sequence/align/test_matrix.py +++ b/tests/sequence/align/test_matrix.py @@ -14,7 +14,7 @@ [ entry for entry in align.SubstitutionMatrix.list_db() - if entry not in ["NUC", "GONNET", "3Di"] + if entry not in ["NUC", "GONNET", "3Di", "PB"] ], ) def test_matrices(db_entry): @@ -45,6 +45,7 @@ def test_structural_alphabet_matrices(matrix_name, alphabet): "std_protein_matrix", "std_nucleotide_matrix", "std_3di_matrix", + "std_protein_blocks_matrix", ], ) def test_default_matrices(method_name): diff --git a/tests/structure/data/alphabet/1ay7.bcif b/tests/structure/data/alphabet/1ay7.bcif new file mode 100644 index 000000000..3ce454e2e Binary files /dev/null and b/tests/structure/data/alphabet/1ay7.bcif differ diff --git a/tests/structure/data/alphabet/README.rst b/tests/structure/data/alphabet/README.rst index 957108467..50fe5bc94 100644 --- a/tests/structure/data/alphabet/README.rst +++ b/tests/structure/data/alphabet/README.rst @@ -15,4 +15,11 @@ The 3Di sequences in `i3d.fasta` were generated with `foldseek` according to $ foldseek createdb --chain-name-mode 1 tests/structure/data/*.cif /tmp/biotite_3di $ foldseek lndb /tmp/biotite_3di_h /tmp/biotite_3di_ss_h - $ foldseek convert2fasta /tmp/biotite_3di_ss tests/structure/data/alphabet/i3d.fasta \ No newline at end of file + $ foldseek convert2fasta /tmp/biotite_3di_ss tests/structure/data/alphabet/i3d.fasta + +Protein Blocks sequences +------------------------ + +Only one sequence is available in `pb.fasta`, that is taken from +`https://pbxplore.readthedocs.io/en/latest/PBassign.html`. +`1ay7.bcif` contains the corresponding structure. \ No newline at end of file diff --git a/tests/structure/data/alphabet/pb.fasta b/tests/structure/data/alphabet/pb.fasta new file mode 100644 index 000000000..3d7cb3f8c --- /dev/null +++ b/tests/structure/data/alphabet/pb.fasta @@ -0,0 +1,2 @@ +>1ay7 +ZZdddfklpcbfklmmmmmmmmnopafklgoiaklmmmmmmmmpacddddddehkllmmmmnnommmmmmmmmmmmmmnopacddddZZ \ No newline at end of file diff --git a/tests/structure/test_geometry.py b/tests/structure/test_geometry.py index dc5ecf1af..715cdeb3e 100644 --- a/tests/structure/test_geometry.py +++ b/tests/structure/test_geometry.py @@ -39,28 +39,23 @@ def test_dihedral(): assert struc.dihedral(coord1, coord2, coord3, coord4) == pytest.approx(0.5 * np.pi) -@pytest.mark.parametrize("multiple_chains", [False, True]) -def test_dihedral_backbone_general(multiple_chains): +@pytest.mark.parametrize("multi_model", [False, True]) +def test_dihedral_backbone_general(multi_model): stack = strucio.load_structure(join(data_dir("structure"), "1l2y.bcif")) n_models = stack.stack_depth() n_res = stack.res_id[-1] - if multiple_chains: - stack2 = stack.copy() - stack2.chain_id[:] = "B" - stack = stack + stack2 - n_res = n_res * 2 - array = stack[0] + + if multi_model: + atoms = stack + expected_shape = (n_models, n_res) + else: + atoms = stack[0] + expected_shape = (n_res,) # Test array - phi, psi, omega = struc.dihedral_backbone(array) - assert phi.shape == (n_res,) - assert psi.shape == (n_res,) - assert omega.shape == (n_res,) - _assert_plausible_omega(omega) - # Test stack - phi, psi, omega = struc.dihedral_backbone(stack) - assert phi.shape == (n_models, n_res) - assert psi.shape == (n_models, n_res) - assert omega.shape == (n_models, n_res) + phi, psi, omega = struc.dihedral_backbone(atoms) + assert phi.shape == expected_shape + assert psi.shape == expected_shape + assert omega.shape == expected_shape _assert_plausible_omega(omega) @@ -72,13 +67,17 @@ def _assert_plausible_omega(omega): assert omega.tolist() == pytest.approx([np.pi] * len(omega), rel=0.6) -def test_dihedral_backbone_consistency(): +@pytest.mark.parametrize("multi_model", [False, True]) +def test_dihedral_backbone_consistency(multi_model): """ Check if the computed dihedral angles are equal to the reference computed with MDTraj. """ pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1l2y.bcif")) atoms = pdbx.get_structure(pdbx_file, model=1) + if multi_model: + # Use models with equal coordinates + atoms = struc.stack([atoms] * 2) test_phi, test_psi, test_ome = struc.dihedral_backbone(atoms) @@ -105,6 +104,14 @@ def test_dihedral_backbone_consistency(): [-1.363, np.nan, np.nan], ]) # fmt: skip + if multi_model: + assert np.array_equal(test_phi[1], test_phi[0], equal_nan=True) + assert np.array_equal(test_psi[1], test_psi[0], equal_nan=True) + assert np.array_equal(test_ome[1], test_ome[0], equal_nan=True) + test_phi = test_phi[0] + test_psi = test_psi[0] + test_ome = test_ome[0] + assert test_phi == pytest.approx(ref_dihedrals[:, 0], abs=1e-3, nan_ok=True) assert test_psi == pytest.approx(ref_dihedrals[:, 1], abs=1e-3, nan_ok=True) assert test_ome == pytest.approx(ref_dihedrals[:, 2], abs=1e-3, nan_ok=True) diff --git a/tests/structure/test_i3d.py b/tests/structure/test_i3d.py index e08d94544..402554608 100644 --- a/tests/structure/test_i3d.py +++ b/tests/structure/test_i3d.py @@ -84,6 +84,7 @@ def test_missing_residues(): atoms = pdbx.get_structure(pdbx_file, model=1) atoms = atoms[struc.filter_amino_acids(atoms)] + # Randomly delete some backbone atoms rng = np.random.default_rng(1) del_backbone_residue_ids = rng.choice( np.unique(atoms.res_id), N_DELETIONS, replace=False @@ -94,6 +95,7 @@ def test_missing_residues(): ] test_sequences, _ = strucalph.to_3di(atoms) + # Apply the same deletions to the reference sequence ref_sequence = _get_ref_3di_sequence(PDB_ID, atoms.chain_id[0]) for res_id in del_backbone_residue_ids: seq_index = res_id - atoms.res_id[0] diff --git a/tests/structure/test_pb.py b/tests/structure/test_pb.py new file mode 100644 index 000000000..5ecb57348 --- /dev/null +++ b/tests/structure/test_pb.py @@ -0,0 +1,76 @@ +from pathlib import Path +import numpy as np +import pytest +import biotite.sequence.io.fasta as fasta +import biotite.structure as struc +import biotite.structure.alphabet as strucalph +import biotite.structure.io.pdbx as pdbx +from tests.util import data_dir + + +@pytest.fixture +def reference_sequence(): + """ + Get the reference Protein Blocks sequence for the alphabet example structure. + """ + _, seq_string = next( + fasta.FastaFile.read_iter(Path(data_dir("structure")) / "alphabet" / "pb.fasta") + ) + return strucalph.ProteinBlocksSequence(seq_string) + + +@pytest.fixture +def reference_chain(): + pdbx_file = pdbx.BinaryCIFFile.read( + Path(data_dir("structure")) / "alphabet" / "1ay7.bcif" + ) + atoms = pdbx.get_structure(pdbx_file, model=1) + atoms = atoms[struc.filter_amino_acids(atoms)] + chain = atoms[atoms.chain_id == "B"] + return chain + + +def test_to_protein_blocks(reference_chain, reference_sequence): + """ + Test the structure conversion to protein blocks based on a reference example from + the PBexplore documentation + (https://pbxplore.readthedocs.io/en/latest/intro_PB.html). + """ + test_pb_sequences, _ = strucalph.to_protein_blocks(reference_chain) + + assert len(test_pb_sequences) == 1 + assert str(test_pb_sequences[0]) == str(reference_sequence) + + +def test_missing_residues(reference_chain, reference_sequence): + """ + Like, `test_to_protein_blocks()`, but in some residues backbone atoms are missing. + Expect that these and adjacent residues get the unknown symbol 'Z' in the + PB sequence. + """ + N_DELETIONS = 5 + # The 'Z' symbol + UKNOWN_SYMBOL = strucalph.ProteinBlocksSequence.unknown_symbol + + # Randomly delete some backbone atoms + rng = np.random.default_rng(1) + del_backbone_residue_ids = rng.choice( + np.unique(reference_chain.res_id), N_DELETIONS, replace=False + ) + reference_chain = reference_chain[ + ~np.isin(reference_chain.res_id, del_backbone_residue_ids) + | ~np.isin(reference_chain.atom_name, ("N", "CA", "C")) + ] + + # Apply the same deletions to the reference sequence + for res_id in del_backbone_residue_ids: + seq_index = res_id - reference_chain.res_id[0] + # Convert the PB symbol for residue and adjacent ones to 'Z' + start_index = max(0, seq_index - 2) + end_index = min(len(reference_sequence), seq_index + 2) + reference_sequence[start_index : end_index + 1] = UKNOWN_SYMBOL + + test_pb_sequences, _ = strucalph.to_protein_blocks(reference_chain) + + assert len(test_pb_sequences) == 1 + assert str(test_pb_sequences[0]) == str(reference_sequence)