Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate structural alphabets better into Biotite #682

Merged
merged 9 commits into from
Nov 4, 2024
22 changes: 22 additions & 0 deletions benchmarks/structure/benchmark_alphabet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from pathlib import Path
import pytest
import biotite.structure.alphabet as strucalph
import biotite.structure.io.pdbx as pdbx
from tests.util import data_dir

PDB_ID = "1aki"


@pytest.fixture
def atoms():
pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / f"{PDB_ID}.bcif")
return pdbx.get_structure(pdbx_file, model=1, include_bonds=True)


@pytest.mark.benchmark
@pytest.mark.parametrize("method", [strucalph.to_3di, strucalph.to_protein_blocks])
def benchmark_structural_alphabet_methods(method, atoms):
"""
Convert a structure to the given structural alphabet.
"""
method(atoms)
9 changes: 7 additions & 2 deletions doc/apidoc.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,8 +257,13 @@ def _is_relevant_type(obj):
in [types.FunctionType, types.BuiltinFunctionType, types.MethodType]
)
| (
# Functions from C-extensions
type(obj).__name__ in ["cython_function_or_method", "fused_cython_function"]
# Functions from C-extensions and wrapped functions
type(obj).__name__
in [
"cython_function_or_method",
"fused_cython_function",
"_lru_cache_wrapper",
]
)
| (
# Enum instance
Expand Down
1 change: 1 addition & 0 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@
"examples/scripts/structure/molecule",
"examples/scripts/structure/contacts",
"examples/scripts/structure/modeling",
"examples/scripts/structure/alphabet",
"examples/scripts/structure/misc",
]
),
Expand Down
26 changes: 18 additions & 8 deletions doc/examples/scripts/sequence/misc/color_schemes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Biotite color schemes
=====================

This script displays the available color schemes.
This script displays the available color schemes for the different built-in alphabets.
"""

# Code source: Patrick Kunzmann
Expand All @@ -14,6 +14,7 @@
from matplotlib.patches import Rectangle
import biotite.sequence as seq
import biotite.sequence.graphics as graphics
import biotite.structure.alphabet as strucalph


def plot_colors(ax, alphabet):
Expand Down Expand Up @@ -48,25 +49,34 @@ def plot_colors(ax, alphabet):

nuc_alphabet = seq.NucleotideSequence.alphabet_amb
prot_alphabet = seq.ProteinSequence.alphabet
pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop")
i3d_alphabet = strucalph.I3DSequence.alphabet
pb_alphabet = strucalph.ProteinBlocksSequence.alphabet

figure = plt.figure(figsize=(8.0, 5.0))
figure = plt.figure(figsize=(8.0, 7.0))
gs = GridSpec(
3,
4,
1,
height_ratios=[
len(graphics.list_color_scheme_names(alphabet))
for alphabet in (nuc_alphabet, prot_alphabet, pb_alphabet)
for alphabet in (nuc_alphabet, prot_alphabet, i3d_alphabet, pb_alphabet)
],
)

ax = figure.add_subplot(gs[0, 0])
ax.set_title("Nucleotide color schemes")
ax.set_title("Nucleotide")
plot_colors(ax, nuc_alphabet)

ax = figure.add_subplot(gs[1, 0])
ax.set_title("Protein color schemes")
ax.set_title("Protein")
plot_colors(ax, prot_alphabet)

ax = figure.add_subplot(gs[2, 0])
ax.set_title("Protein block color schemes")
ax.set_title("3Di")
plot_colors(ax, i3d_alphabet)

ax = figure.add_subplot(gs[3, 0])
ax.set_title("Protein Blocks")
plot_colors(ax, pb_alphabet)

plt.tight_layout()
plt.show()
2 changes: 2 additions & 0 deletions doc/examples/scripts/structure/alphabet/README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Structural alphabets
--------------------
87 changes: 87 additions & 0 deletions doc/examples/scripts/structure/alphabet/msa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""
Multiple Structural alignment of orthologous proteins
=====================================================

In this example we perform a structural alignment of multiple lysozyme
variants from different organisms.
A feasible approach to perfrom such a multiple structure alignment is the
usage of a structural alphabet:
At first the structure is translated into a sequence that represents
the structure.
Then the sequences can be aligned with the standard sequence alignment
techniques, using the substitution matrix of the structural alphabet.

In this example, the structural alphabet we will use is called
*Protein Blocks* (PBs) :footcite:`Brevern2000, Barnoud2017`:
There are 16 different PBs, represented by the symbols ``a`` to ``p``.
Each one depicts a different set of the backbone dihedral angles of a
peptide 5-mer.

.. footbibliography::
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import matplotlib.pyplot as plt
import biotite.database.rcsb as rcsb
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.structure as struc
import biotite.structure.alphabet as strucalph
import biotite.structure.io.pdbx as pdbx

PDB_IDS = ["1REX", "1AKI", "1DKJ", "1GD6"]

# Create a PB sequence from each structure
structural_sequences = []
organisms = []
for pdb_id in PDB_IDS:
file = pdbx.BinaryCIFFile.read(rcsb.fetch(pdb_id, "bcif"))
# Take only the first model into account
array = pdbx.get_structure(file, model=1)
# We are only interested in the first protein chain
array = array[struc.filter_amino_acids(array)]
array = array[array.chain_id == array.chain_id[0]]
pb_sequences, _ = strucalph.to_protein_blocks(array)
structural_sequences.append(pb_sequences[0].remove_undefined())

try:
organism_name = file.block["entity_src_nat"][
"pdbx_organism_scientific"
].as_item()
except KeyError:
organism_name = file.block["entity_src_gen"][
"pdbx_gene_src_scientific_name"
].as_item()
generic, specific = organism_name.split(" ")
abbreviation = f"{generic[0]}. {specific}"
organisms.append(abbreviation)


# Perform a multiple sequence alignment of the PB sequences
matrix = align.SubstitutionMatrix.std_protein_blocks_matrix()
alignment, order, _, _ = align.align_multiple(
structural_sequences, matrix, gap_penalty=(-500, -100), terminal_penalty=False
)

# Visualize the alignment
# Order alignment according to guide tree
alignment = alignment[:, order.tolist()]
labels = [organisms[i] for i in order]
fig = plt.figure(figsize=(8.0, 4.0))
ax = fig.add_subplot(111)
# The color scheme was generated with the 'Gecos' software
graphics.plot_alignment_type_based(
ax,
alignment,
labels=labels,
symbols_per_line=45,
spacing=2,
show_numbers=True,
color_scheme="flower",
)
# Organism names in italic
ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle": "italic"})
fig.tight_layout()
plt.show()
26 changes: 0 additions & 26 deletions doc/examples/scripts/structure/protein/pb_alignment.license

This file was deleted.

158 changes: 0 additions & 158 deletions doc/examples/scripts/structure/protein/pb_alignment.py

This file was deleted.

2 changes: 1 addition & 1 deletion doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ Biotite documentation
<h2>Analyze sequence data</h2>

Work with sequences of any kind: from the usual nucleotide and protein
sequences to your own sequence types created from a custom alphabet.
sequences to structural alphabets like *3Di*.
Use the rapid and modular alignment tools to identify homologous
regions or to map reads.
Eventually, visualize your results in different *Matplotlib* based
Expand Down
Loading
Loading