Merge pull request #702 from padix-key/profile

Expand tutorial about position specific scoring matrices
biotite-dev · Nov 25, 2024 · b761a28 · b761a28
2 parents 77cf0f6 + 2caccfb
commit b761a28
Show file tree

Hide file tree

Showing 3 changed files with 121 additions and 9 deletions.
diff --git a/doc/tutorial/sequence/profiles.rst b/doc/tutorial/sequence/profiles.rst
@@ -1,7 +1,7 @@
 .. include:: /tutorial/preamble.rst
 
-Profiles and position-specific matrices
-=======================================
+Profiles and position-specific scoring matrices
+===============================================
 Often sequences are not viewed in isolation:
 For example, if you investigate a protein family, you do not handle a single sequence,
 but an arbitrarily large collection of highly similar sequences.
@@ -116,12 +116,15 @@ regardless of their position in a sequence.
 However, as discussed above, the position is crucial information to determine how
 conserved a certain symbol is in a family of sequences.
 
-Hence, we extend the concept of substitution matrices to *position-specific* matrices,
+Hence, we extend the concept of substitution matrices to *position-specific scoring
+matrices*,
 which assign a score to a symbol and a position (instead of another symbols).
+A typical way to create a position-specific scoring matrix is to use the log-odds matrix
+from a profile.
 
 .. jupyter-execute::
 
-    # For a real analysis each amino acid background frequencies should be provided
+    # For a real analysis amino acid background frequencies should be provided
     log_odds = profile.log_odds_matrix(pseudocount=1)
 
 Now, we encounter a problem:
@@ -142,7 +145,7 @@ sought length.
     positional_seq = seq.PositionalSequence(profile.to_consensus())
     matrix = align.SubstitutionMatrix(
         positional_seq.alphabet,
-        profile.alphabet,
+        seq.ProteinSequence.alphabet,
         # Substitution matrices require integer scores
         # Multiply by 10 to increase value range and convert to integer
         (log_odds * 10).astype(int)
@@ -157,3 +160,45 @@ the alignment to work.
 The consensus sequence of the profile was merely chosen for cosmetic reasons, i.e.
 to have a meaningful string representation of the positional sequence and thus the
 alignment.
+
+Further applications
+^^^^^^^^^^^^^^^^^^^^
+
+Using a sequence profile is only one way to create a position-specific scoring matrix.
+The combination of :class:`.PositionalSequence` objects and a corresponding
+:class:`.SubstitutionMatrix` allows to assign a score between two sequence positions
+based on any property.
+
+As a toy example, let's say we want to harness the information that cyclotides have
+strongly conserved disulfide bridges.
+Hence, we want the score matrix not only to reward when cysteines are paired with
+cysteines in general, but specifically when cysteines are paired with cysteines at the
+corresponding position, i.e. the first cysteine with the first cysteine etc.
+As template we will use the standard *BLOSUM62* substitution matrix, expanded into
+a position-specific scoring matrix using :meth:`SubstitutionMatrix.as_positional()`:
+
+.. jupyter-execute::
+
+    cyclotide_n = query
+    cyclotide_c = sequences["C"]
+    aa_matrix = align.SubstitutionMatrix.std_protein_matrix()
+    # `as_positional()` expands the matrix into a position-specific scoring matrix
+    # for the given sequences while also giving us the positional sequences for them
+    pos_matrix, pos_cyclotide_n, pos_cyclotide_c = aa_matrix.as_positional(
+        cyclotide_n, cyclotide_c
+    )
+    # Introduce bias for cysteine positions
+    score_matrix = pos_matrix.score_matrix().copy()
+    cys_code = seq.ProteinSequence.alphabet.encode("C")
+    score_matrix[cyclotide_n.code == cys_code, cyclotide_c.code == cys_code] = 100
+    disulfide_matrix = align.SubstitutionMatrix(
+        pos_matrix.get_alphabet1(),
+        pos_matrix.get_alphabet2(),
+        score_matrix,
+    )
+    print(disulfide_matrix)
+
+You might notice the new shape of the substitution matrix:
+Instead of spanning the 24 x 24 amino acids including ambiguous symbols, it now matches
+the lengths of the two input sequences.
+In this matrix you can spot the biased scores at the matching cysteine positions.
diff --git a/src/biotite/sequence/profile.py b/src/biotite/sequence/profile.py
@@ -3,6 +3,7 @@
 # information.
 
 import warnings
+from numbers import Integral
 import numpy as np
 from biotite.sequence.align.alignment import get_codes
 from biotite.sequence.alphabet import LetterAlphabet
@@ -66,16 +67,16 @@ class SequenceProfile(object):
     It also saves the number of gaps at each position in the array
     'gaps'.
 
+    With :meth:`from_alignment()` a :class:`SequenceProfile` object can
+    be created from an indefinite number of aligned sequences.
+
     With :meth:`probability_matrix()` the position probability matrix
     can be created based on 'symbols' and a pseudocount.
 
     With :meth:`log_odds_matrix()` the position weight matrix can
     be created based on the before calculated position probability
     matrix and the background frequencies.
 
-    With :meth:`from_alignment()` a :class:`SequenceProfile` object can
-    be created from an indefinite number of aligned sequences.
-
     With :meth:`sequence_probability_from_matrix()` the probability of a
     sequence can be calculated based on the before calculated position
     probability matrix of this instance of object SequenceProfile.
@@ -105,6 +106,63 @@ class SequenceProfile(object):
         Array which indicates the number of gaps at each position.
     alphabet : Alphabet, length=k
         Alphabet of sequences of sequence profile
+
+    Examples
+    --------
+
+    Create a profile from a multiple sequence alignment:
+
+    >>> sequences = [
+    ...     NucleotideSequence("CGCTCATTC"),
+    ...     NucleotideSequence("CGCTATTC"),
+    ...     NucleotideSequence("CCCTCAATC"),
+    ... ]
+    >>> msa, _, _, _ = align_multiple(
+    ...     sequences, SubstitutionMatrix.std_nucleotide_matrix(), gap_penalty=-5
+    ... )
+    >>> print(msa)
+    CGCTCATTC
+    CGCT-ATTC
+    CCCTCAATC
+    >>> profile = SequenceProfile.from_alignment(msa)
+    >>> print(profile)
+      A C G T
+    0 0 3 0 0
+    1 0 1 2 0
+    2 0 3 0 0
+    3 0 0 0 3
+    4 0 2 0 0
+    5 3 0 0 0
+    6 1 0 0 2
+    7 0 0 0 3
+    8 0 3 0 0
+    >>> print(profile.gaps)
+    [0 0 0 0 1 0 0 0 0]
+
+    Slice the profile (masks and index arrays are also supported):
+
+    >>> print(profile[2:])
+      A C G T
+    0 0 3 0 0
+    1 0 0 0 3
+    2 0 2 0 0
+    3 3 0 0 0
+    4 1 0 0 2
+    5 0 0 0 3
+    6 0 3 0 0
+
+    Use the profile to compute the position probability matrix:
+
+    >>> print(profile.probability_matrix())
+    [[0.000 1.000 0.000 0.000]
+     [0.000 0.333 0.667 0.000]
+     [0.000 1.000 0.000 0.000]
+     [0.000 0.000 0.000 1.000]
+     [0.000 1.000 0.000 0.000]
+     [1.000 0.000 0.000 0.000]
+     [0.333 0.000 0.000 0.667]
+     [0.000 0.000 0.000 1.000]
+     [0.000 1.000 0.000 0.000]]
     """
 
     def __init__(self, symbols, gaps, alphabet):
@@ -498,3 +556,12 @@ def sequence_score(self, sequence, background_frequencies=None, pseudocount=0):
                 f"as 'symbols' {self.symbols.shape}"
             )
         return np.sum(pwm[np.arange(len(sequence)), sequence.code])
+
+    def __getitem__(self, index):
+        if isinstance(index, Integral):
+            # Do not allow to collapse dimensions
+            index = slice(index, index + 1)
+        return SequenceProfile(self.symbols[index], self.gaps[index], self.alphabet)
+
+    def __len__(self):
+        return len(self.symbols)
diff --git a/tests/test_doctest.py b/tests/test_doctest.py
@@ -23,7 +23,7 @@
 # Keep test parameters in separate variable to generate IDs from them
 TEST_PARAMETERS = [
     pytest.param("biotite", []),
-    pytest.param("biotite.sequence", []),
+    pytest.param("biotite.sequence", ["biotite.sequence.align"]),
     pytest.param("biotite.sequence.align", ["biotite.sequence"]),
     pytest.param("biotite.sequence.phylo", ["biotite.sequence"]),
     pytest.param(