From e305d09e920c3935f44c9fcec1e12de6f5bf88c0 Mon Sep 17 00:00:00 2001
From: Shing Zhan <shing.zhan@gmail.com>
Date: Mon, 7 Aug 2023 18:19:00 +0100
Subject: [PATCH] Add tests to check LS HMM of _tskit.lshmm compared to BEAGLE

---
 python/tests/beagle.py          | 628 ++++++++++++++++++++++++++++++++
 python/tests/test_imputation.py | 558 ++++++++++++++++++++++++++++
 2 files changed, 1186 insertions(+)
 create mode 100644 python/tests/beagle.py
 create mode 100644 python/tests/test_imputation.py

diff --git a/python/tests/beagle.py b/python/tests/beagle.py
new file mode 100644
index 0000000000..02a23854b1
--- /dev/null
+++ b/python/tests/beagle.py
@@ -0,0 +1,628 @@
+"""
+Implementation of the BEAGLE 4.1 algorithm to impute genotypes.
+
+This was implemented while closely consulting the BEAGLE 4.1 paper:
+Browning & Browning (2016). Genotype imputation with millions of reference samples.
+Am J Hum Genet 98:116-126. doi:10.1016/j.ajhg.2015.11.020
+
+The source code of BEAGLE 4.1 (particularly `LSHapBaum.java`) was also consulted:
+https://faculty.washington.edu/browning/beagle/b4_1.html
+
+Here are some notations used throughout this implementation:
+h = Number of reference haplotypes.
+m = Number of genotyped markers.
+x = Number of imputed markers.
+
+This implementation takes the following inputs:
+1. Panel of reference haplotypes in a matrix of size (m + x, h).
+2. One query haplotype in an array of size (m + x).
+3. Genomic site positions of all the markers in an array of size (m + x).
+
+For simplicity, we assume that:
+1. All the sites are biallelic (0/1 encoding).
+2. The genetic map is defined by equating 1 Mbp to 1 cM.
+
+In the query haplotype:
+1. The genotyped markers take values of 0 or 1.
+2. The imputed markers take -1.
+
+The forward, backward, and HMM state probability matrices are of size (m, h).
+The interpolated haplotype probability matrix is of size (x, 2),
+The imputed alleles in the query haplotype are the maximum a posteriori (MAP) alleles.
+
+For efficiency, BEAGLE uses aggregated markers, which are clusters of markers
+within a 0.005 cM interval (default). Because the genotypes are phased,
+the alleles in the aggregated markers form distinct "allele sequences".
+Below, we do not use aggregated markers or allele sequences, which would complicate
+the implementation.
+
+Rather than trying to exactly replicating the original BEAGLE 4.1 algorithm,
+this implementation computes Equation 1 of BB2016. The functions used in an attempt
+to faithfully implement the BEAGLE algorithm are kept for documentation.
+"""
+import numpy as np
+
+
+def convert_to_genetic_map_position(pos):
+    """
+    Convert genomic site positions (bp) to genetic map positions (cM).
+
+    In BEAGLE 4.1, when a genetic map is not specified, it is assumed that 1 Mbp = 1cM.
+
+    This trivial function is meant for documentation.
+
+    :param numpy.ndarray pos: Site positions.
+    :return: Genetic map positions.
+    :rtype: numpy.ndarray
+    """
+    assert 0 <= np.min(pos)
+    return pos / 1e6
+
+
+def get_weights(genotyped_pos, imputed_pos):
+    """
+    Get weights at imputed markers in the query haplotype.
+
+    These weights are used in linear interpolation of HMM state probabilities
+    at imputed markers.
+
+    In BB2016 (see below Equation 1), a weight between genotyped markers m and m + 1
+    at imputed marker x is denoted lambda_m,x.
+
+    lambda_m,x = [g(m + 1) - g(x)] / [g(m + 1) - g(m)],
+    where g(i) = genetic map position of marker i.
+
+    :param numpy.ndarray genotyped_pos: Site positions of genotyped markers.
+    :param numpy.ndarray imputed_pos: Site positions of imputed markers.
+    :return: Weights used in linear interpolation.
+    :rtype: numpy.ndarray
+    """
+    # Check that the two sets are mutually exclusive.
+    assert not np.any(np.isin(genotyped_pos, imputed_pos))
+    # Set min genetic distance to avoid division by zero.
+    MIN_CM_DIST = 0.005
+    m = len(genotyped_pos)
+    x = len(imputed_pos)
+    genotyped_cm = convert_to_genetic_map_position(genotyped_pos)
+    imputed_cm = convert_to_genetic_map_position(imputed_pos)
+    assert not np.any(np.isin(genotyped_cm, imputed_cm))
+    # Calculate weights for imputed markers.
+    weights = np.zeros(x, dtype=np.float64)
+    # Identify genotype markers k and k + 1 between imputed marker i
+    for i in np.arange(x):
+        if imputed_cm[i] < genotyped_cm[0]:
+            # Special case: imputed marker is before the first genotyped marker.
+            k = 0
+            weights[i] = 1.0
+        elif imputed_cm[i] > genotyped_cm[-1]:
+            # Special case: imputed marker is after the last genotyped marker.
+            k = m - 1
+            weights[i] = 0.0
+        else:
+            k = np.max(np.where(imputed_cm[i] > genotyped_cm)[0])
+            cm_mP1_x = genotyped_cm[k + 1] - imputed_cm[i]
+            cm_mP1_m = np.max([genotyped_cm[k + 1] - genotyped_cm[k], MIN_CM_DIST])
+            weights[i] = cm_mP1_x / cm_mP1_m
+    assert 0 <= np.min(weights) and np.max(weights) <= 1, "Weights are not in [0, 1]."
+    return weights
+
+
+def get_mismatch_prob(pos, miscall_rate):
+    """
+    Set mismatch probabilities at genotyped markers.
+
+    In BEAGLE, the mismatch probability is dominated by the allelic miscall rate.
+    By default, it is set to 0.0001 and capped at 0.50.
+
+    :param numpy.ndarray pos: Site positions.
+    :param float miscall_rate: Allelic miscall rate.
+    :return: Mismatch probabilities.
+    :rtype: numpy.ndarray
+    """
+    assert isinstance(miscall_rate, float)
+    if miscall_rate >= 0.5:
+        miscall_rate = 0.5
+    mu = np.zeros(len(pos), dtype=np.float64) + miscall_rate
+    return mu
+
+
+def get_switch_prob(pos, h, ne):
+    """
+    Set switch probabilities at genotyped markers.
+
+    Note that in BEAGLE, the default value of `ne` is set to 1e6.
+    If `h` is small and `ne` is large, the switch probability is nearly 1.
+    In such cases, it may be desirable to set `ne` to a small value
+    to avoid switching between haplotypes all the time.
+
+    :param numpy.ndarray pos: Site positions.
+    :param int h: Number of reference haplotypes.
+    :param float ne: Effective population size.
+    :return: Switch probabilities.
+    :rtype: numpy.ndarray
+    """
+    assert pos[0] > 0, "Site positions are not 1-based."
+    assert isinstance(h, int), "Number of reference haplotypes is not an integer."
+    assert isinstance(ne, float), "Effective population size is not a float."
+    # Get genetic distance between adjacent markers.
+    cm = np.concatenate([[0], convert_to_genetic_map_position(pos)])
+    gen_dist = cm[1:] - cm[:-1]
+    assert len(gen_dist) == len(pos)
+    rho = -np.expm1(-(4 * ne * gen_dist / h))
+    return rho
+
+
+def compute_forward_probability_matrix(ref_h, query_h, rho, mu):
+    """
+    Implement the forward algorithm.
+
+    The forward probablity matrix is of size (m, h),
+    where m is the number of genotyped markers
+    and h is the number of reference haplotypes.
+
+    `ref_h` is a panel of reference haplotypes of size m x h.
+    Here, it is subsetted to genotyped markers.
+
+    `query_h` is also subsetted to genotyped markers (length m).
+
+    :param numpy.ndarray ref_h: Reference haplotypes.
+    :param numpy.ndarray query_h: One query haplotype.
+    :param numpy.ndarray rho: Switch probabilities.
+    :param numpy.ndarray mu: Mismatch probabilities.
+    :return: Forward probability matrix.
+    :rtype: numpy.ndarray
+    """
+    assert np.all(
+        np.isin(ref_h, [0, 1])
+    ), "Reference haplotypes have non-biallelic values."
+    assert np.all(np.isin(query_h, [0, 1])), "Query haplotype has non-biallelic values."
+    m = ref_h.shape[0]
+    h = ref_h.shape[1]
+    assert m == len(query_h)
+    assert m == len(rho)
+    assert m == len(mu)
+    assert 0 <= np.min(rho) and np.max(rho) <= 1
+    # BEAGLE caps mismatch probabilities at 0.5.
+    assert 0 <= np.min(mu) and np.max(mu) <= 0.5
+    fm = np.zeros((m, h), dtype=np.float64)
+    last_sum = 1.0  # Part of normalization factor
+    for i in np.arange(m):
+        # Get site-specific parameters
+        shift = rho[i] / h
+        scale = (1 - rho[i]) / last_sum
+        # Get allele at genotyped marker i on query haplotype j
+        query_a = query_h[i]
+        for j in np.arange(h):
+            # Get allele at genotyped marker i on reference haplotype j
+            ref_a = ref_h[i, j]
+            # Get emission probability
+            em_prob = mu[i] if query_a != ref_a else 1.0 - mu[i]
+            if m == 0:
+                fm[i, j] = em_prob
+            else:
+                fm[i, j] = em_prob * (scale * fm[i - 1, j] + shift)
+        site_sum = np.sum(fm[i, :])
+        last_sum = site_sum / h if m == 0 else site_sum
+    return fm
+
+
+def compute_backward_probability_matrix(ref_h, query_h, rho, mu):
+    """
+    Implement the backward algorithm.
+
+    The backward probablity matrix is of size (m, h),
+    where m is the number of genotyped markers
+    and h is the number of reference haplotypes.
+
+    `ref_h` is a panel of reference haplotypes of size m x h.
+    Here, it is subsetted to genotyped markers.
+
+    `query_h` is also subsetted to genotyped markers (length m).
+
+    In BEAGLE 4.1, the values are kept one site at a time.
+    Here, we keep the values at all the sites.
+
+    :param numpy.ndarray ref_h: Reference haplotypes.
+    :param numpy.ndarray query_h: One query haplotype.
+    :param numpy.ndarray rho: Switch probabilities.
+    :param numpy.ndarray mu: Mismatch probabilities.
+    :return: Backward probability matrix.
+    :rtype: numpy.ndarray
+    """
+    assert np.all(
+        np.isin(ref_h, [0, 1])
+    ), "Reference haplotypes have non-biallelic values."
+    assert np.all(np.isin(query_h, [0, 1])), "Query haplotype has non-biallelic values."
+    m = ref_h.shape[0]
+    h = ref_h.shape[1]
+    assert m == len(query_h)
+    assert m == len(rho)
+    assert m == len(mu)
+    assert 0 <= np.min(rho) and np.max(rho) <= 1
+    assert 0 <= np.min(mu) and np.max(mu) <= 0.5
+    bm = np.zeros((m, h), dtype=np.float64)
+    bm[-1, :] = 1.0 / h  # Initialise the last column
+    for i in np.arange(m - 2, -1, -1):
+        query_a = query_h[i + 1]
+        for j in np.arange(h):
+            ref_a = ref_h[i + 1, j]
+            em_prob = mu[i + 1] if ref_a != query_a else 1.0 - mu[i + 1]
+            # Note that BEAGLE keeps the values at one site at a time.
+            bm[i, j] = bm[i + 1, j] * em_prob
+        sum_site = np.sum(bm[i, :])
+        scale = (1 - rho[i + 1]) / sum_site
+        shift = rho[i + 1] / h
+        bm[i, :] = scale * bm[i, :] + shift
+    return bm
+
+
+def _get_ref_hap_seg(a, b):
+    """
+    WARN: This function is part of an attempt to faithfully reimplement
+    the BEAGLE 4.1 algorithm. It is not complete and not used
+    in the current implementation. It is kept for documentation.
+
+    Assuming all biallelic sites, get the index of a reference haplotype segment (i)
+    defined by the alleles a and b at two adjacent genotyped markers.
+
+    #i, a, b
+    0, 0, 0
+    1, 1, 0
+    2, 0, 1
+    3, 1, 1
+
+    See https://github.com/tskit-dev/tskit/issues/2802#issuecomment-1698767169
+
+    This is a helper function for `_compute_state_probability_matrix`.
+    """
+    ref_hap_idx = a * 1 + b * 2
+    return ref_hap_idx
+
+
+def _compute_state_probability_matrix(fm, bm, ref_h, query_h, rho, mu):
+    """
+    WARN: This function is part of an attempt to faithfully reimplement
+    the BEAGLE 4.1 algorithm. It is not complete and not used
+    in the current implementation. It is kept for documentation.
+
+    Implement the HMM forward-backward algorithm to compute:
+    1. A HMM state probability matrix across genotyped markers;
+    2. "Forward haplotype probabilities"; and
+    3. "Backward haplotype probabilities".
+
+    A HMM state is an index of a reference haplotype.
+
+    The state probability matrix is of size (m, h),
+    where m is the number of genotyped markers in the query haplotype
+    and h is the number of reference haplotypes.
+
+    Both the forward and backward haplotype probability matrices
+    are of size (m, 2), assuming only biallelic sites.
+
+    `ref_h` is a panel of reference haplotypes of size m x h.
+    Here, it is subsetted to genotyped markers.
+
+    `query_h` is also subsetted to genotyped markers (length m).
+
+    This implementation is based on the function `setStateProbs`
+    of `LSHapBaum.java` in BEAGLE. It performs the computation
+    described in Equation 2 of BB2016.
+
+    :param numpy.ndarray fm: Forward probability matrix.
+    :param numpy.ndarray bm: Backward probability matrix.
+    :param numpy.ndarray ref_h: Reference haplotypes.
+    :param numpy.ndarray query_h: One query haplotype.
+    :param numpy.ndarray rho: Switch probabilities.
+    :param numpy.ndarray mu: Mismatch probabilities.
+    :return: HMM state probability matrix,
+        forward haplotype probability matrix, and
+        backward haplotype probability matrix.
+    :rtype: tuple
+    """
+    m = fm.shape[0]
+    h = fm.shape[1]
+    assert (m, h) == bm.shape
+    assert not np.any(fm < 0), "Forward probability matrix has negative values."
+    assert not np.any(np.isnan(fm)), "Forward probability matrix has NaN values."
+    assert not np.any(bm < 0), "Backward probability matrix has negative values."
+    assert not np.any(np.isnan(bm)), "Backward probability matrix has NaN values."
+    assert np.all(
+        np.isin(ref_h, [0, 1])
+    ), "Reference haplotypes have non-biallelic values."
+    assert np.all(np.isin(query_h, [0, 1])), "Query haplotype has non-biallelic values."
+    assert m == len(query_h)
+    assert m == len(rho)
+    assert m == len(mu)
+    assert 0 <= np.min(rho) and np.max(rho) <= 1
+    assert 0 <= np.min(mu) and np.max(mu) <= 0.5
+    sm = np.zeros((m, h), dtype=np.float64)
+    fwd_hap_probs = np.zeros((m, 4), dtype=np.float64)
+    bwd_hap_probs = np.zeros((m, 4), dtype=np.float64)
+    for i in np.arange(m - 1, -1, -1):
+        for j in np.arange(h):
+            sm[i, j] = fm[i, j] * bm[i, j]
+            ref_hap_seg_mP1 = _get_ref_hap_seg(ref_h[i, j], ref_h[i + 1, j])
+            ref_hap_seg_m = _get_ref_hap_seg(ref_h[i - 1, j], ref_h[i, j])
+            fwd_hap_probs[i, ref_hap_seg_mP1] += sm[i, j]
+            bwd_hap_probs[i, ref_hap_seg_m] += sm[i, j]
+        sum_fwd_hap_probs = np.sum(fwd_hap_probs[i, :])
+        fwd_hap_probs[i, :] /= sum_fwd_hap_probs
+        bwd_hap_probs[i, :] /= sum_fwd_hap_probs  # Sum of forward hap probs
+    return (sm, fwd_hap_probs, bwd_hap_probs)
+
+
+def _interpolate_allele_probabilities(
+    fwd_hap_probs, bwd_hap_probs, genotyped_pos, imputed_pos
+):
+    """
+    WARN: This function is part of an attempt to faithfully reimplement
+    the BEAGLE 4.1 algorithm. It is not complete and not used
+    in the current implementation. It is kept for documentation.
+
+    Compute the interpolated allele probabilities at the imputed markers
+    of a query haplotype.
+
+    The forward and backward haplotype probability matrices are of size (m, 2),
+    where m is the number of genotyped markers.
+    Here, we assume all biallelic sites, hence the 2.
+
+    The interpolated allele probability matrix is of size (x, 2),
+    where x is the number of imputed markers.
+    Again, we assume all biallelic sites, hence the 2.
+
+    This implementation is based on Equation 2 (the rearranged one) in BB2016.
+
+    :param numpy.ndarray fwd_hap_probs: Forward haplotype probability matrix.
+    :param numpy.ndarray bwd_hap_probs: Backward haplotype probability matrix.
+    :param numpy.ndarray genotyped_pos: Site positions of genotyped markers.
+    :param numpy.ndarray imputed_pos: Site positions of imputed markers.
+    :return: Interpolated allele probabilities.
+    :rtype: numpy.ndarray
+    """
+    m = len(genotyped_pos)
+    x = len(imputed_pos)
+    assert (m, 2) == fwd_hap_probs.shape
+    assert (m, 2) == bwd_hap_probs.shape
+    genotyped_cm = convert_to_genetic_map_position(genotyped_pos)
+    assert m == len(genotyped_cm)
+    imputed_cm = convert_to_genetic_map_position(imputed_pos)
+    assert x == len(imputed_cm)
+    weights = get_weights(genotyped_cm, imputed_cm)
+    assert x == len(weights)
+    i_hap_probs = np.zeros((x, 2), dtype=np.float64)
+    for i in np.arange(x):
+        # Identify the genotyped markers k and k + 1 between imputed marker i
+        if imputed_cm[i] < genotyped_cm[0]:
+            k = 0
+        elif imputed_cm[i] > genotyped_cm[-1]:
+            k = m - 2
+        else:
+            k = np.max(np.where(imputed_cm[i] > genotyped_cm)[0])
+        # Vectorised over the allelic states
+        i_hap_probs[i, :] = weights[i] * bwd_hap_probs[k, :]
+        i_hap_probs[i, :] += (1 - weights[i]) * fwd_hap_probs[k, :]
+    return i_hap_probs
+
+
+def compute_state_probability_matrix(fm, bm, ref_h, query_h):
+    """
+    Implement the HMM forward-backward algorithm following Equation 1 of BB2016.
+    This is simpler than faithfully reimplementing the original BEAGLE 4.1 algorithm.
+
+    A HMM state is (index of a genotyped marker, index of a reference haplotype).
+
+    The output state probability matrix is of size (m, h),
+    where m is the number of genotyped markers in the query haplotype
+    and h is the number of reference haplotypes.
+
+    The forward and backward probability matrices are both of size (m, h).
+
+    The reference haplotypes of size (m, h), and the query haplotype is of size m.
+    Here, they are both subsetted to genotyped markers.
+
+    :param numpy.ndarray fm: Forward probability matrix.
+    :param numpy.ndarray bm: Backward probability matrix.
+    :param numpy.ndarray ref_h: Reference haplotypes.
+    :param numpy.ndarray query_h: One query haplotype.
+    :return: HMM state probability matrix.
+    :rtype: numpy.ndarray
+    """
+    assert np.all(
+        np.isin(ref_h, [0, 1])
+    ), "Reference haplotypes have non-biallelic values."
+    assert np.all(np.isin(query_h, [0, 1])), "Query haplotype has non-biallelic values."
+    m = ref_h.shape[0]
+    h = ref_h.shape[1]
+    assert m == len(query_h)
+    assert (m, h) == fm.shape
+    assert (m, h) == bm.shape
+    assert not np.any(fm < 0), "Forward probability matrix has negative values."
+    assert not np.any(np.isnan(fm)), "Forward probability matrix has NaN values."
+    assert not np.any(bm < 0), "Backward probability matrix has negative values."
+    assert not np.any(np.isnan(bm)), "Backward probability matrix has NaN values."
+    sm = np.zeros((m, h), dtype=np.float64)
+    for i in np.arange(m - 1, -1, -1):
+        for j in np.arange(h):
+            sm[i, j] = fm[i, j] * bm[i, j]
+    return sm
+
+
+def interpolate_allele_probabilities(sm, ref_h, genotyped_pos, imputed_pos):
+    """
+    Compute the interpolated allele probabilities at imputed markers of
+    a query haplotype following Equation 1 of BB2016.
+
+    Assuming all biallelic sites, the interpolated allele probability matrix
+    is of size (x, 2), where x is the number of imputed markers.
+
+    This function takes the output of `compute_state_probability_matrix`.
+
+    Note that this function takes:
+    1. HMM state probability matrix across genotyped markers of size (m, h).
+    2. reference haplotypes subsetted to imputed markers of size (x, h).
+
+    :param numpy.ndarray sm: HMM state probability matrix at genotyped markers.
+    :param numpy.ndarray ref_h: Reference haplotypes subsetted to imputed markers.
+    :param numpy.ndarray genotyped_pos: Site positions at genotyped markers.
+    :param numpy.ndarray imputed_pos: Site positions at imputed markers.
+    :return: Interpolated allele probabilities.
+    :rtype: numpy.ndarray
+    """
+    assert not np.any(sm < 0), "HMM state probability matrix has negative values."
+    assert not np.any(np.isnan(sm)), "HMM state probability matrix has NaN values."
+    m = sm.shape[0]
+    h = sm.shape[1]
+    assert np.all(
+        np.isin(ref_h, [0, 1])
+    ), "Reference haplotypes have non-biallelic values."
+    assert m == len(genotyped_pos)
+    x = len(imputed_pos)
+    assert (x, h) == ref_h.shape
+    genotyped_cm = convert_to_genetic_map_position(genotyped_pos)
+    imputed_cm = convert_to_genetic_map_position(imputed_pos)
+    weights = get_weights(genotyped_pos, imputed_pos)
+    assert x == len(weights)
+    p = np.zeros((x, 2), dtype=np.float64)
+
+    def _compute_allele_probabilities(a):
+        """Equation 1 in BB2016."""
+        for i in np.arange(x):
+            # Identify the genotyped markers k and k + 1 between the imputed marker i
+            # The indices k should mirror those in `get_weights`.
+            if imputed_cm[i] < genotyped_cm[0]:
+                # Special case: imputed marker is before the first genotyped marker.
+                k = 0
+            elif imputed_cm[i] > genotyped_cm[-1]:
+                # Special case: imputed marker is after the last genotyped marker.
+                k = m - 1
+            else:
+                k = np.max(np.where(imputed_cm[i] > genotyped_cm)[0])
+            for j in np.arange(h):
+                if ref_h[i, j] == a:
+                    p[i, a] += weights[i] * sm[k, j]
+                    p[i, a] += (1 - weights[i]) * sm[k + 1, j]
+
+    _compute_allele_probabilities(a=0)
+    _compute_allele_probabilities(a=1)
+    return p
+
+
+def get_map_alleles(allele_probs):
+    """
+    Compute the maximum a posteriori alleles at the imputed markers of a query haplotype.
+
+    Assuming all biallelic sites, it is a matrix of size (x, 2),
+    where x is the number of imputed markers.
+
+    :param numpy.ndarray allele_probs: Interpolated allele probabilities.
+    :return: Imputed alleles in the query haplotype.
+    :rtype: numpy.ndarray
+    """
+    assert not np.any(allele_probs < 0), "Allele probabilities have negative values."
+    assert not np.any(np.isnan(allele_probs)), "Allele probabilities have NaN values."
+    x = len(allele_probs)
+    imputed_alleles = np.zeros((x, 2), dtype=int)
+    # TODO: Vectorise over the imputed markers
+    for i in np.arange(x):
+        imputed_alleles[i, 0] = np.argmax(allele_probs[i, :])
+        imputed_alleles[i, 1] = np.argmax(allele_probs[i, :])
+    return imputed_alleles
+
+
+def run_beagle(ref_h, query_h, pos, miscall_rate=0.0001, ne=1e6, debug=False):
+    """
+    Run a simplified version of the BEAGLE imputation algorithm
+    based on Equation 1 of BB2016.
+
+    `ref_h` and `query_h` span all genotyped and imputed marker sites.
+    So, their sizes are (m + x, h) and (m + x), respectively,
+    where m is the number of genotyped markers
+    and x is the number of imputed markers.
+
+    The genomic site positions of all the markers are in `pos`.
+
+    This produces imputed alleles at the imputed markers of the query haplotype.
+
+    :param numpy.ndarray ref_h: Reference haplotypes.
+    :param numpy.ndarray query_h: One query haplotype.
+    :param numpy.ndarray pos: Site positions of all the markers.
+    :param float miscall_rate: Allelic miscall rate.
+    :param int ne: Effective population size.
+    :param bool debug: Whether to print intermediate results.
+    :return: Imputed alleles in the query haplotype.
+    :rtype: numpy.ndarray
+    """
+    assert ref_h.shape[0] == len(pos)
+    assert len(query_h) == len(pos)
+    # Indices of genotyped markers in the query haplotype
+    genotyped_pos_idx = np.where(query_h != -1)[0]
+    assert len(genotyped_pos_idx) > 0
+    # Indices of imputed markers in the query haplotype
+    imputed_pos_idx = np.where(query_h == -1)[0]
+    assert len(imputed_pos_idx) > 0
+    # Site positions of genotyped markers
+    genotyped_pos = pos[genotyped_pos_idx]
+    # Site positions of imputed markers
+    imputed_pos = pos[imputed_pos_idx]
+    h = ref_h.shape[1]
+    m = len(genotyped_pos)
+    x = len(imputed_pos)
+    assert m + x == len(pos)
+    # Subset the reference haplotypes to genotyped markers
+    ref_h_genotyped = ref_h[genotyped_pos_idx, :]
+    if debug:
+        print("Reference haplotypes subsetted to genotyped markers")
+        print(ref_h_genotyped)
+    assert ref_h_genotyped.shape == (m, h)
+    # Subset the query haplotype to genotyped markers
+    query_h_genotyped = query_h[genotyped_pos_idx]
+    if debug:
+        print("Query haplotype subsetted to genotyped markers")
+        print(query_h_genotyped)
+    assert len(query_h_genotyped) == m
+    # Set mismatch probabilities at genotyped markers
+    mu = get_mismatch_prob(genotyped_pos, miscall_rate=miscall_rate)
+    if debug:
+        print("Mismatch probabilities")
+        print(mu)
+    assert len(mu) == m
+    # Set switch probabilities at genotyped markers
+    rho = get_switch_prob(genotyped_pos, h, ne=ne)
+    if debug:
+        print("Switch probabilities")
+        print(rho)
+    assert len(rho) == m
+    # Compute forward probability matrix at genotyped markers
+    fm = compute_forward_probability_matrix(ref_h_genotyped, query_h_genotyped, rho, mu)
+    if debug:
+        print("Forward probability matrix")
+        print(fm)
+    assert fm.shape == (m, h)
+    # Compute backward probability matrix at genotyped markers
+    bm = compute_backward_probability_matrix(
+        ref_h_genotyped, query_h_genotyped, rho, mu
+    )
+    if debug:
+        print("Backward probability matrix")
+        print(bm)
+    assert bm.shape == (m, h)
+    # Compute HMM state probability matrix at genotyped markers
+    sm = compute_state_probability_matrix(fm, bm, ref_h_genotyped, query_h_genotyped)
+    if debug:
+        print("HMM state probability matrix")
+        print(sm)
+    assert sm.shape == (m, h)
+    # Subset the reference haplotypes to imputed markers
+    ref_h_imputed = ref_h[imputed_pos_idx, :]
+    # Interpolate allele probabilities at imputed markers
+    i_allele_probs = interpolate_allele_probabilities(
+        sm, ref_h_imputed, genotyped_pos, imputed_pos
+    )
+    if debug:
+        print("Interpolated allele probabilities")
+        print(i_allele_probs)
+    assert i_allele_probs.shape == (x, 2)
+    # Get MAP alleles at imputed markers
+    imputed_alleles = get_map_alleles(i_allele_probs)
+    assert len(imputed_alleles) == x
+    return imputed_alleles
diff --git a/python/tests/test_imputation.py b/python/tests/test_imputation.py
new file mode 100644
index 0000000000..3fe69afe8a
--- /dev/null
+++ b/python/tests/test_imputation.py
@@ -0,0 +1,558 @@
+"""
+Tests for genotype imputation (forward and Baum-Welsh algorithms).
+"""
+import io
+
+import numpy as np
+
+import _tskit
+import tskit
+from tests.beagle import run_beagle
+
+
+# A toy tree sequence containing 3 diploid individuals with 5 sites and 5 mutations.
+# Two toy query haplotypes are targets for imputation.
+
+toy_ts_nodes_text = """\
+id      is_sample       time    population      individual      metadata
+0       1       0.000000        0       0
+1       1       0.000000        0       0
+2       1       0.000000        0       1
+3       1       0.000000        0       1
+4       1       0.000000        0       2
+5       1       0.000000        0       2
+6       0       0.029768        0       -1
+7       0       0.133017        0       -1
+8       0       0.223233        0       -1
+9       0       0.651586        0       -1
+10      0       0.698831        0       -1
+11      0       2.114867        0       -1
+12      0       4.322031        0       -1
+13      0       7.432311        0       -1
+"""
+
+toy_ts_edges_text = """\
+left    right   parent  child   metadata
+0.000000        1000000.000000  6       0
+0.000000        1000000.000000  6       3
+0.000000        1000000.000000  7       2
+0.000000        1000000.000000  7       5
+0.000000        1000000.000000  8       1
+0.000000        1000000.000000  8       4
+0.000000        781157.000000   9       6
+0.000000        781157.000000   9       7
+0.000000        505438.000000   10      8
+0.000000        505438.000000   10      9
+505438.000000   549484.000000   11      8
+505438.000000   549484.000000   11      9
+781157.000000   1000000.000000  12      6
+781157.000000   1000000.000000  12      7
+549484.000000   1000000.000000  13      8
+549484.000000   781157.000000   13      9
+781157.000000   1000000.000000  13      12
+"""
+
+toy_ts_sites_text = """\
+position        ancestral_state metadata
+200000.000000   A
+300000.000000   C
+520000.000000   G
+600000.000000   T
+900000.000000   A
+"""
+
+toy_ts_mutations_text = """\
+site    node    time    derived_state   parent  metadata
+0       9       unknown G       -1
+1       8       unknown A       -1
+2       9       unknown T       -1
+3       9       unknown C       -1
+4       12      unknown C       -1
+"""
+
+toy_ts_individuals_text = """\
+flags
+0
+0
+0
+"""
+
+toy_query_haplotypes_01 = np.array(
+    [
+        [
+            0,
+            1,
+            -1,
+            1,
+            1,
+        ],
+        [
+            0,
+            1,
+            -1,
+            0,
+            1,
+        ],
+    ],
+    dtype=np.int32,
+)
+
+toy_query_haplotypes_ACGT = np.array(
+    [
+        [0, 0, -1, 2, 2],  # AACC
+        [0, 0, -1, 3, 2],  # AATC
+    ],
+    dtype=np.int32,
+)
+
+
+def get_toy_data():
+    ref_ts = tskit.load_text(
+        nodes=io.StringIO(toy_ts_nodes_text),
+        edges=io.StringIO(toy_ts_edges_text),
+        sites=io.StringIO(toy_ts_sites_text),
+        mutations=io.StringIO(toy_ts_mutations_text),
+        individuals=io.StringIO(toy_ts_individuals_text),
+        strict=False,
+    )
+    query_h = toy_query_haplotypes_ACGT
+    return [ref_ts, query_h]
+
+
+def get_tskit_forward_backward_matrices(ts, h):
+    m = ts.num_sites
+    fm = _tskit.CompressedMatrix(ts._ll_tree_sequence)
+    bm = _tskit.CompressedMatrix(ts._ll_tree_sequence)
+    ls_hmm = _tskit.LsHmm(
+        ts._ll_tree_sequence, np.zeros(m) + 0.1, np.zeros(m) + 0.1, acgt_alleles=True
+    )
+    ls_hmm.forward_matrix(h, fm)
+    ls_hmm.backward_matrix(h, fm.normalisation_factor, bm)
+    return [fm.decode(), bm.decode()]
+
+
+# BEAGLE 4.1 was run on the toy data set above using default parameters.
+#
+# In the query VCF, the site at position 520,000 was redacted and then imputed.
+# Note that the ancestral allele in the simulated tree sequence is
+# treated as the REF in the VCFs.
+#
+# Three matrices were printed and kept below:
+# 1. Forward probability matrix
+# 2. Backward probability matrix
+# 3. HMM state probability matrix
+#
+# There are two sets of matrices, one for each query haplotype.
+#
+# Notes about calculations:
+# h = index of reference haplotype
+# H = number of reference haplotypes
+# m = index of genotyped marker
+# M = number of genotyped markers
+#
+# In forward probability matrix,
+#    fwd[m][h] = emission prob., if m = 0 (first marker)
+#    fwd[m][h] = emission prob. * (scale * fwd[m - 1][h] + shift), otherwise
+#
+#    where scale = (1 - switch prob.)/sum of fwd[m - 1],
+#        and shift = switch prob. / H.
+#
+# In backward probability matrix,
+#    bwd[m][h] = 1/n, if m = M - 1 (last marker)
+#    unadj. bwd[m][h] = emission prob. / H
+#    bwd[m][h] = (unadj. bwd[m][h] + shift) * scale, otherwise
+#
+#    where scale = (1 - switch prob.)/sum of unadj. bwd[m],
+#        and shift = switch prob. / H.
+#
+# For each site, the sum of backward values over all haplotypes was calculated
+# before scaling and shifting.
+#
+# The matrices below were obtained by setting Ne to 10.
+def parse_matrix(csv_text):
+    # This returns a record array, which is essentially the same as a
+    # pandas dataframe, which we can access via df["m"] etc.
+    return np.lib.npyio.recfromcsv(io.StringIO(csv_text))
+
+
+def test_toy_example():
+    # Get toy dataset ready for tskit.
+    ref_ts, query = get_toy_data()
+    print(list(ref_ts.haplotypes()))
+    print(ref_ts)
+    print(query)
+    # tskit_fwd, tskit_bwd = get_tskit_forward_backward_matrices(ref_ts, query[0])
+    # Get BEAGLE results for the toy dataset.
+    # beagle_fwd = parse_matrix(beagle_fwd_matrix_text_1)
+    # beagle_bwd = parse_matrix(beagle_bwd_matrix_text_1)
+    # print(beagle_fwd)
+    # print(beagle_bwd)
+    # Run a trivial test case using Python implementation of BEAGLE.
+    ref_h = np.array(
+        [
+            [
+                0,
+                1,
+                0,
+                1,
+                0,
+                1,
+            ],
+            [
+                1,
+                0,
+                1,
+                0,
+                1,
+                0,
+            ],
+        ]
+    ).T
+    query_h = np.array(
+        [
+            -1,
+            1,
+            -1,
+            1,
+            -1,
+            0,
+        ]
+    )
+    assert ref_h.shape[0] == len(query_h)
+    pos = (np.arange(len(query_h)) + 1) * 1e4
+    ia = run_beagle(ref_h, query_h, pos, miscall_rate=0.1, ne=10.0, debug=True)
+    print(ia)
+
+
+# The following toy query haplotypes were imputed from the toy reference haplotypes
+# using BEAGLE 4.1 with Ne set to 10.
+toy_ref_0 = np.array(
+    [
+        [
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        ],  # ref_0
+        [
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        ],  # ref_0
+    ]
+).T
+toy_query_0 = np.array(
+    [
+        [
+            -1,
+            0,
+            -1,
+            0,
+            -1,
+            0,
+            -1,
+            0,
+            -1,
+        ],  # query_0
+        [
+            -1,
+            0,
+            -1,
+            0,
+            -1,
+            0,
+            -1,
+            0,
+            -1,
+        ],  # query_0
+    ]
+)
+assert toy_ref_0.shape == (9, 2)
+assert toy_query_0.shape == (2, 9)
+assert np.sum(toy_query_0[0] == 0) == 4
+assert np.sum(toy_query_0[0] == -1) == 5
+assert np.array_equal(toy_query_0[0], toy_query_0[1])
+
+toy_ref_1 = np.array(
+    [
+        [
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+        ],  # ref_0
+        [
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+        ],  # ref_0
+    ]
+).T
+toy_query_1 = np.array(
+    [
+        [
+            -1,
+            1,
+            -1,
+            1,
+            -1,
+            1,
+            -1,
+            1,
+            -1,
+        ],  # query_0
+        [
+            -1,
+            1,
+            -1,
+            1,
+            -1,
+            1,
+            -1,
+            1,
+            -1,
+        ],  # query_0
+    ]
+)
+assert toy_ref_1.shape == (9, 2)
+assert toy_query_1.shape == (2, 9)
+assert np.sum(toy_query_1[0] == 1) == 4
+assert np.sum(toy_query_1[0] == -1) == 5
+assert np.array_equal(toy_query_1[0], toy_query_1[1])
+
+toy_ref_2 = np.copy(toy_ref_0)
+toy_query_2 = np.array(
+    [
+        [
+            0,
+            0,
+            -1,
+            0,
+            -1,
+            0,
+            -1,
+            0,
+            0,
+        ],  # query_0
+        [
+            0,
+            0,
+            -1,
+            0,
+            -1,
+            0,
+            -1,
+            0,
+            0,
+        ],  # query_0
+    ]
+)
+assert toy_ref_2.shape == (9, 2)
+assert toy_query_2.shape == (2, 9)
+assert np.sum(toy_query_2[0] == 0) == 6
+assert np.sum(toy_query_2[0] == -1) == 3
+assert np.array_equal(toy_query_2[0], toy_query_2[1])
+
+toy_ref_3 = np.array(
+    [
+        [
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        ],  # ref_0
+        [
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+        ],  # ref_0
+    ]
+).T
+toy_query_3 = np.array(
+    [
+        [
+            -1,
+            0,
+            -1,
+            0,
+            -1,
+            0,
+            -1,
+            0,
+            -1,
+        ],  # query_0
+        [
+            -1,
+            1,
+            -1,
+            1,
+            -1,
+            1,
+            -1,
+            1,
+            -1,
+        ],  # query_0
+    ]
+)
+assert toy_ref_3.shape == (9, 2)
+assert toy_query_3.shape == (2, 9)
+assert np.sum(toy_query_3[0] == 0) == 4
+assert np.sum(toy_query_3[0] == -1) == 5
+assert np.sum(toy_query_3[1] == 1) == 4
+assert np.sum(toy_query_3[1] == -1) == 5
+
+toy_ref_4 = np.array(
+    [
+        [0, 1, 0, 1, 0, 1, 0, 1, 0],  # ref_0
+        [1, 0, 1, 0, 1, 0, 1, 0, 1],  # ref_0
+    ]
+)
+toy_query_4 = np.array(
+    [
+        [
+            -1,
+            0,
+            -1,
+            0,
+            -1,
+            0,
+            -1,
+            0,
+            -1,
+        ],  # query_0
+        [
+            -1,
+            1,
+            -1,
+            1,
+            -1,
+            1,
+            -1,
+            1,
+            -1,
+        ],  # query_0
+    ]
+)
+assert toy_ref_4.shape == (9, 2)
+assert toy_query_4.shape == (2, 9)
+assert np.all(np.invert(np.equal(toy_query_4[0], toy_query_4[1])))
+assert np.sum(toy_query_4[0] == 0) == 4
+assert np.sum(toy_query_4[0] == -1) == 5
+assert np.sum(toy_query_4[1] == 1) == 4
+assert np.sum(toy_query_4[1] == -1) == 5
+
+toy_ref_5 = np.array(
+    [
+        [0, 0, 0, 0, 0, 0, 0, 0, 0],  # ref_0
+        [0, 0, 0, 0, 0, 0, 0, 0, 0],  # ref_0
+        [1, 1, 1, 1, 1, 1, 1, 1, 1],  # ref_1
+        [1, 1, 1, 1, 1, 1, 1, 1, 1],  # ref_1
+    ]
+).T
+toy_query_5 = np.array(
+    [
+        [
+            -1,
+            0,
+            -1,
+            1,
+            -1,
+            1,
+            -1,
+            0,
+            -1,
+        ],  # query_0
+        [
+            -1,
+            1,
+            -1,
+            0,
+            -1,
+            0,
+            -1,
+            1,
+            -1,
+        ],  # query_0
+    ]
+)
+assert toy_ref_5.shape == (9, 4)
+assert toy_query_5.shape == (2, 9)
+assert np.sum(toy_ref_5[0] == 0) == 9
+assert np.sum(toy_ref_5[1] == 0) == 9
+assert np.sum(toy_ref_5[2] == 1) == 9
+assert np.sum(toy_ref_5[3] == 1) == 9
+assert np.sum(toy_query_5[0] == 0) == 2
+assert np.sum(toy_query_5[0] == 1) == 2
+assert np.sum(toy_query_5[0] == -1) == 5
+assert np.sum(toy_query_5[1] == 0) == 2
+assert np.sum(toy_query_5[1] == 1) == 2
+assert np.sum(toy_query_5[1] == -1) == 5
+
+toy_ref_6 = np.copy(toy_ref_5)
+toy_query_6 = np.array(
+    [
+        [
+            -1,
+            0,
+            -1,
+            0,
+            -1,
+            1,
+            -1,
+            1,
+            -1,
+        ],  # query_0
+        [
+            -1,
+            1,
+            -1,
+            1,
+            -1,
+            0,
+            -1,
+            0,
+            -1,
+        ],  # query_0
+    ]
+)
+assert toy_ref_6.shape == (9, 4)
+assert toy_query_6.shape == (2, 9)
+assert np.sum(toy_query_6[0] == 0) == 2
+assert np.sum(toy_query_6[0] == 1) == 2
+assert np.sum(toy_query_6[0] == -1) == 5
+assert np.sum(toy_query_6[1] == 0) == 2
+assert np.sum(toy_query_6[1] == 1) == 2
+assert np.sum(toy_query_6[1] == -1) == 5