From e305d09e920c3935f44c9fcec1e12de6f5bf88c0 Mon Sep 17 00:00:00 2001 From: Shing Zhan Date: Mon, 7 Aug 2023 18:19:00 +0100 Subject: [PATCH] Add tests to check LS HMM of _tskit.lshmm compared to BEAGLE --- python/tests/beagle.py | 628 ++++++++++++++++++++++++++++++++ python/tests/test_imputation.py | 558 ++++++++++++++++++++++++++++ 2 files changed, 1186 insertions(+) create mode 100644 python/tests/beagle.py create mode 100644 python/tests/test_imputation.py diff --git a/python/tests/beagle.py b/python/tests/beagle.py new file mode 100644 index 0000000000..02a23854b1 --- /dev/null +++ b/python/tests/beagle.py @@ -0,0 +1,628 @@ +""" +Implementation of the BEAGLE 4.1 algorithm to impute genotypes. + +This was implemented while closely consulting the BEAGLE 4.1 paper: +Browning & Browning (2016). Genotype imputation with millions of reference samples. +Am J Hum Genet 98:116-126. doi:10.1016/j.ajhg.2015.11.020 + +The source code of BEAGLE 4.1 (particularly `LSHapBaum.java`) was also consulted: +https://faculty.washington.edu/browning/beagle/b4_1.html + +Here are some notations used throughout this implementation: +h = Number of reference haplotypes. +m = Number of genotyped markers. +x = Number of imputed markers. + +This implementation takes the following inputs: +1. Panel of reference haplotypes in a matrix of size (m + x, h). +2. One query haplotype in an array of size (m + x). +3. Genomic site positions of all the markers in an array of size (m + x). + +For simplicity, we assume that: +1. All the sites are biallelic (0/1 encoding). +2. The genetic map is defined by equating 1 Mbp to 1 cM. + +In the query haplotype: +1. The genotyped markers take values of 0 or 1. +2. The imputed markers take -1. + +The forward, backward, and HMM state probability matrices are of size (m, h). +The interpolated haplotype probability matrix is of size (x, 2), +The imputed alleles in the query haplotype are the maximum a posteriori (MAP) alleles. + +For efficiency, BEAGLE uses aggregated markers, which are clusters of markers +within a 0.005 cM interval (default). Because the genotypes are phased, +the alleles in the aggregated markers form distinct "allele sequences". +Below, we do not use aggregated markers or allele sequences, which would complicate +the implementation. + +Rather than trying to exactly replicating the original BEAGLE 4.1 algorithm, +this implementation computes Equation 1 of BB2016. The functions used in an attempt +to faithfully implement the BEAGLE algorithm are kept for documentation. +""" +import numpy as np + + +def convert_to_genetic_map_position(pos): + """ + Convert genomic site positions (bp) to genetic map positions (cM). + + In BEAGLE 4.1, when a genetic map is not specified, it is assumed that 1 Mbp = 1cM. + + This trivial function is meant for documentation. + + :param numpy.ndarray pos: Site positions. + :return: Genetic map positions. + :rtype: numpy.ndarray + """ + assert 0 <= np.min(pos) + return pos / 1e6 + + +def get_weights(genotyped_pos, imputed_pos): + """ + Get weights at imputed markers in the query haplotype. + + These weights are used in linear interpolation of HMM state probabilities + at imputed markers. + + In BB2016 (see below Equation 1), a weight between genotyped markers m and m + 1 + at imputed marker x is denoted lambda_m,x. + + lambda_m,x = [g(m + 1) - g(x)] / [g(m + 1) - g(m)], + where g(i) = genetic map position of marker i. + + :param numpy.ndarray genotyped_pos: Site positions of genotyped markers. + :param numpy.ndarray imputed_pos: Site positions of imputed markers. + :return: Weights used in linear interpolation. + :rtype: numpy.ndarray + """ + # Check that the two sets are mutually exclusive. + assert not np.any(np.isin(genotyped_pos, imputed_pos)) + # Set min genetic distance to avoid division by zero. + MIN_CM_DIST = 0.005 + m = len(genotyped_pos) + x = len(imputed_pos) + genotyped_cm = convert_to_genetic_map_position(genotyped_pos) + imputed_cm = convert_to_genetic_map_position(imputed_pos) + assert not np.any(np.isin(genotyped_cm, imputed_cm)) + # Calculate weights for imputed markers. + weights = np.zeros(x, dtype=np.float64) + # Identify genotype markers k and k + 1 between imputed marker i + for i in np.arange(x): + if imputed_cm[i] < genotyped_cm[0]: + # Special case: imputed marker is before the first genotyped marker. + k = 0 + weights[i] = 1.0 + elif imputed_cm[i] > genotyped_cm[-1]: + # Special case: imputed marker is after the last genotyped marker. + k = m - 1 + weights[i] = 0.0 + else: + k = np.max(np.where(imputed_cm[i] > genotyped_cm)[0]) + cm_mP1_x = genotyped_cm[k + 1] - imputed_cm[i] + cm_mP1_m = np.max([genotyped_cm[k + 1] - genotyped_cm[k], MIN_CM_DIST]) + weights[i] = cm_mP1_x / cm_mP1_m + assert 0 <= np.min(weights) and np.max(weights) <= 1, "Weights are not in [0, 1]." + return weights + + +def get_mismatch_prob(pos, miscall_rate): + """ + Set mismatch probabilities at genotyped markers. + + In BEAGLE, the mismatch probability is dominated by the allelic miscall rate. + By default, it is set to 0.0001 and capped at 0.50. + + :param numpy.ndarray pos: Site positions. + :param float miscall_rate: Allelic miscall rate. + :return: Mismatch probabilities. + :rtype: numpy.ndarray + """ + assert isinstance(miscall_rate, float) + if miscall_rate >= 0.5: + miscall_rate = 0.5 + mu = np.zeros(len(pos), dtype=np.float64) + miscall_rate + return mu + + +def get_switch_prob(pos, h, ne): + """ + Set switch probabilities at genotyped markers. + + Note that in BEAGLE, the default value of `ne` is set to 1e6. + If `h` is small and `ne` is large, the switch probability is nearly 1. + In such cases, it may be desirable to set `ne` to a small value + to avoid switching between haplotypes all the time. + + :param numpy.ndarray pos: Site positions. + :param int h: Number of reference haplotypes. + :param float ne: Effective population size. + :return: Switch probabilities. + :rtype: numpy.ndarray + """ + assert pos[0] > 0, "Site positions are not 1-based." + assert isinstance(h, int), "Number of reference haplotypes is not an integer." + assert isinstance(ne, float), "Effective population size is not a float." + # Get genetic distance between adjacent markers. + cm = np.concatenate([[0], convert_to_genetic_map_position(pos)]) + gen_dist = cm[1:] - cm[:-1] + assert len(gen_dist) == len(pos) + rho = -np.expm1(-(4 * ne * gen_dist / h)) + return rho + + +def compute_forward_probability_matrix(ref_h, query_h, rho, mu): + """ + Implement the forward algorithm. + + The forward probablity matrix is of size (m, h), + where m is the number of genotyped markers + and h is the number of reference haplotypes. + + `ref_h` is a panel of reference haplotypes of size m x h. + Here, it is subsetted to genotyped markers. + + `query_h` is also subsetted to genotyped markers (length m). + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: Forward probability matrix. + :rtype: numpy.ndarray + """ + assert np.all( + np.isin(ref_h, [0, 1]) + ), "Reference haplotypes have non-biallelic values." + assert np.all(np.isin(query_h, [0, 1])), "Query haplotype has non-biallelic values." + m = ref_h.shape[0] + h = ref_h.shape[1] + assert m == len(query_h) + assert m == len(rho) + assert m == len(mu) + assert 0 <= np.min(rho) and np.max(rho) <= 1 + # BEAGLE caps mismatch probabilities at 0.5. + assert 0 <= np.min(mu) and np.max(mu) <= 0.5 + fm = np.zeros((m, h), dtype=np.float64) + last_sum = 1.0 # Part of normalization factor + for i in np.arange(m): + # Get site-specific parameters + shift = rho[i] / h + scale = (1 - rho[i]) / last_sum + # Get allele at genotyped marker i on query haplotype j + query_a = query_h[i] + for j in np.arange(h): + # Get allele at genotyped marker i on reference haplotype j + ref_a = ref_h[i, j] + # Get emission probability + em_prob = mu[i] if query_a != ref_a else 1.0 - mu[i] + if m == 0: + fm[i, j] = em_prob + else: + fm[i, j] = em_prob * (scale * fm[i - 1, j] + shift) + site_sum = np.sum(fm[i, :]) + last_sum = site_sum / h if m == 0 else site_sum + return fm + + +def compute_backward_probability_matrix(ref_h, query_h, rho, mu): + """ + Implement the backward algorithm. + + The backward probablity matrix is of size (m, h), + where m is the number of genotyped markers + and h is the number of reference haplotypes. + + `ref_h` is a panel of reference haplotypes of size m x h. + Here, it is subsetted to genotyped markers. + + `query_h` is also subsetted to genotyped markers (length m). + + In BEAGLE 4.1, the values are kept one site at a time. + Here, we keep the values at all the sites. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: Backward probability matrix. + :rtype: numpy.ndarray + """ + assert np.all( + np.isin(ref_h, [0, 1]) + ), "Reference haplotypes have non-biallelic values." + assert np.all(np.isin(query_h, [0, 1])), "Query haplotype has non-biallelic values." + m = ref_h.shape[0] + h = ref_h.shape[1] + assert m == len(query_h) + assert m == len(rho) + assert m == len(mu) + assert 0 <= np.min(rho) and np.max(rho) <= 1 + assert 0 <= np.min(mu) and np.max(mu) <= 0.5 + bm = np.zeros((m, h), dtype=np.float64) + bm[-1, :] = 1.0 / h # Initialise the last column + for i in np.arange(m - 2, -1, -1): + query_a = query_h[i + 1] + for j in np.arange(h): + ref_a = ref_h[i + 1, j] + em_prob = mu[i + 1] if ref_a != query_a else 1.0 - mu[i + 1] + # Note that BEAGLE keeps the values at one site at a time. + bm[i, j] = bm[i + 1, j] * em_prob + sum_site = np.sum(bm[i, :]) + scale = (1 - rho[i + 1]) / sum_site + shift = rho[i + 1] / h + bm[i, :] = scale * bm[i, :] + shift + return bm + + +def _get_ref_hap_seg(a, b): + """ + WARN: This function is part of an attempt to faithfully reimplement + the BEAGLE 4.1 algorithm. It is not complete and not used + in the current implementation. It is kept for documentation. + + Assuming all biallelic sites, get the index of a reference haplotype segment (i) + defined by the alleles a and b at two adjacent genotyped markers. + + #i, a, b + 0, 0, 0 + 1, 1, 0 + 2, 0, 1 + 3, 1, 1 + + See https://github.com/tskit-dev/tskit/issues/2802#issuecomment-1698767169 + + This is a helper function for `_compute_state_probability_matrix`. + """ + ref_hap_idx = a * 1 + b * 2 + return ref_hap_idx + + +def _compute_state_probability_matrix(fm, bm, ref_h, query_h, rho, mu): + """ + WARN: This function is part of an attempt to faithfully reimplement + the BEAGLE 4.1 algorithm. It is not complete and not used + in the current implementation. It is kept for documentation. + + Implement the HMM forward-backward algorithm to compute: + 1. A HMM state probability matrix across genotyped markers; + 2. "Forward haplotype probabilities"; and + 3. "Backward haplotype probabilities". + + A HMM state is an index of a reference haplotype. + + The state probability matrix is of size (m, h), + where m is the number of genotyped markers in the query haplotype + and h is the number of reference haplotypes. + + Both the forward and backward haplotype probability matrices + are of size (m, 2), assuming only biallelic sites. + + `ref_h` is a panel of reference haplotypes of size m x h. + Here, it is subsetted to genotyped markers. + + `query_h` is also subsetted to genotyped markers (length m). + + This implementation is based on the function `setStateProbs` + of `LSHapBaum.java` in BEAGLE. It performs the computation + described in Equation 2 of BB2016. + + :param numpy.ndarray fm: Forward probability matrix. + :param numpy.ndarray bm: Backward probability matrix. + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: HMM state probability matrix, + forward haplotype probability matrix, and + backward haplotype probability matrix. + :rtype: tuple + """ + m = fm.shape[0] + h = fm.shape[1] + assert (m, h) == bm.shape + assert not np.any(fm < 0), "Forward probability matrix has negative values." + assert not np.any(np.isnan(fm)), "Forward probability matrix has NaN values." + assert not np.any(bm < 0), "Backward probability matrix has negative values." + assert not np.any(np.isnan(bm)), "Backward probability matrix has NaN values." + assert np.all( + np.isin(ref_h, [0, 1]) + ), "Reference haplotypes have non-biallelic values." + assert np.all(np.isin(query_h, [0, 1])), "Query haplotype has non-biallelic values." + assert m == len(query_h) + assert m == len(rho) + assert m == len(mu) + assert 0 <= np.min(rho) and np.max(rho) <= 1 + assert 0 <= np.min(mu) and np.max(mu) <= 0.5 + sm = np.zeros((m, h), dtype=np.float64) + fwd_hap_probs = np.zeros((m, 4), dtype=np.float64) + bwd_hap_probs = np.zeros((m, 4), dtype=np.float64) + for i in np.arange(m - 1, -1, -1): + for j in np.arange(h): + sm[i, j] = fm[i, j] * bm[i, j] + ref_hap_seg_mP1 = _get_ref_hap_seg(ref_h[i, j], ref_h[i + 1, j]) + ref_hap_seg_m = _get_ref_hap_seg(ref_h[i - 1, j], ref_h[i, j]) + fwd_hap_probs[i, ref_hap_seg_mP1] += sm[i, j] + bwd_hap_probs[i, ref_hap_seg_m] += sm[i, j] + sum_fwd_hap_probs = np.sum(fwd_hap_probs[i, :]) + fwd_hap_probs[i, :] /= sum_fwd_hap_probs + bwd_hap_probs[i, :] /= sum_fwd_hap_probs # Sum of forward hap probs + return (sm, fwd_hap_probs, bwd_hap_probs) + + +def _interpolate_allele_probabilities( + fwd_hap_probs, bwd_hap_probs, genotyped_pos, imputed_pos +): + """ + WARN: This function is part of an attempt to faithfully reimplement + the BEAGLE 4.1 algorithm. It is not complete and not used + in the current implementation. It is kept for documentation. + + Compute the interpolated allele probabilities at the imputed markers + of a query haplotype. + + The forward and backward haplotype probability matrices are of size (m, 2), + where m is the number of genotyped markers. + Here, we assume all biallelic sites, hence the 2. + + The interpolated allele probability matrix is of size (x, 2), + where x is the number of imputed markers. + Again, we assume all biallelic sites, hence the 2. + + This implementation is based on Equation 2 (the rearranged one) in BB2016. + + :param numpy.ndarray fwd_hap_probs: Forward haplotype probability matrix. + :param numpy.ndarray bwd_hap_probs: Backward haplotype probability matrix. + :param numpy.ndarray genotyped_pos: Site positions of genotyped markers. + :param numpy.ndarray imputed_pos: Site positions of imputed markers. + :return: Interpolated allele probabilities. + :rtype: numpy.ndarray + """ + m = len(genotyped_pos) + x = len(imputed_pos) + assert (m, 2) == fwd_hap_probs.shape + assert (m, 2) == bwd_hap_probs.shape + genotyped_cm = convert_to_genetic_map_position(genotyped_pos) + assert m == len(genotyped_cm) + imputed_cm = convert_to_genetic_map_position(imputed_pos) + assert x == len(imputed_cm) + weights = get_weights(genotyped_cm, imputed_cm) + assert x == len(weights) + i_hap_probs = np.zeros((x, 2), dtype=np.float64) + for i in np.arange(x): + # Identify the genotyped markers k and k + 1 between imputed marker i + if imputed_cm[i] < genotyped_cm[0]: + k = 0 + elif imputed_cm[i] > genotyped_cm[-1]: + k = m - 2 + else: + k = np.max(np.where(imputed_cm[i] > genotyped_cm)[0]) + # Vectorised over the allelic states + i_hap_probs[i, :] = weights[i] * bwd_hap_probs[k, :] + i_hap_probs[i, :] += (1 - weights[i]) * fwd_hap_probs[k, :] + return i_hap_probs + + +def compute_state_probability_matrix(fm, bm, ref_h, query_h): + """ + Implement the HMM forward-backward algorithm following Equation 1 of BB2016. + This is simpler than faithfully reimplementing the original BEAGLE 4.1 algorithm. + + A HMM state is (index of a genotyped marker, index of a reference haplotype). + + The output state probability matrix is of size (m, h), + where m is the number of genotyped markers in the query haplotype + and h is the number of reference haplotypes. + + The forward and backward probability matrices are both of size (m, h). + + The reference haplotypes of size (m, h), and the query haplotype is of size m. + Here, they are both subsetted to genotyped markers. + + :param numpy.ndarray fm: Forward probability matrix. + :param numpy.ndarray bm: Backward probability matrix. + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :return: HMM state probability matrix. + :rtype: numpy.ndarray + """ + assert np.all( + np.isin(ref_h, [0, 1]) + ), "Reference haplotypes have non-biallelic values." + assert np.all(np.isin(query_h, [0, 1])), "Query haplotype has non-biallelic values." + m = ref_h.shape[0] + h = ref_h.shape[1] + assert m == len(query_h) + assert (m, h) == fm.shape + assert (m, h) == bm.shape + assert not np.any(fm < 0), "Forward probability matrix has negative values." + assert not np.any(np.isnan(fm)), "Forward probability matrix has NaN values." + assert not np.any(bm < 0), "Backward probability matrix has negative values." + assert not np.any(np.isnan(bm)), "Backward probability matrix has NaN values." + sm = np.zeros((m, h), dtype=np.float64) + for i in np.arange(m - 1, -1, -1): + for j in np.arange(h): + sm[i, j] = fm[i, j] * bm[i, j] + return sm + + +def interpolate_allele_probabilities(sm, ref_h, genotyped_pos, imputed_pos): + """ + Compute the interpolated allele probabilities at imputed markers of + a query haplotype following Equation 1 of BB2016. + + Assuming all biallelic sites, the interpolated allele probability matrix + is of size (x, 2), where x is the number of imputed markers. + + This function takes the output of `compute_state_probability_matrix`. + + Note that this function takes: + 1. HMM state probability matrix across genotyped markers of size (m, h). + 2. reference haplotypes subsetted to imputed markers of size (x, h). + + :param numpy.ndarray sm: HMM state probability matrix at genotyped markers. + :param numpy.ndarray ref_h: Reference haplotypes subsetted to imputed markers. + :param numpy.ndarray genotyped_pos: Site positions at genotyped markers. + :param numpy.ndarray imputed_pos: Site positions at imputed markers. + :return: Interpolated allele probabilities. + :rtype: numpy.ndarray + """ + assert not np.any(sm < 0), "HMM state probability matrix has negative values." + assert not np.any(np.isnan(sm)), "HMM state probability matrix has NaN values." + m = sm.shape[0] + h = sm.shape[1] + assert np.all( + np.isin(ref_h, [0, 1]) + ), "Reference haplotypes have non-biallelic values." + assert m == len(genotyped_pos) + x = len(imputed_pos) + assert (x, h) == ref_h.shape + genotyped_cm = convert_to_genetic_map_position(genotyped_pos) + imputed_cm = convert_to_genetic_map_position(imputed_pos) + weights = get_weights(genotyped_pos, imputed_pos) + assert x == len(weights) + p = np.zeros((x, 2), dtype=np.float64) + + def _compute_allele_probabilities(a): + """Equation 1 in BB2016.""" + for i in np.arange(x): + # Identify the genotyped markers k and k + 1 between the imputed marker i + # The indices k should mirror those in `get_weights`. + if imputed_cm[i] < genotyped_cm[0]: + # Special case: imputed marker is before the first genotyped marker. + k = 0 + elif imputed_cm[i] > genotyped_cm[-1]: + # Special case: imputed marker is after the last genotyped marker. + k = m - 1 + else: + k = np.max(np.where(imputed_cm[i] > genotyped_cm)[0]) + for j in np.arange(h): + if ref_h[i, j] == a: + p[i, a] += weights[i] * sm[k, j] + p[i, a] += (1 - weights[i]) * sm[k + 1, j] + + _compute_allele_probabilities(a=0) + _compute_allele_probabilities(a=1) + return p + + +def get_map_alleles(allele_probs): + """ + Compute the maximum a posteriori alleles at the imputed markers of a query haplotype. + + Assuming all biallelic sites, it is a matrix of size (x, 2), + where x is the number of imputed markers. + + :param numpy.ndarray allele_probs: Interpolated allele probabilities. + :return: Imputed alleles in the query haplotype. + :rtype: numpy.ndarray + """ + assert not np.any(allele_probs < 0), "Allele probabilities have negative values." + assert not np.any(np.isnan(allele_probs)), "Allele probabilities have NaN values." + x = len(allele_probs) + imputed_alleles = np.zeros((x, 2), dtype=int) + # TODO: Vectorise over the imputed markers + for i in np.arange(x): + imputed_alleles[i, 0] = np.argmax(allele_probs[i, :]) + imputed_alleles[i, 1] = np.argmax(allele_probs[i, :]) + return imputed_alleles + + +def run_beagle(ref_h, query_h, pos, miscall_rate=0.0001, ne=1e6, debug=False): + """ + Run a simplified version of the BEAGLE imputation algorithm + based on Equation 1 of BB2016. + + `ref_h` and `query_h` span all genotyped and imputed marker sites. + So, their sizes are (m + x, h) and (m + x), respectively, + where m is the number of genotyped markers + and x is the number of imputed markers. + + The genomic site positions of all the markers are in `pos`. + + This produces imputed alleles at the imputed markers of the query haplotype. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray pos: Site positions of all the markers. + :param float miscall_rate: Allelic miscall rate. + :param int ne: Effective population size. + :param bool debug: Whether to print intermediate results. + :return: Imputed alleles in the query haplotype. + :rtype: numpy.ndarray + """ + assert ref_h.shape[0] == len(pos) + assert len(query_h) == len(pos) + # Indices of genotyped markers in the query haplotype + genotyped_pos_idx = np.where(query_h != -1)[0] + assert len(genotyped_pos_idx) > 0 + # Indices of imputed markers in the query haplotype + imputed_pos_idx = np.where(query_h == -1)[0] + assert len(imputed_pos_idx) > 0 + # Site positions of genotyped markers + genotyped_pos = pos[genotyped_pos_idx] + # Site positions of imputed markers + imputed_pos = pos[imputed_pos_idx] + h = ref_h.shape[1] + m = len(genotyped_pos) + x = len(imputed_pos) + assert m + x == len(pos) + # Subset the reference haplotypes to genotyped markers + ref_h_genotyped = ref_h[genotyped_pos_idx, :] + if debug: + print("Reference haplotypes subsetted to genotyped markers") + print(ref_h_genotyped) + assert ref_h_genotyped.shape == (m, h) + # Subset the query haplotype to genotyped markers + query_h_genotyped = query_h[genotyped_pos_idx] + if debug: + print("Query haplotype subsetted to genotyped markers") + print(query_h_genotyped) + assert len(query_h_genotyped) == m + # Set mismatch probabilities at genotyped markers + mu = get_mismatch_prob(genotyped_pos, miscall_rate=miscall_rate) + if debug: + print("Mismatch probabilities") + print(mu) + assert len(mu) == m + # Set switch probabilities at genotyped markers + rho = get_switch_prob(genotyped_pos, h, ne=ne) + if debug: + print("Switch probabilities") + print(rho) + assert len(rho) == m + # Compute forward probability matrix at genotyped markers + fm = compute_forward_probability_matrix(ref_h_genotyped, query_h_genotyped, rho, mu) + if debug: + print("Forward probability matrix") + print(fm) + assert fm.shape == (m, h) + # Compute backward probability matrix at genotyped markers + bm = compute_backward_probability_matrix( + ref_h_genotyped, query_h_genotyped, rho, mu + ) + if debug: + print("Backward probability matrix") + print(bm) + assert bm.shape == (m, h) + # Compute HMM state probability matrix at genotyped markers + sm = compute_state_probability_matrix(fm, bm, ref_h_genotyped, query_h_genotyped) + if debug: + print("HMM state probability matrix") + print(sm) + assert sm.shape == (m, h) + # Subset the reference haplotypes to imputed markers + ref_h_imputed = ref_h[imputed_pos_idx, :] + # Interpolate allele probabilities at imputed markers + i_allele_probs = interpolate_allele_probabilities( + sm, ref_h_imputed, genotyped_pos, imputed_pos + ) + if debug: + print("Interpolated allele probabilities") + print(i_allele_probs) + assert i_allele_probs.shape == (x, 2) + # Get MAP alleles at imputed markers + imputed_alleles = get_map_alleles(i_allele_probs) + assert len(imputed_alleles) == x + return imputed_alleles diff --git a/python/tests/test_imputation.py b/python/tests/test_imputation.py new file mode 100644 index 0000000000..3fe69afe8a --- /dev/null +++ b/python/tests/test_imputation.py @@ -0,0 +1,558 @@ +""" +Tests for genotype imputation (forward and Baum-Welsh algorithms). +""" +import io + +import numpy as np + +import _tskit +import tskit +from tests.beagle import run_beagle + + +# A toy tree sequence containing 3 diploid individuals with 5 sites and 5 mutations. +# Two toy query haplotypes are targets for imputation. + +toy_ts_nodes_text = """\ +id is_sample time population individual metadata +0 1 0.000000 0 0 +1 1 0.000000 0 0 +2 1 0.000000 0 1 +3 1 0.000000 0 1 +4 1 0.000000 0 2 +5 1 0.000000 0 2 +6 0 0.029768 0 -1 +7 0 0.133017 0 -1 +8 0 0.223233 0 -1 +9 0 0.651586 0 -1 +10 0 0.698831 0 -1 +11 0 2.114867 0 -1 +12 0 4.322031 0 -1 +13 0 7.432311 0 -1 +""" + +toy_ts_edges_text = """\ +left right parent child metadata +0.000000 1000000.000000 6 0 +0.000000 1000000.000000 6 3 +0.000000 1000000.000000 7 2 +0.000000 1000000.000000 7 5 +0.000000 1000000.000000 8 1 +0.000000 1000000.000000 8 4 +0.000000 781157.000000 9 6 +0.000000 781157.000000 9 7 +0.000000 505438.000000 10 8 +0.000000 505438.000000 10 9 +505438.000000 549484.000000 11 8 +505438.000000 549484.000000 11 9 +781157.000000 1000000.000000 12 6 +781157.000000 1000000.000000 12 7 +549484.000000 1000000.000000 13 8 +549484.000000 781157.000000 13 9 +781157.000000 1000000.000000 13 12 +""" + +toy_ts_sites_text = """\ +position ancestral_state metadata +200000.000000 A +300000.000000 C +520000.000000 G +600000.000000 T +900000.000000 A +""" + +toy_ts_mutations_text = """\ +site node time derived_state parent metadata +0 9 unknown G -1 +1 8 unknown A -1 +2 9 unknown T -1 +3 9 unknown C -1 +4 12 unknown C -1 +""" + +toy_ts_individuals_text = """\ +flags +0 +0 +0 +""" + +toy_query_haplotypes_01 = np.array( + [ + [ + 0, + 1, + -1, + 1, + 1, + ], + [ + 0, + 1, + -1, + 0, + 1, + ], + ], + dtype=np.int32, +) + +toy_query_haplotypes_ACGT = np.array( + [ + [0, 0, -1, 2, 2], # AACC + [0, 0, -1, 3, 2], # AATC + ], + dtype=np.int32, +) + + +def get_toy_data(): + ref_ts = tskit.load_text( + nodes=io.StringIO(toy_ts_nodes_text), + edges=io.StringIO(toy_ts_edges_text), + sites=io.StringIO(toy_ts_sites_text), + mutations=io.StringIO(toy_ts_mutations_text), + individuals=io.StringIO(toy_ts_individuals_text), + strict=False, + ) + query_h = toy_query_haplotypes_ACGT + return [ref_ts, query_h] + + +def get_tskit_forward_backward_matrices(ts, h): + m = ts.num_sites + fm = _tskit.CompressedMatrix(ts._ll_tree_sequence) + bm = _tskit.CompressedMatrix(ts._ll_tree_sequence) + ls_hmm = _tskit.LsHmm( + ts._ll_tree_sequence, np.zeros(m) + 0.1, np.zeros(m) + 0.1, acgt_alleles=True + ) + ls_hmm.forward_matrix(h, fm) + ls_hmm.backward_matrix(h, fm.normalisation_factor, bm) + return [fm.decode(), bm.decode()] + + +# BEAGLE 4.1 was run on the toy data set above using default parameters. +# +# In the query VCF, the site at position 520,000 was redacted and then imputed. +# Note that the ancestral allele in the simulated tree sequence is +# treated as the REF in the VCFs. +# +# Three matrices were printed and kept below: +# 1. Forward probability matrix +# 2. Backward probability matrix +# 3. HMM state probability matrix +# +# There are two sets of matrices, one for each query haplotype. +# +# Notes about calculations: +# h = index of reference haplotype +# H = number of reference haplotypes +# m = index of genotyped marker +# M = number of genotyped markers +# +# In forward probability matrix, +# fwd[m][h] = emission prob., if m = 0 (first marker) +# fwd[m][h] = emission prob. * (scale * fwd[m - 1][h] + shift), otherwise +# +# where scale = (1 - switch prob.)/sum of fwd[m - 1], +# and shift = switch prob. / H. +# +# In backward probability matrix, +# bwd[m][h] = 1/n, if m = M - 1 (last marker) +# unadj. bwd[m][h] = emission prob. / H +# bwd[m][h] = (unadj. bwd[m][h] + shift) * scale, otherwise +# +# where scale = (1 - switch prob.)/sum of unadj. bwd[m], +# and shift = switch prob. / H. +# +# For each site, the sum of backward values over all haplotypes was calculated +# before scaling and shifting. +# +# The matrices below were obtained by setting Ne to 10. +def parse_matrix(csv_text): + # This returns a record array, which is essentially the same as a + # pandas dataframe, which we can access via df["m"] etc. + return np.lib.npyio.recfromcsv(io.StringIO(csv_text)) + + +def test_toy_example(): + # Get toy dataset ready for tskit. + ref_ts, query = get_toy_data() + print(list(ref_ts.haplotypes())) + print(ref_ts) + print(query) + # tskit_fwd, tskit_bwd = get_tskit_forward_backward_matrices(ref_ts, query[0]) + # Get BEAGLE results for the toy dataset. + # beagle_fwd = parse_matrix(beagle_fwd_matrix_text_1) + # beagle_bwd = parse_matrix(beagle_bwd_matrix_text_1) + # print(beagle_fwd) + # print(beagle_bwd) + # Run a trivial test case using Python implementation of BEAGLE. + ref_h = np.array( + [ + [ + 0, + 1, + 0, + 1, + 0, + 1, + ], + [ + 1, + 0, + 1, + 0, + 1, + 0, + ], + ] + ).T + query_h = np.array( + [ + -1, + 1, + -1, + 1, + -1, + 0, + ] + ) + assert ref_h.shape[0] == len(query_h) + pos = (np.arange(len(query_h)) + 1) * 1e4 + ia = run_beagle(ref_h, query_h, pos, miscall_rate=0.1, ne=10.0, debug=True) + print(ia) + + +# The following toy query haplotypes were imputed from the toy reference haplotypes +# using BEAGLE 4.1 with Ne set to 10. +toy_ref_0 = np.array( + [ + [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ], # ref_0 + [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ], # ref_0 + ] +).T +toy_query_0 = np.array( + [ + [ + -1, + 0, + -1, + 0, + -1, + 0, + -1, + 0, + -1, + ], # query_0 + [ + -1, + 0, + -1, + 0, + -1, + 0, + -1, + 0, + -1, + ], # query_0 + ] +) +assert toy_ref_0.shape == (9, 2) +assert toy_query_0.shape == (2, 9) +assert np.sum(toy_query_0[0] == 0) == 4 +assert np.sum(toy_query_0[0] == -1) == 5 +assert np.array_equal(toy_query_0[0], toy_query_0[1]) + +toy_ref_1 = np.array( + [ + [ + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + ], # ref_0 + [ + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + ], # ref_0 + ] +).T +toy_query_1 = np.array( + [ + [ + -1, + 1, + -1, + 1, + -1, + 1, + -1, + 1, + -1, + ], # query_0 + [ + -1, + 1, + -1, + 1, + -1, + 1, + -1, + 1, + -1, + ], # query_0 + ] +) +assert toy_ref_1.shape == (9, 2) +assert toy_query_1.shape == (2, 9) +assert np.sum(toy_query_1[0] == 1) == 4 +assert np.sum(toy_query_1[0] == -1) == 5 +assert np.array_equal(toy_query_1[0], toy_query_1[1]) + +toy_ref_2 = np.copy(toy_ref_0) +toy_query_2 = np.array( + [ + [ + 0, + 0, + -1, + 0, + -1, + 0, + -1, + 0, + 0, + ], # query_0 + [ + 0, + 0, + -1, + 0, + -1, + 0, + -1, + 0, + 0, + ], # query_0 + ] +) +assert toy_ref_2.shape == (9, 2) +assert toy_query_2.shape == (2, 9) +assert np.sum(toy_query_2[0] == 0) == 6 +assert np.sum(toy_query_2[0] == -1) == 3 +assert np.array_equal(toy_query_2[0], toy_query_2[1]) + +toy_ref_3 = np.array( + [ + [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ], # ref_0 + [ + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + ], # ref_0 + ] +).T +toy_query_3 = np.array( + [ + [ + -1, + 0, + -1, + 0, + -1, + 0, + -1, + 0, + -1, + ], # query_0 + [ + -1, + 1, + -1, + 1, + -1, + 1, + -1, + 1, + -1, + ], # query_0 + ] +) +assert toy_ref_3.shape == (9, 2) +assert toy_query_3.shape == (2, 9) +assert np.sum(toy_query_3[0] == 0) == 4 +assert np.sum(toy_query_3[0] == -1) == 5 +assert np.sum(toy_query_3[1] == 1) == 4 +assert np.sum(toy_query_3[1] == -1) == 5 + +toy_ref_4 = np.array( + [ + [0, 1, 0, 1, 0, 1, 0, 1, 0], # ref_0 + [1, 0, 1, 0, 1, 0, 1, 0, 1], # ref_0 + ] +) +toy_query_4 = np.array( + [ + [ + -1, + 0, + -1, + 0, + -1, + 0, + -1, + 0, + -1, + ], # query_0 + [ + -1, + 1, + -1, + 1, + -1, + 1, + -1, + 1, + -1, + ], # query_0 + ] +) +assert toy_ref_4.shape == (9, 2) +assert toy_query_4.shape == (2, 9) +assert np.all(np.invert(np.equal(toy_query_4[0], toy_query_4[1]))) +assert np.sum(toy_query_4[0] == 0) == 4 +assert np.sum(toy_query_4[0] == -1) == 5 +assert np.sum(toy_query_4[1] == 1) == 4 +assert np.sum(toy_query_4[1] == -1) == 5 + +toy_ref_5 = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0 + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_1 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_1 + ] +).T +toy_query_5 = np.array( + [ + [ + -1, + 0, + -1, + 1, + -1, + 1, + -1, + 0, + -1, + ], # query_0 + [ + -1, + 1, + -1, + 0, + -1, + 0, + -1, + 1, + -1, + ], # query_0 + ] +) +assert toy_ref_5.shape == (9, 4) +assert toy_query_5.shape == (2, 9) +assert np.sum(toy_ref_5[0] == 0) == 9 +assert np.sum(toy_ref_5[1] == 0) == 9 +assert np.sum(toy_ref_5[2] == 1) == 9 +assert np.sum(toy_ref_5[3] == 1) == 9 +assert np.sum(toy_query_5[0] == 0) == 2 +assert np.sum(toy_query_5[0] == 1) == 2 +assert np.sum(toy_query_5[0] == -1) == 5 +assert np.sum(toy_query_5[1] == 0) == 2 +assert np.sum(toy_query_5[1] == 1) == 2 +assert np.sum(toy_query_5[1] == -1) == 5 + +toy_ref_6 = np.copy(toy_ref_5) +toy_query_6 = np.array( + [ + [ + -1, + 0, + -1, + 0, + -1, + 1, + -1, + 1, + -1, + ], # query_0 + [ + -1, + 1, + -1, + 1, + -1, + 0, + -1, + 0, + -1, + ], # query_0 + ] +) +assert toy_ref_6.shape == (9, 4) +assert toy_query_6.shape == (2, 9) +assert np.sum(toy_query_6[0] == 0) == 2 +assert np.sum(toy_query_6[0] == 1) == 2 +assert np.sum(toy_query_6[0] == -1) == 5 +assert np.sum(toy_query_6[1] == 0) == 2 +assert np.sum(toy_query_6[1] == 1) == 2 +assert np.sum(toy_query_6[1] == -1) == 5