diff --git a/lshmm/api.py b/lshmm/api.py index b147c78..5b88181 100644 --- a/lshmm/api.py +++ b/lshmm/api.py @@ -3,21 +3,15 @@ import numpy as np -from .forward_backward.fb_diploid_variants_samples import ( - backward_ls_dip_loop, - forward_ls_dip_loop, -) -from .forward_backward.fb_haploid_variants_samples import ( - backwards_ls_hap, - forwards_ls_hap, -) -from .vit_diploid_variants_samples import ( +from .forward_backward.fb_diploid import backward_ls_dip_loop, forward_ls_dip_loop +from .forward_backward.fb_haploid import backwards_ls_hap, forwards_ls_hap +from .vit_diploid import ( backwards_viterbi_dip, forwards_viterbi_dip_low_mem, get_phased_path, path_ll_dip, ) -from .vit_haploid_variants_samples import ( +from .vit_haploid import ( backwards_viterbi_hap, forwards_viterbi_hap_lower_mem_rescaling, path_ll_hap, diff --git a/lshmm/forward_backward/fb_diploid_samples_variants.py b/lshmm/forward_backward/fb_diploid_samples_variants.py deleted file mode 100644 index 4864cf3..0000000 --- a/lshmm/forward_backward/fb_diploid_samples_variants.py +++ /dev/null @@ -1,523 +0,0 @@ -""" -Collection of functions to run forwards and backwards algorithms -on diploid genotype data, where the data is structured as samples x -variants. -""" -import numpy as np - -from lshmm import jit - -EQUAL_BOTH_HOM = 4 -UNEQUAL_BOTH_HOM = 0 -BOTH_HET = 7 -REF_HOM_OBS_HET = 1 -REF_HET_OBS_HOM = 2 - -MISSING = -1 -MISSING_INDEX = 3 - -# https://github.com/numba/numba/issues/1269 -@jit.numba_njit -def np_apply_along_axis(func1d, axis, arr): - """Create numpy-like functions for max, sum etc.""" - assert arr.ndim == 2 - assert axis in [0, 1] - if axis == 0: - result = np.empty(arr.shape[1]) - for i in range(len(result)): - result[i] = func1d(arr[:, i]) - else: - result = np.empty(arr.shape[0]) - for i in range(len(result)): - result[i] = func1d(arr[i, :]) - return result - - -@jit.numba_njit -def np_amax(array, axis): - """Numba implementation of numpy vectorised maximum.""" - return np_apply_along_axis(np.amax, axis, array) - - -@jit.numba_njit -def np_sum(array, axis): - """Numba implementation of numpy vectorised sum.""" - return np_apply_along_axis(np.sum, axis, array) - - -@jit.numba_njit -def np_argmax(array, axis): - """Numba implementation of numpy vectorised argmax.""" - return np_apply_along_axis(np.argmax, axis, array) - - -@jit.numba_njit -def forwards_ls_dip(n, m, G, s, e, r, norm=True): - """Matrix based diploid LS forward algorithm using numpy vectorisation.""" - # Initialise the forward tensor - F = np.zeros((n, n, m)) - F[:, :, 0] = 1 / (n ** 2) - c = np.ones(m) - r_n = r / n - - if s[0, 0] == MISSING: - index = MISSING_INDEX * np.ones( - (n, n), dtype=np.int64 - ) # We could have chosen anything here, this just implies a multiplication by a constant. - else: - index = 4 * np.equal(G[:, :, 0], s[0, 0]).astype(np.int64) + 2 * ( - G[:, :, 0] == 1 - ).astype(np.int64) - if s[0, 0] == 1: - index += 1 - - F[:, :, 0] *= e[index.ravel(), 0].reshape(n, n) - - if norm: - c[0] = np.sum(F[:, :, 0]) - F[:, :, 0] *= 1 / c[0] - - # Forwards - for l in range(1, m): - if s[0, l] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = 4 * np.equal(G[:, :, l], s[0, l]).astype(np.int64) + 2 * ( - G[:, :, l] == 1 - ).astype(np.int64) - - if s[0, l] == 1: - index += 1 - - # No change in both - F[:, :, l] = (1 - r[l]) ** 2 * F[:, :, l - 1] - - # Both change - F[:, :, l] += (r_n[l]) ** 2 - - # One changes - sum_j = np_sum(F[:, :, l - 1], 0).repeat(n).reshape((-1, n)) - F[:, :, l] += ((1 - r[l]) * r_n[l]) * (sum_j + sum_j.T) - - # Emission - F[:, :, l] *= e[index.ravel(), l].reshape(n, n) - c[l] = np.sum(F[:, :, l]) - F[:, :, l] *= 1 / c[l] - - ll = np.sum(np.log10(c)) - else: - # Forwards - for l in range(1, m): - if s[0, l] == MISSING: - index = MISSING_INDEX * np.ones((n, n)).astype(np.int64) - else: - index = 4 * np.equal(G[:, :, l], s[0, l]).astype(np.int64) + 2 * ( - G[:, :, l] == 1 - ).astype(np.int64) - - if s[0, l] == 1: - index += 1 - - # No change in both - F[:, :, l] = (1 - r[l]) ** 2 * F[:, :, l - 1] - - # Both change - F[:, :, l] += (r_n[l]) ** 2 * np.sum(F[:, :, l - 1]) - - # One changes - sum_j = np_sum(F[:, :, l - 1], 0).repeat(n).reshape((-1, n)).T - F[:, :, l] += ((1 - r[l]) * r_n[l]) * (sum_j + sum_j.T) - - # Emission - F[:, :, l] *= e[index.ravel(), l].reshape(n, n) - - ll = np.log10(np.sum(F[:, :, l])) - - return F, c, ll - - -@jit.numba_njit -def backwards_ls_dip(n, m, G, s, e, c, r): - """Matrix based diploid LS backward algorithm using numpy vectorisation.""" - # Initialise the backward tensor - B = np.zeros((n, n, m)) - - # Initialise - B[:, :, m - 1] = 1 - r_n = r / n - - # Backwards - for l in range(m - 2, -1, -1): - - if s[0, l + 1] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64).ravel() - else: - index = ( - 4 * np.equal(G[:, :, l + 1], s[0, l + 1]).astype(np.int64) - + 2 * (G[:, :, l + 1] == 1).astype(np.int64) - + np.int64(s[0, l + 1] == 1) - ).ravel() - - # Both change - B[:, :, l] = r_n[l + 1] ** 2 * np.sum( - e[index, l + 1].reshape(n, n) * B[:, :, l + 1] - ) - - # No change in both - B[:, :, l] += ( - (1 - r[l + 1]) ** 2 * B[:, :, l + 1] * e[index, l + 1].reshape(n, n) - ) - - sum_j = ( - np_sum(B[:, :, l + 1] * e[index, l + 1].reshape(n, n), 0) - .repeat(n) - .reshape((-1, n)) - ) - B[:, :, l] += ((1 - r[l + 1]) * r_n[l + 1]) * (sum_j + sum_j.T) - B[:, :, l] *= 1 / c[l + 1] - - return B - - -@jit.numba_njit -def forward_ls_dip_starting_point(n, m, G, s, e, r): - """Naive implementation of LS diploid forwards algorithm.""" - # Initialise the forward tensor - F = np.zeros((n, n, m)) - F[:, :, 0] = 1 / (n ** 2) - if s[0, 0] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64).ravel() - else: - index = ( - 4 * np.equal(G[:, :, 0], s[0, 0]).astype(np.int64) - + 2 * (G[:, :, 0] == 1).astype(np.int64) - + np.int64(s[0, 0] == 1) - ).ravel() - - F[:, :, 0] *= e[index, 0].reshape((n, n)) - r_n = r / n - - for l in range(1, m): - - # Determine the various components - F_no_change = np.zeros((n, n)) - F_j1_change = np.zeros(n) - F_j2_change = np.zeros(n) - F_both_change = 0 - - for j1 in range(n): - for j2 in range(n): - F_no_change[j1, j2] = (1 - r[l]) ** 2 * F[j1, j2, l - 1] - - for j1 in range(n): - for j2 in range(n): - F_both_change += r_n[l] ** 2 * F[j1, j2, l - 1] - - for j1 in range(n): - for j2 in range(n): # This is the variable to sum over - it changes - F_j2_change[j1] += (1 - r[l]) * r_n[l] * F[j1, j2, l - 1] - - for j2 in range(n): - for j1 in range(n): # This is the variable to sum over - it changes - F_j1_change[j2] += (1 - r[l]) * r_n[l] * F[j1, j2, l - 1] - - F[:, :, l] = F_both_change - - for j1 in range(n): - F[j1, :, l] += F_j2_change - - for j2 in range(n): - F[:, j2, l] += F_j1_change - - for j1 in range(n): - for j2 in range(n): - F[j1, j2, l] += F_no_change[j1, j2] - - if s[0, l] == MISSING: - F[:, :, l] *= e[MISSING_INDEX, l] - else: - for j1 in range(n): - for j2 in range(n): - # What is the emission? - if s[0, l] == 1: - # OBS is het - if G[j1, j2, l] == 1: # REF is het - F[j1, j2, l] *= e[BOTH_HET, l] - else: # REF is hom - F[j1, j2, l] *= e[REF_HOM_OBS_HET, l] - else: - # OBS is hom - if G[j1, j2, l] == 1: # REF is het - F[j1, j2, l] *= e[REF_HET_OBS_HOM, l] - else: # REF is hom - if G[j1, j2, l] == s[0, l]: # Equal - F[j1, j2, l] *= e[EQUAL_BOTH_HOM, l] - else: # Unequal - F[j1, j2, l] *= e[UNEQUAL_BOTH_HOM, l] - - ll = np.log10(np.sum(F[:, :, l])) - return F, ll - - -@jit.numba_njit -def backward_ls_dip_starting_point(n, m, G, s, e, r): - """Naive implementation of LS diploid backwards algorithm.""" - # Backwards - B = np.zeros((n, n, m)) - - # Initialise - B[:, :, m - 1] = 1 - r_n = r / n - - for l in range(m - 2, -1, -1): - - # Determine the various components - B_no_change = np.zeros((n, n)) - B_j1_change = np.zeros(n) - B_j2_change = np.zeros(n) - B_both_change = 0 - - # Evaluate the emission matrix at this site, for all pairs - e_tmp = np.zeros((n, n)) - if s[0, l + 1] == MISSING: - e_tmp[:, :] = e[MISSING_INDEX, l + 1] - else: - for j1 in range(n): - for j2 in range(n): - # What is the emission? - if s[0, l + 1] == 1: - # OBS is het - if G[j1, j2, l + 1] == 1: # REF is het - e_tmp[j1, j2] = e[BOTH_HET, l + 1] - else: # REF is hom - e_tmp[j1, j2] = e[REF_HOM_OBS_HET, l + 1] - else: - # OBS is hom - if G[j1, j2, l + 1] == 1: # REF is het - e_tmp[j1, j2] = e[REF_HET_OBS_HOM, l + 1] - else: # REF is hom - if G[j1, j2, l + 1] == s[0, l + 1]: # Equal - e_tmp[j1, j2] = e[EQUAL_BOTH_HOM, l + 1] - else: # Unequal - e_tmp[j1, j2] = e[UNEQUAL_BOTH_HOM, l + 1] - - for j1 in range(n): - for j2 in range(n): - B_no_change[j1, j2] = ( - (1 - r[l + 1]) ** 2 * B[j1, j2, l + 1] * e_tmp[j1, j2] - ) - - for j1 in range(n): - for j2 in range(n): - B_both_change += r_n[l + 1] ** 2 * e_tmp[j1, j2] * B[j1, j2, l + 1] - - for j1 in range(n): - for j2 in range(n): # This is the variable to sum over - it changes - B_j2_change[j1] += ( - (1 - r[l + 1]) * r_n[l + 1] * B[j1, j2, l + 1] * e_tmp[j1, j2] - ) - - for j2 in range(n): - for j1 in range(n): # This is the variable to sum over - it changes - B_j1_change[j2] += ( - (1 - r[l + 1]) * r_n[l + 1] * B[j1, j2, l + 1] * e_tmp[j1, j2] - ) - - B[:, :, l] = B_both_change - - for j1 in range(n): - B[j1, :, l] += B_j2_change - - for j2 in range(n): - B[:, j2, l] += B_j1_change - - for j1 in range(n): - for j2 in range(n): - B[j1, j2, l] += B_no_change[j1, j2] - - return B - - -@jit.numba_njit -def forward_ls_dip_loop(n, m, G, s, e, r, norm=True): - """LS diploid forwards algoritm without vectorisation.""" - # Initialise the forward tensor - F = np.zeros((n, n, m)) - F[:, :, 0] = 1 / (n ** 2) - r_n = r / n - c = np.ones(m) - - if s[0, 0] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = ( - 4 * np.equal(G[:, :, 0], s[0, 0]).astype(np.int64) - + 2 * (G[:, :, 0] == 1).astype(np.int64) - + np.int64(s[0, 0] == 1) - ) - F[:, :, 0] *= e[index.ravel(), 0].reshape(n, n) - - if norm: - - c[0] = np.sum(F[:, :, 0]) - F[:, :, 0] *= 1 / c[0] - - for l in range(1, m): - - # Determine the various components - F_no_change = np.zeros((n, n)) - F_j_change = np.zeros(n) - - for j1 in range(n): - for j2 in range(n): - F_no_change[j1, j2] = (1 - r[l]) ** 2 * F[j1, j2, l - 1] - F_j_change[j1] += (1 - r[l]) * r_n[l] * F[j2, j1, l - 1] - - F[:, :, l] = r_n[l] ** 2 - - for j1 in range(n): - F[j1, :, l] += F_j_change - F[:, j1, l] += F_j_change - for j2 in range(n): - F[j1, j2, l] += F_no_change[j1, j2] - - if s[0, l] == MISSING: - F[:, :, l] *= e[MISSING_INDEX, l] - else: - for j1 in range(n): - for j2 in range(n): - # What is the emission? - if s[0, l] == 1: - # OBS is het - if G[j1, j2, l] == 1: # REF is het - F[j1, j2, l] *= e[BOTH_HET, l] - else: # REF is hom - F[j1, j2, l] *= e[REF_HOM_OBS_HET, l] - else: - # OBS is hom - if G[j1, j2, l] == 1: # REF is het - F[j1, j2, l] *= e[REF_HET_OBS_HOM, l] - else: # REF is hom - if G[j1, j2, l] == s[0, l]: # Equal - F[j1, j2, l] *= e[EQUAL_BOTH_HOM, l] - else: # Unequal - F[j1, j2, l] *= e[UNEQUAL_BOTH_HOM, l] - - c[l] = np.sum(F[:, :, l]) - F[:, :, l] *= 1 / c[l] - - ll = np.sum(np.log10(c)) - - else: - - for l in range(1, m): - - # Determine the various components - F_no_change = np.zeros((n, n)) - F_j1_change = np.zeros(n) - F_j2_change = np.zeros(n) - F_both_change = 0 - - for j1 in range(n): - for j2 in range(n): - F_no_change[j1, j2] = (1 - r[l]) ** 2 * F[j1, j2, l - 1] - F_j1_change[j1] += (1 - r[l]) * r_n[l] * F[j2, j1, l - 1] - F_j2_change[j1] += (1 - r[l]) * r_n[l] * F[j1, j2, l - 1] - F_both_change += r_n[l] ** 2 * F[j1, j2, l - 1] - - F[:, :, l] = F_both_change - - for j1 in range(n): - F[j1, :, l] += F_j2_change - F[:, j1, l] += F_j1_change - for j2 in range(n): - F[j1, j2, l] += F_no_change[j1, j2] - - if s[0, l] == MISSING: - F[:, :, l] *= e[MISSING_INDEX, l] - else: - for j1 in range(n): - for j2 in range(n): - # What is the emission? - if s[0, l] == 1: - # OBS is het - if G[j1, j2, l] == 1: # REF is het - F[j1, j2, l] *= e[BOTH_HET, l] - else: # REF is hom - F[j1, j2, l] *= e[REF_HOM_OBS_HET, l] - else: - # OBS is hom - if G[j1, j2, l] == 1: # REF is het - F[j1, j2, l] *= e[REF_HET_OBS_HOM, l] - else: # REF is hom - if G[j1, j2, l] == s[0, l]: # Equal - F[j1, j2, l] *= e[EQUAL_BOTH_HOM, l] - else: # Unequal - F[j1, j2, l] *= e[UNEQUAL_BOTH_HOM, l] - - ll = np.log10(np.sum(F[:, :, l])) - - return F, c, ll - - -@jit.numba_njit -def backward_ls_dip_loop(n, m, G, s, e, c, r): - """LS diploid backwards algoritm without vectorisation.""" - # Initialise the backward tensor - B = np.zeros((n, n, m)) - B[:, :, m - 1] = 1 - r_n = r / n - - for l in range(m - 2, -1, -1): - - # Determine the various components - B_no_change = np.zeros((n, n)) - B_j_change = np.zeros(n) - B_both_change = 0 - - # Evaluate the emission matrix at this site, for all pairs - e_tmp = np.zeros((n, n)) - if s[0, l + 1] == MISSING: - e_tmp[:, :] = e[MISSING_INDEX, l + 1] - else: - for j1 in range(n): - for j2 in range(n): - # What is the emission? - if s[0, l + 1] == 1: - # OBS is het - if G[j1, j2, l + 1] == 1: # REF is het - e_tmp[j1, j2] = e[BOTH_HET, l + 1] - else: # REF is hom - e_tmp[j1, j2] = e[REF_HOM_OBS_HET, l + 1] - else: - # OBS is hom - if G[j1, j2, l + 1] == 1: # REF is het - e_tmp[j1, j2] = e[REF_HET_OBS_HOM, l + 1] - else: # REF is hom - if G[j1, j2, l + 1] == s[0, l + 1]: # Equal - e_tmp[j1, j2] = e[EQUAL_BOTH_HOM, l + 1] - else: # Unequal - e_tmp[j1, j2] = e[UNEQUAL_BOTH_HOM, l + 1] - - for j1 in range(n): - for j2 in range(n): - B_no_change[j1, j2] = ( - (1 - r[l + 1]) ** 2 * B[j1, j2, l + 1] * e_tmp[j1, j2] - ) - B_j_change[j1] += ( - (1 - r[l + 1]) * r_n[l + 1] * B[j1, j2, l + 1] * e_tmp[j1, j2] - ) - B_both_change += r_n[l + 1] ** 2 * e_tmp[j1, j2] * B[j1, j2, l + 1] - - B[:, :, l] = B_both_change - - for j1 in range(n): - B[:, j1, l] += B_j_change - B[j1, :, l] += B_j_change - - for j2 in range(n): - B[j1, j2, l] += B_no_change[j1, j2] - - B[:, :, l] *= 1 / c[l + 1] - - return B diff --git a/lshmm/forward_backward/fb_diploid_variants_samples.py b/lshmm/forward_backward/fb_diploid_variants_samples.py deleted file mode 100644 index 53d1b15..0000000 --- a/lshmm/forward_backward/fb_diploid_variants_samples.py +++ /dev/null @@ -1,524 +0,0 @@ -"""Collection of functions to run forwards and backwards algorithms on haploid genotype data, where the data is structured as variants x samples.""" -import numpy as np - -from lshmm import jit - -EQUAL_BOTH_HOM = 4 -UNEQUAL_BOTH_HOM = 0 -BOTH_HET = 7 -REF_HOM_OBS_HET = 1 -REF_HET_OBS_HOM = 2 - -MISSING = -1 -MISSING_INDEX = 3 - -# https://github.com/numba/numba/issues/1269 -@jit.numba_njit -def np_apply_along_axis(func1d, axis, arr): - """Create numpy-like functions for max, sum etc.""" - assert arr.ndim == 2 - assert axis in [0, 1] - if axis == 0: - result = np.empty(arr.shape[1]) - for i in range(len(result)): - result[i] = func1d(arr[:, i]) - else: - result = np.empty(arr.shape[0]) - for i in range(len(result)): - result[i] = func1d(arr[i, :]) - return result - - -@jit.numba_njit -def np_amax(array, axis): - """Numba implementation of numpy vectorised maximum.""" - return np_apply_along_axis(np.amax, axis, array) - - -@jit.numba_njit -def np_sum(array, axis): - """Numba implementation of numpy vectorised sum.""" - return np_apply_along_axis(np.sum, axis, array) - - -@jit.numba_njit -def np_argmax(array, axis): - """Numba implementation of numpy vectorised argmax.""" - return np_apply_along_axis(np.argmax, axis, array) - - -def forwards_ls_dip(n, m, G, s, e, r, norm=True): - """Matrix based diploid LS forward algorithm using numpy vectorisation.""" - # Initialise the forward tensor - F = np.zeros((m, n, n)) - F[0, :, :] = 1 / (n ** 2) - c = np.ones(m) - r_n = r / n - - if s[0, 0] == MISSING: - index = MISSING_INDEX * np.ones( - (n, n), dtype=np.int64 - ) # We could have chosen anything here, this just implies a multiplication by a constant. - else: - index = 4 * np.equal(G[0, :, :], s[0, 0]).astype(np.int64) + 2 * ( - G[0, :, :] == 1 - ).astype(np.int64) - if s[0, 0] == 1: - index += 1 - - F[0, :, :] *= e[0, index] - - if norm: - c[0] = np.sum(F[0, :, :]) - F[0, :, :] *= 1 / c[0] - - # Forwards - for l in range(1, m): - if s[0, l] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = 4 * np.equal(G[l, :, :], s[0, l]).astype(np.int64) + 2 * ( - G[l, :, :] == 1 - ).astype(np.int64) - - if s[0, l] == 1: - index += 1 - - # No change in both - F[l, :, :] = (1 - r[l]) ** 2 * F[l - 1, :, :] - - # Both change - F[l, :, :] += (r_n[l]) ** 2 - - # One changes - sum_j = np_sum(F[l - 1, :, :], 0).repeat(n).reshape((-1, n)).T - F[l, :, :] += ((1 - r[l]) * r_n[l]) * (sum_j + sum_j.T) - - # Emission - F[l, :, :] *= e[l, index] - c[l] = np.sum(F[l, :, :]) - F[l, :, :] *= 1 / c[l] - - ll = np.sum(np.log10(c)) - else: - # Forwards - for l in range(1, m): - if s[0, l] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = 4 * np.equal(G[l, :, :], s[0, l]).astype(np.int64) + 2 * ( - G[l, :, :] == 1 - ).astype(np.int64) - - if s[0, l] == 1: - index += 1 - - # No change in both - F[l, :, :] = (1 - r[l]) ** 2 * F[l - 1, :, :] - - # Both change - F[l, :, :] += (r_n[l]) ** 2 * np.sum(F[l - 1, :, :]) - - # One changes - sum_j = np_sum(F[l - 1, :, :], 0).repeat(n).reshape((-1, n)).T - # sum_j2 = np_sum(F[l - 1, :, :], 1).repeat(n).reshape((-1, n)) - F[l, :, :] += ((1 - r[l]) * r_n[l]) * (sum_j + sum_j.T) - - # Emission - F[l, :, :] *= e[l, index] - - ll = np.log10(np.sum(F[l, :, :])) - - return F, c, ll - - -def backwards_ls_dip(n, m, G, s, e, c, r): - """Matrix based diploid LS backward algorithm using numpy vectorisation.""" - # Initialise the backward tensor - B = np.zeros((m, n, n)) - - # Initialise - B[m - 1, :, :] = 1 - r_n = r / n - - # Backwards - for l in range(m - 2, -1, -1): - - if s[0, l + 1] == MISSING: - index = MISSING_INDEX * np.ones( - (n, n), dtype=np.int64 - ) # We could have chosen anything here, this just implies a multiplication by a constant. - else: - index = ( - 4 * np.equal(G[l + 1, :, :], s[0, l + 1]).astype(np.int64) - + 2 * (G[l + 1, :, :] == 1).astype(np.int64) - + np.int64(s[0, l + 1] == 1) - ) - - # No change in both - B[l, :, :] = r_n[l + 1] ** 2 * np.sum( - e[l + 1, index.reshape((n, n))] * B[l + 1, :, :] - ) - - # Both change - B[l, :, :] += ( - (1 - r[l + 1]) ** 2 * B[l + 1, :, :] * e[l + 1, index.reshape((n, n))] - ) - - # One changes - sum_j = np_sum(B[l + 1, :, :] * e[l + 1, index], 0).repeat(n).reshape((-1, n)) - B[l, :, :] += ((1 - r[l + 1]) * r_n[l + 1]) * (sum_j + sum_j.T) - B[l, :, :] *= 1 / c[l + 1] - - return B - - -@jit.numba_njit -def forward_ls_dip_starting_point(n, m, G, s, e, r): - """Naive implementation of LS diploid forwards algorithm.""" - # Initialise the forward tensor - F = np.zeros((m, n, n)) - r_n = r / n - for j1 in range(n): - for j2 in range(n): - F[0, j1, j2] = 1 / (n ** 2) - if s[0, 0] == MISSING: - index_tmp = MISSING_INDEX - else: - index_tmp = ( - 4 * np.int64(np.equal(G[0, j1, j2], s[0, 0])) - + 2 * np.int64((G[0, j1, j2] == 1)) - + np.int64(s[0, 0] == 1) - ) - F[0, j1, j2] *= e[0, index_tmp] - - for l in range(1, m): - - # Determine the various components - F_no_change = np.zeros((n, n)) - F_j1_change = np.zeros(n) - F_j2_change = np.zeros(n) - F_both_change = 0 - - for j1 in range(n): - for j2 in range(n): - F_no_change[j1, j2] = (1 - r[l]) ** 2 * F[l - 1, j1, j2] - - for j1 in range(n): - for j2 in range(n): - F_both_change += r_n[l] ** 2 * F[l - 1, j1, j2] - - for j1 in range(n): - for j2 in range(n): # This is the variable to sum over - it changes - F_j2_change[j1] += (1 - r[l]) * r_n[l] * F[l - 1, j1, j2] - - for j2 in range(n): - for j1 in range(n): # This is the variable to sum over - it changes - F_j1_change[j2] += (1 - r[l]) * r_n[l] * F[l - 1, j1, j2] - - F[l, :, :] = F_both_change - - for j1 in range(n): - F[l, j1, :] += F_j2_change - - for j2 in range(n): - F[l, :, j2] += F_j1_change - - for j1 in range(n): - for j2 in range(n): - F[l, j1, j2] += F_no_change[j1, j2] - - for j1 in range(n): - for j2 in range(n): - # What is the emission? - if s[0, l] == MISSING: - F[l, j1, j2] *= e[l, MISSING_INDEX] - else: - if s[0, l] == 1: - # OBS is het - if G[l, j1, j2] == 1: # REF is het - F[l, j1, j2] *= e[l, BOTH_HET] - else: # REF is hom - F[l, j1, j2] *= e[l, REF_HOM_OBS_HET] - else: - # OBS is hom - if G[l, j1, j2] == 1: # REF is het - F[l, j1, j2] *= e[l, REF_HET_OBS_HOM] - else: # REF is hom - if G[l, j1, j2] == s[0, l]: # Equal - F[l, j1, j2] *= e[l, EQUAL_BOTH_HOM] - else: # Unequal - F[l, j1, j2] *= e[l, UNEQUAL_BOTH_HOM] - - ll = np.log10(np.sum(F[l, :, :])) - - return F, ll - - -@jit.numba_njit -def backward_ls_dip_starting_point(n, m, G, s, e, r): - """Naive implementation of LS diploid backwards algorithm.""" - # Backwards - B = np.zeros((m, n, n)) - - # Initialise - B[m - 1, :, :] = 1 - r_n = r / n - - for l in range(m - 2, -1, -1): - - # Determine the various components - B_no_change = np.zeros((n, n)) - B_j1_change = np.zeros(n) - B_j2_change = np.zeros(n) - B_both_change = 0 - - # Evaluate the emission matrix at this site, for all pairs - e_tmp = np.zeros((n, n)) - if s[0, l + 1] == MISSING: - e_tmp[:, :] = e[l + 1, MISSING_INDEX] - else: - for j1 in range(n): - for j2 in range(n): - # What is the emission? - if s[0, l + 1] == 1: - # OBS is het - if G[l + 1, j1, j2] == 1: # REF is het - e_tmp[j1, j2] = e[l + 1, BOTH_HET] - else: # REF is hom - e_tmp[j1, j2] = e[l + 1, REF_HOM_OBS_HET] - else: - # OBS is hom - if G[l + 1, j1, j2] == 1: # REF is het - e_tmp[j1, j2] = e[l + 1, REF_HET_OBS_HOM] - else: # REF is hom - if G[l + 1, j1, j2] == s[0, l + 1]: # Equal - e_tmp[j1, j2] = e[l + 1, EQUAL_BOTH_HOM] - else: # Unequal - e_tmp[j1, j2] = e[l + 1, UNEQUAL_BOTH_HOM] - - for j1 in range(n): - for j2 in range(n): - B_no_change[j1, j2] = ( - (1 - r[l + 1]) ** 2 * B[l + 1, j1, j2] * e_tmp[j1, j2] - ) - - for j1 in range(n): - for j2 in range(n): - B_both_change += r_n[l + 1] ** 2 * e_tmp[j1, j2] * B[l + 1, j1, j2] - - for j1 in range(n): - for j2 in range(n): # This is the variable to sum over - it changes - B_j2_change[j1] += ( - (1 - r[l + 1]) * r_n[l + 1] * B[l + 1, j1, j2] * e_tmp[j1, j2] - ) - - for j2 in range(n): - for j1 in range(n): # This is the variable to sum over - it changes - B_j1_change[j2] += ( - (1 - r[l + 1]) * r_n[l + 1] * B[l + 1, j1, j2] * e_tmp[j1, j2] - ) - - B[l, :, :] = B_both_change - - for j1 in range(n): - B[l, j1, :] += B_j2_change - - for j2 in range(n): - B[l, :, j2] += B_j1_change - - for j1 in range(n): - for j2 in range(n): - B[l, j1, j2] += B_no_change[j1, j2] - - return B - - -@jit.numba_njit -def forward_ls_dip_loop(n, m, G, s, e, r, norm=True): - """LS diploid forwards algoritm without vectorisation.""" - # Initialise the forward tensor - F = np.zeros((m, n, n)) - for j1 in range(n): - for j2 in range(n): - F[0, j1, j2] = 1 / (n ** 2) - if s[0, 0] == MISSING: - index_tmp = MISSING_INDEX - else: - index_tmp = ( - 4 * np.int64(np.equal(G[0, j1, j2], s[0, 0])) - + 2 * np.int64((G[0, j1, j2] == 1)) - + np.int64(s[0, 0] == 1) - ) - F[0, j1, j2] *= e[0, index_tmp] - r_n = r / n - c = np.ones(m) - - if norm: - - c[0] = np.sum(F[0, :, :]) - F[0, :, :] *= 1 / c[0] - - for l in range(1, m): - - # Determine the various components - F_no_change = np.zeros((n, n)) - F_j_change = np.zeros(n) - - for j1 in range(n): - for j2 in range(n): - F_no_change[j1, j2] = (1 - r[l]) ** 2 * F[l - 1, j1, j2] - F_j_change[j1] += (1 - r[l]) * r_n[l] * F[l - 1, j2, j1] - - F[l, :, :] = r_n[l] ** 2 - - for j1 in range(n): - F[l, j1, :] += F_j_change - F[l, :, j1] += F_j_change - for j2 in range(n): - F[l, j1, j2] += F_no_change[j1, j2] - - if s[0, l] == MISSING: - F[l, :, :] *= e[l, MISSING_INDEX] - else: - for j1 in range(n): - for j2 in range(n): - # What is the emission? - if s[0, l] == 1: - # OBS is het - if G[l, j1, j2] == 1: # REF is het - F[l, j1, j2] *= e[l, BOTH_HET] - else: # REF is hom - F[l, j1, j2] *= e[l, REF_HOM_OBS_HET] - else: - # OBS is hom - if G[l, j1, j2] == 1: # REF is het - F[l, j1, j2] *= e[l, REF_HET_OBS_HOM] - else: # REF is hom - if G[l, j1, j2] == s[0, l]: # Equal - F[l, j1, j2] *= e[l, EQUAL_BOTH_HOM] - else: # Unequal - F[l, j1, j2] *= e[l, UNEQUAL_BOTH_HOM] - - c[l] = np.sum(F[l, :, :]) - F[l, :, :] *= 1 / c[l] - - ll = np.sum(np.log10(c)) - - else: - - for l in range(1, m): - - # Determine the various components - F_no_change = np.zeros((n, n)) - F_j1_change = np.zeros(n) - F_j2_change = np.zeros(n) - F_both_change = 0 - - for j1 in range(n): - for j2 in range(n): - F_no_change[j1, j2] = (1 - r[l]) ** 2 * F[l - 1, j1, j2] - F_j1_change[j1] += (1 - r[l]) * r_n[l] * F[l - 1, j2, j1] - F_j2_change[j1] += (1 - r[l]) * r_n[l] * F[l - 1, j1, j2] - F_both_change += r_n[l] ** 2 * F[l - 1, j1, j2] - - F[l, :, :] = F_both_change - - for j1 in range(n): - F[l, j1, :] += F_j2_change - F[l, :, j1] += F_j1_change - for j2 in range(n): - F[l, j1, j2] += F_no_change[j1, j2] - - if s[0, l] == MISSING: - F[l, :, :] *= e[l, MISSING_INDEX] - else: - for j1 in range(n): - for j2 in range(n): - # What is the emission? - if s[0, l] == 1: - # OBS is het - if G[l, j1, j2] == 1: # REF is het - F[l, j1, j2] *= e[l, BOTH_HET] - else: # REF is hom - F[l, j1, j2] *= e[l, REF_HOM_OBS_HET] - else: - # OBS is hom - if G[l, j1, j2] == 1: # REF is het - F[l, j1, j2] *= e[l, REF_HET_OBS_HOM] - else: # REF is hom - if G[l, j1, j2] == s[0, l]: # Equal - F[l, j1, j2] *= e[l, EQUAL_BOTH_HOM] - else: # Unequal - F[l, j1, j2] *= e[l, UNEQUAL_BOTH_HOM] - - ll = np.log10(np.sum(F[l, :, :])) - - return F, c, ll - - -@jit.numba_njit -def backward_ls_dip_loop(n, m, G, s, e, c, r): - """LS diploid backwards algoritm without vectorisation.""" - # Initialise the backward tensor - B = np.zeros((m, n, n)) - B[m - 1, :, :] = 1 - r_n = r / n - - for l in range(m - 2, -1, -1): - - # Determine the various components - B_no_change = np.zeros((n, n)) - B_j1_change = np.zeros(n) - B_j2_change = np.zeros(n) - B_both_change = 0 - - # Evaluate the emission matrix at this site, for all pairs - e_tmp = np.zeros((n, n)) - if s[0, l + 1] == MISSING: - e_tmp[:, :] = e[l + 1, MISSING_INDEX] - else: - for j1 in range(n): - for j2 in range(n): - # What is the emission? - # else: - if s[0, l + 1] == 1: - # OBS is het - if G[l + 1, j1, j2] == 1: # REF is het - e_tmp[j1, j2] = e[l + 1, BOTH_HET] - else: # REF is hom - e_tmp[j1, j2] = e[l + 1, REF_HOM_OBS_HET] - else: - # OBS is hom - if G[l + 1, j1, j2] == 1: # REF is het - e_tmp[j1, j2] = e[l + 1, REF_HET_OBS_HOM] - else: # REF is hom - if G[l + 1, j1, j2] == s[0, l + 1]: # Equal - e_tmp[j1, j2] = e[l + 1, EQUAL_BOTH_HOM] - else: # Unequal - e_tmp[j1, j2] = e[l + 1, UNEQUAL_BOTH_HOM] - - for j1 in range(n): - for j2 in range(n): - B_no_change[j1, j2] = ( - (1 - r[l + 1]) ** 2 * B[l + 1, j1, j2] * e_tmp[j1, j2] - ) - B_j2_change[j1] += ( - (1 - r[l + 1]) * r_n[l + 1] * B[l + 1, j1, j2] * e_tmp[j1, j2] - ) - B_j1_change[j1] += ( - (1 - r[l + 1]) * r_n[l + 1] * B[l + 1, j2, j1] * e_tmp[j2, j1] - ) - B_both_change += r_n[l + 1] ** 2 * e_tmp[j1, j2] * B[l + 1, j1, j2] - - B[l, :, :] = B_both_change - - for j1 in range(n): - B[l, j1, :] += B_j2_change - B[l, :, j1] += B_j1_change - for j2 in range(n): - B[l, j1, j2] += B_no_change[j1, j2] - - B[l, :, :] *= 1 / c[l + 1] - - return B diff --git a/lshmm/forward_backward/fb_haploid_samples_variants.py b/lshmm/forward_backward/fb_haploid_samples_variants.py deleted file mode 100644 index 9652d3e..0000000 --- a/lshmm/forward_backward/fb_haploid_samples_variants.py +++ /dev/null @@ -1,91 +0,0 @@ -"""Collection of functions to run forwards and backwards algorithms on haploid genotype data, where the data is structured as samples x variants.""" -import numpy as np - -from lshmm import jit - -MISSING = -1 - - -@jit.numba_njit -def forwards_ls_hap(n, m, H, s, e, r, norm=True): - """Matrix based haploid LS forward algorithm using numpy vectorisation.""" - # Initialise - F = np.zeros((n, m)) - r_n = r / n - - if norm: - - c = np.zeros(m) - for i in range(n): - F[i, 0] = ( - 1 / n * e[np.int64(np.equal(H[i, 0], s[0, 0]) or s[0, 0] == MISSING), 0] - ) - c[0] += F[i, 0] - - for i in range(n): - F[i, 0] *= 1 / c[0] - - # Forwards pass - for l in range(1, m): - for i in range(n): - F[i, l] = F[i, l - 1] * (1 - r[l]) + r_n[l] - F[i, l] *= e[ - np.int64(np.equal(H[i, l], s[0, l]) or s[0, l] == MISSING), l - ] - c[l] += F[i, l] - - for i in range(n): - F[i, l] *= 1 / c[l] - - ll = np.sum(np.log10(c)) - - else: - c = np.ones(m) - - for i in range(n): - F[i, 0] = ( - 1 / n * e[np.int64(np.equal(H[i, 0], s[0, 0]) or s[0, 0] == MISSING), 0] - ) - # Forwards pass - for l in range(1, m): - for i in range(n): - F[i, l] = F[i, l - 1] * (1 - r[l]) + np.sum(F[:, l - 1]) * r_n[l] - F[i, l] *= e[ - np.int64(np.equal(H[i, l], s[0, l]) or s[0, l] == MISSING), l - ] - - ll = np.log10(np.sum(F[:, m - 1])) - - return F, c, ll - - -@jit.numba_njit -def backwards_ls_hap(n, m, H, s, e, c, r): - """Matrix based haploid LS backward algorithm using numpy vectorisation.""" - # Initialise - B = np.zeros((n, m)) - for i in range(n): - B[i, m - 1] = 1 - r_n = r / n - - # Backwards pass - for l in range(m - 2, -1, -1): - tmp_B = np.zeros(n) - tmp_B_sum = 0 - for i in range(n): - tmp_B[i] = ( - e[ - np.int64( - np.equal(H[i, l + 1], s[0, l + 1]) or s[0, l + 1] == MISSING - ), - l + 1, - ] - * B[i, l + 1] - ) - tmp_B_sum += tmp_B[i] - for i in range(n): - B[i, l] = r_n[l + 1] * tmp_B_sum - B[i, l] += (1 - r[l + 1]) * tmp_B[i] - B[i, l] *= 1 / c[l + 1] - - return B diff --git a/lshmm/forward_backward/fb_haploid_variants_samples.py b/lshmm/forward_backward/fb_haploid_variants_samples.py deleted file mode 100644 index d0f03df..0000000 --- a/lshmm/forward_backward/fb_haploid_variants_samples.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Collection of functions to run forwards and backwards algorithms on haploid genotype data, where the data is structured as variants x samples.""" -import numpy as np - -from lshmm import jit - -MISSING = -1 - - -@jit.numba_njit -def forwards_ls_hap(n, m, H, s, e, r, norm=True): - """Matrix based haploid LS forward algorithm using numpy vectorisation.""" - # Initialise - F = np.zeros((m, n)) - r_n = r / n - - if norm: - - c = np.zeros(m) - for i in range(n): - F[0, i] = ( - 1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]) or s[0, 0] == MISSING)] - ) - c[0] += F[0, i] - - for i in range(n): - F[0, i] *= 1 / c[0] - - # Forwards pass - for l in range(1, m): - for i in range(n): - F[l, i] = F[l - 1, i] * (1 - r[l]) + r_n[l] - F[l, i] *= e[ - l, np.int64(np.equal(H[l, i], s[0, l]) or s[0, l] == MISSING) - ] - c[l] += F[l, i] - - for i in range(n): - F[l, i] *= 1 / c[l] - - ll = np.sum(np.log10(c)) - - else: - - c = np.ones(m) - - for i in range(n): - F[0, i] = ( - 1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]) or s[0, 0] == MISSING)] - ) - - # Forwards pass - for l in range(1, m): - for i in range(n): - F[l, i] = F[l - 1, i] * (1 - r[l]) + np.sum(F[l - 1, :]) * r_n[l] - F[l, i] *= e[ - l, np.int64(np.equal(H[l, i], s[0, l]) or s[0, l] == MISSING) - ] - - ll = np.log10(np.sum(F[m - 1, :])) - - return F, c, ll - - -@jit.numba_njit -def backwards_ls_hap(n, m, H, s, e, c, r): - """Matrix based haploid LS backward algorithm using numpy vectorisation.""" - # Initialise - B = np.zeros((m, n)) - for i in range(n): - B[m - 1, i] = 1 - r_n = r / n - - # Backwards pass - for l in range(m - 2, -1, -1): - tmp_B = np.zeros(n) - tmp_B_sum = 0 - for i in range(n): - tmp_B[i] = ( - e[ - l + 1, - np.int64( - np.equal(H[l + 1, i], s[0, l + 1]) or s[0, l + 1] == MISSING - ), - ] - * B[l + 1, i] - ) - tmp_B_sum += tmp_B[i] - for i in range(n): - B[l, i] = r_n[l + 1] * tmp_B_sum - B[l, i] += (1 - r[l + 1]) * tmp_B[i] - B[l, i] *= 1 / c[l + 1] - - return B diff --git a/lshmm/vit_diploid_samples_variants.py b/lshmm/vit_diploid_samples_variants.py deleted file mode 100644 index 624a144..0000000 --- a/lshmm/vit_diploid_samples_variants.py +++ /dev/null @@ -1,411 +0,0 @@ -"""Collection of functions to run Viterbi algorithms on dipoid genotype data, where the data is structured as samples x variants.""" -import numpy as np - -from . import jit - -MISSING = -1 -MISSING_INDEX = 3 - -# https://github.com/numba/numba/issues/1269 -@jit.numba_njit -def np_apply_along_axis(func1d, axis, arr): - """Create numpy-like functions for max, sum etc.""" - assert arr.ndim == 2 - assert axis in [0, 1] - if axis == 0: - result = np.empty(arr.shape[1]) - for i in range(len(result)): - result[i] = func1d(arr[:, i]) - else: - result = np.empty(arr.shape[0]) - for i in range(len(result)): - result[i] = func1d(arr[i, :]) - return result - - -@jit.numba_njit -def np_amax(array, axis): - """Numba implementation of numpy vectorised maximum.""" - return np_apply_along_axis(np.amax, axis, array) - - -@jit.numba_njit -def np_sum(array, axis): - """Numba implementation of numpy vectorised sum.""" - return np_apply_along_axis(np.sum, axis, array) - - -@jit.numba_njit -def np_argmax(array, axis): - """Numba implementation of numpy vectorised argmax.""" - return np_apply_along_axis(np.argmax, axis, array) - - -@jit.numba_njit -def forwards_viterbi_dip_naive(n, m, G, s, e, r): - """Naive implementation of LS diploid Viterbi algorithm.""" - # Initialise - V = np.zeros((n, n, m)) - P = np.zeros((n, n, m)).astype(np.int64) - c = np.ones(m) - - if s[0, 0] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - # index = EQUAL_BOTH_HOM * np.ones((n, n), dtype=np.int64) + ( - # BOTH_HET - EQUAL_BOTH_HOM - # ) * (G[:, :, 0] == 1).astype(np.int64) - else: - index = ( - 4 * np.equal(G[:, :, 0], s[0, 0]).astype(np.int64) - + 2 * (G[:, :, 0] == 1).astype(np.int64) - + np.int64(s[0, 0] == 1) - ) - - V[:, :, 0] = 1 / (n ** 2) * e[index.ravel(), 0].reshape(n, n) - r_n = r / n - - for l in range(1, m): - - if s[0, l] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = ( - 4 * np.equal(G[:, :, l], s[0, l]).astype(np.int64) - + 2 * (G[:, :, l] == 1).astype(np.int64) - + np.int64(s[0, l] == 1) - ) - - for j1 in range(n): - for j2 in range(n): - # Get the vector to maximise over - v = np.zeros((n, n)) - for k1 in range(n): - for k2 in range(n): - v[k1, k2] = V[k1, k2, l - 1] - if (k1 == j1) and (k2 == j2): - v[k1, k2] *= ( - (1 - r[l]) ** 2 + 2 * (1 - r[l]) * r_n[l] + r_n[l] ** 2 - ) - elif (k1 == j1) or (k2 == j2): - v[k1, k2] *= r_n[l] * (1 - r[l]) + r_n[l] ** 2 - else: - v[k1, k2] *= r_n[l] ** 2 - V[j1, j2, l] = np.amax(v) * e[index[j1, j2], l] - P[j1, j2, l] = np.argmax(v) - c[l] = np.amax(V[:, :, l]) - V[:, :, l] *= 1 / c[l] - - ll = np.sum(np.log10(c)) - - return V, P, ll - - -@jit.numba_njit -def forwards_viterbi_dip_naive_low_mem(n, m, G, s, e, r): - """Naive implementation of LS diploid Viterbi algorithm, with reduced memory.""" - # Initialise - V = np.zeros((n, n)) - P = np.zeros((n, n, m)).astype(np.int64) - c = np.ones(m) - - if s[0, 0] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = ( - 4 * np.equal(G[:, :, 0], s[0, 0]).astype(np.int64) - + 2 * (G[:, :, 0] == 1).astype(np.int64) - + np.int64(s[0, 0] == 1) - ) - - V_previous = 1 / (n ** 2) * e[index.ravel(), 0].reshape(n, n) - r_n = r / n - - # Take a look at Haploid Viterbi implementation in Jeromes code and see if we can pinch some ideas. - # Diploid Viterbi, with smaller memory footprint. - for l in range(1, m): - - if s[0, l] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = ( - 4 * np.equal(G[:, :, l], s[0, l]).astype(np.int64) - + 2 * (G[:, :, l] == 1).astype(np.int64) - + np.int64(s[0, l] == 1) - ) - - for j1 in range(n): - for j2 in range(n): - # Get the vector to maximise over - v = np.zeros((n, n)) - for k1 in range(n): - for k2 in range(n): - v[k1, k2] = V_previous[k1, k2] - if (k1 == j1) and (k2 == j2): - v[k1, k2] *= ( - (1 - r[l]) ** 2 + 2 * (1 - r[l]) * r_n[l] + r_n[l] ** 2 - ) - elif (k1 == j1) or (k2 == j2): - v[k1, k2] *= r_n[l] * (1 - r[l]) + r_n[l] ** 2 - else: - v[k1, k2] *= r_n[l] ** 2 - V[j1, j2] = np.amax(v) * e[index[j1, j2], l] - P[j1, j2, l] = np.argmax(v) - c[l] = np.amax(V) - V_previous = np.copy(V) / c[l] - - ll = np.sum(np.log10(c)) - - return V, P, ll - - -@jit.numba_njit -def forwards_viterbi_dip_low_mem(n, m, G, s, e, r): - """LS diploid Viterbi algorithm, with reduced memory.""" - # Initialise - V = np.zeros((n, n)) - P = np.zeros((n, n, m)).astype(np.int64) - - if s[0, 0] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = ( - 4 * np.equal(G[:, :, 0], s[0, 0]).astype(np.int64) - + 2 * (G[:, :, 0] == 1).astype(np.int64) - + np.int64(s[0, 0] == 1) - ) - - V_previous = 1 / (n ** 2) * e[index.ravel(), 0].reshape(n, n) - c = np.ones(m) - r_n = r / n - - # Diploid Viterbi, with smaller memory footprint, rescaling, and using the structure of the HMM. - for l in range(1, m): - - if s[0, l] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = ( - 4 * np.equal(G[:, :, l], s[0, l]).astype(np.int64) - + 2 * (G[:, :, l] == 1).astype(np.int64) - + np.int64(s[0, l] == 1) - ) - - c[l] = np.amax(V_previous) - argmax = np.argmax(V_previous) - - V_previous *= 1 / c[l] - V_rowcol_max = np_amax(V_previous, axis=0) - arg_rowcol_max = np_argmax(V_previous, axis=0) - - no_switch = (1 - r[l]) ** 2 + 2 * (r_n[l] * (1 - r[l])) + r_n[l] ** 2 - single_switch = r_n[l] * (1 - r[l]) + r_n[l] ** 2 - double_switch = r_n[l] ** 2 - - j1_j2 = 0 - - for j1 in range(n): - for j2 in range(n): - - V_single_switch = max(V_rowcol_max[j1], V_rowcol_max[j2]) - P_single_switch = np.argmax( - np.array([V_rowcol_max[j1], V_rowcol_max[j2]]) - ) - - if P_single_switch == 0: - template_single_switch = j1 * n + arg_rowcol_max[j1] - else: - template_single_switch = arg_rowcol_max[j2] * n + j2 - - V[j1, j2] = V_previous[j1, j2] * no_switch # No switch in either - P[j1, j2, l] = j1_j2 - - # Single or double switch? - single_switch_tmp = single_switch * V_single_switch - if single_switch_tmp > double_switch: - # Then single switch is the alternative - if V[j1, j2] < single_switch * V_single_switch: - V[j1, j2] = single_switch * V_single_switch - P[j1, j2, l] = template_single_switch - else: - # Double switch is the alternative - if V[j1, j2] < double_switch: - V[j1, j2] = double_switch - P[j1, j2, l] = argmax - - V[j1, j2] *= e[index[j1, j2], l] - j1_j2 += 1 - V_previous = np.copy(V) - - ll = np.sum(np.log10(c)) + np.log10(np.amax(V)) - - return V, P, ll - - -@jit.numba_njit -def forwards_viterbi_dip_naive_vec(n, m, G, s, e, r): - """Vectorised LS diploid Viterbi algorithm using numpy.""" - # Initialise - V = np.zeros((n, n, m)) - P = np.zeros((n, n, m)).astype(np.int64) - c = np.ones(m) - - if s[0, 0] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = ( - 4 * np.equal(G[:, :, 0], s[0, 0]).astype(np.int64) - + 2 * (G[:, :, 0] == 1).astype(np.int64) - + np.int64(s[0, 0] == 1) - ) - - V[:, :, 0] = 1 / (n ** 2) * e[index.ravel(), 0].reshape(n, n) - r_n = r / n - - # Jumped the gun - vectorising. - for l in range(1, m): - - if s[0, l] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = ( - 4 * np.equal(G[:, :, l], s[0, l]).astype(np.int64) - + 2 * (G[:, :, l] == 1).astype(np.int64) - + np.int64(s[0, l] == 1) - ) - - for j1 in range(n): - for j2 in range(n): - v = (r_n[l] ** 2) * np.ones((n, n)) - v[j1, j2] += (1 - r[l]) ** 2 - v[j1, :] += r_n[l] * (1 - r[l]) - v[:, j2] += r_n[l] * (1 - r[l]) - v *= V[:, :, l - 1] - V[j1, j2, l] = np.amax(v) * e[index[j1, j2], l] - P[j1, j2, l] = np.argmax(v) - - c[l] = np.amax(V[:, :, l]) - V[:, :, l] *= 1 / c[l] - - ll = np.sum(np.log10(c)) - - return V, P, ll - - -def forwards_viterbi_dip_naive_full_vec(n, m, G, s, e, r): - """Fully vectorised naive LS diploid Viterbi algorithm using numpy.""" - char_both = np.eye(n * n).ravel().reshape((n, n, n, n)) - char_col = np.tile(np.sum(np.eye(n * n).reshape((n, n, n, n)), 3), (n, 1, 1, 1)) - char_row = np.copy(char_col.T) - rows, cols = np.ogrid[:n, :n] - - # Initialise - V = np.zeros((n, n, m)) - P = np.zeros((n, n, m)).astype(np.int64) - c = np.ones(m) - - if s[0, 0] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = ( - 4 * np.equal(G[:, :, 0], s[0, 0]).astype(np.int64) - + 2 * (G[:, :, 0] == 1).astype(np.int64) - + np.int64(s[0, 0] == 1) - ) - - V[:, :, 0] = 1 / (n ** 2) * e[index, 0] - r_n = r / n - - for l in range(1, m): - - if s[0, l] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = ( - 4 * np.equal(G[:, :, l], s[0, l]).astype(np.int64) - + 2 * (G[:, :, l] == 1).astype(np.int64) - + np.int64(s[0, l] == 1) - ) - - v = ( - (r_n[l] ** 2) - + (1 - r[l]) ** 2 * char_both - + (r_n[l] * (1 - r[l])) * (char_col + char_row) - ) - v *= V[:, :, l - 1] - P[:, :, l] = np.argmax(v.reshape(n, n, -1), 2) # Have to flatten to use argmax - V[:, :, l] = v.reshape(n, n, -1)[rows, cols, P[:, :, l]] * e[index, l] - c[l] = np.amax(V[:, :, l]) - V[:, :, l] *= 1 / c[l] - - ll = np.sum(np.log10(c)) - - return V, P, ll - - -@jit.numba_njit -def backwards_viterbi_dip(m, V_last, P): - """Run a backwards pass to determine the most likely path.""" - assert V_last.ndim == 2 - assert V_last.shape[0] == V_last.shape[1] - # Initialisation - path = np.zeros(m).astype(np.int64) - path[m - 1] = np.argmax(V_last) - - # Backtrace - for j in range(m - 2, -1, -1): - path[j] = P[:, :, j + 1].ravel()[path[j + 1]] - - return path - - -def get_phased_path(n, path): - """Obtain the phased path.""" - return np.unravel_index(path, (n, n)) - - -@jit.numba_njit -def path_ll_dip(n, m, G, phased_path, s, e, r): - """Evaluate log-likelihood path through a reference panel which results in sequence s.""" - if s[0, 0] == MISSING: - index = MISSING_INDEX - else: - index = ( - 4 * np.int64(np.equal(G[phased_path[0][0], phased_path[1][0], 0], s[0, 0])) - + 2 * np.int64(G[phased_path[0][0], phased_path[1][0], 0] == 1) - + np.int64(s[0, 0] == 1) - ) - - log_prob_path = np.log10(1 / (n ** 2) * e[index, 0]) - old_phase = np.array([phased_path[0][0], phased_path[1][0]]) - r_n = r / n - - for l in range(1, m): - - if s[0, l] == MISSING: - index = MISSING_INDEX - else: - index = ( - 4 - * np.int64( - np.equal(G[phased_path[0][l], phased_path[1][l], l], s[0, l]) - ) - + 2 * np.int64(G[phased_path[0][l], phased_path[1][l], l] == 1) - + np.int64(s[0, l] == 1) - ) - - current_phase = np.array([phased_path[0][l], phased_path[1][l]]) - phase_diff = np.sum(~np.equal(current_phase, old_phase)) - - if phase_diff == 0: - log_prob_path += np.log10( - (1 - r[l]) ** 2 + 2 * (r_n[l] * (1 - r[l])) + r_n[l] ** 2 - ) - elif phase_diff == 1: - log_prob_path += np.log10(r_n[l] * (1 - r[l]) + r_n[l] ** 2) - else: - log_prob_path += np.log10(r_n[l] ** 2) - - log_prob_path += np.log10(e[index, l]) - old_phase = current_phase - - return log_prob_path diff --git a/lshmm/vit_diploid_variants_samples.py b/lshmm/vit_diploid_variants_samples.py deleted file mode 100644 index f140861..0000000 --- a/lshmm/vit_diploid_variants_samples.py +++ /dev/null @@ -1,547 +0,0 @@ -"""Collection of functions to run Viterbi algorithms on dipoid genotype data, where the data is structured as variants x samples.""" -import numpy as np - -from . import jit - -MISSING = -1 -MISSING_INDEX = 3 - -# https://github.com/numba/numba/issues/1269 -@jit.numba_njit -def np_apply_along_axis(func1d, axis, arr): - """Create numpy-like functions for max, sum etc.""" - assert arr.ndim == 2 - assert axis in [0, 1] - if axis == 0: - result = np.empty(arr.shape[1]) - for i in range(len(result)): - result[i] = func1d(arr[:, i]) - else: - result = np.empty(arr.shape[0]) - for i in range(len(result)): - result[i] = func1d(arr[i, :]) - return result - - -@jit.numba_njit -def np_amax(array, axis): - """Numba implementation of numpy vectorised maximum.""" - return np_apply_along_axis(np.amax, axis, array) - - -@jit.numba_njit -def np_sum(array, axis): - """Numba implementation of numpy vectorised sum.""" - return np_apply_along_axis(np.sum, axis, array) - - -@jit.numba_njit -def np_argmax(array, axis): - """Numba implementation of numpy vectorised argmax.""" - return np_apply_along_axis(np.argmax, axis, array) - - -@jit.numba_njit -def forwards_viterbi_dip_naive(n, m, G, s, e, r): - """Naive implementation of LS diploid Viterbi algorithm.""" - # Initialise - V = np.zeros((m, n, n)) - P = np.zeros((m, n, n)).astype(np.int64) - c = np.ones(m) - r_n = r / n - - for j1 in range(n): - for j2 in range(n): - if s[0, 0] == MISSING: - index_tmp = MISSING_INDEX - else: - index_tmp = ( - 4 * np.int64(np.equal(G[0, j1, j2], s[0, 0])) - + 2 * np.int64((G[0, j1, j2] == 1)) - + np.int64(s[0, 0] == 1) - ) - V[0, j1, j2] = 1 / (n ** 2) * e[0, index_tmp] - - for l in range(1, m): - if s[0, l] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = ( - 4 * np.equal(G[l, :, :], s[0, l]).astype(np.int64) - + 2 * (G[l, :, :] == 1).astype(np.int64) - + np.int64(s[0, l] == 1) - ) - - for j1 in range(n): - for j2 in range(n): - # Get the vector to maximise over - v = np.zeros((n, n)) - for k1 in range(n): - for k2 in range(n): - v[k1, k2] = V[l - 1, k1, k2] - if (k1 == j1) and (k2 == j2): - v[k1, k2] *= ( - (1 - r[l]) ** 2 + 2 * (1 - r[l]) * r_n[l] + r_n[l] ** 2 - ) - elif (k1 == j1) or (k2 == j2): - v[k1, k2] *= r_n[l] * (1 - r[l]) + r_n[l] ** 2 - else: - v[k1, k2] *= r_n[l] ** 2 - V[l, j1, j2] = np.amax(v) * e[l, index[j1, j2]] - P[l, j1, j2] = np.argmax(v) - c[l] = np.amax(V[l, :, :]) - V[l, :, :] *= 1 / c[l] - - ll = np.sum(np.log10(c)) - - return V, P, ll - - -@jit.numba_njit -def forwards_viterbi_dip_naive_low_mem(n, m, G, s, e, r): - """Naive implementation of LS diploid Viterbi algorithm, with reduced memory.""" - # Initialise - V = np.zeros((n, n)) - V_previous = np.zeros((n, n)) - P = np.zeros((m, n, n)).astype(np.int64) - c = np.ones(m) - r_n = r / n - - for j1 in range(n): - for j2 in range(n): - if s[0, 0] == MISSING: - index_tmp = MISSING_INDEX - else: - index_tmp = ( - 4 * np.int64(np.equal(G[0, j1, j2], s[0, 0])) - + 2 * np.int64((G[0, j1, j2] == 1)) - + np.int64(s[0, 0] == 1) - ) - V_previous[j1, j2] = 1 / (n ** 2) * e[0, index_tmp] - - # Take a look at Haploid Viterbi implementation in Jeromes code and see if we can pinch some ideas. - # Diploid Viterbi, with smaller memory footprint. - for l in range(1, m): - if s[0, l] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = ( - 4 * np.equal(G[l, :, :], s[0, l]).astype(np.int64) - + 2 * (G[l, :, :] == 1).astype(np.int64) - + np.int64(s[0, l] == 1) - ) - for j1 in range(n): - for j2 in range(n): - # Get the vector to maximise over - v = np.zeros((n, n)) - for k1 in range(n): - for k2 in range(n): - v[k1, k2] = V_previous[k1, k2] - if (k1 == j1) and (k2 == j2): - v[k1, k2] *= ( - (1 - r[l]) ** 2 + 2 * (1 - r[l]) * r_n[l] + r_n[l] ** 2 - ) - elif (k1 == j1) or (k2 == j2): - v[k1, k2] *= r_n[l] * (1 - r[l]) + r_n[l] ** 2 - else: - v[k1, k2] *= r_n[l] ** 2 - V[j1, j2] = np.amax(v) * e[l, index[j1, j2]] - P[l, j1, j2] = np.argmax(v) - c[l] = np.amax(V) - V_previous = np.copy(V) / c[l] - - ll = np.sum(np.log10(c)) - - return V, P, ll - - -@jit.numba_njit -def forwards_viterbi_dip_low_mem(n, m, G, s, e, r): - """LS diploid Viterbi algorithm, with reduced memory.""" - # Initialise - V = np.zeros((n, n)) - V_previous = np.zeros((n, n)) - P = np.zeros((m, n, n)).astype(np.int64) - c = np.ones(m) - r_n = r / n - - for j1 in range(n): - for j2 in range(n): - if s[0, 0] == MISSING: - index_tmp = MISSING_INDEX - else: - index_tmp = ( - 4 * np.int64(np.equal(G[0, j1, j2], s[0, 0])) - + 2 * np.int64((G[0, j1, j2] == 1)) - + np.int64(s[0, 0] == 1) - ) - V_previous[j1, j2] = 1 / (n ** 2) * e[0, index_tmp] - - # Diploid Viterbi, with smaller memory footprint, rescaling, and using the structure of the HMM. - for l in range(1, m): - if s[0, l] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = ( - 4 * np.equal(G[l, :, :], s[0, l]).astype(np.int64) - + 2 * (G[l, :, :] == 1).astype(np.int64) - + np.int64(s[0, l] == 1) - ) - - c[l] = np.amax(V_previous) - argmax = np.argmax(V_previous) - - V_previous *= 1 / c[l] - V_rowcol_max = np_amax(V_previous, 0) - arg_rowcol_max = np_argmax(V_previous, 0) - - no_switch = (1 - r[l]) ** 2 + 2 * (r_n[l] * (1 - r[l])) + r_n[l] ** 2 - single_switch = r_n[l] * (1 - r[l]) + r_n[l] ** 2 - double_switch = r_n[l] ** 2 - - j1_j2 = 0 - - for j1 in range(n): - for j2 in range(n): - - V_single_switch = max(V_rowcol_max[j1], V_rowcol_max[j2]) - P_single_switch = np.argmax( - np.array([V_rowcol_max[j1], V_rowcol_max[j2]]) - ) - - if P_single_switch == 0: - template_single_switch = j1 * n + arg_rowcol_max[j1] - else: - template_single_switch = arg_rowcol_max[j2] * n + j2 - - V[j1, j2] = V_previous[j1, j2] * no_switch # No switch in either - P[l, j1, j2] = j1_j2 - - # Single or double switch? - single_switch_tmp = single_switch * V_single_switch - if single_switch_tmp > double_switch: - # Then single switch is the alternative - if V[j1, j2] < single_switch * V_single_switch: - V[j1, j2] = single_switch * V_single_switch - P[l, j1, j2] = template_single_switch - else: - # Double switch is the alternative - if V[j1, j2] < double_switch: - V[j1, j2] = double_switch - P[l, j1, j2] = argmax - - V[j1, j2] *= e[l, index[j1, j2]] - j1_j2 += 1 - V_previous = np.copy(V) - - ll = np.sum(np.log10(c)) + np.log10(np.amax(V)) - - return V, P, ll - - -@jit.numba_njit -def forwards_viterbi_dip_low_mem_no_pointer(n, m, G, s, e, r): - """LS diploid Viterbi algorithm, with reduced memory.""" - # Initialise - V = np.zeros((n, n)) - V_previous = np.zeros((n, n)) - c = np.ones(m) - r_n = r / n - - recombs_single = [ - np.zeros(shape=0, dtype=np.int64) for _ in range(m) - ] # Store all single switch recombs - recombs_double = [ - np.zeros(shape=0, dtype=np.int64) for _ in range(m) - ] # Store all double switch recombs - - V_argmaxes = np.zeros(m) - V_rowcol_maxes = np.zeros((m, n)) - V_rowcol_argmaxes = np.zeros((m, n)) - - for j1 in range(n): - for j2 in range(n): - if s[0, 0] == MISSING: - index_tmp = MISSING_INDEX - else: - index_tmp = ( - 4 * np.int64(np.equal(G[0, j1, j2], s[0, 0])) - + 2 * np.int64((G[0, j1, j2] == 1)) - + np.int64(s[0, 0] == 1) - ) - V_previous[j1, j2] = 1 / (n ** 2) * e[0, index_tmp] - - # Diploid Viterbi, with smaller memory footprint, rescaling, and using the structure of the HMM. - for l in range(1, m): - if s[0, l] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = ( - 4 * np.equal(G[l, :, :], s[0, l]).astype(np.int64) - + 2 * (G[l, :, :] == 1).astype(np.int64) - + np.int64(s[0, l] == 1) - ) - - c[l] = np.amax(V_previous) - argmax = np.argmax(V_previous) - V_argmaxes[l - 1] = argmax # added - - V_previous *= 1 / c[l] - V_rowcol_max = np_amax(V_previous, 0) - V_rowcol_maxes[l - 1, :] = V_rowcol_max - arg_rowcol_max = np_argmax(V_previous, 0) - V_rowcol_argmaxes[l - 1, :] = arg_rowcol_max - - no_switch = (1 - r[l]) ** 2 + 2 * (r_n[l] * (1 - r[l])) + r_n[l] ** 2 - single_switch = r_n[l] * (1 - r[l]) + r_n[l] ** 2 - double_switch = r_n[l] ** 2 - - j1_j2 = 0 - - for j1 in range(n): - for j2 in range(n): - - V_single_switch = max(V_rowcol_max[j1], V_rowcol_max[j2]) - V[j1, j2] = V_previous[j1, j2] * no_switch # No switch in either - - # Single or double switch? - single_switch_tmp = single_switch * V_single_switch - if single_switch_tmp > double_switch: - # Then single switch is the alternative - if V[j1, j2] < single_switch * V_single_switch: - V[j1, j2] = single_switch * V_single_switch - recombs_single[l] = np.append(recombs_single[l], j1_j2) - else: - # Double switch is the alternative - if V[j1, j2] < double_switch: - V[j1, j2] = double_switch - recombs_double[l] = np.append(recombs_double[l], values=j1_j2) - - V[j1, j2] *= e[l, index[j1, j2]] - j1_j2 += 1 - V_previous = np.copy(V) - - V_argmaxes[m - 1] = np.argmax(V_previous) - V_rowcol_maxes[m - 1, :] = np_amax(V_previous, 0) - V_rowcol_argmaxes[m - 1, :] = np_argmax(V_previous, 0) - ll = np.sum(np.log10(c)) + np.log10(np.amax(V)) - - return ( - V, - V_argmaxes, - V_rowcol_maxes, - V_rowcol_argmaxes, - recombs_single, - recombs_double, - ll, - ) - - -@jit.numba_njit -def forwards_viterbi_dip_naive_vec(n, m, G, s, e, r): - """Vectorised LS diploid Viterbi algorithm using numpy.""" - # Initialise - V = np.zeros((m, n, n)) - P = np.zeros((m, n, n)).astype(np.int64) - c = np.ones(m) - r_n = r / n - - for j1 in range(n): - for j2 in range(n): - if s[0, 0] == MISSING: - index_tmp = MISSING_INDEX - else: - index_tmp = ( - 4 * np.int64(np.equal(G[0, j1, j2], s[0, 0])) - + 2 * np.int64((G[0, j1, j2] == 1)) - + np.int64(s[0, 0] == 1) - ) - V[0, j1, j2] = 1 / (n ** 2) * e[0, index_tmp] - - # Jumped the gun - vectorising. - for l in range(1, m): - if s[0, l] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = ( - 4 * np.equal(G[l, :, :], s[0, l]).astype(np.int64) - + 2 * (G[l, :, :] == 1).astype(np.int64) - + np.int64(s[0, l] == 1) - ) - - for j1 in range(n): - for j2 in range(n): - v = (r_n[l] ** 2) * np.ones((n, n)) - v[j1, j2] += (1 - r[l]) ** 2 - v[j1, :] += r_n[l] * (1 - r[l]) - v[:, j2] += r_n[l] * (1 - r[l]) - v *= V[l - 1, :, :] - V[l, j1, j2] = np.amax(v) * e[l, index[j1, j2]] - P[l, j1, j2] = np.argmax(v) - - c[l] = np.amax(V[l, :, :]) - V[l, :, :] *= 1 / c[l] - - ll = np.sum(np.log10(c)) - - return V, P, ll - - -def forwards_viterbi_dip_naive_full_vec(n, m, G, s, e, r): - """Fully vectorised naive LS diploid Viterbi algorithm using numpy.""" - char_both = np.eye(n * n).ravel().reshape((n, n, n, n)) - char_col = np.tile(np.sum(np.eye(n * n).reshape((n, n, n, n)), 3), (n, 1, 1, 1)) - char_row = np.copy(char_col).T - rows, cols = np.ogrid[:n, :n] - - # Initialise - V = np.zeros((m, n, n)) - P = np.zeros((m, n, n)).astype(np.int64) - c = np.ones(m) - if s[0, 0] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = ( - 4 * np.equal(G[0, :, :], s[0, 0]).astype(np.int64) - + 2 * (G[0, :, :] == 1).astype(np.int64) - + np.int64(s[0, 0] == 1) - ) - V[0, :, :] = 1 / (n ** 2) * e[0, index] - r_n = r / n - - for l in range(1, m): - if s[0, l] == MISSING: - index = MISSING_INDEX * np.ones((n, n), dtype=np.int64) - else: - index = ( - 4 * np.equal(G[l, :, :], s[0, l]).astype(np.int64) - + 2 * (G[l, :, :] == 1).astype(np.int64) - + np.int64(s[0, l] == 1) - ) - v = ( - (r_n[l] ** 2) - + (1 - r[l]) ** 2 * char_both - + (r_n[l] * (1 - r[l])) * (char_col + char_row) - ) - v *= V[l - 1, :, :] - P[l, :, :] = np.argmax(v.reshape(n, n, -1), 2) # Have to flatten to use argmax - V[l, :, :] = v.reshape(n, n, -1)[rows, cols, P[l, :, :]] * e[l, index] - c[l] = np.amax(V[l, :, :]) - V[l, :, :] *= 1 / c[l] - - ll = np.sum(np.log10(c)) - - return V, P, ll - - -@jit.numba_njit -def backwards_viterbi_dip(m, V_last, P): - """Run a backwards pass to determine the most likely path.""" - assert V_last.ndim == 2 - assert V_last.shape[0] == V_last.shape[1] - # Initialisation - path = np.zeros(m).astype(np.int64) - path[m - 1] = np.argmax(V_last) - - # Backtrace - for j in range(m - 2, -1, -1): - path[j] = P[j + 1, :, :].ravel()[path[j + 1]] - - return path - - -@jit.numba_njit -def in_list(array, value): - where = np.searchsorted(array, value) - if where < array.shape[0]: - return array[where] == value - else: - return False - - -@jit.numba_njit -def backwards_viterbi_dip_no_pointer( - m, - V_argmaxes, - V_rowcol_maxes, - V_rowcol_argmaxes, - recombs_single, - recombs_double, - V_last, -): - """Run a backwards pass to determine the most likely path.""" - assert V_last.ndim == 2 - assert V_last.shape[0] == V_last.shape[1] - # Initialisation - path = np.zeros(m).astype(np.int64) - path[m - 1] = np.argmax(V_last) - n = V_last.shape[0] - - # Backtrace - for l in range(m - 2, -1, -1): - current_best_template = path[l + 1] - # if current_best_template in recombs_double[l + 1]: - if in_list(recombs_double[l + 1], current_best_template): - current_best_template = V_argmaxes[l] - # elif current_best_template in recombs_single[l + 1]: - elif in_list(recombs_single[l + 1], current_best_template): - (j1, j2) = divmod(current_best_template, n) - if V_rowcol_maxes[l, j1] > V_rowcol_maxes[l, j2]: - current_best_template = j1 * n + V_rowcol_argmaxes[l, j1] - else: - current_best_template = V_rowcol_argmaxes[l, j2] * n + j2 - path[l] = current_best_template - - return path - - -def get_phased_path(n, path): - """Obtain the phased path.""" - return np.unravel_index(path, (n, n)) - - -@jit.numba_njit -def path_ll_dip(n, m, G, phased_path, s, e, r): - """Evaluate log-likelihood path through a reference panel which results in sequence s.""" - if s[0, 0] == MISSING: - index = MISSING_INDEX - else: - index = ( - 4 * np.int64(np.equal(G[0, phased_path[0][0], phased_path[1][0]], s[0, 0])) - + 2 * np.int64(G[0, phased_path[0][0], phased_path[1][0]] == 1) - + np.int64(s[0, 0] == 1) - ) - log_prob_path = np.log10(1 / (n ** 2) * e[0, index]) - old_phase = np.array([phased_path[0][0], phased_path[1][0]]) - r_n = r / n - - for l in range(1, m): - - if s[0, l] == MISSING: - index = MISSING_INDEX - else: - index = ( - 4 - * np.int64( - np.equal(G[l, phased_path[0][l], phased_path[1][l]], s[0, l]) - ) - + 2 * np.int64(G[l, phased_path[0][l], phased_path[1][l]] == 1) - + np.int64(s[0, l] == 1) - ) - - current_phase = np.array([phased_path[0][l], phased_path[1][l]]) - phase_diff = np.sum(~np.equal(current_phase, old_phase)) - - if phase_diff == 0: - log_prob_path += np.log10( - (1 - r[l]) ** 2 + 2 * (r_n[l] * (1 - r[l])) + r_n[l] ** 2 - ) - elif phase_diff == 1: - log_prob_path += np.log10(r_n[l] * (1 - r[l]) + r_n[l] ** 2) - else: - log_prob_path += np.log10(r_n[l] ** 2) - - log_prob_path += np.log10(e[l, index]) - old_phase = current_phase - - return log_prob_path diff --git a/lshmm/vit_haploid_samples_variants.py b/lshmm/vit_haploid_samples_variants.py deleted file mode 100644 index 0c29dec..0000000 --- a/lshmm/vit_haploid_samples_variants.py +++ /dev/null @@ -1,240 +0,0 @@ -"""Collection of functions to run Viterbi algorithms on haploid genotype data, where the data is structured as samples x variants.""" -import numpy as np - -from . import jit - -MISSING = -1 - - -@jit.numba_njit -def viterbi_naive_init(n, m, H, s, e, r): - """Initialise naive implementation of LS viterbi.""" - V = np.zeros((n, m)) - P = np.zeros((n, m)).astype(np.int64) - r_n = r / n - - P[:, 0] = 0 # Reminder - - for i in range(n): - V[i, 0] = ( - 1 / n * e[np.int64(np.equal(H[i, 0], s[0, 0]) or s[0, 0] == MISSING), 0] - ) - - return V, P, r_n - - -@jit.numba_njit -def viterbi_init(n, m, H, s, e, r): - """Initialise naive, but more space memory efficient implementation of LS viterbi.""" - V_previous = np.zeros(n) - for i in range(n): - V_previous[i] = ( - 1 / n * e[np.int64(np.equal(H[i, 0], s[0, 0]) or s[0, 0] == MISSING), 0] - ) - - V = np.zeros(n) - P = np.zeros((n, m)).astype(np.int64) - P[:, 0] = 0 # Reminder - r_n = r / n - - return V, V_previous, P, r_n - - -@jit.numba_njit -def forwards_viterbi_hap_naive(n, m, H, s, e, r): - """Naive implementation of LS haploid Viterbi algorithm.""" - # Initialise - V, P, r_n = viterbi_naive_init(n, m, H, s, e, r) - - for j in range(1, m): - for i in range(n): - # Get the vector to maximise over - v = np.zeros(n) - for k in range(n): - v[k] = ( - e[np.int64(np.equal(H[i, j], s[0, j]) or s[0, j] == MISSING), j] - * V[k, j - 1] - ) - if k == i: - v[k] *= 1 - r[j] + r_n[j] - else: - v[k] *= r_n[j] - P[i, j] = np.argmax(v) - V[i, j] = v[P[i, j]] - - ll = np.log10(np.amax(V[:, m - 1])) - - return V, P, ll - - -@jit.numba_njit -def forwards_viterbi_hap_naive_vec(n, m, H, s, e, r): - """Naive matrix based implementation of LS haploid forward Viterbi algorithm using numpy.""" - # Initialise - V, P, r_n = viterbi_naive_init(n, m, H, s, e, r) - - for j in range(1, m): - v_tmp = V[:, j - 1] * r_n[j] - for i in range(n): - v = np.copy(v_tmp) - v[i] += V[i, j - 1] * (1 - r[j]) - v *= e[np.int64(np.equal(H[i, j], s[0, j]) or s[0, j] == MISSING), j] - P[i, j] = np.argmax(v) - V[i, j] = v[P[i, j]] - - ll = np.log10(np.amax(V[:, m - 1])) - - return V, P, ll - - -def forwards_viterbi_hap_naive_low_mem(n, m, H, s, e, r): - """Naive implementation of LS haploid Viterbi algorithm, with reduced memory.""" - # Initialise - V, V_previous, P, r_n = viterbi_init(n, m, H, s, e, r) - - for j in range(1, m): - for i in range(n): - # Get the vector to maximise over - v = np.zeros(n) - for k in range(n): - v[k] = ( - e[np.int64(np.equal(H[i, j], s[0, j]) or s[0, j] == MISSING), j] - * V_previous[k] - ) - if k == i: - v[k] *= (1 - r[j]) + r_n[j] - else: - v[k] *= r_n[j] - P[i, j] = np.argmax(v) - V[i] = v[P[i, j]] - V_previous = np.copy(V) - - ll = np.log10(np.amax(V)) - - return V, P, ll - - -@jit.numba_njit -def forwards_viterbi_hap_naive_low_mem_rescaling(n, m, H, s, e, r): - """Naive implementation of LS haploid Viterbi algorithm, with reduced memory and rescaling.""" - # Initialise - V, V_previous, P, r_n = viterbi_init(n, m, H, s, e, r) - c = np.ones(m) - - for j in range(1, m): - c[j] = np.amax(V_previous) - V_previous *= 1 / c[j] - for i in range(n): - # Get the vector to maximise over - v = np.zeros(n) - for k in range(n): - v[k] = ( - e[np.int64(np.equal(H[i, j], s[0, j]) or s[0, j] == MISSING), j] - * V_previous[k] - ) - if k == i: - v[k] *= (1 - r[j]) + r_n[j] - else: - v[k] *= r_n[j] - P[i, j] = np.argmax(v) - V[i] = v[P[i, j]] - - V_previous = np.copy(V) - - ll = np.sum(np.log10(c)) + np.log10(np.amax(V)) - - return V, P, ll - - -@jit.numba_njit -def forwards_viterbi_hap_low_mem_rescaling(n, m, H, s, e, r): - """LS haploid Viterbi algorithm, with reduced memory and exploits the Markov process structure.""" - # Initialise - V, V_previous, P, r_n = viterbi_init(n, m, H, s, e, r) - c = np.ones(m) - - for j in range(1, m): - argmax = np.argmax(V_previous) - c[j] = V_previous[argmax] - V_previous *= 1 / c[j] - V = np.zeros(n) - for i in range(n): - V[i] = V_previous[i] * (1 - r[j] + r_n[j]) - P[i, j] = i - if V[i] < r_n[j]: - V[i] = r_n[j] - P[i, j] = argmax - - V[i] *= e[np.int64(np.equal(H[i, j], s[0, j]) or s[0, j] == MISSING), j] - V_previous = np.copy(V) - - ll = np.sum(np.log10(c)) + np.log10(np.max(V)) - - return V, P, ll - - -@jit.numba_njit -def forwards_viterbi_hap_lower_mem_rescaling(n, m, H, s, e, r): - """LS haploid Viterbi algorithm with even smaller memory footprint and exploits the Markov process structure.""" - # Initialise - V = np.zeros(n) - for i in range(n): - V[i] = 1 / n * e[np.int64(np.equal(H[i, 0], s[0, 0]) or s[0, 0] == MISSING), 0] - P = np.zeros((n, m)).astype(np.int64) - P[:, 0] = 0 - r_n = r / n - c = np.ones(m) - - for j in range(1, m): - argmax = np.argmax(V) - c[j] = V[argmax] - V *= 1 / c[j] - for i in range(n): - V[i] = V[i] * (1 - r[j] + r_n[j]) - P[i, j] = i - if V[i] < r_n[j]: - V[i] = r_n[j] - P[i, j] = argmax - V[i] *= e[np.int64(np.equal(H[i, j], s[0, j]) or s[0, j] == MISSING), j] - - ll = np.sum(np.log10(c)) + np.log10(np.max(V)) - - return V, P, ll - - -@jit.numba_njit -def backwards_viterbi_hap(m, V_last, P): - """Run a backwards pass to determine the most likely path.""" - # Initialise - assert len(V_last.shape) == 1 - path = np.zeros(m).astype(np.int64) - path[m - 1] = np.argmax(V_last) - - for j in range(m - 2, -1, -1): - path[j] = P[path[j + 1], j + 1] - - return path - - -@jit.numba_njit -def path_ll_hap(n, m, H, path, s, e, r): - """Evaluate log-likelihood path through a reference panel which results in sequence s.""" - index = np.int64(np.equal(H[path[0], 0], s[0, 0]) or s[0, 0] == MISSING) - log_prob_path = np.log10((1 / n) * e[index, 0]) - old = path[0] - r_n = r / n - - for l in range(1, m): - index = np.int64(np.equal(H[path[l], l], s[0, l]) or s[0, l] == MISSING) - current = path[l] - same = old == current - - if same: - log_prob_path += np.log10((1 - r[l]) + r_n[l]) - else: - log_prob_path += np.log10(r_n[l]) - - log_prob_path += np.log10(e[index, l]) - old = current - - return log_prob_path diff --git a/lshmm/vit_haploid_variants_samples.py b/lshmm/vit_haploid_variants_samples.py deleted file mode 100644 index 22aa7d3..0000000 --- a/lshmm/vit_haploid_variants_samples.py +++ /dev/null @@ -1,287 +0,0 @@ -"""Collection of functions to run Viterbi algorithms on haploid genotype data, where the data is structured as variants x samples.""" -import numpy as np - -from . import jit - -MISSING = -1 - - -@jit.numba_njit -def viterbi_naive_init(n, m, H, s, e, r): - """Initialise naive implementation of LS viterbi.""" - V = np.zeros((m, n)) - P = np.zeros((m, n)).astype(np.int64) - r_n = r / n - for i in range(n): - V[0, i] = ( - 1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]) or s[0, 0] == MISSING)] - ) - - return V, P, r_n - - -@jit.numba_njit -def viterbi_init(n, m, H, s, e, r): - """Initialise naive, but more space memory efficient implementation of LS viterbi.""" - V_previous = np.zeros(n) - V = np.zeros(n) - P = np.zeros((m, n)).astype(np.int64) - r_n = r / n - - for i in range(n): - V_previous[i] = ( - 1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]) or s[0, 0] == MISSING)] - ) - - return V, V_previous, P, r_n - - -@jit.numba_njit -def forwards_viterbi_hap_naive(n, m, H, s, e, r): - """Naive implementation of LS haploid Viterbi algorithm.""" - # Initialise - V, P, r_n = viterbi_naive_init(n, m, H, s, e, r) - - for j in range(1, m): - for i in range(n): - # Get the vector to maximise over - v = np.zeros(n) - for k in range(n): - v[k] = ( - e[j, np.int64(np.equal(H[j, i], s[0, j]) or s[0, j] == MISSING)] - * V[j - 1, k] - ) - if k == i: - v[k] *= 1 - r[j] + r_n[j] - else: - v[k] *= r_n[j] - P[j, i] = np.argmax(v) - V[j, i] = v[P[j, i]] - - ll = np.log10(np.amax(V[m - 1, :])) - - return V, P, ll - - -@jit.numba_njit -def forwards_viterbi_hap_naive_vec(n, m, H, s, e, r): - """Naive matrix based implementation of LS haploid forward Viterbi algorithm using numpy.""" - # Initialise - V, P, r_n = viterbi_naive_init(n, m, H, s, e, r) - - for j in range(1, m): - v_tmp = V[j - 1, :] * r_n[j] - for i in range(n): - v = np.copy(v_tmp) - v[i] += V[j - 1, i] * (1 - r[j]) - v *= e[j, np.int64(np.equal(H[j, i], s[0, j]) or s[0, j] == MISSING)] - P[j, i] = np.argmax(v) - V[j, i] = v[P[j, i]] - - ll = np.log10(np.amax(V[m - 1, :])) - - return V, P, ll - - -@jit.numba_njit -def forwards_viterbi_hap_naive_low_mem(n, m, H, s, e, r): - """Naive implementation of LS haploid Viterbi algorithm, with reduced memory.""" - # Initialise - V, V_previous, P, r_n = viterbi_init(n, m, H, s, e, r) - - for j in range(1, m): - for i in range(n): - # Get the vector to maximise over - v = np.zeros(n) - for k in range(n): - v[k] = ( - e[j, np.int64(np.equal(H[j, i], s[0, j]) or s[0, j] == MISSING)] - * V_previous[k] - ) - if k == i: - v[k] *= 1 - r[j] + r_n[j] - else: - v[k] *= r_n[j] - P[j, i] = np.argmax(v) - V[i] = v[P[j, i]] - V_previous = np.copy(V) - - ll = np.log10(np.amax(V)) - - return V, P, ll - - -@jit.numba_njit -def forwards_viterbi_hap_naive_low_mem_rescaling(n, m, H, s, e, r): - """Naive implementation of LS haploid Viterbi algorithm, with reduced memory and rescaling.""" - # Initialise - V, V_previous, P, r_n = viterbi_init(n, m, H, s, e, r) - c = np.ones(m) - - for j in range(1, m): - c[j] = np.amax(V_previous) - V_previous *= 1 / c[j] - for i in range(n): - # Get the vector to maximise over - v = np.zeros(n) - for k in range(n): - v[k] = ( - e[j, np.int64(np.equal(H[j, i], s[0, j]) or s[0, j] == MISSING)] - * V_previous[k] - ) - if k == i: - v[k] *= 1 - r[j] + r_n[j] - else: - v[k] *= r_n[j] - P[j, i] = np.argmax(v) - V[i] = v[P[j, i]] - - V_previous = np.copy(V) - - ll = np.sum(np.log10(c)) + np.log10(np.amax(V)) - - return V, P, ll - - -@jit.numba_njit -def forwards_viterbi_hap_low_mem_rescaling(n, m, H, s, e, r): - """LS haploid Viterbi algorithm, with reduced memory and exploits the Markov process structure.""" - # Initialise - V, V_previous, P, r_n = viterbi_init(n, m, H, s, e, r) - c = np.ones(m) - - for j in range(1, m): - argmax = np.argmax(V_previous) - c[j] = V_previous[argmax] - V_previous *= 1 / c[j] - V = np.zeros(n) - for i in range(n): - V[i] = V_previous[i] * (1 - r[j] + r_n[j]) - P[j, i] = i - if V[i] < r_n[j]: - V[i] = r_n[j] - P[j, i] = argmax - V[i] *= e[j, np.int64(np.equal(H[j, i], s[0, j]) or s[0, j] == MISSING)] - V_previous = np.copy(V) - - ll = np.sum(np.log10(c)) + np.log10(np.max(V)) - - return V, P, ll - - -@jit.numba_njit -def forwards_viterbi_hap_lower_mem_rescaling(n, m, H, s, e, r): - """LS haploid Viterbi algorithm with even smaller memory footprint and exploits the Markov process structure.""" - # Initialise - V = np.zeros(n) - for i in range(n): - V[i] = 1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]) or s[0, 0] == MISSING)] - P = np.zeros((m, n)).astype(np.int64) - r_n = r / n - c = np.ones(m) - - for j in range(1, m): - argmax = np.argmax(V) - c[j] = V[argmax] - V *= 1 / c[j] - for i in range(n): - V[i] = V[i] * (1 - r[j] + r_n[j]) - P[j, i] = i - if V[i] < r_n[j]: - V[i] = r_n[j] - P[j, i] = argmax - V[i] *= e[j, np.int64(np.equal(H[j, i], s[0, j]) or s[0, j] == MISSING)] - - ll = np.sum(np.log10(c)) + np.log10(np.max(V)) - - return V, P, ll - - -@jit.numba_njit -def forwards_viterbi_hap_lower_mem_rescaling_no_pointer(n, m, H, s, e, r): - """LS haploid Viterbi algorithm with even smaller memory footprint and exploits the Markov process structure.""" - # Initialise - V = np.zeros(n) - for i in range(n): - V[i] = 1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]) or s[0, 0] == MISSING)] - r_n = r / n - c = np.ones(m) - recombs = [ - np.zeros(shape=0, dtype=np.int64) for _ in range(m) - ] # This is going to be filled with the templates we can recombine to that have higher prob than staying where we are. - - V_argmaxes = np.zeros(m) - - for j in range(1, m): - argmax = np.argmax(V) - V_argmaxes[j - 1] = argmax - c[j] = V[argmax] - V *= 1 / c[j] - for i in range(n): - V[i] = V[i] * (1 - r[j] + r_n[j]) - if V[i] < r_n[j]: - V[i] = r_n[j] - recombs[j] = np.append( - recombs[j], i - ) # We add template i as a potential template to recombine to at site j. - V[i] *= e[j, np.int64(np.equal(H[j, i], s[0, j]) or s[0, j] == MISSING)] - - V_argmaxes[m - 1] = np.argmax(V) - ll = np.sum(np.log10(c)) + np.log10(np.max(V)) - - return V, V_argmaxes, recombs, ll - - -# Speedier version, variants x samples -@jit.numba_njit -def backwards_viterbi_hap(m, V_last, P): - """Run a backwards pass to determine the most likely path.""" - # Initialise - assert len(V_last.shape) == 1 - path = np.zeros(m).astype(np.int64) - path[m - 1] = np.argmax(V_last) - - for j in range(m - 2, -1, -1): - path[j] = P[j + 1, path[j + 1]] - - return path - - -@jit.numba_njit -def backwards_viterbi_hap_no_pointer(m, V_argmaxes, recombs): - """Run a backwards pass to determine the most likely path.""" - # Initialise - path = np.zeros(m).astype(np.int64) - path[m - 1] = V_argmaxes[m - 1] - - for j in range(m - 2, -1, -1): - current_best_template = path[j + 1] - if current_best_template in recombs[j + 1]: - current_best_template = V_argmaxes[j] - path[j] = current_best_template - - return path - - -@jit.numba_njit -def path_ll_hap(n, m, H, path, s, e, r): - """Evaluate log-likelihood path through a reference panel which results in sequence s.""" - index = np.int64(np.equal(H[0, path[0]], s[0, 0]) or s[0, 0] == MISSING) - log_prob_path = np.log10((1 / n) * e[0, index]) - old = path[0] - r_n = r / n - - for l in range(1, m): - index = np.int64(np.equal(H[l, path[l]], s[0, l]) or s[0, l] == MISSING) - current = path[l] - same = old == current - - if same: - log_prob_path += np.log10((1 - r[l]) + r_n[l]) - else: - log_prob_path += np.log10(r_n[l]) - - log_prob_path += np.log10(e[l, index]) - old = current - - return log_prob_path diff --git a/tests/test_LS_haploid_diploid.py b/tests/test_LS_haploid_diploid.py index aaeaa77..5b4dc48 100644 --- a/tests/test_LS_haploid_diploid.py +++ b/tests/test_LS_haploid_diploid.py @@ -6,14 +6,10 @@ import numpy as np import pytest -import lshmm.forward_backward.fb_diploid_samples_variants as fbd_sv -import lshmm.forward_backward.fb_diploid_variants_samples as fbd_vs -import lshmm.forward_backward.fb_haploid_samples_variants as fbh_sv -import lshmm.forward_backward.fb_haploid_variants_samples as fbh_vs -import lshmm.vit_diploid_samples_variants as vd_sv -import lshmm.vit_diploid_variants_samples as vd_vs -import lshmm.vit_haploid_samples_variants as vh_sv -import lshmm.vit_haploid_variants_samples as vh_vs +import lshmm.forward_backward.fb_diploid as fbd +import lshmm.forward_backward.fb_haploid as fbh +import lshmm.vit_diploid as vd +import lshmm.vit_haploid as vh EQUAL_BOTH_HOM = 4 UNEQUAL_BOTH_HOM = 0 @@ -260,252 +256,117 @@ def verify(self, ts): H_sv = H_vs.T # variants x samples - F_vs, c_vs, ll_vs = fbh_vs.forwards_ls_hap( - n, m, H_vs, s, e_vs, r, norm=False - ) - B_vs = fbh_vs.backwards_ls_hap(n, m, H_vs, s, e_vs, c_vs, r) + F_vs, c_vs, ll_vs = fbh.forwards_ls_hap(n, m, H_vs, s, e_vs, r, norm=False) + B_vs = fbh.backwards_ls_hap(n, m, H_vs, s, e_vs, c_vs, r) self.assertAllClose(np.log10(np.sum(F_vs * B_vs, 1)), ll_vs * np.ones(m)) - F_tmp, c_tmp, ll_tmp = fbh_vs.forwards_ls_hap( + F_tmp, c_tmp, ll_tmp = fbh.forwards_ls_hap( n, m, H_vs, s, e_vs, r, norm=True ) - B_tmp = fbh_vs.backwards_ls_hap(n, m, H_vs, s, e_vs, c_tmp, r) + B_tmp = fbh.backwards_ls_hap(n, m, H_vs, s, e_vs, c_tmp, r) self.assertAllClose(ll_vs, ll_tmp) self.assertAllClose(np.sum(F_tmp * B_tmp, 1), np.ones(m)) - # samples x variants - F_sv, c_sv, ll_sv = fbh_sv.forwards_ls_hap( - n, m, H_sv, s, e_sv, r, norm=False - ) - B_sv = fbh_sv.backwards_ls_hap(n, m, H_sv, s, e_sv, c_sv, r) - self.assertAllClose(np.log10(np.sum(F_sv * B_sv, 0)), ll_sv * np.ones(m)) - F_tmp, c_tmp, ll_tmp = fbh_sv.forwards_ls_hap( - n, m, H_sv, s, e_sv, r, norm=True - ) - B_tmp = fbh_sv.backwards_ls_hap(n, m, H_sv, s, e_sv, c_tmp, r) - self.assertAllClose(ll_sv, ll_tmp) - self.assertAllClose(np.sum(F_tmp * B_tmp, 0), np.ones(m)) - - # samples x variants agrees with variants x samples - self.assertAllClose(ll_vs, ll_sv) - def verify_larger(self, ts): # variants x samples for n, m, H_vs, s, e_vs, r in self.example_parameters_haplotypes_larger(ts): e_sv = e_vs.T H_sv = H_vs.T - F_vs, c_vs, ll_vs = fbh_vs.forwards_ls_hap( - n, m, H_vs, s, e_vs, r, norm=False - ) - B_vs = fbh_vs.backwards_ls_hap(n, m, H_vs, s, e_vs, c_vs, r) + F_vs, c_vs, ll_vs = fbh.forwards_ls_hap(n, m, H_vs, s, e_vs, r, norm=False) + B_vs = fbh.backwards_ls_hap(n, m, H_vs, s, e_vs, c_vs, r) self.assertAllClose(np.log10(np.sum(F_vs * B_vs, 1)), ll_vs * np.ones(m)) - F_tmp, c_tmp, ll_tmp = fbh_vs.forwards_ls_hap( + F_tmp, c_tmp, ll_tmp = fbh.forwards_ls_hap( n, m, H_vs, s, e_vs, r, norm=True ) - B_tmp = fbh_vs.backwards_ls_hap(n, m, H_vs, s, e_vs, c_tmp, r) + B_tmp = fbh.backwards_ls_hap(n, m, H_vs, s, e_vs, c_tmp, r) self.assertAllClose(ll_vs, ll_tmp) self.assertAllClose(np.sum(F_tmp * B_tmp, 1), np.ones(m)) - # samples x variants - F_sv, c_sv, ll_sv = fbh_sv.forwards_ls_hap( - n, m, H_sv, s, e_sv, r, norm=False - ) - B_sv = fbh_sv.backwards_ls_hap(n, m, H_sv, s, e_sv, c_sv, r) - self.assertAllClose(np.log10(np.sum(F_sv * B_sv, 0)), ll_sv * np.ones(m)) - F_tmp, c_tmp, ll_tmp = fbh_sv.forwards_ls_hap( - n, m, H_sv, s, e_sv, r, norm=True - ) - B_tmp = fbh_sv.backwards_ls_hap(n, m, H_sv, s, e_sv, c_tmp, r) - self.assertAllClose(ll_sv, ll_tmp) - self.assertAllClose(np.sum(F_tmp * B_tmp, 0), np.ones(m)) - - # samples x variants agrees with variants x samples - self.assertAllClose(ll_vs, ll_sv) - class TestNonTreeMethodsDip(FBAlgorithmBase): """Test that we compute the sample likelihoods across all implementations.""" def verify(self, ts): for n, m, G_vs, s, e_vs, r in self.example_parameters_genotypes(ts): - e_sv = e_vs.T - G_sv = G_vs.T - # variants x samples - F_vs, c_vs, ll_vs = fbd_vs.forwards_ls_dip( - n, m, G_vs, s, e_vs, r, norm=True - ) - B_vs = fbd_vs.backwards_ls_dip(n, m, G_vs, s, e_vs, c_vs, r) + F_vs, c_vs, ll_vs = fbd.forwards_ls_dip(n, m, G_vs, s, e_vs, r, norm=True) + B_vs = fbd.backwards_ls_dip(n, m, G_vs, s, e_vs, c_vs, r) self.assertAllClose(np.sum(F_vs * B_vs, (1, 2)), np.ones(m)) - F_tmp, c_tmp, ll_tmp = fbd_vs.forwards_ls_dip( + F_tmp, c_tmp, ll_tmp = fbd.forwards_ls_dip( n, m, G_vs, s, e_vs, r, norm=False ) if ll_tmp != -np.inf: - B_tmp = fbd_vs.backwards_ls_dip(n, m, G_vs, s, e_vs, c_tmp, r) + B_tmp = fbd.backwards_ls_dip(n, m, G_vs, s, e_vs, c_tmp, r) self.assertAllClose(ll_vs, ll_tmp) self.assertAllClose( np.log10(np.sum(F_tmp * B_tmp, (1, 2))), ll_tmp * np.ones(m) ) - F_tmp, ll_tmp = fbd_vs.forward_ls_dip_starting_point(n, m, G_vs, s, e_vs, r) + F_tmp, ll_tmp = fbd.forward_ls_dip_starting_point(n, m, G_vs, s, e_vs, r) if ll_tmp != -np.inf: - B_tmp = fbd_vs.backward_ls_dip_starting_point(n, m, G_vs, s, e_vs, r) + B_tmp = fbd.backward_ls_dip_starting_point(n, m, G_vs, s, e_vs, r) self.assertAllClose(ll_vs, ll_tmp) self.assertAllClose( np.log10(np.sum(F_tmp * B_tmp, (1, 2))), ll_tmp * np.ones(m) ) - F_tmp, c_tmp, ll_tmp = fbd_vs.forward_ls_dip_loop( + F_tmp, c_tmp, ll_tmp = fbd.forward_ls_dip_loop( n, m, G_vs, s, e_vs, r, norm=False ) if ll_tmp != -np.inf: - B_tmp = fbd_vs.backward_ls_dip_loop(n, m, G_vs, s, e_vs, c_tmp, r) + B_tmp = fbd.backward_ls_dip_loop(n, m, G_vs, s, e_vs, c_tmp, r) self.assertAllClose(ll_vs, ll_tmp) self.assertAllClose( np.log10(np.sum(F_tmp * B_tmp, (1, 2))), ll_tmp * np.ones(m) ) - F_tmp, c_tmp, ll_tmp = fbd_vs.forward_ls_dip_loop( + F_tmp, c_tmp, ll_tmp = fbd.forward_ls_dip_loop( n, m, G_vs, s, e_vs, r, norm=True ) - B_tmp = fbd_vs.backward_ls_dip_loop(n, m, G_vs, s, e_vs, c_tmp, r) + B_tmp = fbd.backward_ls_dip_loop(n, m, G_vs, s, e_vs, c_tmp, r) self.assertAllClose(ll_vs, ll_tmp) self.assertAllClose(np.sum(F_tmp * B_tmp, (1, 2)), np.ones(m)) - # samples x variants - F_sv, c_sv, ll_sv = fbd_sv.forwards_ls_dip( - n, m, G_sv, s, e_sv, r, norm=True - ) - B_sv = fbd_sv.backwards_ls_dip(n, m, G_sv, s, e_sv, c_sv, r) - self.assertAllClose(np.sum(F_sv * B_sv, (0, 1)), np.ones(m)) - F_tmp, c_tmp, ll_tmp = fbd_sv.forwards_ls_dip( - n, m, G_sv, s, e_sv, r, norm=False - ) - if ll_tmp != -np.inf: - B_tmp = fbd_sv.backwards_ls_dip(n, m, G_sv, s, e_sv, c_tmp, r) - self.assertAllClose( - np.log10(np.sum(F_tmp * B_tmp, (0, 1))), ll_tmp * np.ones(m) - ) - self.assertAllClose(ll_sv, ll_tmp) - - F_tmp, ll_tmp = fbd_sv.forward_ls_dip_starting_point(n, m, G_sv, s, e_sv, r) - if ll_tmp != -np.inf: - B_tmp = fbd_sv.backward_ls_dip_starting_point(n, m, G_sv, s, e_sv, r) - self.assertAllClose(ll_sv, ll_tmp) - self.assertAllClose( - np.log10(np.sum(F_tmp * B_tmp, (0, 1))), ll_tmp * np.ones(m) - ) - - F_tmp, c_tmp, ll_tmp = fbd_sv.forward_ls_dip_loop( - n, m, G_sv, s, e_sv, r, norm=True - ) - B_tmp = fbd_sv.backward_ls_dip_loop(n, m, G_sv, s, e_sv, c_tmp, r) - self.assertAllClose(ll_sv, ll_tmp) - self.assertAllClose(np.sum(F_tmp * B_tmp, (0, 1)), np.ones(m)) - F_tmp, c_tmp, ll_tmp = fbd_sv.forward_ls_dip_loop( - n, m, G_sv, s, e_sv, r, norm=False - ) - if ll_tmp != -np.inf: - B_tmp = fbd_sv.backward_ls_dip_loop(n, m, G_sv, s, e_sv, c_tmp, r) - self.assertAllClose(ll_sv, ll_tmp) - self.assertAllClose( - np.log10(np.sum(F_tmp * B_tmp, (0, 1))), ll_tmp * np.ones(m) - ) - - # compare sample x variants to variants x samples - self.assertAllClose(ll_vs, ll_sv) - def verify_larger(self, ts): - # variants x samples for n, m, G_vs, s, e_vs, r in self.example_parameters_genotypes_larger(ts): - e_sv = e_vs.T - G_sv = G_vs.T - - # variants x samples - F_vs, c_vs, ll_vs = fbd_vs.forwards_ls_dip( - n, m, G_vs, s, e_vs, r, norm=True - ) - B_vs = fbd_vs.backwards_ls_dip(n, m, G_vs, s, e_vs, c_vs, r) + F_vs, c_vs, ll_vs = fbd.forwards_ls_dip(n, m, G_vs, s, e_vs, r, norm=True) + B_vs = fbd.backwards_ls_dip(n, m, G_vs, s, e_vs, c_vs, r) self.assertAllClose(np.sum(F_vs * B_vs, (1, 2)), np.ones(m)) - F_tmp, c_tmp, ll_tmp = fbd_vs.forwards_ls_dip( + F_tmp, c_tmp, ll_tmp = fbd.forwards_ls_dip( n, m, G_vs, s, e_vs, r, norm=False ) if ll_tmp != -np.inf: - B_tmp = fbd_vs.backwards_ls_dip(n, m, G_vs, s, e_vs, c_tmp, r) + B_tmp = fbd.backwards_ls_dip(n, m, G_vs, s, e_vs, c_tmp, r) self.assertAllClose(ll_vs, ll_tmp) self.assertAllClose( np.log10(np.sum(F_tmp * B_tmp, (1, 2))), ll_tmp * np.ones(m) ) - F_tmp, ll_tmp = fbd_vs.forward_ls_dip_starting_point(n, m, G_vs, s, e_vs, r) + F_tmp, ll_tmp = fbd.forward_ls_dip_starting_point(n, m, G_vs, s, e_vs, r) if ll_tmp != -np.inf: - B_tmp = fbd_vs.backward_ls_dip_starting_point(n, m, G_vs, s, e_vs, r) + B_tmp = fbd.backward_ls_dip_starting_point(n, m, G_vs, s, e_vs, r) self.assertAllClose(ll_vs, ll_tmp) self.assertAllClose( np.log10(np.sum(F_tmp * B_tmp, (1, 2))), ll_tmp * np.ones(m) ) - F_tmp, c_tmp, ll_tmp = fbd_vs.forward_ls_dip_loop( + F_tmp, c_tmp, ll_tmp = fbd.forward_ls_dip_loop( n, m, G_vs, s, e_vs, r, norm=False ) if ll_tmp != -np.inf: - B_tmp = fbd_vs.backward_ls_dip_loop(n, m, G_vs, s, e_vs, c_tmp, r) + B_tmp = fbd.backward_ls_dip_loop(n, m, G_vs, s, e_vs, c_tmp, r) self.assertAllClose(ll_vs, ll_tmp) self.assertAllClose( np.log10(np.sum(F_tmp * B_tmp, (1, 2))), ll_tmp * np.ones(m) ) - F_tmp, c_tmp, ll_tmp = fbd_vs.forward_ls_dip_loop( + F_tmp, c_tmp, ll_tmp = fbd.forward_ls_dip_loop( n, m, G_vs, s, e_vs, r, norm=True ) - B_tmp = fbd_vs.backward_ls_dip_loop(n, m, G_vs, s, e_vs, c_tmp, r) + B_tmp = fbd.backward_ls_dip_loop(n, m, G_vs, s, e_vs, c_tmp, r) self.assertAllClose(ll_vs, ll_tmp) self.assertAllClose(np.sum(F_tmp * B_tmp, (1, 2)), np.ones(m)) - # samples x variants - - F_sv, c_sv, ll_sv = fbd_sv.forwards_ls_dip( - n, m, G_sv, s, e_sv, r, norm=True - ) - B_sv = fbd_sv.backwards_ls_dip(n, m, G_sv, s, e_sv, c_sv, r) - self.assertAllClose(np.sum(F_sv * B_sv, (0, 1)), np.ones(m)) - F_tmp, c_tmp, ll_tmp = fbd_sv.forwards_ls_dip( - n, m, G_sv, s, e_sv, r, norm=False - ) - if ll_tmp != -np.inf: - B_tmp = fbd_sv.backwards_ls_dip(n, m, G_sv, s, e_sv, c_tmp, r) - self.assertAllClose( - np.log10(np.sum(F_tmp * B_tmp, (0, 1))), ll_tmp * np.ones(m) - ) - self.assertAllClose(ll_sv, ll_tmp) - - F_tmp, ll_tmp = fbd_sv.forward_ls_dip_starting_point(n, m, G_sv, s, e_sv, r) - if ll_tmp != -np.inf: - B_tmp = fbd_sv.backward_ls_dip_starting_point(n, m, G_sv, s, e_sv, r) - self.assertAllClose(ll_sv, ll_tmp) - self.assertAllClose( - np.log10(np.sum(F_tmp * B_tmp, (0, 1))), ll_tmp * np.ones(m) - ) - - F_tmp, c_tmp, ll_tmp = fbd_sv.forward_ls_dip_loop( - n, m, G_sv, s, e_sv, r, norm=True - ) - B_tmp = fbd_sv.backward_ls_dip_loop(n, m, G_sv, s, e_sv, c_tmp, r) - self.assertAllClose(ll_sv, ll_tmp) - self.assertAllClose(np.sum(F_tmp * B_tmp, (0, 1)), np.ones(m)) - F_tmp, c_tmp, ll_tmp = fbd_sv.forward_ls_dip_loop( - n, m, G_sv, s, e_sv, r, norm=False - ) - if ll_tmp != -np.inf: - B_tmp = fbd_sv.backward_ls_dip_loop(n, m, G_sv, s, e_sv, c_tmp, r) - self.assertAllClose(ll_sv, ll_tmp) - self.assertAllClose( - np.log10(np.sum(F_tmp * B_tmp, (0, 1))), ll_tmp * np.ones(m) - ) - - # compare sample x variants to variants x samples - self.assertAllClose(ll_vs, ll_sv) - class VitAlgorithmBase(LSBase): """Base for viterbi algoritm tests.""" @@ -516,47 +377,44 @@ class TestNonTreeViterbiHap(VitAlgorithmBase): def verify(self, ts): for n, m, H_vs, s, e_vs, r in self.example_parameters_haplotypes(ts): - e_sv = e_vs.T - H_sv = H_vs.T - # variants x samples - V_vs, P_vs, ll_vs = vh_vs.forwards_viterbi_hap_naive(n, m, H_vs, s, e_vs, r) - path_vs = vh_vs.backwards_viterbi_hap(m, V_vs[m - 1, :], P_vs) - ll_check = vh_vs.path_ll_hap(n, m, H_vs, path_vs, s, e_vs, r) + V_vs, P_vs, ll_vs = vh.forwards_viterbi_hap_naive(n, m, H_vs, s, e_vs, r) + path_vs = vh.backwards_viterbi_hap(m, V_vs[m - 1, :], P_vs) + ll_check = vh.path_ll_hap(n, m, H_vs, path_vs, s, e_vs, r) self.assertAllClose(ll_vs, ll_check) - V_tmp, P_tmp, ll_tmp = vh_vs.forwards_viterbi_hap_naive_vec( + V_tmp, P_tmp, ll_tmp = vh.forwards_viterbi_hap_naive_vec( n, m, H_vs, s, e_vs, r ) - path_tmp = vh_vs.backwards_viterbi_hap(m, V_tmp[m - 1, :], P_tmp) - ll_check = vh_vs.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) + path_tmp = vh.backwards_viterbi_hap(m, V_tmp[m - 1, :], P_tmp) + ll_check = vh.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, ll_check) self.assertAllClose(ll_vs, ll_tmp) - V_tmp, P_tmp, ll_tmp = vh_vs.forwards_viterbi_hap_naive_low_mem( + V_tmp, P_tmp, ll_tmp = vh.forwards_viterbi_hap_naive_low_mem( n, m, H_vs, s, e_vs, r ) - path_tmp = vh_vs.backwards_viterbi_hap(m, V_tmp, P_tmp) - ll_check = vh_vs.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) + path_tmp = vh.backwards_viterbi_hap(m, V_tmp, P_tmp) + ll_check = vh.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, ll_check) self.assertAllClose(ll_vs, ll_tmp) - V_tmp, P_tmp, ll_tmp = vh_vs.forwards_viterbi_hap_naive_low_mem_rescaling( + V_tmp, P_tmp, ll_tmp = vh.forwards_viterbi_hap_naive_low_mem_rescaling( n, m, H_vs, s, e_vs, r ) - path_tmp = vh_vs.backwards_viterbi_hap(m, V_tmp, P_tmp) - ll_check = vh_vs.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) + path_tmp = vh.backwards_viterbi_hap(m, V_tmp, P_tmp) + ll_check = vh.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, ll_check) self.assertAllClose(ll_vs, ll_tmp) - V_tmp, P_tmp, ll_tmp = vh_vs.forwards_viterbi_hap_low_mem_rescaling( + V_tmp, P_tmp, ll_tmp = vh.forwards_viterbi_hap_low_mem_rescaling( n, m, H_vs, s, e_vs, r ) - path_tmp = vh_vs.backwards_viterbi_hap(m, V_tmp, P_tmp) - ll_check = vh_vs.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) + path_tmp = vh.backwards_viterbi_hap(m, V_tmp, P_tmp) + ll_check = vh.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, ll_check) self.assertAllClose(ll_vs, ll_tmp) - V_tmp, P_tmp, ll_tmp = vh_vs.forwards_viterbi_hap_lower_mem_rescaling( + V_tmp, P_tmp, ll_tmp = vh.forwards_viterbi_hap_lower_mem_rescaling( n, m, H_vs, s, e_vs, r ) - path_tmp = vh_vs.backwards_viterbi_hap(m, V_tmp, P_tmp) - ll_check = vh_vs.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) + path_tmp = vh.backwards_viterbi_hap(m, V_tmp, P_tmp) + ll_check = vh.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, ll_check) self.assertAllClose(ll_vs, ll_tmp) @@ -565,105 +423,58 @@ def verify(self, ts): V_argmaxes_tmp, recombs, ll_tmp, - ) = vh_vs.forwards_viterbi_hap_lower_mem_rescaling_no_pointer( + ) = vh.forwards_viterbi_hap_lower_mem_rescaling_no_pointer( n, m, H_vs, s, e_vs, r ) - path_tmp = vh_vs.backwards_viterbi_hap_no_pointer( + path_tmp = vh.backwards_viterbi_hap_no_pointer( m, V_argmaxes_tmp, nb.typed.List(recombs), ) - ll_check = vh_vs.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) - self.assertAllClose(ll_tmp, ll_check) - self.assertAllClose(ll_vs, ll_tmp) - - # samples x variants - V_sv, P_sv, ll_sv = vh_sv.forwards_viterbi_hap_naive(n, m, H_sv, s, e_sv, r) - path_tmp = vh_sv.backwards_viterbi_hap(m, V_sv[:, m - 1], P_sv) - ll_check = vh_sv.path_ll_hap(n, m, H_sv, path_tmp, s, e_sv, r) - self.assertAllClose(ll_sv, ll_check) - V_tmp, P_tmp, ll_tmp = vh_sv.forwards_viterbi_hap_naive_vec( - n, m, H_sv, s, e_sv, r - ) - path_tmp = vh_sv.backwards_viterbi_hap(m, V_tmp[:, m - 1], P_tmp) - ll_check = vh_sv.path_ll_hap(n, m, H_sv, path_tmp, s, e_sv, r) - self.assertAllClose(ll_tmp, ll_check) - self.assertAllClose(ll_sv, ll_tmp) - V_tmp, P_tmp, ll_tmp = vh_sv.forwards_viterbi_hap_naive_low_mem( - n, m, H_sv, s, e_sv, r - ) - path_tmp = vh_sv.backwards_viterbi_hap(m, V_tmp, P_tmp) - ll_check = vh_sv.path_ll_hap(n, m, H_sv, path_tmp, s, e_sv, r) - self.assertAllClose(ll_tmp, ll_check) - self.assertAllClose(ll_sv, ll_tmp) - V_tmp, P_tmp, ll_tmp = vh_sv.forwards_viterbi_hap_naive_low_mem_rescaling( - n, m, H_sv, s, e_sv, r - ) - path_tmp = vh_sv.backwards_viterbi_hap(m, V_tmp, P_tmp) - ll_check = vh_sv.path_ll_hap(n, m, H_sv, path_tmp, s, e_sv, r) - self.assertAllClose(ll_tmp, ll_check) - self.assertAllClose(ll_sv, ll_tmp) - V_tmp, P_tmp, ll_tmp = vh_sv.forwards_viterbi_hap_low_mem_rescaling( - n, m, H_sv, s, e_sv, r - ) - path_tmp = vh_sv.backwards_viterbi_hap(m, V_tmp, P_tmp) - ll_check = vh_sv.path_ll_hap(n, m, H_sv, path_tmp, s, e_sv, r) - self.assertAllClose(ll_tmp, ll_check) - self.assertAllClose(ll_sv, ll_tmp) - V_tmp, P_tmp, ll_tmp = vh_sv.forwards_viterbi_hap_lower_mem_rescaling( - n, m, H_sv, s, e_sv, r - ) - path_tmp = vh_sv.backwards_viterbi_hap(m, V_tmp, P_tmp) - ll_check = vh_sv.path_ll_hap(n, m, H_sv, path_tmp, s, e_sv, r) + ll_check = vh.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, ll_check) self.assertAllClose(ll_vs, ll_tmp) - # samples x variants agrees with variants x samples - self.assertAllClose(ll_vs, ll_sv) - def verify_larger(self, ts): for n, m, H_vs, s, e_vs, r in self.example_parameters_haplotypes_larger(ts): - e_sv = e_vs.T - H_sv = H_vs.T - # variants x samples - V_vs, P_vs, ll_vs = vh_vs.forwards_viterbi_hap_naive(n, m, H_vs, s, e_vs, r) - path_vs = vh_vs.backwards_viterbi_hap(m, V_vs[m - 1, :], P_vs) - ll_check = vh_vs.path_ll_hap(n, m, H_vs, path_vs, s, e_vs, r) + V_vs, P_vs, ll_vs = vh.forwards_viterbi_hap_naive(n, m, H_vs, s, e_vs, r) + path_vs = vh.backwards_viterbi_hap(m, V_vs[m - 1, :], P_vs) + ll_check = vh.path_ll_hap(n, m, H_vs, path_vs, s, e_vs, r) self.assertAllClose(ll_vs, ll_check) - V_tmp, P_tmp, ll_tmp = vh_vs.forwards_viterbi_hap_naive_vec( + V_tmp, P_tmp, ll_tmp = vh.forwards_viterbi_hap_naive_vec( n, m, H_vs, s, e_vs, r ) - path_tmp = vh_vs.backwards_viterbi_hap(m, V_tmp[m - 1, :], P_tmp) - ll_check = vh_vs.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) + path_tmp = vh.backwards_viterbi_hap(m, V_tmp[m - 1, :], P_tmp) + ll_check = vh.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, ll_check) self.assertAllClose(ll_vs, ll_tmp) - V_tmp, P_tmp, ll_tmp = vh_vs.forwards_viterbi_hap_naive_low_mem( + V_tmp, P_tmp, ll_tmp = vh.forwards_viterbi_hap_naive_low_mem( n, m, H_vs, s, e_vs, r ) - path_tmp = vh_vs.backwards_viterbi_hap(m, V_tmp, P_tmp) - ll_check = vh_vs.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) + path_tmp = vh.backwards_viterbi_hap(m, V_tmp, P_tmp) + ll_check = vh.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, ll_check) self.assertAllClose(ll_vs, ll_tmp) - V_tmp, P_tmp, ll_tmp = vh_vs.forwards_viterbi_hap_naive_low_mem_rescaling( + V_tmp, P_tmp, ll_tmp = vh.forwards_viterbi_hap_naive_low_mem_rescaling( n, m, H_vs, s, e_vs, r ) - path_tmp = vh_vs.backwards_viterbi_hap(m, V_tmp, P_tmp) - ll_check = vh_vs.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) + path_tmp = vh.backwards_viterbi_hap(m, V_tmp, P_tmp) + ll_check = vh.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, ll_check) self.assertAllClose(ll_vs, ll_tmp) - V_tmp, P_tmp, ll_tmp = vh_vs.forwards_viterbi_hap_low_mem_rescaling( + V_tmp, P_tmp, ll_tmp = vh.forwards_viterbi_hap_low_mem_rescaling( n, m, H_vs, s, e_vs, r ) - path_tmp = vh_vs.backwards_viterbi_hap(m, V_tmp, P_tmp) - ll_check = vh_vs.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) + path_tmp = vh.backwards_viterbi_hap(m, V_tmp, P_tmp) + ll_check = vh.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, ll_check) self.assertAllClose(ll_vs, ll_tmp) - V_tmp, P_tmp, ll_tmp = vh_vs.forwards_viterbi_hap_lower_mem_rescaling( + V_tmp, P_tmp, ll_tmp = vh.forwards_viterbi_hap_lower_mem_rescaling( n, m, H_vs, s, e_vs, r ) - path_tmp = vh_vs.backwards_viterbi_hap(m, V_tmp, P_tmp) - ll_check = vh_vs.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) + path_tmp = vh.backwards_viterbi_hap(m, V_tmp, P_tmp) + ll_check = vh.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, ll_check) self.assertAllClose(ll_vs, ll_tmp) @@ -672,91 +483,44 @@ def verify_larger(self, ts): V_argmaxes_tmp, recombs, ll_tmp, - ) = vh_vs.forwards_viterbi_hap_lower_mem_rescaling_no_pointer( + ) = vh.forwards_viterbi_hap_lower_mem_rescaling_no_pointer( n, m, H_vs, s, e_vs, r ) - path_tmp = vh_vs.backwards_viterbi_hap_no_pointer( + path_tmp = vh.backwards_viterbi_hap_no_pointer( m, V_argmaxes_tmp, nb.typed.List(recombs) ) - ll_check = vh_vs.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) - self.assertAllClose(ll_tmp, ll_check) - self.assertAllClose(ll_vs, ll_tmp) - - # samples x variants - V_sv, P_sv, ll_sv = vh_sv.forwards_viterbi_hap_naive(n, m, H_sv, s, e_sv, r) - path_tmp = vh_sv.backwards_viterbi_hap(m, V_sv[:, m - 1], P_sv) - ll_check = vh_sv.path_ll_hap(n, m, H_sv, path_tmp, s, e_sv, r) - self.assertAllClose(ll_sv, ll_check) - V_tmp, P_tmp, ll_tmp = vh_sv.forwards_viterbi_hap_naive_vec( - n, m, H_sv, s, e_sv, r - ) - path_tmp = vh_sv.backwards_viterbi_hap(m, V_tmp[:, m - 1], P_tmp) - ll_check = vh_sv.path_ll_hap(n, m, H_sv, path_tmp, s, e_sv, r) - self.assertAllClose(ll_tmp, ll_check) - self.assertAllClose(ll_sv, ll_tmp) - V_tmp, P_tmp, ll_tmp = vh_sv.forwards_viterbi_hap_naive_low_mem( - n, m, H_sv, s, e_sv, r - ) - path_tmp = vh_sv.backwards_viterbi_hap(m, V_tmp, P_tmp) - ll_check = vh_sv.path_ll_hap(n, m, H_sv, path_tmp, s, e_sv, r) - self.assertAllClose(ll_tmp, ll_check) - self.assertAllClose(ll_sv, ll_tmp) - V_tmp, P_tmp, ll_tmp = vh_sv.forwards_viterbi_hap_naive_low_mem_rescaling( - n, m, H_sv, s, e_sv, r - ) - path_tmp = vh_sv.backwards_viterbi_hap(m, V_tmp, P_tmp) - ll_check = vh_sv.path_ll_hap(n, m, H_sv, path_tmp, s, e_sv, r) - self.assertAllClose(ll_tmp, ll_check) - self.assertAllClose(ll_sv, ll_tmp) - V_tmp, P_tmp, ll_tmp = vh_sv.forwards_viterbi_hap_low_mem_rescaling( - n, m, H_sv, s, e_sv, r - ) - path_tmp = vh_sv.backwards_viterbi_hap(m, V_tmp, P_tmp) - ll_check = vh_sv.path_ll_hap(n, m, H_sv, path_tmp, s, e_sv, r) - self.assertAllClose(ll_tmp, ll_check) - self.assertAllClose(ll_sv, ll_tmp) - V_tmp, P_tmp, ll_tmp = vh_sv.forwards_viterbi_hap_lower_mem_rescaling( - n, m, H_sv, s, e_sv, r - ) - path_tmp = vh_sv.backwards_viterbi_hap(m, V_tmp, P_tmp) - ll_check = vh_sv.path_ll_hap(n, m, H_sv, path_tmp, s, e_sv, r) + ll_check = vh.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, ll_check) self.assertAllClose(ll_vs, ll_tmp) - # samples x variants agrees with variants x samples - self.assertAllClose(ll_vs, ll_sv) - class TestNonTreeViterbiDip(VitAlgorithmBase): """Test that we have the same log-likelihood across all implementations""" def verify(self, ts): for n, m, G_vs, s, e_vs, r in self.example_parameters_genotypes(ts): - e_sv = e_vs.T - G_sv = G_vs.T - # variants x samples - V_vs, P_vs, ll_vs = vd_vs.forwards_viterbi_dip_naive(n, m, G_vs, s, e_vs, r) - path_vs = vd_vs.backwards_viterbi_dip(m, V_vs[m - 1, :, :], P_vs) - phased_path_vs = vd_vs.get_phased_path(n, path_vs) - path_ll_vs = vd_vs.path_ll_dip(n, m, G_vs, phased_path_vs, s, e_vs, r) + V_vs, P_vs, ll_vs = vd.forwards_viterbi_dip_naive(n, m, G_vs, s, e_vs, r) + path_vs = vd.backwards_viterbi_dip(m, V_vs[m - 1, :, :], P_vs) + phased_path_vs = vd.get_phased_path(n, path_vs) + path_ll_vs = vd.path_ll_dip(n, m, G_vs, phased_path_vs, s, e_vs, r) self.assertAllClose(ll_vs, path_ll_vs) - V_tmp, P_tmp, ll_tmp = vd_vs.forwards_viterbi_dip_naive_low_mem( + V_tmp, P_tmp, ll_tmp = vd.forwards_viterbi_dip_naive_low_mem( n, m, G_vs, s, e_vs, r ) - path_tmp = vd_vs.backwards_viterbi_dip(m, V_tmp, P_tmp) - phased_path_tmp = vd_vs.get_phased_path(n, path_tmp) - path_ll_tmp = vd_vs.path_ll_dip(n, m, G_vs, phased_path_tmp, s, e_vs, r) + path_tmp = vd.backwards_viterbi_dip(m, V_tmp, P_tmp) + phased_path_tmp = vd.get_phased_path(n, path_tmp) + path_ll_tmp = vd.path_ll_dip(n, m, G_vs, phased_path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, path_ll_tmp) self.assertAllClose(ll_vs, ll_tmp) - V_tmp, P_tmp, ll_tmp = vd_vs.forwards_viterbi_dip_low_mem( + V_tmp, P_tmp, ll_tmp = vd.forwards_viterbi_dip_low_mem( n, m, G_vs, s, e_vs, r ) - path_tmp = vd_vs.backwards_viterbi_dip(m, V_tmp, P_tmp) - phased_path_tmp = vd_vs.get_phased_path(n, path_tmp) - path_ll_tmp = vd_vs.path_ll_dip(n, m, G_vs, phased_path_tmp, s, e_vs, r) + path_tmp = vd.backwards_viterbi_dip(m, V_tmp, P_tmp) + phased_path_tmp = vd.get_phased_path(n, path_tmp) + path_ll_tmp = vd.path_ll_dip(n, m, G_vs, phased_path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, path_ll_tmp) self.assertAllClose(ll_vs, ll_tmp) @@ -767,10 +531,9 @@ def verify(self, ts): V_rowcol_argmaxes_tmp, recombs_single, recombs_double, - # P_tmp, ll_tmp, - ) = vd_vs.forwards_viterbi_dip_low_mem_no_pointer(n, m, G_vs, s, e_vs, r) - path_tmp = vd_vs.backwards_viterbi_dip_no_pointer( + ) = vd.forwards_viterbi_dip_low_mem_no_pointer(n, m, G_vs, s, e_vs, r) + path_tmp = vd.backwards_viterbi_dip_no_pointer( m, V_argmaxes_tmp, V_rowcol_maxes_tmp, @@ -779,85 +542,44 @@ def verify(self, ts): nb.typed.List(recombs_double), V_tmp, ) - phased_path_tmp = vd_vs.get_phased_path(n, path_tmp) - path_ll_tmp = vd_vs.path_ll_dip(n, m, G_vs, phased_path_tmp, s, e_vs, r) + phased_path_tmp = vd.get_phased_path(n, path_tmp) + path_ll_tmp = vd.path_ll_dip(n, m, G_vs, phased_path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, path_ll_tmp) self.assertAllClose(ll_vs, ll_tmp) - V_tmp, P_tmp, ll_tmp = vd_vs.forwards_viterbi_dip_naive_vec( + V_tmp, P_tmp, ll_tmp = vd.forwards_viterbi_dip_naive_vec( n, m, G_vs, s, e_vs, r ) - path_tmp = vd_vs.backwards_viterbi_dip(m, V_tmp[m - 1, :, :], P_tmp) - phased_path_tmp = vd_vs.get_phased_path(n, path_tmp) - path_ll_tmp = vd_vs.path_ll_dip(n, m, G_vs, phased_path_tmp, s, e_vs, r) + path_tmp = vd.backwards_viterbi_dip(m, V_tmp[m - 1, :, :], P_tmp) + phased_path_tmp = vd.get_phased_path(n, path_tmp) + path_ll_tmp = vd.path_ll_dip(n, m, G_vs, phased_path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, path_ll_tmp) self.assertAllClose(ll_vs, ll_tmp) - # samples x variants - V_sv, P_sv, ll_sv = vd_sv.forwards_viterbi_dip_naive(n, m, G_sv, s, e_sv, r) - path_sv = vd_sv.backwards_viterbi_dip(m, V_sv[:, :, m - 1], P_sv) - phased_path_sv = vd_sv.get_phased_path(n, path_sv) - path_ll_sv = vd_sv.path_ll_dip(n, m, G_sv, phased_path_sv, s, e_sv, r) - self.assertAllClose(ll_sv, path_ll_sv) - - V_tmp, P_tmp, ll_tmp = vd_sv.forwards_viterbi_dip_naive_low_mem( - n, m, G_sv, s, e_sv, r - ) - path_tmp = vd_sv.backwards_viterbi_dip(m, V_tmp, P_tmp) - phased_path_tmp = vd_sv.get_phased_path(n, path_tmp) - path_ll_tmp = vd_sv.path_ll_dip(n, m, G_sv, phased_path_tmp, s, e_sv, r) - self.assertAllClose(ll_tmp, path_ll_tmp) - self.assertAllClose(ll_sv, ll_tmp) - - V_tmp, P_tmp, ll_tmp = vd_sv.forwards_viterbi_dip_low_mem( - n, m, G_sv, s, e_sv, r - ) - path_tmp = vd_sv.backwards_viterbi_dip(m, V_tmp, P_tmp) - phased_path_tmp = vd_sv.get_phased_path(n, path_tmp) - path_ll_tmp = vd_sv.path_ll_dip(n, m, G_sv, phased_path_tmp, s, e_sv, r) - self.assertAllClose(ll_tmp, path_ll_tmp) - self.assertAllClose(ll_sv, ll_tmp) - - V_tmp, P_tmp, ll_tmp = vd_sv.forwards_viterbi_dip_naive_vec( - n, m, G_sv, s, e_sv, r - ) - path_tmp = vd_sv.backwards_viterbi_dip(m, V_tmp[:, :, m - 1], P_tmp) - phased_path_tmp = vd_sv.get_phased_path(n, path_tmp) - path_ll_tmp = vd_sv.path_ll_dip(n, m, G_sv, phased_path_tmp, s, e_sv, r) - self.assertAllClose(ll_tmp, path_ll_tmp) - self.assertAllClose(ll_sv, ll_tmp) - self.assertAllClose(path_ll_sv, path_ll_vs) - - # samples x variants agrees with variants x samples - self.assertAllClose(ll_vs, ll_sv) - def verify_larger(self, ts): for n, m, G_vs, s, e_vs, r in self.example_parameters_genotypes_larger(ts): - e_sv = e_vs.T - G_sv = G_vs.T - # variants x samples - V_vs, P_vs, ll_vs = vd_vs.forwards_viterbi_dip_naive(n, m, G_vs, s, e_vs, r) - path_vs = vd_vs.backwards_viterbi_dip(m, V_vs[m - 1, :, :], P_vs) - phased_path_vs = vd_vs.get_phased_path(n, path_vs) - path_ll_vs = vd_vs.path_ll_dip(n, m, G_vs, phased_path_vs, s, e_vs, r) + V_vs, P_vs, ll_vs = vd.forwards_viterbi_dip_naive(n, m, G_vs, s, e_vs, r) + path_vs = vd.backwards_viterbi_dip(m, V_vs[m - 1, :, :], P_vs) + phased_path_vs = vd.get_phased_path(n, path_vs) + path_ll_vs = vd.path_ll_dip(n, m, G_vs, phased_path_vs, s, e_vs, r) self.assertAllClose(ll_vs, path_ll_vs) - V_tmp, P_tmp, ll_tmp = vd_vs.forwards_viterbi_dip_naive_low_mem( + V_tmp, P_tmp, ll_tmp = vd.forwards_viterbi_dip_naive_low_mem( n, m, G_vs, s, e_vs, r ) - path_tmp = vd_vs.backwards_viterbi_dip(m, V_tmp, P_tmp) - phased_path_tmp = vd_vs.get_phased_path(n, path_tmp) - path_ll_tmp = vd_vs.path_ll_dip(n, m, G_vs, phased_path_tmp, s, e_vs, r) + path_tmp = vd.backwards_viterbi_dip(m, V_tmp, P_tmp) + phased_path_tmp = vd.get_phased_path(n, path_tmp) + path_ll_tmp = vd.path_ll_dip(n, m, G_vs, phased_path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, path_ll_tmp) self.assertAllClose(ll_vs, ll_tmp) - V_tmp, P_tmp, ll_tmp = vd_vs.forwards_viterbi_dip_low_mem( + V_tmp, P_tmp, ll_tmp = vd.forwards_viterbi_dip_low_mem( n, m, G_vs, s, e_vs, r ) - path_tmp = vd_vs.backwards_viterbi_dip(m, V_tmp, P_tmp) - phased_path_tmp = vd_vs.get_phased_path(n, path_tmp) - path_ll_tmp = vd_vs.path_ll_dip(n, m, G_vs, phased_path_tmp, s, e_vs, r) + path_tmp = vd.backwards_viterbi_dip(m, V_tmp, P_tmp) + phased_path_tmp = vd.get_phased_path(n, path_tmp) + path_ll_tmp = vd.path_ll_dip(n, m, G_vs, phased_path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, path_ll_tmp) self.assertAllClose(ll_vs, ll_tmp) @@ -869,8 +591,8 @@ def verify_larger(self, ts): recombs_single, recombs_double, ll_tmp, - ) = vd_vs.forwards_viterbi_dip_low_mem_no_pointer(n, m, G_vs, s, e_vs, r) - path_tmp = vd_vs.backwards_viterbi_dip_no_pointer( + ) = vd.forwards_viterbi_dip_low_mem_no_pointer(n, m, G_vs, s, e_vs, r) + path_tmp = vd.backwards_viterbi_dip_no_pointer( m, V_argmaxes_tmp, V_rowcol_maxes_tmp, @@ -879,53 +601,16 @@ def verify_larger(self, ts): nb.typed.List(recombs_double), V_tmp, ) - phased_path_tmp = vd_vs.get_phased_path(n, path_tmp) - path_ll_tmp = vd_vs.path_ll_dip(n, m, G_vs, phased_path_tmp, s, e_vs, r) + phased_path_tmp = vd.get_phased_path(n, path_tmp) + path_ll_tmp = vd.path_ll_dip(n, m, G_vs, phased_path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, path_ll_tmp) self.assertAllClose(ll_vs, ll_tmp) - V_tmp, P_tmp, ll_tmp = vd_vs.forwards_viterbi_dip_naive_vec( + V_tmp, P_tmp, ll_tmp = vd.forwards_viterbi_dip_naive_vec( n, m, G_vs, s, e_vs, r ) - path_tmp = vd_vs.backwards_viterbi_dip(m, V_tmp[m - 1, :, :], P_tmp) - phased_path_tmp = vd_vs.get_phased_path(n, path_tmp) - path_ll_tmp = vd_vs.path_ll_dip(n, m, G_vs, phased_path_tmp, s, e_vs, r) + path_tmp = vd.backwards_viterbi_dip(m, V_tmp[m - 1, :, :], P_tmp) + phased_path_tmp = vd.get_phased_path(n, path_tmp) + path_ll_tmp = vd.path_ll_dip(n, m, G_vs, phased_path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, path_ll_tmp) self.assertAllClose(ll_vs, ll_tmp) - - # samples x variants - V_sv, P_sv, ll_sv = vd_sv.forwards_viterbi_dip_naive(n, m, G_sv, s, e_sv, r) - path_sv = vd_sv.backwards_viterbi_dip(m, V_sv[:, :, m - 1], P_sv) - phased_path_sv = vd_sv.get_phased_path(n, path_sv) - path_ll_sv = vd_sv.path_ll_dip(n, m, G_sv, phased_path_sv, s, e_sv, r) - self.assertAllClose(ll_sv, path_ll_sv) - - V_tmp, P_tmp, ll_tmp = vd_sv.forwards_viterbi_dip_naive_low_mem( - n, m, G_sv, s, e_sv, r - ) - path_tmp = vd_sv.backwards_viterbi_dip(m, V_tmp, P_tmp) - phased_path_tmp = vd_sv.get_phased_path(n, path_tmp) - path_ll_tmp = vd_sv.path_ll_dip(n, m, G_sv, phased_path_tmp, s, e_sv, r) - self.assertAllClose(ll_tmp, path_ll_tmp) - self.assertAllClose(ll_sv, ll_tmp) - - V_tmp, P_tmp, ll_tmp = vd_sv.forwards_viterbi_dip_low_mem( - n, m, G_sv, s, e_sv, r - ) - path_tmp = vd_sv.backwards_viterbi_dip(m, V_tmp, P_tmp) - phased_path_tmp = vd_sv.get_phased_path(n, path_tmp) - path_ll_tmp = vd_sv.path_ll_dip(n, m, G_sv, phased_path_tmp, s, e_sv, r) - self.assertAllClose(ll_tmp, path_ll_tmp) - self.assertAllClose(ll_sv, ll_tmp) - - V_tmp, P_tmp, ll_tmp = vd_sv.forwards_viterbi_dip_naive_vec( - n, m, G_sv, s, e_sv, r - ) - path_tmp = vd_sv.backwards_viterbi_dip(m, V_tmp[:, :, m - 1], P_tmp) - phased_path_tmp = vd_sv.get_phased_path(n, path_tmp) - path_ll_tmp = vd_sv.path_ll_dip(n, m, G_sv, phased_path_tmp, s, e_sv, r) - self.assertAllClose(ll_tmp, path_ll_tmp) - self.assertAllClose(ll_sv, ll_tmp) - - # samples x variants agrees with variants x samples - self.assertAllClose(ll_vs, ll_sv)