From 4926a250136e8f9cf77f3c4192fe2f3dc0b1b1ad Mon Sep 17 00:00:00 2001 From: astheeggeggs Date: Wed, 31 Aug 2022 16:57:54 +0100 Subject: [PATCH] added forwards backwards testing Added forwards backwards testing and now include missingness appropriately added missingness to diploid LS added some fixes for flake errors added missingness to diploid viterbi changed test_genotype_matching_fb.py remove stray print removed caps for bool EQUAL_BOTH_HOM etc Removed caps for EQUAL_BOTH_HOM etc in Viterbi removed unused imported function --- python/tests/test_genotype_matching_fb.py | 128 ++++++++++++------ .../tests/test_genotype_matching_viterbi.py | 74 ++++++---- 2 files changed, 133 insertions(+), 69 deletions(-) diff --git a/python/tests/test_genotype_matching_fb.py b/python/tests/test_genotype_matching_fb.py index 984b3ce13a..248382e913 100644 --- a/python/tests/test_genotype_matching_fb.py +++ b/python/tests/test_genotype_matching_fb.py @@ -1,4 +1,3 @@ -# Simulation import copy import itertools @@ -14,6 +13,8 @@ REF_HOM_OBS_HET = 1 REF_HET_OBS_HOM = 2 +MISSING = -1 + def mirror_coordinates(ts): """ @@ -411,6 +412,7 @@ def update_probabilities(self, site, genotype_state): ] query_is_het = genotype_state == 1 + query_is_missing = genotype_state == MISSING for st1 in T: u1 = st1.tree_node @@ -444,6 +446,7 @@ def update_probabilities(self, site, genotype_state): match, template_is_het, query_is_het, + query_is_missing, ) # This will ensure that allelic_state[:n] is filled @@ -561,7 +564,14 @@ def compute_normalisation_factor_dict(self): raise NotImplementedError() def compute_next_probability_dict( - self, site_id, p_last, inner_summation, is_match, template_is_het, query_is_het + self, + site_id, + p_last, + inner_summation, + is_match, + template_is_het, + query_is_het, + query_is_missing, ): raise NotImplementedError() @@ -670,41 +680,45 @@ def compute_next_probability_dict( is_match, template_is_het, query_is_het, + query_is_missing, ): rho = self.rho[site_id] mu = self.mu[site_id] n = self.ts.num_samples - template_is_hom = np.logical_not(template_is_het) - query_is_hom = np.logical_not(query_is_het) - - EQUAL_BOTH_HOM = np.logical_and( - np.logical_and(is_match, template_is_hom), query_is_hom - ) - UNEQUAL_BOTH_HOM = np.logical_and( - np.logical_and(np.logical_not(is_match), template_is_hom), query_is_hom - ) - BOTH_HET = np.logical_and(template_is_het, query_is_het) - REF_HOM_OBS_HET = np.logical_and(template_is_hom, query_is_het) - REF_HET_OBS_HOM = np.logical_and(template_is_het, query_is_hom) - p_t = ( (rho / n) ** 2 + ((1 - rho) * (rho / n)) * inner_normalisation_factor + (1 - rho) ** 2 * p_last ) - p_e = ( - EQUAL_BOTH_HOM * (1 - mu) ** 2 - + UNEQUAL_BOTH_HOM * (mu**2) - + REF_HOM_OBS_HET * (2 * mu * (1 - mu)) - + REF_HET_OBS_HOM * (mu * (1 - mu)) - + BOTH_HET * ((1 - mu) ** 2 + mu**2) - ) + + if query_is_missing: + p_e = 1 + else: + query_is_hom = np.logical_not(query_is_het) + template_is_hom = np.logical_not(template_is_het) + + equal_both_hom = np.logical_and( + np.logical_and(is_match, template_is_hom), query_is_hom + ) + unequal_both_hom = np.logical_and( + np.logical_and(np.logical_not(is_match), template_is_hom), query_is_hom + ) + both_het = np.logical_and(template_is_het, query_is_het) + ref_hom_obs_het = np.logical_and(template_is_hom, query_is_het) + ref_het_obs_hom = np.logical_and(template_is_het, query_is_hom) + + p_e = ( + equal_both_hom * (1 - mu) ** 2 + + unequal_both_hom * (mu**2) + + ref_hom_obs_het * (2 * mu * (1 - mu)) + + ref_het_obs_hom * (mu * (1 - mu)) + + both_het * ((1 - mu) ** 2 + mu**2) + ) return p_t * p_e -# DEV: Sort this class BackwardAlgorithm(LsHmmAlgorithm): """Runs the Li and Stephens forward algorithm.""" @@ -737,29 +751,35 @@ def compute_next_probability_dict( is_match, template_is_het, query_is_het, + query_is_missing, ): mu = self.mu[site_id] template_is_hom = np.logical_not(template_is_het) - query_is_hom = np.logical_not(query_is_het) - EQUAL_BOTH_HOM = np.logical_and( - np.logical_and(is_match, template_is_hom), query_is_hom - ) - UNEQUAL_BOTH_HOM = np.logical_and( - np.logical_and(np.logical_not(is_match), template_is_hom), query_is_hom - ) - BOTH_HET = np.logical_and(template_is_het, query_is_het) - REF_HOM_OBS_HET = np.logical_and(template_is_hom, query_is_het) - REF_HET_OBS_HOM = np.logical_and(template_is_het, query_is_hom) - - p_e = ( - EQUAL_BOTH_HOM * (1 - mu) ** 2 - + UNEQUAL_BOTH_HOM * (mu**2) - + REF_HOM_OBS_HET * (2 * mu * (1 - mu)) - + REF_HET_OBS_HOM * (mu * (1 - mu)) - + BOTH_HET * ((1 - mu) ** 2 + mu**2) - ) + if query_is_missing: + p_e = 1 + else: + query_is_hom = np.logical_not(query_is_het) + + equal_both_hom = np.logical_and( + np.logical_and(is_match, template_is_hom), query_is_hom + ) + unequal_both_hom = np.logical_and( + np.logical_and(np.logical_not(is_match), template_is_hom), query_is_hom + ) + both_het = np.logical_and(template_is_het, query_is_het) + ref_hom_obs_het = np.logical_and(template_is_hom, query_is_het) + ref_het_obs_hom = np.logical_and(template_is_het, query_is_hom) + + p_e = ( + equal_both_hom * (1 - mu) ** 2 + + unequal_both_hom * (mu**2) + + ref_hom_obs_het * (2 * mu * (1 - mu)) + + ref_het_obs_hom * (mu * (1 - mu)) + + both_het * ((1 - mu) ** 2 + mu**2) + ) + return p_next * p_e @@ -797,6 +817,21 @@ def example_genotypes(self, ts): s = H[:, 0].reshape(1, H.shape[0]) + H[:, 1].reshape(1, H.shape[0]) H = H[:, 2:] + genotypes = [ + s, + H[:, -1].reshape(1, H.shape[0]) + H[:, -2].reshape(1, H.shape[0]), + ] + + s_tmp = s.copy() + s_tmp[0, -1] = MISSING + genotypes.append(s_tmp) + s_tmp = s.copy() + s_tmp[0, ts.num_sites // 2] = MISSING + genotypes.append(s_tmp) + s_tmp = s.copy() + s_tmp[0, :] = MISSING + genotypes.append(s_tmp) + m = ts.get_num_sites() n = H.shape[1] @@ -804,11 +839,11 @@ def example_genotypes(self, ts): for i in range(m): G[i, :, :] = np.add.outer(H[i, :], H[i, :]) - return H, G, s + return H, G, genotypes def example_parameters_genotypes(self, ts, seed=42): np.random.seed(seed) - H, G, s = self.example_genotypes(ts) + H, G, genotypes = self.example_genotypes(ts) n = H.shape[1] m = ts.get_num_sites() @@ -819,13 +854,16 @@ def example_parameters_genotypes(self, ts, seed=42): e = self.genotype_emission(mu, m) - yield n, m, G, s, e, r, mu + for s in genotypes: + yield n, m, G, s, e, r, mu # Mixture of random and extremes rs = [np.zeros(m) + 0.999, np.zeros(m) + 1e-6, np.random.rand(m)] mus = [np.zeros(m) + 0.33, np.zeros(m) + 1e-6, np.random.rand(m) * 0.33] - for r, mu in itertools.product(rs, mus): + e = self.genotype_emission(mu, m) + + for s, r, mu in itertools.product(genotypes, rs, mus): r[0] = 0 e = self.genotype_emission(mu, m) yield n, m, G, s, e, r, mu diff --git a/python/tests/test_genotype_matching_viterbi.py b/python/tests/test_genotype_matching_viterbi.py index 89377bdb33..acab5d1c28 100644 --- a/python/tests/test_genotype_matching_viterbi.py +++ b/python/tests/test_genotype_matching_viterbi.py @@ -13,6 +13,8 @@ REF_HOM_OBS_HET = 1 REF_HET_OBS_HOM = 2 +MISSING = -1 + class ValueTransition: """Simple struct holding value transition values.""" @@ -390,6 +392,7 @@ def update_probabilities(self, site, genotype_state): ] query_is_het = genotype_state == 1 + query_is_missing = genotype_state == MISSING for st1 in T: u1 = st1.tree_node @@ -423,6 +426,7 @@ def update_probabilities(self, site, genotype_state): match, template_is_het, query_is_het, + query_is_missing, u1, u2, ) @@ -486,6 +490,7 @@ def compute_next_probability_dict( is_match, template_is_het, query_is_het, + query_is_missing, node_1, node_2, ): @@ -830,6 +835,7 @@ def compute_next_probability_dict( is_match, template_is_het, query_is_het, + query_is_missing, node_1, node_2, ): @@ -841,26 +847,28 @@ def compute_next_probability_dict( double_recombination_required = False single_recombination_required = False - template_is_hom = np.logical_not(template_is_het) - query_is_hom = np.logical_not(query_is_het) - - EQUAL_BOTH_HOM = np.logical_and( - np.logical_and(is_match, template_is_hom), query_is_hom - ) - UNEQUAL_BOTH_HOM = np.logical_and( - np.logical_and(np.logical_not(is_match), template_is_hom), query_is_hom - ) - BOTH_HET = np.logical_and(template_is_het, query_is_het) - REF_HOM_OBS_HET = np.logical_and(template_is_hom, query_is_het) - REF_HET_OBS_HOM = np.logical_and(template_is_het, query_is_hom) - - p_e = ( - EQUAL_BOTH_HOM * (1 - mu) ** 2 - + UNEQUAL_BOTH_HOM * (mu**2) - + REF_HOM_OBS_HET * (2 * mu * (1 - mu)) - + REF_HET_OBS_HOM * (mu * (1 - mu)) - + BOTH_HET * ((1 - mu) ** 2 + mu**2) - ) + if query_is_missing: + p_e = 1 + else: + template_is_hom = np.logical_not(template_is_het) + query_is_hom = np.logical_not(query_is_het) + equal_both_hom = np.logical_and( + np.logical_and(is_match, template_is_hom), query_is_hom + ) + unequal_both_hom = np.logical_and( + np.logical_and(np.logical_not(is_match), template_is_hom), query_is_hom + ) + both_het = np.logical_and(template_is_het, query_is_het) + ref_hom_obs_het = np.logical_and(template_is_hom, query_is_het) + ref_het_obs_hom = np.logical_and(template_is_het, query_is_hom) + + p_e = ( + equal_both_hom * (1 - mu) ** 2 + + unequal_both_hom * (mu**2) + + ref_hom_obs_het * (2 * mu * (1 - mu)) + + ref_het_obs_hom * (mu * (1 - mu)) + + both_het * ((1 - mu) ** 2 + mu**2) + ) no_switch = (1 - r) ** 2 + 2 * (r_n * (1 - r)) + r_n**2 single_switch = r_n * (1 - r) + r_n**2 @@ -919,6 +927,21 @@ def example_genotypes(self, ts): s = H[:, 0].reshape(1, H.shape[0]) + H[:, 1].reshape(1, H.shape[0]) H = H[:, 2:] + genotypes = [ + s, + H[:, -1].reshape(1, H.shape[0]) + H[:, -2].reshape(1, H.shape[0]), + ] + + s_tmp = s.copy() + s_tmp[0, -1] = MISSING + genotypes.append(s_tmp) + s_tmp = s.copy() + s_tmp[0, ts.num_sites // 2] = MISSING + genotypes.append(s_tmp) + s_tmp = s.copy() + s_tmp[0, :] = MISSING + genotypes.append(s_tmp) + m = ts.get_num_sites() n = H.shape[1] @@ -926,11 +949,11 @@ def example_genotypes(self, ts): for i in range(m): G[i, :, :] = np.add.outer(H[i, :], H[i, :]) - return H, G, s + return H, G, genotypes def example_parameters_genotypes(self, ts, seed=42): np.random.seed(seed) - H, G, s = self.example_genotypes(ts) + H, G, genotypes = self.example_genotypes(ts) n = H.shape[1] m = ts.get_num_sites() @@ -941,13 +964,16 @@ def example_parameters_genotypes(self, ts, seed=42): e = self.genotype_emission(mu, m) - yield n, m, G, s, e, r, mu + for s in genotypes: + yield n, m, G, s, e, r, mu # Mixture of random and extremes rs = [np.zeros(m) + 0.999, np.zeros(m) + 1e-6, np.random.rand(m)] mus = [np.zeros(m) + 0.33, np.zeros(m) + 1e-6, np.random.rand(m) * 0.33] - for r, mu in itertools.product(rs, mus): + e = self.genotype_emission(mu, m) + + for s, r, mu in itertools.product(genotypes, rs, mus): r[0] = 0 e = self.genotype_emission(mu, m) yield n, m, G, s, e, r, mu