-
Notifications
You must be signed in to change notification settings - Fork 74
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add tests to check LS HMM of tskit compared to BEAGLE
- Loading branch information
Showing
1 changed file
with
251 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,251 @@ | ||
""" | ||
Tests for genotype imputation (forward and Baum-Welsh algorithms). | ||
""" | ||
import io | ||
|
||
import numpy as np | ||
|
||
import tskit | ||
|
||
|
||
# A tree sequence containing 3 diploid individuals with 5 sites and 5 mutations | ||
# (one per site). The first 2 individuals are used as reference panel, | ||
# the last one is the target individual. | ||
|
||
_toy_ts_nodes_text = """\ | ||
id is_sample time population individual metadata | ||
0 1 0.000000 0 0 | ||
1 1 0.000000 0 0 | ||
2 1 0.000000 0 1 | ||
3 1 0.000000 0 1 | ||
4 1 0.000000 0 2 | ||
5 1 0.000000 0 2 | ||
6 0 0.029768 0 -1 | ||
7 0 0.133017 0 -1 | ||
8 0 0.223233 0 -1 | ||
9 0 0.651586 0 -1 | ||
10 0 0.698831 0 -1 | ||
11 0 2.114867 0 -1 | ||
12 0 4.322031 0 -1 | ||
13 0 7.432311 0 -1 | ||
""" | ||
|
||
_toy_ts_edges_text = """\ | ||
left right parent child metadata | ||
0.000000 1000000.000000 6 0 | ||
0.000000 1000000.000000 6 3 | ||
0.000000 1000000.000000 7 2 | ||
0.000000 1000000.000000 7 5 | ||
0.000000 1000000.000000 8 1 | ||
0.000000 1000000.000000 8 4 | ||
0.000000 781157.000000 9 6 | ||
0.000000 781157.000000 9 7 | ||
0.000000 505438.000000 10 8 | ||
0.000000 505438.000000 10 9 | ||
505438.000000 549484.000000 11 8 | ||
505438.000000 549484.000000 11 9 | ||
781157.000000 1000000.000000 12 6 | ||
781157.000000 1000000.000000 12 7 | ||
549484.000000 1000000.000000 13 8 | ||
549484.000000 781157.000000 13 9 | ||
781157.000000 1000000.000000 13 12 | ||
""" | ||
|
||
_toy_ts_sites_text = """\ | ||
position ancestral_state metadata | ||
200000.000000 A | ||
300000.000000 C | ||
520000.000000 G | ||
600000.000000 T | ||
900000.000000 A | ||
""" | ||
|
||
_toy_ts_mutations_text = """\ | ||
site node time derived_state parent metadata | ||
0 9 unknown G -1 | ||
1 8 unknown A -1 | ||
2 9 unknown T -1 | ||
3 9 unknown C -1 | ||
4 12 unknown C -1 | ||
""" | ||
|
||
_toy_ts_individuals_text = """\ | ||
flags | ||
0 | ||
0 | ||
0 | ||
""" | ||
|
||
|
||
def get_toy_ts(): | ||
""" | ||
Returns the toy tree sequence in text format above. | ||
""" | ||
ts = tskit.load_text( | ||
nodes=io.StringIO(_toy_ts_nodes_text), | ||
edges=io.StringIO(_toy_ts_edges_text), | ||
sites=io.StringIO(_toy_ts_sites_text), | ||
mutations=io.StringIO(_toy_ts_mutations_text), | ||
individuals=io.StringIO(_toy_ts_individuals_text), | ||
strict=False, | ||
) | ||
return ts | ||
|
||
|
||
# BEAGLE 4.1 was run on the toy data set above using default parameters. | ||
# The following are the forward probability matrices and backward probability | ||
# matrices calculated when imputing into the third individual above. There are | ||
# two sets of matrices, one for each haplotype. | ||
# | ||
# Notes about calculations: | ||
# n = number of haplotypes in ref. panel | ||
# M = number of markers | ||
# m = index of marker (site) | ||
# h = index of haplotype in ref. panel | ||
# | ||
# In forward probability matrix, | ||
# fwd[m][h] = emission prob., if m = 0 (first marker) | ||
# fwd[m][h] = emission prob. * (scale * fwd[m - 1][h] + shift), otherwise | ||
# where scale = (1 - switch prob.)/sum of fwd[m - 1], | ||
# and shift = switch prob./n. | ||
# | ||
# In backward probability matrix, | ||
# bwd[m][h] = 1, if m = M - 1 (last marker) // DON'T SEE THIS IN BEAGLE | ||
# unadj. bwd[m][h] = emission prob. / n | ||
# bwd[m][h] = (unadj. bwd[m][h] + shift) * scale, otherwise | ||
# where scale = (1 - switch prob.)/sum of unadj. bwd[m], | ||
# and shift = switch prob./n. | ||
# | ||
# For each site, the sum of backward value over all haplotypes is calculated | ||
# before scaling and shifting. | ||
|
||
_fwd_matrix_text_1 = """ | ||
m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shift,scale,sum,val | ||
0,0,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,0.000100,0.000100 | ||
0,1,0.000000,1.000000,0.999900,0.000100,0,0,0.000000,1.000000,1.000000,0.999900 | ||
0,2,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,1.000100,0.000100 | ||
0,3,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,1.000200,0.000100 | ||
1,0,1.000000,0.000000,0.999900,0.000100,0,1,0.250000,0.000000,0.000025,0.000025 | ||
1,1,1.000000,0.000000,0.999900,0.000100,1,1,0.250000,0.000000,0.250000,0.249975 | ||
1,2,1.000000,0.000000,0.999900,0.000100,0,1,0.250000,0.000000,0.250025,0.000025 | ||
1,3,1.000000,0.000000,0.999900,0.000100,0,1,0.250000,0.000000,0.250050,0.000025 | ||
2,0,1.000000,0.000000,0.999900,0.000100,1,0,0.250000,0.000000,0.000025,0.000025 | ||
2,1,1.000000,0.000000,0.999900,0.000100,0,0,0.250000,0.000000,0.250000,0.249975 | ||
2,2,1.000000,0.000000,0.999900,0.000100,1,0,0.250000,0.000000,0.250025,0.000025 | ||
2,3,1.000000,0.000000,0.999900,0.000100,1,0,0.250000,0.000000,0.250050,0.000025 | ||
3,0,1.000000,0.000000,0.999900,0.000100,1,0,0.250000,0.000000,0.000025,0.000025 | ||
3,1,1.000000,0.000000,0.999900,0.000100,0,0,0.250000,0.000000,0.250000,0.249975 | ||
3,2,1.000000,0.000000,0.999900,0.000100,1,0,0.250000,0.000000,0.250025,0.000025 | ||
3,3,1.000000,0.000000,0.999900,0.000100,1,0,0.250000,0.000000,0.250050,0.000025 | ||
""" | ||
|
||
_bwd_matrix_text_1 = """ | ||
m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shift,scale,sum,val | ||
3,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000 | ||
3,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000 | ||
3,2,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000 | ||
3,3,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000 | ||
2,0,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.250000,0.250050,0.250000 | ||
2,1,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.250000,0.250050,0.250000 | ||
2,2,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.250000,0.250050,0.250000 | ||
2,3,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.250000,0.250050,0.250000 | ||
1,0,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.250000,0.250050,0.250000 | ||
1,1,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.250000,0.250050,0.250000 | ||
1,2,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.250000,0.250050,0.250000 | ||
1,3,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.250000,0.250050,0.250000 | ||
0,0,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.250000,0.250050,0.250000 | ||
0,1,1.000000,0.000000,0.999900,0.000100,0,1,0.000000,0.250000,0.250050,0.250000 | ||
0,2,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.250000,0.250050,0.250000 | ||
0,3,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.250000,0.250050,0.250000 | ||
""" | ||
|
||
_fwd_matrix_text_2 = """ | ||
m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shift,scale,sum,val | ||
0,0,0.000000,1.000000,0.999900,0.000100,1,1,0.000000,1.000000,0.999900,0.999900 | ||
0,1,0.000000,1.000000,0.999900,0.000100,0,1,0.000000,1.000000,1.000000,0.000100 | ||
0,2,0.000000,1.000000,0.999900,0.000100,1,1,0.000000,1.000000,1.999900,0.999900 | ||
0,3,0.000000,1.000000,0.999900,0.000100,1,1,0.000000,1.000000,2.999800,0.999900 | ||
1,0,1.000000,0.000000,0.999900,0.000100,0,0,0.250000,0.000000,0.249975,0.249975 | ||
1,1,1.000000,0.000000,0.999900,0.000100,1,0,0.250000,0.000000,0.250000,0.000025 | ||
1,2,1.000000,0.000000,0.999900,0.000100,0,0,0.250000,0.000000,0.499975,0.249975 | ||
1,3,1.000000,0.000000,0.999900,0.000100,0,0,0.250000,0.000000,0.749950,0.249975 | ||
2,0,1.000000,0.000000,0.999900,0.000100,1,1,0.250000,0.000000,0.249975,0.249975 | ||
2,1,1.000000,0.000000,0.999900,0.000100,0,1,0.250000,0.000000,0.250000,0.000025 | ||
2,2,1.000000,0.000000,0.999900,0.000100,1,1,0.250000,0.000000,0.499975,0.249975 | ||
2,3,1.000000,0.000000,0.999900,0.000100,1,1,0.250000,0.000000,0.749950,0.249975 | ||
3,0,1.000000,0.000000,0.999900,0.000100,1,1,0.250000,0.000000,0.249975,0.249975 | ||
3,1,1.000000,0.000000,0.999900,0.000100,0,1,0.250000,0.000000,0.250000,0.000025 | ||
3,2,1.000000,0.000000,0.999900,0.000100,1,1,0.250000,0.000000,0.499975,0.249975 | ||
3,3,1.000000,0.000000,0.999900,0.000100,1,1,0.250000,0.000000,0.749950,0.249975 | ||
""" | ||
|
||
_bwd_matrix_text_2 = """ | ||
m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shift,scale,sum,val | ||
3,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000 | ||
3,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000 | ||
3,2,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000 | ||
3,3,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000 | ||
2,0,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.250000,0.749950,0.250000 | ||
2,1,1.000000,0.000000,0.999900,0.000100,0,1,0.000000,0.250000,0.749950,0.250000 | ||
2,2,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.250000,0.749950,0.250000 | ||
2,3,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.250000,0.749950,0.250000 | ||
1,0,1.000000,0.000000,0.999900,0.000100,0,1,0.000000,0.250000,0.749950,0.250000 | ||
1,1,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.250000,0.749950,0.250000 | ||
1,2,1.000000,0.000000,0.999900,0.000100,0,1,0.000000,0.250000,0.749950,0.250000 | ||
1,3,1.000000,0.000000,0.999900,0.000100,0,1,0.000000,0.250000,0.749950,0.250000 | ||
0,0,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.250000,0.749950,0.250000 | ||
0,1,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.250000,0.749950,0.250000 | ||
0,2,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.250000,0.749950,0.250000 | ||
0,3,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.250000,0.749950,0.250000 | ||
""" | ||
|
||
|
||
def convert_to_numpy(matrix_text): | ||
""" | ||
Converts a matrix in text to numpy and returns it. | ||
""" | ||
x = np.loadtxt(io.StringIO(matrix_text), skiprows=1, delimiter=",") | ||
for i in np.arange(x.shape[0]): | ||
# Check that switch and non-switch probabilities sum to 1 | ||
assert (x[i, 2] + x[i, 3]) == 1 or x[i, 2] == -1 | ||
# Check that non-mismatch and mismatch probabilities sum to 1 | ||
assert (x[i, 4] + x[i, 5]) == 1 or x[i, 4] == -1 | ||
return x[:, -1].reshape((4, 4)) # size (m, h) | ||
|
||
|
||
def get_forward_backward_matrices(): | ||
fwd_matrix_1 = convert_to_numpy(_fwd_matrix_text_1) | ||
bwd_matrix_1 = convert_to_numpy(_bwd_matrix_text_1) | ||
fwd_matrix_2 = convert_to_numpy(_fwd_matrix_text_2) | ||
bwd_matrix_2 = convert_to_numpy(_bwd_matrix_text_2) | ||
return [fwd_matrix_1, bwd_matrix_1, fwd_matrix_2, bwd_matrix_2] | ||
|
||
|
||
def get_test_data(matrix_text, par): | ||
"""Extracts data for checking forward or backward probability matrix calculations.""" | ||
x = convert_to_numpy(matrix_text) | ||
if par == "switch": | ||
# Switch probability, one per site | ||
return x[:, 2].reshape((4, 4))[:, 0] | ||
elif par == "mismatch": | ||
# Mismatch probability, one per site | ||
return x[:, 4].reshape((4, 4))[:, 0] | ||
elif par == "ref_hap_allele": | ||
# Allele in haplotype in reference panel | ||
# 0 = ref allele, 1 = alt allele | ||
return x[:, 6].reshape((4, 4)) | ||
elif par == "query_hap_allele": | ||
# Allele in haplotype in query | ||
# 0 = ref allele, 1 = alt allele | ||
return x[:, 7].reshape((4, 4))[:, 0] | ||
elif par == "shift": | ||
# Shift factor, one per site | ||
return x[:, 8].reshape((4, 4))[:, 0] | ||
elif par == "scale": | ||
# Scale factor, one per site | ||
return x[:, 9].reshape((4, 4))[:, 0] | ||
elif par == "sum": | ||
# Sum of values over haplotypes | ||
return x[:, 10].reshape((4, 4))[:, 0] | ||
else: | ||
raise ValueError(f"Unknown parameter: {par}") |