Replace toy dataset

tskit-dev · Aug 8, 2023 · 1f99589 · 1f99589
1 parent b47db86
commit 1f99589
Showing 1 changed file with 128 additions and 94 deletions.
diff --git a/python/tests/test_imputation.py b/python/tests/test_imputation.py
@@ -133,84 +133,116 @@ def get_toy_data():
 # For each site, the sum of backward value over all haplotypes is calculated
 # before scaling and shifting.
 
-beagle_forward_matrix_text_1 = """
-m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shiftFac,scaleFac,sumSite,val
-0,0,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,0.000100,0.000100
-0,1,0.000000,1.000000,0.999900,0.000100,0,0,0.000000,1.000000,1.000000,0.999900
-0,2,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,1.000100,0.000100
-0,3,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,1.000200,0.000100
-1,0,1.000000,0.000000,0.999900,0.000100,0,1,0.250000,0.000000,0.000025,0.000025
-1,1,1.000000,0.000000,0.999900,0.000100,1,1,0.250000,0.000000,0.250000,0.249975
-1,2,1.000000,0.000000,0.999900,0.000100,0,1,0.250000,0.000000,0.250025,0.000025
-1,3,1.000000,0.000000,0.999900,0.000100,0,1,0.250000,0.000000,0.250050,0.000025
-2,0,1.000000,0.000000,0.999900,0.000100,1,0,0.250000,0.000000,0.000025,0.000025
-2,1,1.000000,0.000000,0.999900,0.000100,0,0,0.250000,0.000000,0.250000,0.249975
-2,2,1.000000,0.000000,0.999900,0.000100,1,0,0.250000,0.000000,0.250025,0.000025
-2,3,1.000000,0.000000,0.999900,0.000100,1,0,0.250000,0.000000,0.250050,0.000025
-3,0,1.000000,0.000000,0.999900,0.000100,1,0,0.250000,0.000000,0.000025,0.000025
-3,1,1.000000,0.000000,0.999900,0.000100,0,0,0.250000,0.000000,0.250000,0.249975
-3,2,1.000000,0.000000,0.999900,0.000100,1,0,0.250000,0.000000,0.250025,0.000025
-3,3,1.000000,0.000000,0.999900,0.000100,1,0,0.250000,0.000000,0.250050,0.000025
+beagle_fwd_matrix_text_1 = """
+m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shiftFac,scaleFac,sumSite,val,
+0,0,0.000000,1.000000,0.999900,0.000100,1,1,0.000000,1.000000,0.999900,0.999900,
+0,1,0.000000,1.000000,0.999900,0.000100,0,1,0.000000,1.000000,1.000000,0.000100,
+0,2,0.000000,1.000000,0.999900,0.000100,1,1,0.000000,1.000000,1.999900,0.999900,
+0,3,0.000000,1.000000,0.999900,0.000100,1,1,0.000000,1.000000,2.999800,0.999900,
+0,4,0.000000,1.000000,0.999900,0.000100,0,1,0.000000,1.000000,2.999900,0.000100,
+0,5,0.000000,1.000000,0.999900,0.000100,1,1,0.000000,1.000000,3.999800,0.999900,
+1,0,1.000000,0.000000,0.999900,0.000100,0,0,0.166667,0.000000,0.166650,0.166650,
+1,1,1.000000,0.000000,0.999900,0.000100,1,0,0.166667,0.000000,0.166667,0.000017,
+1,2,1.000000,0.000000,0.999900,0.000100,0,0,0.166667,0.000000,0.333317,0.166650,
+1,3,1.000000,0.000000,0.999900,0.000100,0,0,0.166667,0.000000,0.499967,0.166650,
+1,4,1.000000,0.000000,0.999900,0.000100,1,0,0.166667,0.000000,0.499983,0.000017,
+1,5,1.000000,0.000000,0.999900,0.000100,0,0,0.166667,0.000000,0.666633,0.166650,
+2,0,1.000000,0.000000,0.999900,0.000100,1,0,0.166667,0.000000,0.000017,0.000017,
+2,1,1.000000,0.000000,0.999900,0.000100,0,0,0.166667,0.000000,0.166667,0.166650,
+2,2,1.000000,0.000000,0.999900,0.000100,1,0,0.166667,0.000000,0.166683,0.000017,
+2,3,1.000000,0.000000,0.999900,0.000100,1,0,0.166667,0.000000,0.166700,0.000017,
+2,4,1.000000,0.000000,0.999900,0.000100,0,0,0.166667,0.000000,0.333350,0.166650,
+2,5,1.000000,0.000000,0.999900,0.000100,1,0,0.166667,0.000000,0.333367,0.000017,
+3,0,1.000000,0.000000,0.999900,0.000100,1,0,0.166667,0.000000,0.000017,0.000017,
+3,1,1.000000,0.000000,0.999900,0.000100,0,0,0.166667,0.000000,0.166667,0.166650,
+3,2,1.000000,0.000000,0.999900,0.000100,1,0,0.166667,0.000000,0.166683,0.000017,
+3,3,1.000000,0.000000,0.999900,0.000100,1,0,0.166667,0.000000,0.166700,0.000017,
+3,4,1.000000,0.000000,0.999900,0.000100,0,0,0.166667,0.000000,0.333350,0.166650,
+3,5,1.000000,0.000000,0.999900,0.000100,1,0,0.166667,0.000000,0.333367,0.000017,
 """
 
-beagle_backward_matrix_text_1 = """
-m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shiftFac,scaleFac,sumSite,val
-3,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000
-3,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000
-3,2,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000
-3,3,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000
-2,0,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.250000,0.250050,0.250000
-2,1,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.250000,0.250050,0.250000
-2,2,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.250000,0.250050,0.250000
-2,3,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.250000,0.250050,0.250000
-1,0,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.250000,0.250050,0.250000
-1,1,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.250000,0.250050,0.250000
-1,2,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.250000,0.250050,0.250000
-1,3,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.250000,0.250050,0.250000
-0,0,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.250000,0.250050,0.250000
-0,1,1.000000,0.000000,0.999900,0.000100,0,1,0.000000,0.250000,0.250050,0.250000
-0,2,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.250000,0.250050,0.250000
-0,3,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.250000,0.250050,0.250000
+beagle_bwd_matrix_text_1 = """
+m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shiftFac,scaleFac,sumSite,val,
+3,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000,
+3,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000,
+3,2,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000,
+3,3,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000,
+3,4,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000,
+3,5,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000,
+2,0,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.166667,0.333367,0.166667,
+2,1,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.166667,0.333367,0.166667,
+2,2,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.166667,0.333367,0.166667,
+2,3,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.166667,0.333367,0.166667,
+2,4,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.166667,0.333367,0.166667,
+2,5,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.166667,0.333367,0.166667,
+1,0,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.166667,0.333367,0.166667,
+1,1,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.166667,0.333367,0.166667,
+1,2,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.166667,0.333367,0.166667,
+1,3,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.166667,0.333367,0.166667,
+1,4,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.166667,0.333367,0.166667,
+1,5,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.166667,0.333367,0.166667,
+0,0,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.166667,0.666633,0.166667,
+0,1,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.166667,0.666633,0.166667,
+0,2,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.166667,0.666633,0.166667,
+0,3,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.166667,0.666633,0.166667,
+0,4,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.166667,0.666633,0.166667,
+0,5,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.166667,0.666633,0.166667,
 """
 
-beagle_forward_matrix_text_2 = """
-m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shiftFac,scaleFac,sumSite,val
-0,0,0.000000,1.000000,0.999900,0.000100,1,1,0.000000,1.000000,0.999900,0.999900
-0,1,0.000000,1.000000,0.999900,0.000100,0,1,0.000000,1.000000,1.000000,0.000100
-0,2,0.000000,1.000000,0.999900,0.000100,1,1,0.000000,1.000000,1.999900,0.999900
-0,3,0.000000,1.000000,0.999900,0.000100,1,1,0.000000,1.000000,2.999800,0.999900
-1,0,1.000000,0.000000,0.999900,0.000100,0,0,0.250000,0.000000,0.249975,0.249975
-1,1,1.000000,0.000000,0.999900,0.000100,1,0,0.250000,0.000000,0.250000,0.000025
-1,2,1.000000,0.000000,0.999900,0.000100,0,0,0.250000,0.000000,0.499975,0.249975
-1,3,1.000000,0.000000,0.999900,0.000100,0,0,0.250000,0.000000,0.749950,0.249975
-2,0,1.000000,0.000000,0.999900,0.000100,1,1,0.250000,0.000000,0.249975,0.249975
-2,1,1.000000,0.000000,0.999900,0.000100,0,1,0.250000,0.000000,0.250000,0.000025
-2,2,1.000000,0.000000,0.999900,0.000100,1,1,0.250000,0.000000,0.499975,0.249975
-2,3,1.000000,0.000000,0.999900,0.000100,1,1,0.250000,0.000000,0.749950,0.249975
-3,0,1.000000,0.000000,0.999900,0.000100,1,1,0.250000,0.000000,0.249975,0.249975
-3,1,1.000000,0.000000,0.999900,0.000100,0,1,0.250000,0.000000,0.250000,0.000025
-3,2,1.000000,0.000000,0.999900,0.000100,1,1,0.250000,0.000000,0.499975,0.249975
-3,3,1.000000,0.000000,0.999900,0.000100,1,1,0.250000,0.000000,0.749950,0.249975
+beagle_fwd_matrix_text_2 = """
+m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shiftFac,scaleFac,sumSite,val,
+0,0,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,0.000100,0.000100,
+0,1,0.000000,1.000000,0.999900,0.000100,0,0,0.000000,1.000000,1.000000,0.999900,
+0,2,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,1.000100,0.000100,
+0,3,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,1.000200,0.000100,
+0,4,0.000000,1.000000,0.999900,0.000100,0,0,0.000000,1.000000,2.000100,0.999900,
+0,5,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,2.000200,0.000100,
+1,0,1.000000,0.000000,0.999900,0.000100,0,1,0.166667,0.000000,0.000017,0.000017,
+1,1,1.000000,0.000000,0.999900,0.000100,1,1,0.166667,0.000000,0.166667,0.166650,
+1,2,1.000000,0.000000,0.999900,0.000100,0,1,0.166667,0.000000,0.166683,0.000017,
+1,3,1.000000,0.000000,0.999900,0.000100,0,1,0.166667,0.000000,0.166700,0.000017,
+1,4,1.000000,0.000000,0.999900,0.000100,1,1,0.166667,0.000000,0.333350,0.166650,
+1,5,1.000000,0.000000,0.999900,0.000100,0,1,0.166667,0.000000,0.333367,0.000017,
+2,0,1.000000,0.000000,0.999900,0.000100,1,1,0.166667,0.000000,0.166650,0.166650,
+2,1,1.000000,0.000000,0.999900,0.000100,0,1,0.166667,0.000000,0.166667,0.000017,
+2,2,1.000000,0.000000,0.999900,0.000100,1,1,0.166667,0.000000,0.333317,0.166650,
+2,3,1.000000,0.000000,0.999900,0.000100,1,1,0.166667,0.000000,0.499967,0.166650,
+2,4,1.000000,0.000000,0.999900,0.000100,0,1,0.166667,0.000000,0.499983,0.000017,
+2,5,1.000000,0.000000,0.999900,0.000100,1,1,0.166667,0.000000,0.666633,0.166650,
+3,0,1.000000,0.000000,0.999900,0.000100,1,0,0.166667,0.000000,0.000017,0.000017,
+3,1,1.000000,0.000000,0.999900,0.000100,0,0,0.166667,0.000000,0.166667,0.166650,
+3,2,1.000000,0.000000,0.999900,0.000100,1,0,0.166667,0.000000,0.166683,0.000017,
+3,3,1.000000,0.000000,0.999900,0.000100,1,0,0.166667,0.000000,0.166700,0.000017,
+3,4,1.000000,0.000000,0.999900,0.000100,0,0,0.166667,0.000000,0.333350,0.166650,
+3,5,1.000000,0.000000,0.999900,0.000100,1,0,0.166667,0.000000,0.333367,0.000017,
 """
 
-beagle_backward_matrix_text_2 = """
-m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shiftFac,scaleFac,sumSite,val
-3,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000
-3,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000
-3,2,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000
-3,3,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000
-2,0,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.250000,0.749950,0.250000
-2,1,1.000000,0.000000,0.999900,0.000100,0,1,0.000000,0.250000,0.749950,0.250000
-2,2,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.250000,0.749950,0.250000
-2,3,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.250000,0.749950,0.250000
-1,0,1.000000,0.000000,0.999900,0.000100,0,1,0.000000,0.250000,0.749950,0.250000
-1,1,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.250000,0.749950,0.250000
-1,2,1.000000,0.000000,0.999900,0.000100,0,1,0.000000,0.250000,0.749950,0.250000
-1,3,1.000000,0.000000,0.999900,0.000100,0,1,0.000000,0.250000,0.749950,0.250000
-0,0,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.250000,0.749950,0.250000
-0,1,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.250000,0.749950,0.250000
-0,2,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.250000,0.749950,0.250000
-0,3,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.250000,0.749950,0.250000
+beagle_bwd_matrix_text_2 = """
+m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shiftFac,scaleFac,sumSite,val,
+3,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000,
+3,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000,
+3,2,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000,
+3,3,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000,
+3,4,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000,
+3,5,-1,-1,-1,-1,-1,-1,-1,-1,-1,1.000000,
+2,0,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.166667,0.333367,0.166667,
+2,1,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.166667,0.333367,0.166667,
+2,2,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.166667,0.333367,0.166667,
+2,3,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.166667,0.333367,0.166667,
+2,4,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.166667,0.333367,0.166667,
+2,5,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.166667,0.333367,0.166667,
+1,0,1.000000,0.000000,0.999900,0.000100,0,1,0.000000,0.166667,0.666633,0.166667,
+1,1,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.166667,0.666633,0.166667,
+1,2,1.000000,0.000000,0.999900,0.000100,0,1,0.000000,0.166667,0.666633,0.166667,
+1,3,1.000000,0.000000,0.999900,0.000100,0,1,0.000000,0.166667,0.666633,0.166667,
+1,4,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.166667,0.666633,0.166667,
+1,5,1.000000,0.000000,0.999900,0.000100,0,1,0.000000,0.166667,0.666633,0.166667,
+0,0,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.166667,0.333367,0.166667,
+0,1,1.000000,0.000000,0.999900,0.000100,0,1,0.000000,0.166667,0.333367,0.166667,
+0,2,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.166667,0.333367,0.166667,
+0,3,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.166667,0.333367,0.166667,
+0,4,1.000000,0.000000,0.999900,0.000100,0,1,0.000000,0.166667,0.333367,0.166667,
+0,5,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.166667,0.333367,0.166667,
 """
 
 
@@ -221,45 +253,47 @@ def convert_to_numpy(matrix_text):
     assert np.all(np.isin(df.probRec + df.probNoRec, [1, -2]))
     # Check that non-mismatch and mismatch probabilities sum to 1
     assert np.all(np.isin(df.noErrProb + df.errProb, [1, -2]))
-    return df.val.to_numpy().reshape((4, 4))
+    return df.val.to_numpy().reshape((4, 6))
 
 
-def get_beagle_forward_backward_matrices():
-    fwd_matrix_1 = convert_to_numpy(beagle_forward_matrix_text_1)
-    bwd_matrix_1 = convert_to_numpy(beagle_backward_matrix_text_1)
-    fwd_matrix_2 = convert_to_numpy(beagle_forward_matrix_text_2)
-    bwd_matrix_2 = convert_to_numpy(beagle_backward_matrix_text_2)
+def get_forward_backward_matrices():
+    fwd_matrix_1 = convert_to_numpy(beagle_fwd_matrix_text_1)
+    bwd_matrix_1 = convert_to_numpy(beagle_bwd_matrix_text_1)
+    fwd_matrix_2 = convert_to_numpy(beagle_fwd_matrix_text_2)
+    bwd_matrix_2 = convert_to_numpy(beagle_bwd_matrix_text_2)
     return [fwd_matrix_1, bwd_matrix_1, fwd_matrix_2, bwd_matrix_2]
 
 
-def get_beagle_data(matrix_text, data_type):
+def get_test_data(matrix_text, field):
     """Extracts data to check forward or backward probability matrix calculations."""
     df = pd.read_csv(io.StringIO(matrix_text))
-    if data_type == "switch":
+    m = 4  # Number of markers
+    n = 6  # Number of haplotypes
+    if field == "switch":
         # Switch probability, one per site
-        return df.probRec.to_numpy().reshape((4, 4))[:, 0]
-    elif data_type == "mismatch":
+        return df.probRec.to_numpy().reshape((m, n))[:, 0]
+    elif field == "mismatch":
         # Mismatch probability, one per site
-        return df.errProb.to_numpy().reshape((4, 4))[:, 0]
-    elif data_type == "ref_hap_allele":
+        return df.errProb.to_numpy().reshape((m, n))[:, 0]
+    elif field == "ref_hap_allele":
         # Allele in haplotype in reference panel
         # 0 = ref allele, 1 = alt allele
-        return df.refAl.to_numpy().reshape((4, 4))
-    elif data_type == "query_hap_allele":
+        return df.refAl.to_numpy().reshape((m, n))
+    elif field == "query_hap_allele":
         # Allele in haplotype in query
         # 0 = ref allele, 1 = alt allele
-        return df.queryAl.to_numpy().reshape((4, 4))[:, 0]
-    elif data_type == "shift":
+        return df.queryAl.to_numpy().reshape((m, n))[:, 0]
+    elif field == "shift":
         # Shift factor, one per site
-        return df.shiftFac.to_numpy().reshape((4, 4))[:, 0]
-    elif data_type == "scale":
+        return df.shiftFac.to_numpy().reshape((m, n))[:, 0]
+    elif field == "scale":
         # Scale factor, one per site
-        return df.scaleFac.to_numpy().reshape((4, 4))[:, 0]
-    elif data_type == "sum":
+        return df.scaleFac.to_numpy().reshape((m, n))[:, 0]
+    elif field == "sum":
         # Sum of values over haplotypes
-        return df.sumSite.to_numpy().reshape((4, 4))[:, 0]
+        return df.sumSite.to_numpy().reshape((m, n))[:, 0]
     else:
-        raise ValueError(f"Unknown data type: {data_type}")
+        raise ValueError(f"Unknown field: {field}")
 
 
 def get_tskit_forward_backward_matrices(ts, h):