From 5fdf850d767f007998eececfdd05088cda65afca Mon Sep 17 00:00:00 2001 From: lloydkirk Date: Sun, 16 Jul 2023 23:20:00 -0500 Subject: [PATCH] Implement a two-locus C API for count statistics. We also fully implement two-site statistics, mirroring the functionality of the LdCalculator. There are a number of tasks that need to be done before this is production ready, but it is a start. We've opened a number of issues in github for improving the work that was done for this PR and for implementing the python API. The general approach for the two-site statistics is to gather all of the samples below each mutation over the whole tree sequence (we will be implementing windows in the near future). To do this, we iterate over each tree in the tree sequence once and perform an in-order traversal starting at each mutation's node, storing the samples we encounter along the way. We then perform another step that examines the parentage of each mutation and removes samples that exist in child mutations. With the samples for each allele, we perform intersections to get haplotype weights, which are then passed to summary functions that fill out a stats matrix. We also implement a few happy-path tests that test the functionality of sample sets and multi-allelic sites. We plan to do more in-depth testing once the python API is implemented. This PR also introduces the concept of a "bit array" in the form of tsk_bit_array_t, which needs some further refinement. In essence, this data structure stores all of the possible samples in a bit set where one sample takes up one bit of information and storage space. This enables a few functions that are critical to this code: 1) it's a relatively memory efficient method of storing sample identities 2) we can efficiently intersect the arrays (I believe much of this is autovectorized by gcc) 3) we can use tricks like SWAR (SIMD within an array) to count the number of samples in a set. It's important to note that this is also where the algorithm spends ~50% of it's time. It can use more improvement in both the API design and the performance characteristics. --- c/tests/test_core.c | 79 +++++ c/tests/test_stats.c | 326 +++++++++++++++++ c/tskit/core.c | 98 ++++++ c/tskit/core.h | 24 ++ c/tskit/trees.c | 809 ++++++++++++++++++++++++++++++++++++++++++- c/tskit/trees.h | 45 +++ 6 files changed, 1380 insertions(+), 1 deletion(-) diff --git a/c/tests/test_core.c b/c/tests/test_core.c index 97f1d4c210..801e5aaae2 100644 --- a/c/tests/test_core.c +++ b/c/tests/test_core.c @@ -526,6 +526,84 @@ test_avl_random(void) validate_avl(sizeof(keys) / sizeof(*keys), keys); } +static void +test_bit_arrays(void) +{ + // NB: This test is only valid for the 32 bit implementation of bit arrays. If we + // were to change the chunk size of a bit array, we'd need to update these tests + tsk_bit_array_t arr; + tsk_bit_array_init(&arr, 90, 1); + for (tsk_bit_array_value_t i = 0; i < 20; i++) { + tsk_bit_array_add_bit(&arr, i); + } + tsk_bit_array_add_bit(&arr, 63); + tsk_bit_array_add_bit(&arr, 65); + + // these assertions are only valid for 32-bit values + CU_ASSERT_EQUAL_FATAL(arr.data[0], 1048575); + CU_ASSERT_EQUAL_FATAL(arr.data[1], 2147483648); + CU_ASSERT_EQUAL_FATAL(arr.data[2], 2); + + // verify our assumptions about bit array counting + CU_ASSERT_EQUAL_FATAL(tsk_bit_array_count(&arr), 22); + + tsk_bit_array_free(&arr); + + // create a length-2 array with 64 bit capacity + tsk_bit_array_init(&arr, 64, 2); + tsk_bit_array_t arr_row1, arr_row2; + + // select the first and second row + tsk_bit_array_get_row(&arr, 0, &arr_row1); + tsk_bit_array_get_row(&arr, 1, &arr_row2); + + // fill the first 50 bits of the first row + for (tsk_bit_array_value_t i = 0; i < 50; i++) { + tsk_bit_array_add_bit(&arr_row1, i); + } + // fill bits 20-40 of the second row + for (tsk_bit_array_value_t i = 20; i < 40; i++) { + tsk_bit_array_add_bit(&arr_row2, i); + } + + // verify our assumptions about row selection + CU_ASSERT_EQUAL_FATAL(arr.data[0], 4294967295); + CU_ASSERT_EQUAL_FATAL(arr.data[1], 262143); + CU_ASSERT_EQUAL_FATAL(arr_row1.data[0], 4294967295); + CU_ASSERT_EQUAL_FATAL(arr_row1.data[1], 262143); + + CU_ASSERT_EQUAL_FATAL(arr.data[2], 4293918720); + CU_ASSERT_EQUAL_FATAL(arr.data[3], 255); + CU_ASSERT_EQUAL_FATAL(arr_row2.data[0], 4293918720); + CU_ASSERT_EQUAL_FATAL(arr_row2.data[1], 255); + + // subtract the second from the first row, store in first + tsk_bit_array_subtract(&arr_row1, &arr_row2); + + // verify our assumptions about subtraction + CU_ASSERT_EQUAL_FATAL(arr_row1.data[0], 1048575); + CU_ASSERT_EQUAL_FATAL(arr_row1.data[1], 261888); + + tsk_bit_array_t int_result; + tsk_bit_array_init(&int_result, 64, 1); + + // their intersection should be zero + tsk_bit_array_intersect(&arr_row1, &arr_row2, &int_result); + CU_ASSERT_EQUAL_FATAL(int_result.data[0], 0); + CU_ASSERT_EQUAL_FATAL(int_result.data[1], 0); + + // now, add them back together, storing back in a + tsk_bit_array_add(&arr_row1, &arr_row2); + + // now, their intersection should be the subtracted chunk (20-40) + tsk_bit_array_intersect(&arr_row1, &arr_row2, &int_result); + CU_ASSERT_EQUAL_FATAL(int_result.data[0], 4293918720); + CU_ASSERT_EQUAL_FATAL(int_result.data[1], 255); + + tsk_bit_array_free(&int_result); + tsk_bit_array_free(&arr); +} + static void test_meson_version(void) { @@ -554,6 +632,7 @@ main(int argc, char **argv) { "test_avl_sequential", test_avl_sequential }, { "test_avl_interleaved", test_avl_interleaved }, { "test_avl_random", test_avl_random }, + { "test_bit_arrays", test_bit_arrays }, { "test_meson_version", test_meson_version }, { NULL, NULL }, }; diff --git a/c/tests/test_stats.c b/c/tests/test_stats.c index 2594f3c5a6..22c1b7dcf9 100644 --- a/c/tests/test_stats.c +++ b/c/tests/test_stats.c @@ -1985,6 +1985,326 @@ test_ld_silent_mutations(void) free(base_ts); } +static void +test_paper_ex_two_site(void) +{ + tsk_treeseq_t ts; + double *result; + tsk_size_t s, result_size; + int ret; + + double truth_one_set[6] = { 1, 0.1111111111111111, 0.1111111111111111, 1, 1, 1 }; + double truth_two_sets[12] = { 1, 1, 0.1111111111111111, 0.1111111111111111, + 0.1111111111111111, 0.1111111111111111, 1, 1, 1, 1, 1, 1 }; + double truth_three_sets[18] = { 1, 1, 0, 0.1111111111111111, 0.1111111111111111, 0, + 0.1111111111111111, 0.1111111111111111, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; + + tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, + paper_ex_mutations, paper_ex_individuals, NULL, 0); + + tsk_size_t sample_set_sizes[3]; + tsk_size_t num_sample_sets; + tsk_id_t sample_sets[ts.num_samples * 3]; + + // First sample set contains all of the samples + sample_set_sizes[0] = ts.num_samples; + num_sample_sets = 1; + for (s = 0; s < ts.num_samples; s++) { + sample_sets[s] = (tsk_id_t) s; + } + + ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 6); + assert_arrays_almost_equal(result_size * num_sample_sets, result, truth_one_set); + tsk_safe_free(result); + + // Second sample set contains all of the samples + sample_set_sizes[1] = ts.num_samples; + num_sample_sets = 2; + for (s = ts.num_samples; s < ts.num_samples * 2; s++) { + sample_sets[s] = (tsk_id_t) s - (tsk_id_t) ts.num_samples; + } + + ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 6); + assert_arrays_almost_equal(result_size * num_sample_sets, result, truth_two_sets); + tsk_safe_free(result); + + // Third sample set contains the first two samples + sample_set_sizes[2] = 2; + num_sample_sets = 3; + for (s = ts.num_samples * 2; s < (ts.num_samples * 3) - 2; s++) { + sample_sets[s] = (tsk_id_t) s - (tsk_id_t) ts.num_samples * 2; + } + + ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 6); + assert_arrays_almost_equal(result_size * num_sample_sets, result, truth_three_sets); + tsk_safe_free(result); + + tsk_treeseq_free(&ts); +} + +static void +test_two_site_correlated_multiallelic(void) +{ + const char *nodes = "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "0 2 -1\n" + "0 4 -1\n" + "0 6 -1\n" + "0 8 -1\n" + "0 10 -1\n" + "0 12 -1\n" + "0 14 -1\n" + "0 16 -1\n"; + const char *edges = "0 20 9 0,1\n" + "0 20 10 2,9\n" + "0 20 11 4,5\n" + "0 20 12 6,11\n" + "0 20 13 7,8\n" + "0 20 14 3,10\n" + "0 10 15 12\n" + "10 20 15 13\n" + "0 10 15 14\n" + "10 20 15 14\n" + "10 20 16 12\n" + "0 10 16 13\n" + "0 10 16 15\n" + "10 20 16 15\n"; + const char *sites = "7 A\n" + "13 G\n"; + const char *mutations = "0 15 T -1\n" + "0 14 G 0\n" + "1 15 T -1\n" + "1 13 C 2\n"; + + int ret; + + tsk_treeseq_t ts; + double *result; + tsk_size_t s, result_size; + + double truth_D[3] + = { 0.043209876543209874, -0.018518518518518517, 0.05555555555555555 }; + double truth_D2[3] + = { 0.023844603634269844, 0.02384460363426984, 0.02384460363426984 }; + double truth_r2[3] = { 1, 1, 1 }; + double truth_D_prime[3] + = { 0.7777777777777777, 0.4444444444444444, 0.6666666666666666 }; + double truth_r[3] + = { 0.18377223398316206, -0.12212786219416509, 0.2609542781331212 }; + double truth_Dz[3] + = { 0.0033870175616860566, 0.003387017561686057, 0.003387017561686057 }; + double truth_pi2[3] + = { 0.04579247743399549, 0.04579247743399549, 0.0457924774339955 }; + + tsk_treeseq_from_text(&ts, 20, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); + + tsk_size_t sample_set_sizes[1] = { ts.num_samples }; + tsk_size_t num_sample_sets = 1; + tsk_id_t sample_sets[ts.num_samples]; + + for (s = 0; s < ts.num_samples; s++) { + sample_sets[s] = (tsk_id_t) s; + } + + ret = tsk_treeseq_D(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_D); + tsk_safe_free(result); + + ret = tsk_treeseq_D2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_D2); + tsk_safe_free(result); + + ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_r2); + tsk_safe_free(result); + + ret = tsk_treeseq_D_prime(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, + NULL, 0, NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_D_prime); + tsk_safe_free(result); + + ret = tsk_treeseq_r(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_r); + tsk_safe_free(result); + + ret = tsk_treeseq_Dz(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_Dz); + tsk_safe_free(result); + + ret = tsk_treeseq_pi2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, + 0, NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_pi2); + tsk_safe_free(result); + + tsk_treeseq_free(&ts); +} + +static void +test_two_site_uncorrelated_multiallelic(void) +{ + const char *nodes = "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "0 2 -1\n" + "0 4 -1\n" + "0 6 -1\n" + "0 8 -1\n" + "0 10 -1\n" + "0 12 -1\n" + "0 14 -1\n" + "0 16 -1\n" + "0 2 -1\n" + "0 4 -1\n" + "0 6 -1\n" + "0 8 -1\n" + "0 10 -1\n" + "0 12 -1\n" + "0 14 -1\n" + "0 16 -1\n"; + const char *edges = "0 10 9 0,1\n" + "10 20 17 0,3\n" + "0 10 10 2,9\n" + "10 20 18 6,17\n" + "0 10 11 3,4\n" + "10 20 19 1,4\n" + "0 10 12 5,11\n" + "10 20 20 7,19\n" + "0 10 13 6,7\n" + "10 20 21 2,5\n" + "0 10 14 8,13\n" + "10 20 22 8,21\n" + "0 10 15 10,12\n" + "10 20 23 18,20\n" + "0 10 16 14,15\n" + "10 20 24 22,23\n"; + const char *sites = "7 A\n" + "13 G\n"; + const char *mutations = "0 15 T -1\n" + "0 12 G 0\n" + "1 23 T -1\n" + "1 20 A 2\n"; + + tsk_treeseq_t ts; + + int ret; + double *result; + tsk_size_t result_size; + + double truth_D[3] = { 0.05555555555555555, 0.0, 0.05555555555555555 }; + double truth_D2[3] = { 0.024691358024691357, 0.0, 0.024691358024691357 }; + double truth_r2[3] = { 1, 0, 1 }; + double truth_D_prime[3] = { 0.6666666666666665, 0.0, 0.6666666666666665 }; + double truth_r[3] = { 0.24999999999999997, 0.0, 0.24999999999999997 }; + double truth_Dz[3] = { 0.0, 0.0, 0.0 }; + double truth_pi2[3] + = { 0.04938271604938272, 0.04938271604938272, 0.04938271604938272 }; + + tsk_treeseq_from_text(&ts, 20, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); + + tsk_size_t sample_set_sizes[1] = { ts.num_samples }; + tsk_size_t num_sample_sets = 1; + tsk_id_t sample_sets[ts.num_samples]; + + for (tsk_size_t s = 0; s < ts.num_samples; s++) { + sample_sets[s] = (tsk_id_t) s; + } + + ret = tsk_treeseq_D(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_D); + tsk_safe_free(result); + + ret = tsk_treeseq_D2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_D2); + tsk_safe_free(result); + + ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_r2); + tsk_safe_free(result); + + ret = tsk_treeseq_D_prime(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, + NULL, 0, NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_D_prime); + tsk_safe_free(result); + + ret = tsk_treeseq_r(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_r); + tsk_safe_free(result); + + ret = tsk_treeseq_Dz(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_Dz); + tsk_safe_free(result); + + ret = tsk_treeseq_pi2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, + 0, NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_pi2); + tsk_safe_free(result); + + tsk_treeseq_free(&ts); +} + static void test_simplest_divergence_matrix(void) { @@ -2248,6 +2568,12 @@ main(int argc, char **argv) { "test_ld_multi_mutations", test_ld_multi_mutations }, { "test_ld_silent_mutations", test_ld_silent_mutations }, + { "test_paper_ex_two_site", test_paper_ex_two_site }, + { "test_two_site_correlated_multiallelic", + test_two_site_correlated_multiallelic }, + { "test_two_site_uncorrelated_multiallelic", + test_two_site_uncorrelated_multiallelic }, + { "test_simplest_divergence_matrix", test_simplest_divergence_matrix }, { "test_simplest_divergence_matrix_windows", test_simplest_divergence_matrix_windows }, diff --git a/c/tskit/core.c b/c/tskit/core.c index 5a8ed6d9ac..44be9740fc 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -1177,3 +1177,101 @@ tsk_avl_tree_int_ordered_nodes(const tsk_avl_tree_int_t *self, tsk_avl_node_int_ ordered_nodes_traverse(self->head.rlink, 0, out); return 0; } + +// Bit Array implementation. Allows us to store unsigned integers in a compact manner. +// Currently implemented as an array of 32-bit unsigned integers for ease of counting. + +int +tsk_bit_array_init(tsk_bit_array_t *self, tsk_size_t num_bits, tsk_size_t length) +{ + int ret = 0; + + self->size = (num_bits >> TSK_BIT_ARRAY_CHUNK) + + (num_bits % TSK_BIT_ARRAY_NUM_BITS ? 1 : 0); + self->data = tsk_calloc(self->size * length, sizeof(*self->data)); + if (self->data == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } +out: + return ret; +} + +void +tsk_bit_array_get_row(const tsk_bit_array_t *self, tsk_size_t row, tsk_bit_array_t *out) +{ + out->size = self->size; + out->data = self->data + (row * self->size); +} + +void +tsk_bit_array_intersect( + const tsk_bit_array_t *self, const tsk_bit_array_t *other, tsk_bit_array_t *out) +{ + for (tsk_size_t i = 0; i < self->size; i++) { + out->data[i] = self->data[i] & other->data[i]; + } +} + +void +tsk_bit_array_subtract(tsk_bit_array_t *self, const tsk_bit_array_t *other) +{ + for (tsk_size_t i = 0; i < self->size; i++) { + self->data[i] &= ~(other->data[i]); + } +} + +void +tsk_bit_array_add(tsk_bit_array_t *self, const tsk_bit_array_t *other) +{ + for (tsk_size_t i = 0; i < self->size; i++) { + self->data[i] |= other->data[i]; + } +} + +void +tsk_bit_array_add_bit(tsk_bit_array_t *self, const tsk_bit_array_value_t bit) +{ + tsk_bit_array_value_t i = bit >> TSK_BIT_ARRAY_CHUNK; + self->data[i] |= (tsk_bit_array_value_t) 1 << (bit - (TSK_BIT_ARRAY_NUM_BITS * i)); +} + +bool +tsk_bit_array_contains(const tsk_bit_array_t *self, const tsk_bit_array_value_t bit) +{ + tsk_bit_array_value_t i = bit >> TSK_BIT_ARRAY_CHUNK; + return self->data[i] + & ((tsk_bit_array_value_t) 1 << (bit - (TSK_BIT_ARRAY_NUM_BITS * i))); +} + +tsk_size_t +tsk_bit_array_count(const tsk_bit_array_t *self) +{ + // Utilizes 12 operations per bit array. NB this only works on 32 bit integers. + // Taken from: + // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + // There's a nice breakdown of this algorithm here: + // https://stackoverflow.com/a/109025 + // Could probably do better with explicit SIMD (instead of SWAR), but not as + // portable: https://arxiv.org/pdf/1611.07612.pdf + // + // There is one solution to explore further, which uses __builtin_popcountll. + // This option is relatively simple, but requires a 64 bit bit array and also + // involves some compiler flag plumbing (-mpopcnt) + + tsk_bit_array_value_t tmp; + tsk_size_t i, count = 0; + + for (i = 0; i < self->size; i++) { + tmp = self->data[i] - ((self->data[i] >> 1) & 0x55555555); + tmp = (tmp & 0x33333333) + ((tmp >> 2) & 0x33333333); + count += (((tmp + (tmp >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; + } + return count; +} + +void +tsk_bit_array_free(tsk_bit_array_t *self) +{ + tsk_safe_free(self->data); +} diff --git a/c/tskit/core.h b/c/tskit/core.h index 45a33dd8b7..a10f4376a5 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -1009,6 +1009,30 @@ int tsk_memcmp(const void *s1, const void *s2, tsk_size_t size); void tsk_set_debug_stream(FILE *f); FILE *tsk_get_debug_stream(void); +/* Bit Array functionality */ + +typedef uint32_t tsk_bit_array_value_t; +typedef struct { + tsk_size_t size; // Number of chunks per row + tsk_bit_array_value_t *data; // Array data +} tsk_bit_array_t; + +#define TSK_BIT_ARRAY_CHUNK 5U +#define TSK_BIT_ARRAY_NUM_BITS (1U << TSK_BIT_ARRAY_CHUNK) + +int tsk_bit_array_init(tsk_bit_array_t *self, tsk_size_t num_bits, tsk_size_t length); +void tsk_bit_array_free(tsk_bit_array_t *self); +void tsk_bit_array_get_row( + const tsk_bit_array_t *self, tsk_size_t row, tsk_bit_array_t *out); +void tsk_bit_array_intersect( + const tsk_bit_array_t *self, const tsk_bit_array_t *other, tsk_bit_array_t *out); +void tsk_bit_array_subtract(tsk_bit_array_t *self, const tsk_bit_array_t *other); +void tsk_bit_array_add(tsk_bit_array_t *self, const tsk_bit_array_t *other); +void tsk_bit_array_add_bit(tsk_bit_array_t *self, const tsk_bit_array_value_t bit); +bool tsk_bit_array_contains( + const tsk_bit_array_t *self, const tsk_bit_array_value_t bit); +tsk_size_t tsk_bit_array_count(const tsk_bit_array_t *self); + #ifdef __cplusplus } #endif diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 0147cbd9ab..0c9889db52 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -1483,7 +1483,7 @@ get_allele_weights(const tsk_site_t *site, const double *state, tsk_size_t state allele_row[k] += state_row[k]; } - /* Get the index for the alternate allele that we must substract from */ + /* Get the index for the alternate allele that we must subtract from */ alt_allele = site->ancestral_state; alt_allele_length = site->ancestral_state_length; if (mutation.parent != TSK_NULL) { @@ -2124,6 +2124,530 @@ tsk_treeseq_sample_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sample_s return ret; } +/*********************************** + * Two Locus Statistics + ***********************************/ + +static int +get_allele_samples(const tsk_site_t *site, const tsk_bit_array_t *state, + tsk_bit_array_t *out_allele_samples, tsk_size_t *out_num_alleles) +{ + int ret = 0; + tsk_mutation_t mutation, parent_mut; + tsk_size_t mutation_index, allele, alt_allele_length; + /* The allele table */ + tsk_size_t max_alleles = site->mutations_length + 1; + const char **alleles = tsk_malloc(max_alleles * sizeof(*alleles)); + tsk_size_t *allele_lengths = tsk_calloc(max_alleles, sizeof(*allele_lengths)); + const char *alt_allele; + tsk_bit_array_t state_row; + tsk_bit_array_t allele_samples_row; + tsk_bit_array_t alt_allele_samples_row; + tsk_size_t num_alleles = 1; + + if (alleles == NULL || allele_lengths == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + + tsk_bug_assert(state != NULL); + alleles[0] = site->ancestral_state; + allele_lengths[0] = site->ancestral_state_length; + + for (mutation_index = 0; mutation_index < site->mutations_length; mutation_index++) { + mutation = site->mutations[mutation_index]; + /* Compute the allele index for this derived state value. */ + for (allele = 0; allele < num_alleles; allele++) { + if (mutation.derived_state_length == allele_lengths[allele] + && tsk_memcmp( + mutation.derived_state, alleles[allele], allele_lengths[allele]) + == 0) { + break; + } + } + if (allele == num_alleles) { + tsk_bug_assert(allele < max_alleles); + alleles[allele] = mutation.derived_state; + allele_lengths[allele] = mutation.derived_state_length; + num_alleles++; + } + + /* Add the mutation's samples to this allele */ + tsk_bit_array_get_row(out_allele_samples, allele, &allele_samples_row); + tsk_bit_array_get_row(state, mutation_index, &state_row); + tsk_bit_array_add(&allele_samples_row, &state_row); + + /* Get the index for the alternate allele that we must subtract from */ + alt_allele = site->ancestral_state; + alt_allele_length = site->ancestral_state_length; + if (mutation.parent != TSK_NULL) { + parent_mut = site->mutations[mutation.parent - site->mutations[0].id]; + alt_allele = parent_mut.derived_state; + alt_allele_length = parent_mut.derived_state_length; + } + for (allele = 0; allele < num_alleles; allele++) { + if (alt_allele_length == allele_lengths[allele] + && tsk_memcmp(alt_allele, alleles[allele], allele_lengths[allele]) + == 0) { + break; + } + } + tsk_bug_assert(allele < num_alleles); + + tsk_bit_array_get_row(out_allele_samples, allele, &alt_allele_samples_row); + tsk_bit_array_subtract(&alt_allele_samples_row, &allele_samples_row); + } + *out_num_alleles = num_alleles; +out: + tsk_safe_free(alleles); + tsk_safe_free(allele_lengths); + return ret; +} + +static int +norm_hap_weighted(tsk_size_t state_dim, const double *hap_weights, + tsk_size_t TSK_UNUSED(n_a), tsk_size_t TSK_UNUSED(n_b), double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + const double *weight_row; + double n; + tsk_size_t k; + + for (k = 0; k < state_dim; k++) { + weight_row = GET_2D_ROW(hap_weights, 3, k); + n = (double) args.sample_set_sizes[k]; + // TODO: what to do when n = 0 + result[k] = weight_row[0] / n; + } + return 0; +} + +static int +norm_total_weighted(tsk_size_t state_dim, const double *TSK_UNUSED(hap_weights), + tsk_size_t n_a, tsk_size_t n_b, double *result, void *TSK_UNUSED(params)) +{ + tsk_size_t k; + + for (k = 0; k < state_dim; k++) { + result[k] = 1 / (double) (n_a * n_b); + } + return 0; +} + +static void +get_all_samples_bits(tsk_bit_array_t *all_samples, tsk_size_t n) +{ + tsk_size_t i; + const tsk_bit_array_value_t all = ~((tsk_bit_array_value_t) 0); + const tsk_bit_array_value_t remainder_samples = n % TSK_BIT_ARRAY_NUM_BITS; + + all_samples->data[all_samples->size - 1] + = remainder_samples ? ~(all << remainder_samples) : all; + for (i = 0; i < all_samples->size - 1; i++) { + all_samples->data[i] = all; + } +} + +typedef int norm_func_t(tsk_size_t state_dim, const double *hap_weights, tsk_size_t n_a, + tsk_size_t n_b, double *result, void *params); + +static int +compute_general_two_site_stat_result(const tsk_bit_array_t *site_a_state, + const tsk_bit_array_t *site_b_state, tsk_size_t num_a_alleles, + tsk_size_t num_b_alleles, tsk_size_t num_samples, tsk_size_t state_dim, + const tsk_bit_array_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, + sample_count_stat_params_t *f_params, norm_func_t *norm_f, bool polarised, + double *result) +{ + int ret = 0; + tsk_bit_array_t A_samples, B_samples; + // ss_ prefix refers to a sample set + tsk_bit_array_t ss_row; + tsk_bit_array_t ss_A_samples, ss_B_samples, ss_AB_samples, AB_samples; + // Sample sets and b sites are rows, a sites are columns + // b1 b2 b3 + // a1 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] + // a2 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] + // a3 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] + tsk_size_t k, mut_a, mut_b; + tsk_size_t row_len = num_b_alleles * state_dim; + tsk_size_t w_A = 0, w_B = 0, w_AB = 0; + uint8_t polarised_val = polarised ? 1 : 0; + double *hap_weight_row; + double *result_tmp_row; + double *weights = tsk_malloc(3 * state_dim * sizeof(*weights)); + double *norm = tsk_malloc(state_dim * sizeof(*norm)); + double *result_tmp = tsk_malloc(row_len * num_a_alleles * sizeof(*result_tmp)); + + tsk_memset(&ss_A_samples, 0, sizeof(ss_A_samples)); + tsk_memset(&ss_B_samples, 0, sizeof(ss_B_samples)); + tsk_memset(&ss_AB_samples, 0, sizeof(ss_AB_samples)); + tsk_memset(&AB_samples, 0, sizeof(AB_samples)); + + if (weights == NULL || norm == NULL || result_tmp == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + + ret = tsk_bit_array_init(&ss_A_samples, num_samples, 1); + if (ret != 0) { + goto out; + } + ret = tsk_bit_array_init(&ss_B_samples, num_samples, 1); + if (ret != 0) { + goto out; + } + ret = tsk_bit_array_init(&ss_AB_samples, num_samples, 1); + if (ret != 0) { + goto out; + } + ret = tsk_bit_array_init(&AB_samples, num_samples, 1); + if (ret != 0) { + goto out; + } + + for (mut_a = polarised_val; mut_a < num_a_alleles; mut_a++) { + result_tmp_row = GET_2D_ROW(result_tmp, row_len, mut_a); + for (mut_b = polarised_val; mut_b < num_b_alleles; mut_b++) { + tsk_bit_array_get_row(site_a_state, mut_a, &A_samples); + tsk_bit_array_get_row(site_b_state, mut_b, &B_samples); + tsk_bit_array_intersect(&A_samples, &B_samples, &AB_samples); + for (k = 0; k < state_dim; k++) { + tsk_bit_array_get_row(sample_sets, k, &ss_row); + hap_weight_row = GET_2D_ROW(weights, 3, k); + + tsk_bit_array_intersect(&A_samples, &ss_row, &ss_A_samples); + tsk_bit_array_intersect(&B_samples, &ss_row, &ss_B_samples); + tsk_bit_array_intersect(&AB_samples, &ss_row, &ss_AB_samples); + + w_AB = tsk_bit_array_count(&ss_AB_samples); + w_A = tsk_bit_array_count(&ss_A_samples); + w_B = tsk_bit_array_count(&ss_B_samples); + + hap_weight_row[0] = (double) w_AB; + hap_weight_row[1] = (double) (w_A - w_AB); // w_Ab + hap_weight_row[2] = (double) (w_B - w_AB); // w_aB + } + ret = f(state_dim, weights, result_dim, result_tmp_row, f_params); + if (ret != 0) { + goto out; + } + ret = norm_f(state_dim, weights, num_a_alleles - polarised_val, + num_b_alleles - polarised_val, norm, f_params); + if (ret != 0) { + goto out; + } + for (k = 0; k < state_dim; k++) { + result[k] += result_tmp_row[k] * norm[k]; + } + result_tmp_row += state_dim; // Advance to the next column + } + } + +out: + tsk_safe_free(weights); + tsk_safe_free(norm); + tsk_safe_free(result_tmp); + tsk_bit_array_free(&ss_A_samples); + tsk_bit_array_free(&ss_B_samples); + tsk_bit_array_free(&ss_AB_samples); + tsk_bit_array_free(&AB_samples); + return ret; +} + +static int +get_mutation_samples( + const tsk_treeseq_t *ts, tsk_size_t *num_alleles, tsk_bit_array_t *allele_samples) +{ + int ret = 0; + const tsk_flags_t *restrict flags = ts->tables->nodes.flags; + const tsk_size_t num_samples = tsk_treeseq_get_num_samples(ts); + const tsk_size_t *restrict site_muts_len = ts->site_mutations_length; + const tsk_site_t *restrict site; + tsk_tree_t tree; + tsk_bit_array_t all_samples_bits, mut_samples, mut_samples_row, out_row; + tsk_size_t max_muts_len, mut_offset, num_nodes, s, m, n; + tsk_id_t node, *nodes = NULL; + void *tmp_nodes; + + tsk_memset(&mut_samples, 0, sizeof(mut_samples)); + tsk_memset(&all_samples_bits, 0, sizeof(all_samples_bits)); + + max_muts_len = 0; + for (s = 0; s < ts->tables->sites.num_rows; s++) { + if (site_muts_len[s] > max_muts_len) { + max_muts_len = site_muts_len[s]; + } + } + ret = tsk_bit_array_init(&mut_samples, num_samples, max_muts_len); + if (ret != 0) { + goto out; + } + ret = tsk_bit_array_init(&all_samples_bits, num_samples, 1); + if (ret != 0) { + goto out; + } + + ret = tsk_tree_init(&tree, ts, TSK_NO_SAMPLE_COUNTS); + if (ret != 0) { + goto out; + } + + // A future improvement could get a union of all sample sets + // instead of all samples + get_all_samples_bits(&all_samples_bits, num_samples); + + // Traverse down each tree, recording all samples below each mutation. We perform one + // preorder traversal per mutation. + mut_offset = 0; + for (ret = tsk_tree_first(&tree); ret == TSK_TREE_OK; ret = tsk_tree_next(&tree)) { + tmp_nodes = tsk_realloc(nodes, tsk_tree_get_size_bound(&tree) * sizeof(*nodes)); + if (tmp_nodes == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + nodes = tmp_nodes; + for (s = 0; s < tree.sites_length; s++) { + site = &tree.sites[s]; + tsk_bit_array_get_row(allele_samples, mut_offset, &out_row); + tsk_bit_array_add(&out_row, &all_samples_bits); + // Zero out results before the start of each iteration + tsk_memset(mut_samples.data, 0, + mut_samples.size * max_muts_len * sizeof(tsk_bit_array_value_t)); + for (m = 0; m < site->mutations_length; m++) { + tsk_bit_array_get_row(&mut_samples, m, &mut_samples_row); + node = site->mutations[m].node; + ret = tsk_tree_preorder_from(&tree, node, nodes, &num_nodes); + if (ret != 0) { + goto out; + } + for (n = 0; n < num_nodes; n++) { + node = nodes[n]; + if (flags[node] & TSK_NODE_IS_SAMPLE) { + tsk_bit_array_add_bit( + &mut_samples_row, (tsk_bit_array_value_t) node); + } + } + mut_offset++; + } + mut_offset++; // One more for the ancestral allele + get_allele_samples(site, &mut_samples, &out_row, &(num_alleles[site->id])); + } + } + // if adding code below, check ret before continuing +out: + tsk_safe_free(nodes); + tsk_tree_free(&tree); + tsk_bit_array_free(&mut_samples); + tsk_bit_array_free(&all_samples_bits); + return ret; +} + +static int +tsk_treeseq_two_site_count_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, + const tsk_bit_array_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, + sample_count_stat_params_t *f_params, norm_func_t *norm_f, + const double *TSK_UNUSED(left_window), const double *TSK_UNUSED(right_window), + tsk_flags_t options, tsk_size_t *result_size, double **result) +{ + int ret = 0; + tsk_bit_array_t allele_samples; + tsk_bit_array_t site_a_state, site_b_state; + tsk_size_t inner, result_offset, inner_offset, a_offset, b_offset; + tsk_size_t site_a, site_b; + bool polarised = false; + const tsk_size_t num_sites = self->tables->sites.num_rows; + const tsk_size_t num_samples = self->num_samples; + const tsk_size_t max_alleles = self->tables->mutations.num_rows + num_sites; + tsk_size_t *num_alleles = tsk_malloc(num_sites * sizeof(*num_alleles)); + const tsk_size_t *restrict site_muts_len = self->site_mutations_length; + + tsk_memset(&allele_samples, 0, sizeof(allele_samples)); + + if (num_alleles == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + + ret = tsk_bit_array_init(&allele_samples, num_samples, max_alleles); + if (ret != 0) { + goto out; + } + ret = get_mutation_samples(self, num_alleles, &allele_samples); + if (ret != 0) { + goto out; + } + + // Number of pairs w/ replacement (sites) + *result_size = (num_sites * (1 + num_sites)) / 2U; + *result = tsk_calloc(*result_size * result_dim, sizeof(**result)); + + if (result == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + + if (options & TSK_STAT_POLARISED) { + polarised = true; + } + + inner = 0; + a_offset = 0; + b_offset = 0; + inner_offset = 0; + result_offset = 0; + // TODO: implement windows! + for (site_a = 0; site_a < num_sites; site_a++) { + b_offset = inner_offset; + for (site_b = inner; site_b < num_sites; site_b++) { + tsk_bit_array_get_row(&allele_samples, a_offset, &site_a_state); + tsk_bit_array_get_row(&allele_samples, b_offset, &site_b_state); + ret = compute_general_two_site_stat_result(&site_a_state, &site_b_state, + num_alleles[site_a], num_alleles[site_b], num_samples, state_dim, + sample_sets, result_dim, f, f_params, norm_f, polarised, + &((*result)[result_offset])); + if (ret != 0) { + goto out; + } + result_offset += result_dim; + b_offset += site_muts_len[site_b] + 1; + } + a_offset += site_muts_len[site_a] + 1; + inner_offset += site_muts_len[site_a] + 1; + inner++; + } + +out: + tsk_safe_free(num_alleles); + tsk_bit_array_free(&allele_samples); + return ret; +} + +static int +sample_sets_to_bit_array(const tsk_treeseq_t *self, const tsk_size_t *sample_set_sizes, + const tsk_id_t *sample_sets, tsk_size_t num_sample_sets, + tsk_bit_array_t *sample_sets_bits) +{ + int ret; + tsk_bit_array_t bits_row; + tsk_size_t j, k, l; + tsk_id_t u, sample_index; + + ret = tsk_bit_array_init(sample_sets_bits, self->num_samples, num_sample_sets); + if (ret != 0) { + return ret; + } + + j = 0; + for (k = 0; k < num_sample_sets; k++) { + tsk_bit_array_get_row(sample_sets_bits, k, &bits_row); + for (l = 0; l < sample_set_sizes[k]; l++) { + u = sample_sets[j]; + sample_index = self->sample_index_map[u]; + if (tsk_bit_array_contains( + &bits_row, (tsk_bit_array_value_t) sample_index)) { + ret = TSK_ERR_DUPLICATE_SAMPLE; + goto out; + } + tsk_bit_array_add_bit(&bits_row, (tsk_bit_array_value_t) sample_index); + j++; + } + } + +out: + return ret; +} + +static int +tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t result_dim, const tsk_id_t *set_indexes, general_stat_func_t *f, + norm_func_t *norm_f, tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result) +{ + // TODO: generalize this function if we ever decide to do weighted two_locus stats. + // We only implement count stats and therefore we don't handle weights. + int ret = 0; + tsk_bit_array_t sample_sets_bits; + bool stat_site = !!(options & TSK_STAT_SITE); + bool stat_branch = !!(options & TSK_STAT_BRANCH); + double default_windows[] = { 0, self->tables->sequence_length }; + tsk_size_t state_dim = num_sample_sets; + sample_count_stat_params_t f_params = { .sample_sets = sample_sets, + .num_sample_sets = num_sample_sets, + .sample_set_sizes = sample_set_sizes, + .set_indexes = set_indexes }; + + tsk_memset(&sample_sets_bits, 0, sizeof(sample_sets_bits)); + + ret = tsk_treeseq_check_sample_sets( + self, num_sample_sets, sample_set_sizes, sample_sets); + if (ret != 0) { + goto out; + } + ret = sample_sets_to_bit_array( + self, sample_set_sizes, sample_sets, num_sample_sets, &sample_sets_bits); + if (ret != 0) { + goto out; + } + + /* If no mode is specified, we default to site mode */ + if (!(stat_site || stat_branch)) { + stat_site = true; + } + /* It's an error to specify more than one mode */ + if (stat_site + stat_branch > 1) { + ret = TSK_ERR_MULTIPLE_STAT_MODES; + goto out; + } + + if (state_dim < 1) { + ret = TSK_ERR_BAD_STATE_DIMS; + goto out; + } + if (result_dim < 1) { + ret = TSK_ERR_BAD_RESULT_DIMS; + goto out; + } + if (left_windows == NULL) { + num_left_windows = 1; + left_windows = default_windows; + num_right_windows = 1; + right_windows = default_windows; + } else if (right_windows == NULL) { + ret = tsk_treeseq_check_windows(self, num_left_windows, left_windows, 0); + if (ret != 0) { + goto out; + } + num_right_windows = num_left_windows; + right_windows = left_windows; + } else { + ret = tsk_treeseq_check_windows(self, num_left_windows, left_windows, 0); + if (ret != 0) { + goto out; + } + ret = tsk_treeseq_check_windows(self, num_right_windows, right_windows, 0); + if (ret != 0) { + goto out; + } + } + + if (stat_site) { + // TODO: assert 1 window left/right + ret = tsk_treeseq_two_site_count_stat(self, state_dim, &sample_sets_bits, + result_dim, f, &f_params, norm_f, left_windows, right_windows, options, + result_size, result); + } else { + // TODO: branch statistics + ret = TSK_ERR_GENERIC; + } + +out: + tsk_bit_array_free(&sample_sets_bits); + return ret; +} + /*********************************** * Allele frequency spectrum ***********************************/ @@ -2922,6 +3446,289 @@ tsk_treeseq_Y1(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, options, result); } +static int +D_summary_func(tsk_size_t state_dim, const double *state, + tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + double n; + const double *state_row; + tsk_size_t j; + + for (j = 0; j < state_dim; j++) { + n = (double) args.sample_set_sizes[j]; + state_row = GET_2D_ROW(state, 3, j); + double p_AB = state_row[0] / n; + double p_Ab = state_row[1] / n; + double p_aB = state_row[2] / n; + + double p_A = p_AB + p_Ab; + double p_B = p_AB + p_aB; + result[j] = p_AB - (p_A * p_B); + } + + return 0; +} + +int +tsk_treeseq_D(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result) +{ + options |= TSK_STAT_POLARISED; // TODO: allow user to pick? + return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, + sample_sets, num_sample_sets, NULL, D_summary_func, norm_total_weighted, + num_left_windows, left_windows, num_right_windows, right_windows, options, + result_size, result); +} + +static int +D2_summary_func(tsk_size_t state_dim, const double *state, + tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + double n; + const double *state_row; + tsk_size_t j; + + for (j = 0; j < state_dim; j++) { + n = (double) args.sample_set_sizes[j]; + state_row = GET_2D_ROW(state, 3, j); + double p_AB = state_row[0] / n; + double p_Ab = state_row[1] / n; + double p_aB = state_row[2] / n; + + double p_A = p_AB + p_Ab; + double p_B = p_AB + p_aB; + result[j] = p_AB - (p_A * p_B); + result[j] *= result[j]; + } + + return 0; +} + +int +tsk_treeseq_D2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result) +{ + return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, + sample_sets, num_sample_sets, NULL, D2_summary_func, norm_total_weighted, + num_left_windows, left_windows, num_right_windows, right_windows, options, + result_size, result); +} + +static int +r2_summary_func(tsk_size_t state_dim, const double *state, + tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + double n; + const double *state_row; + tsk_size_t j; + + for (j = 0; j < state_dim; j++) { + n = (double) args.sample_set_sizes[j]; + state_row = GET_2D_ROW(state, 3, j); + double p_AB = state_row[0] / n; + double p_Ab = state_row[1] / n; + double p_aB = state_row[2] / n; + + double p_A = p_AB + p_Ab; + double p_B = p_AB + p_aB; + + double D_ = p_AB - (p_A * p_B); + double denom = p_A * p_B * (1 - p_A) * (1 - p_B); + + if (denom == 0 && D_ == 0) { + result[j] = 0; + } else { + result[j] = (D_ * D_) / denom; + } + } + return 0; +} + +int +tsk_treeseq_r2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result) +{ + return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, + sample_sets, num_sample_sets, NULL, r2_summary_func, norm_hap_weighted, + num_left_windows, left_windows, num_right_windows, right_windows, options, + result_size, result); +} + +static int +D_prime_summary_func(tsk_size_t state_dim, const double *state, + tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + double n; + const double *state_row; + tsk_size_t j; + + for (j = 0; j < state_dim; j++) { + n = (double) args.sample_set_sizes[j]; + state_row = GET_2D_ROW(state, 3, j); + double p_AB = state_row[0] / n; + double p_Ab = state_row[1] / n; + double p_aB = state_row[2] / n; + + double p_A = p_AB + p_Ab; + double p_B = p_AB + p_aB; + + double D_ = p_AB - (p_A * p_B); + if (D_ >= 0) { + result[j] = D_ / TSK_MIN(p_A * (1 - p_B), (1 - p_A) * p_B); + } else { + result[j] = D_ / TSK_MIN(p_A * p_B, (1 - p_A) * (1 - p_B)); + } + } + return 0; +} + +int +tsk_treeseq_D_prime(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result) +{ + options |= TSK_STAT_POLARISED; // TODO: allow user to pick? + return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, + sample_sets, num_sample_sets, NULL, D_prime_summary_func, norm_hap_weighted, + num_left_windows, left_windows, num_right_windows, right_windows, options, + result_size, result); +} + +static int +r_summary_func(tsk_size_t state_dim, const double *state, + tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + double n; + const double *state_row; + tsk_size_t j; + + for (j = 0; j < state_dim; j++) { + n = (double) args.sample_set_sizes[j]; + state_row = GET_2D_ROW(state, 3, j); + double p_AB = state_row[0] / n; + double p_Ab = state_row[1] / n; + double p_aB = state_row[2] / n; + + double p_A = p_AB + p_Ab; + double p_B = p_AB + p_aB; + + double D_ = p_AB - (p_A * p_B); + double denom = p_A * p_B * (1 - p_A) * (1 - p_B); + + if (denom == 0 && D_ == 0) { + result[j] = 0; + } else { + result[j] = D_ / sqrt(denom); + } + } + return 0; +} + +int +tsk_treeseq_r(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result) +{ + options |= TSK_STAT_POLARISED; // TODO: allow user to pick? + return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, + sample_sets, num_sample_sets, NULL, r_summary_func, norm_total_weighted, + num_left_windows, left_windows, num_right_windows, right_windows, options, + result_size, result); +} + +static int +Dz_summary_func(tsk_size_t state_dim, const double *state, + tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + double n; + const double *state_row; + tsk_size_t j; + + for (j = 0; j < state_dim; j++) { + n = (double) args.sample_set_sizes[j]; + state_row = GET_2D_ROW(state, 3, j); + double p_AB = state_row[0] / n; + double p_Ab = state_row[1] / n; + double p_aB = state_row[2] / n; + + double p_A = p_AB + p_Ab; + double p_B = p_AB + p_aB; + + double D_ = p_AB - (p_A * p_B); + + result[j] = D_ * (1 - 2 * p_A) * (1 - 2 * p_B); + } + return 0; +} + +int +tsk_treeseq_Dz(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result) +{ + return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, + sample_sets, num_sample_sets, NULL, Dz_summary_func, norm_total_weighted, + num_left_windows, left_windows, num_right_windows, right_windows, options, + result_size, result); +} + +static int +pi2_summary_func(tsk_size_t state_dim, const double *state, + tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + double n; + const double *state_row; + tsk_size_t j; + + for (j = 0; j < state_dim; j++) { + n = (double) args.sample_set_sizes[j]; + state_row = GET_2D_ROW(state, 3, j); + double p_AB = state_row[0] / n; + double p_Ab = state_row[1] / n; + double p_aB = state_row[2] / n; + + double p_A = p_AB + p_Ab; + double p_B = p_AB + p_aB; + result[j] = p_A * (1 - p_A) * p_B * (1 - p_B); + } + return 0; +} + +int +tsk_treeseq_pi2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result) +{ + return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, + sample_sets, num_sample_sets, NULL, pi2_summary_func, norm_total_weighted, + num_left_windows, left_windows, num_right_windows, right_windows, options, + result_size, result); +} + /*********************************** * Two way stats ***********************************/ diff --git a/c/tskit/trees.h b/c/tskit/trees.h index 95c66a6ac7..a503b3e39b 100644 --- a/c/tskit/trees.h +++ b/c/tskit/trees.h @@ -919,6 +919,15 @@ typedef int general_stat_func_t(tsk_size_t state_dim, const double *state, int tsk_treeseq_general_stat(const tsk_treeseq_t *self, tsk_size_t K, const double *W, tsk_size_t M, general_stat_func_t *f, void *f_params, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); +// TODO: expose this externally? +/* int tsk_treeseq_two_locus_general_stat(const tsk_treeseq_t *self, */ +/* tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, */ +/* const tsk_id_t *sample_sets, tsk_size_t result_dim, const tsk_id_t *set_indexes, + */ +/* general_stat_func_t *f, norm_func_t *norm_f, tsk_size_t num_left_windows, */ +/* const double *left_windows, tsk_size_t num_right_windows, */ +/* const double *right_windows, tsk_flags_t options, tsk_size_t num_result, */ +/* double *result); */ /* One way weighted stats */ @@ -998,6 +1007,42 @@ int tsk_treeseq_genetic_relatedness(const tsk_treeseq_t *self, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); +int tsk_treeseq_D(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result); +int tsk_treeseq_D2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result); +int tsk_treeseq_r2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result); +int tsk_treeseq_D_prime(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result); +int tsk_treeseq_r(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result); +int tsk_treeseq_Dz(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result); +int tsk_treeseq_pi2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result); + /* Three way sample set stats */ int tsk_treeseq_Y3(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets,