diff --git a/c/tests/test_core.c b/c/tests/test_core.c index 97f1d4c210..801e5aaae2 100644 --- a/c/tests/test_core.c +++ b/c/tests/test_core.c @@ -526,6 +526,84 @@ test_avl_random(void) validate_avl(sizeof(keys) / sizeof(*keys), keys); } +static void +test_bit_arrays(void) +{ + // NB: This test is only valid for the 32 bit implementation of bit arrays. If we + // were to change the chunk size of a bit array, we'd need to update these tests + tsk_bit_array_t arr; + tsk_bit_array_init(&arr, 90, 1); + for (tsk_bit_array_value_t i = 0; i < 20; i++) { + tsk_bit_array_add_bit(&arr, i); + } + tsk_bit_array_add_bit(&arr, 63); + tsk_bit_array_add_bit(&arr, 65); + + // these assertions are only valid for 32-bit values + CU_ASSERT_EQUAL_FATAL(arr.data[0], 1048575); + CU_ASSERT_EQUAL_FATAL(arr.data[1], 2147483648); + CU_ASSERT_EQUAL_FATAL(arr.data[2], 2); + + // verify our assumptions about bit array counting + CU_ASSERT_EQUAL_FATAL(tsk_bit_array_count(&arr), 22); + + tsk_bit_array_free(&arr); + + // create a length-2 array with 64 bit capacity + tsk_bit_array_init(&arr, 64, 2); + tsk_bit_array_t arr_row1, arr_row2; + + // select the first and second row + tsk_bit_array_get_row(&arr, 0, &arr_row1); + tsk_bit_array_get_row(&arr, 1, &arr_row2); + + // fill the first 50 bits of the first row + for (tsk_bit_array_value_t i = 0; i < 50; i++) { + tsk_bit_array_add_bit(&arr_row1, i); + } + // fill bits 20-40 of the second row + for (tsk_bit_array_value_t i = 20; i < 40; i++) { + tsk_bit_array_add_bit(&arr_row2, i); + } + + // verify our assumptions about row selection + CU_ASSERT_EQUAL_FATAL(arr.data[0], 4294967295); + CU_ASSERT_EQUAL_FATAL(arr.data[1], 262143); + CU_ASSERT_EQUAL_FATAL(arr_row1.data[0], 4294967295); + CU_ASSERT_EQUAL_FATAL(arr_row1.data[1], 262143); + + CU_ASSERT_EQUAL_FATAL(arr.data[2], 4293918720); + CU_ASSERT_EQUAL_FATAL(arr.data[3], 255); + CU_ASSERT_EQUAL_FATAL(arr_row2.data[0], 4293918720); + CU_ASSERT_EQUAL_FATAL(arr_row2.data[1], 255); + + // subtract the second from the first row, store in first + tsk_bit_array_subtract(&arr_row1, &arr_row2); + + // verify our assumptions about subtraction + CU_ASSERT_EQUAL_FATAL(arr_row1.data[0], 1048575); + CU_ASSERT_EQUAL_FATAL(arr_row1.data[1], 261888); + + tsk_bit_array_t int_result; + tsk_bit_array_init(&int_result, 64, 1); + + // their intersection should be zero + tsk_bit_array_intersect(&arr_row1, &arr_row2, &int_result); + CU_ASSERT_EQUAL_FATAL(int_result.data[0], 0); + CU_ASSERT_EQUAL_FATAL(int_result.data[1], 0); + + // now, add them back together, storing back in a + tsk_bit_array_add(&arr_row1, &arr_row2); + + // now, their intersection should be the subtracted chunk (20-40) + tsk_bit_array_intersect(&arr_row1, &arr_row2, &int_result); + CU_ASSERT_EQUAL_FATAL(int_result.data[0], 4293918720); + CU_ASSERT_EQUAL_FATAL(int_result.data[1], 255); + + tsk_bit_array_free(&int_result); + tsk_bit_array_free(&arr); +} + static void test_meson_version(void) { @@ -554,6 +632,7 @@ main(int argc, char **argv) { "test_avl_sequential", test_avl_sequential }, { "test_avl_interleaved", test_avl_interleaved }, { "test_avl_random", test_avl_random }, + { "test_bit_arrays", test_bit_arrays }, { "test_meson_version", test_meson_version }, { NULL, NULL }, }; diff --git a/c/tests/test_stats.c b/c/tests/test_stats.c index 2594f3c5a6..22c1b7dcf9 100644 --- a/c/tests/test_stats.c +++ b/c/tests/test_stats.c @@ -1985,6 +1985,326 @@ test_ld_silent_mutations(void) free(base_ts); } +static void +test_paper_ex_two_site(void) +{ + tsk_treeseq_t ts; + double *result; + tsk_size_t s, result_size; + int ret; + + double truth_one_set[6] = { 1, 0.1111111111111111, 0.1111111111111111, 1, 1, 1 }; + double truth_two_sets[12] = { 1, 1, 0.1111111111111111, 0.1111111111111111, + 0.1111111111111111, 0.1111111111111111, 1, 1, 1, 1, 1, 1 }; + double truth_three_sets[18] = { 1, 1, 0, 0.1111111111111111, 0.1111111111111111, 0, + 0.1111111111111111, 0.1111111111111111, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; + + tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, + paper_ex_mutations, paper_ex_individuals, NULL, 0); + + tsk_size_t sample_set_sizes[3]; + tsk_size_t num_sample_sets; + tsk_id_t sample_sets[ts.num_samples * 3]; + + // First sample set contains all of the samples + sample_set_sizes[0] = ts.num_samples; + num_sample_sets = 1; + for (s = 0; s < ts.num_samples; s++) { + sample_sets[s] = (tsk_id_t) s; + } + + ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 6); + assert_arrays_almost_equal(result_size * num_sample_sets, result, truth_one_set); + tsk_safe_free(result); + + // Second sample set contains all of the samples + sample_set_sizes[1] = ts.num_samples; + num_sample_sets = 2; + for (s = ts.num_samples; s < ts.num_samples * 2; s++) { + sample_sets[s] = (tsk_id_t) s - (tsk_id_t) ts.num_samples; + } + + ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 6); + assert_arrays_almost_equal(result_size * num_sample_sets, result, truth_two_sets); + tsk_safe_free(result); + + // Third sample set contains the first two samples + sample_set_sizes[2] = 2; + num_sample_sets = 3; + for (s = ts.num_samples * 2; s < (ts.num_samples * 3) - 2; s++) { + sample_sets[s] = (tsk_id_t) s - (tsk_id_t) ts.num_samples * 2; + } + + ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 6); + assert_arrays_almost_equal(result_size * num_sample_sets, result, truth_three_sets); + tsk_safe_free(result); + + tsk_treeseq_free(&ts); +} + +static void +test_two_site_correlated_multiallelic(void) +{ + const char *nodes = "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "0 2 -1\n" + "0 4 -1\n" + "0 6 -1\n" + "0 8 -1\n" + "0 10 -1\n" + "0 12 -1\n" + "0 14 -1\n" + "0 16 -1\n"; + const char *edges = "0 20 9 0,1\n" + "0 20 10 2,9\n" + "0 20 11 4,5\n" + "0 20 12 6,11\n" + "0 20 13 7,8\n" + "0 20 14 3,10\n" + "0 10 15 12\n" + "10 20 15 13\n" + "0 10 15 14\n" + "10 20 15 14\n" + "10 20 16 12\n" + "0 10 16 13\n" + "0 10 16 15\n" + "10 20 16 15\n"; + const char *sites = "7 A\n" + "13 G\n"; + const char *mutations = "0 15 T -1\n" + "0 14 G 0\n" + "1 15 T -1\n" + "1 13 C 2\n"; + + int ret; + + tsk_treeseq_t ts; + double *result; + tsk_size_t s, result_size; + + double truth_D[3] + = { 0.043209876543209874, -0.018518518518518517, 0.05555555555555555 }; + double truth_D2[3] + = { 0.023844603634269844, 0.02384460363426984, 0.02384460363426984 }; + double truth_r2[3] = { 1, 1, 1 }; + double truth_D_prime[3] + = { 0.7777777777777777, 0.4444444444444444, 0.6666666666666666 }; + double truth_r[3] + = { 0.18377223398316206, -0.12212786219416509, 0.2609542781331212 }; + double truth_Dz[3] + = { 0.0033870175616860566, 0.003387017561686057, 0.003387017561686057 }; + double truth_pi2[3] + = { 0.04579247743399549, 0.04579247743399549, 0.0457924774339955 }; + + tsk_treeseq_from_text(&ts, 20, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); + + tsk_size_t sample_set_sizes[1] = { ts.num_samples }; + tsk_size_t num_sample_sets = 1; + tsk_id_t sample_sets[ts.num_samples]; + + for (s = 0; s < ts.num_samples; s++) { + sample_sets[s] = (tsk_id_t) s; + } + + ret = tsk_treeseq_D(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_D); + tsk_safe_free(result); + + ret = tsk_treeseq_D2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_D2); + tsk_safe_free(result); + + ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_r2); + tsk_safe_free(result); + + ret = tsk_treeseq_D_prime(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, + NULL, 0, NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_D_prime); + tsk_safe_free(result); + + ret = tsk_treeseq_r(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_r); + tsk_safe_free(result); + + ret = tsk_treeseq_Dz(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_Dz); + tsk_safe_free(result); + + ret = tsk_treeseq_pi2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, + 0, NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_pi2); + tsk_safe_free(result); + + tsk_treeseq_free(&ts); +} + +static void +test_two_site_uncorrelated_multiallelic(void) +{ + const char *nodes = "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "0 2 -1\n" + "0 4 -1\n" + "0 6 -1\n" + "0 8 -1\n" + "0 10 -1\n" + "0 12 -1\n" + "0 14 -1\n" + "0 16 -1\n" + "0 2 -1\n" + "0 4 -1\n" + "0 6 -1\n" + "0 8 -1\n" + "0 10 -1\n" + "0 12 -1\n" + "0 14 -1\n" + "0 16 -1\n"; + const char *edges = "0 10 9 0,1\n" + "10 20 17 0,3\n" + "0 10 10 2,9\n" + "10 20 18 6,17\n" + "0 10 11 3,4\n" + "10 20 19 1,4\n" + "0 10 12 5,11\n" + "10 20 20 7,19\n" + "0 10 13 6,7\n" + "10 20 21 2,5\n" + "0 10 14 8,13\n" + "10 20 22 8,21\n" + "0 10 15 10,12\n" + "10 20 23 18,20\n" + "0 10 16 14,15\n" + "10 20 24 22,23\n"; + const char *sites = "7 A\n" + "13 G\n"; + const char *mutations = "0 15 T -1\n" + "0 12 G 0\n" + "1 23 T -1\n" + "1 20 A 2\n"; + + tsk_treeseq_t ts; + + int ret; + double *result; + tsk_size_t result_size; + + double truth_D[3] = { 0.05555555555555555, 0.0, 0.05555555555555555 }; + double truth_D2[3] = { 0.024691358024691357, 0.0, 0.024691358024691357 }; + double truth_r2[3] = { 1, 0, 1 }; + double truth_D_prime[3] = { 0.6666666666666665, 0.0, 0.6666666666666665 }; + double truth_r[3] = { 0.24999999999999997, 0.0, 0.24999999999999997 }; + double truth_Dz[3] = { 0.0, 0.0, 0.0 }; + double truth_pi2[3] + = { 0.04938271604938272, 0.04938271604938272, 0.04938271604938272 }; + + tsk_treeseq_from_text(&ts, 20, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); + + tsk_size_t sample_set_sizes[1] = { ts.num_samples }; + tsk_size_t num_sample_sets = 1; + tsk_id_t sample_sets[ts.num_samples]; + + for (tsk_size_t s = 0; s < ts.num_samples; s++) { + sample_sets[s] = (tsk_id_t) s; + } + + ret = tsk_treeseq_D(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_D); + tsk_safe_free(result); + + ret = tsk_treeseq_D2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_D2); + tsk_safe_free(result); + + ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_r2); + tsk_safe_free(result); + + ret = tsk_treeseq_D_prime(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, + NULL, 0, NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_D_prime); + tsk_safe_free(result); + + ret = tsk_treeseq_r(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_r); + tsk_safe_free(result); + + ret = tsk_treeseq_Dz(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, 0, + NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_Dz); + tsk_safe_free(result); + + ret = tsk_treeseq_pi2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, + 0, NULL, 0, &result_size, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(result_size, 3); + assert_arrays_almost_equal(result_size, result, truth_pi2); + tsk_safe_free(result); + + tsk_treeseq_free(&ts); +} + static void test_simplest_divergence_matrix(void) { @@ -2248,6 +2568,12 @@ main(int argc, char **argv) { "test_ld_multi_mutations", test_ld_multi_mutations }, { "test_ld_silent_mutations", test_ld_silent_mutations }, + { "test_paper_ex_two_site", test_paper_ex_two_site }, + { "test_two_site_correlated_multiallelic", + test_two_site_correlated_multiallelic }, + { "test_two_site_uncorrelated_multiallelic", + test_two_site_uncorrelated_multiallelic }, + { "test_simplest_divergence_matrix", test_simplest_divergence_matrix }, { "test_simplest_divergence_matrix_windows", test_simplest_divergence_matrix_windows }, diff --git a/c/tskit/core.c b/c/tskit/core.c index 5a8ed6d9ac..44be9740fc 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -1177,3 +1177,101 @@ tsk_avl_tree_int_ordered_nodes(const tsk_avl_tree_int_t *self, tsk_avl_node_int_ ordered_nodes_traverse(self->head.rlink, 0, out); return 0; } + +// Bit Array implementation. Allows us to store unsigned integers in a compact manner. +// Currently implemented as an array of 32-bit unsigned integers for ease of counting. + +int +tsk_bit_array_init(tsk_bit_array_t *self, tsk_size_t num_bits, tsk_size_t length) +{ + int ret = 0; + + self->size = (num_bits >> TSK_BIT_ARRAY_CHUNK) + + (num_bits % TSK_BIT_ARRAY_NUM_BITS ? 1 : 0); + self->data = tsk_calloc(self->size * length, sizeof(*self->data)); + if (self->data == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } +out: + return ret; +} + +void +tsk_bit_array_get_row(const tsk_bit_array_t *self, tsk_size_t row, tsk_bit_array_t *out) +{ + out->size = self->size; + out->data = self->data + (row * self->size); +} + +void +tsk_bit_array_intersect( + const tsk_bit_array_t *self, const tsk_bit_array_t *other, tsk_bit_array_t *out) +{ + for (tsk_size_t i = 0; i < self->size; i++) { + out->data[i] = self->data[i] & other->data[i]; + } +} + +void +tsk_bit_array_subtract(tsk_bit_array_t *self, const tsk_bit_array_t *other) +{ + for (tsk_size_t i = 0; i < self->size; i++) { + self->data[i] &= ~(other->data[i]); + } +} + +void +tsk_bit_array_add(tsk_bit_array_t *self, const tsk_bit_array_t *other) +{ + for (tsk_size_t i = 0; i < self->size; i++) { + self->data[i] |= other->data[i]; + } +} + +void +tsk_bit_array_add_bit(tsk_bit_array_t *self, const tsk_bit_array_value_t bit) +{ + tsk_bit_array_value_t i = bit >> TSK_BIT_ARRAY_CHUNK; + self->data[i] |= (tsk_bit_array_value_t) 1 << (bit - (TSK_BIT_ARRAY_NUM_BITS * i)); +} + +bool +tsk_bit_array_contains(const tsk_bit_array_t *self, const tsk_bit_array_value_t bit) +{ + tsk_bit_array_value_t i = bit >> TSK_BIT_ARRAY_CHUNK; + return self->data[i] + & ((tsk_bit_array_value_t) 1 << (bit - (TSK_BIT_ARRAY_NUM_BITS * i))); +} + +tsk_size_t +tsk_bit_array_count(const tsk_bit_array_t *self) +{ + // Utilizes 12 operations per bit array. NB this only works on 32 bit integers. + // Taken from: + // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + // There's a nice breakdown of this algorithm here: + // https://stackoverflow.com/a/109025 + // Could probably do better with explicit SIMD (instead of SWAR), but not as + // portable: https://arxiv.org/pdf/1611.07612.pdf + // + // There is one solution to explore further, which uses __builtin_popcountll. + // This option is relatively simple, but requires a 64 bit bit array and also + // involves some compiler flag plumbing (-mpopcnt) + + tsk_bit_array_value_t tmp; + tsk_size_t i, count = 0; + + for (i = 0; i < self->size; i++) { + tmp = self->data[i] - ((self->data[i] >> 1) & 0x55555555); + tmp = (tmp & 0x33333333) + ((tmp >> 2) & 0x33333333); + count += (((tmp + (tmp >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; + } + return count; +} + +void +tsk_bit_array_free(tsk_bit_array_t *self) +{ + tsk_safe_free(self->data); +} diff --git a/c/tskit/core.h b/c/tskit/core.h index 45a33dd8b7..a10f4376a5 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -1009,6 +1009,30 @@ int tsk_memcmp(const void *s1, const void *s2, tsk_size_t size); void tsk_set_debug_stream(FILE *f); FILE *tsk_get_debug_stream(void); +/* Bit Array functionality */ + +typedef uint32_t tsk_bit_array_value_t; +typedef struct { + tsk_size_t size; // Number of chunks per row + tsk_bit_array_value_t *data; // Array data +} tsk_bit_array_t; + +#define TSK_BIT_ARRAY_CHUNK 5U +#define TSK_BIT_ARRAY_NUM_BITS (1U << TSK_BIT_ARRAY_CHUNK) + +int tsk_bit_array_init(tsk_bit_array_t *self, tsk_size_t num_bits, tsk_size_t length); +void tsk_bit_array_free(tsk_bit_array_t *self); +void tsk_bit_array_get_row( + const tsk_bit_array_t *self, tsk_size_t row, tsk_bit_array_t *out); +void tsk_bit_array_intersect( + const tsk_bit_array_t *self, const tsk_bit_array_t *other, tsk_bit_array_t *out); +void tsk_bit_array_subtract(tsk_bit_array_t *self, const tsk_bit_array_t *other); +void tsk_bit_array_add(tsk_bit_array_t *self, const tsk_bit_array_t *other); +void tsk_bit_array_add_bit(tsk_bit_array_t *self, const tsk_bit_array_value_t bit); +bool tsk_bit_array_contains( + const tsk_bit_array_t *self, const tsk_bit_array_value_t bit); +tsk_size_t tsk_bit_array_count(const tsk_bit_array_t *self); + #ifdef __cplusplus } #endif diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 0147cbd9ab..0c9889db52 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -1483,7 +1483,7 @@ get_allele_weights(const tsk_site_t *site, const double *state, tsk_size_t state allele_row[k] += state_row[k]; } - /* Get the index for the alternate allele that we must substract from */ + /* Get the index for the alternate allele that we must subtract from */ alt_allele = site->ancestral_state; alt_allele_length = site->ancestral_state_length; if (mutation.parent != TSK_NULL) { @@ -2124,6 +2124,530 @@ tsk_treeseq_sample_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sample_s return ret; } +/*********************************** + * Two Locus Statistics + ***********************************/ + +static int +get_allele_samples(const tsk_site_t *site, const tsk_bit_array_t *state, + tsk_bit_array_t *out_allele_samples, tsk_size_t *out_num_alleles) +{ + int ret = 0; + tsk_mutation_t mutation, parent_mut; + tsk_size_t mutation_index, allele, alt_allele_length; + /* The allele table */ + tsk_size_t max_alleles = site->mutations_length + 1; + const char **alleles = tsk_malloc(max_alleles * sizeof(*alleles)); + tsk_size_t *allele_lengths = tsk_calloc(max_alleles, sizeof(*allele_lengths)); + const char *alt_allele; + tsk_bit_array_t state_row; + tsk_bit_array_t allele_samples_row; + tsk_bit_array_t alt_allele_samples_row; + tsk_size_t num_alleles = 1; + + if (alleles == NULL || allele_lengths == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + + tsk_bug_assert(state != NULL); + alleles[0] = site->ancestral_state; + allele_lengths[0] = site->ancestral_state_length; + + for (mutation_index = 0; mutation_index < site->mutations_length; mutation_index++) { + mutation = site->mutations[mutation_index]; + /* Compute the allele index for this derived state value. */ + for (allele = 0; allele < num_alleles; allele++) { + if (mutation.derived_state_length == allele_lengths[allele] + && tsk_memcmp( + mutation.derived_state, alleles[allele], allele_lengths[allele]) + == 0) { + break; + } + } + if (allele == num_alleles) { + tsk_bug_assert(allele < max_alleles); + alleles[allele] = mutation.derived_state; + allele_lengths[allele] = mutation.derived_state_length; + num_alleles++; + } + + /* Add the mutation's samples to this allele */ + tsk_bit_array_get_row(out_allele_samples, allele, &allele_samples_row); + tsk_bit_array_get_row(state, mutation_index, &state_row); + tsk_bit_array_add(&allele_samples_row, &state_row); + + /* Get the index for the alternate allele that we must subtract from */ + alt_allele = site->ancestral_state; + alt_allele_length = site->ancestral_state_length; + if (mutation.parent != TSK_NULL) { + parent_mut = site->mutations[mutation.parent - site->mutations[0].id]; + alt_allele = parent_mut.derived_state; + alt_allele_length = parent_mut.derived_state_length; + } + for (allele = 0; allele < num_alleles; allele++) { + if (alt_allele_length == allele_lengths[allele] + && tsk_memcmp(alt_allele, alleles[allele], allele_lengths[allele]) + == 0) { + break; + } + } + tsk_bug_assert(allele < num_alleles); + + tsk_bit_array_get_row(out_allele_samples, allele, &alt_allele_samples_row); + tsk_bit_array_subtract(&alt_allele_samples_row, &allele_samples_row); + } + *out_num_alleles = num_alleles; +out: + tsk_safe_free(alleles); + tsk_safe_free(allele_lengths); + return ret; +} + +static int +norm_hap_weighted(tsk_size_t state_dim, const double *hap_weights, + tsk_size_t TSK_UNUSED(n_a), tsk_size_t TSK_UNUSED(n_b), double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + const double *weight_row; + double n; + tsk_size_t k; + + for (k = 0; k < state_dim; k++) { + weight_row = GET_2D_ROW(hap_weights, 3, k); + n = (double) args.sample_set_sizes[k]; + // TODO: what to do when n = 0 + result[k] = weight_row[0] / n; + } + return 0; +} + +static int +norm_total_weighted(tsk_size_t state_dim, const double *TSK_UNUSED(hap_weights), + tsk_size_t n_a, tsk_size_t n_b, double *result, void *TSK_UNUSED(params)) +{ + tsk_size_t k; + + for (k = 0; k < state_dim; k++) { + result[k] = 1 / (double) (n_a * n_b); + } + return 0; +} + +static void +get_all_samples_bits(tsk_bit_array_t *all_samples, tsk_size_t n) +{ + tsk_size_t i; + const tsk_bit_array_value_t all = ~((tsk_bit_array_value_t) 0); + const tsk_bit_array_value_t remainder_samples = n % TSK_BIT_ARRAY_NUM_BITS; + + all_samples->data[all_samples->size - 1] + = remainder_samples ? ~(all << remainder_samples) : all; + for (i = 0; i < all_samples->size - 1; i++) { + all_samples->data[i] = all; + } +} + +typedef int norm_func_t(tsk_size_t state_dim, const double *hap_weights, tsk_size_t n_a, + tsk_size_t n_b, double *result, void *params); + +static int +compute_general_two_site_stat_result(const tsk_bit_array_t *site_a_state, + const tsk_bit_array_t *site_b_state, tsk_size_t num_a_alleles, + tsk_size_t num_b_alleles, tsk_size_t num_samples, tsk_size_t state_dim, + const tsk_bit_array_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, + sample_count_stat_params_t *f_params, norm_func_t *norm_f, bool polarised, + double *result) +{ + int ret = 0; + tsk_bit_array_t A_samples, B_samples; + // ss_ prefix refers to a sample set + tsk_bit_array_t ss_row; + tsk_bit_array_t ss_A_samples, ss_B_samples, ss_AB_samples, AB_samples; + // Sample sets and b sites are rows, a sites are columns + // b1 b2 b3 + // a1 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] + // a2 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] + // a3 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] + tsk_size_t k, mut_a, mut_b; + tsk_size_t row_len = num_b_alleles * state_dim; + tsk_size_t w_A = 0, w_B = 0, w_AB = 0; + uint8_t polarised_val = polarised ? 1 : 0; + double *hap_weight_row; + double *result_tmp_row; + double *weights = tsk_malloc(3 * state_dim * sizeof(*weights)); + double *norm = tsk_malloc(state_dim * sizeof(*norm)); + double *result_tmp = tsk_malloc(row_len * num_a_alleles * sizeof(*result_tmp)); + + tsk_memset(&ss_A_samples, 0, sizeof(ss_A_samples)); + tsk_memset(&ss_B_samples, 0, sizeof(ss_B_samples)); + tsk_memset(&ss_AB_samples, 0, sizeof(ss_AB_samples)); + tsk_memset(&AB_samples, 0, sizeof(AB_samples)); + + if (weights == NULL || norm == NULL || result_tmp == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + + ret = tsk_bit_array_init(&ss_A_samples, num_samples, 1); + if (ret != 0) { + goto out; + } + ret = tsk_bit_array_init(&ss_B_samples, num_samples, 1); + if (ret != 0) { + goto out; + } + ret = tsk_bit_array_init(&ss_AB_samples, num_samples, 1); + if (ret != 0) { + goto out; + } + ret = tsk_bit_array_init(&AB_samples, num_samples, 1); + if (ret != 0) { + goto out; + } + + for (mut_a = polarised_val; mut_a < num_a_alleles; mut_a++) { + result_tmp_row = GET_2D_ROW(result_tmp, row_len, mut_a); + for (mut_b = polarised_val; mut_b < num_b_alleles; mut_b++) { + tsk_bit_array_get_row(site_a_state, mut_a, &A_samples); + tsk_bit_array_get_row(site_b_state, mut_b, &B_samples); + tsk_bit_array_intersect(&A_samples, &B_samples, &AB_samples); + for (k = 0; k < state_dim; k++) { + tsk_bit_array_get_row(sample_sets, k, &ss_row); + hap_weight_row = GET_2D_ROW(weights, 3, k); + + tsk_bit_array_intersect(&A_samples, &ss_row, &ss_A_samples); + tsk_bit_array_intersect(&B_samples, &ss_row, &ss_B_samples); + tsk_bit_array_intersect(&AB_samples, &ss_row, &ss_AB_samples); + + w_AB = tsk_bit_array_count(&ss_AB_samples); + w_A = tsk_bit_array_count(&ss_A_samples); + w_B = tsk_bit_array_count(&ss_B_samples); + + hap_weight_row[0] = (double) w_AB; + hap_weight_row[1] = (double) (w_A - w_AB); // w_Ab + hap_weight_row[2] = (double) (w_B - w_AB); // w_aB + } + ret = f(state_dim, weights, result_dim, result_tmp_row, f_params); + if (ret != 0) { + goto out; + } + ret = norm_f(state_dim, weights, num_a_alleles - polarised_val, + num_b_alleles - polarised_val, norm, f_params); + if (ret != 0) { + goto out; + } + for (k = 0; k < state_dim; k++) { + result[k] += result_tmp_row[k] * norm[k]; + } + result_tmp_row += state_dim; // Advance to the next column + } + } + +out: + tsk_safe_free(weights); + tsk_safe_free(norm); + tsk_safe_free(result_tmp); + tsk_bit_array_free(&ss_A_samples); + tsk_bit_array_free(&ss_B_samples); + tsk_bit_array_free(&ss_AB_samples); + tsk_bit_array_free(&AB_samples); + return ret; +} + +static int +get_mutation_samples( + const tsk_treeseq_t *ts, tsk_size_t *num_alleles, tsk_bit_array_t *allele_samples) +{ + int ret = 0; + const tsk_flags_t *restrict flags = ts->tables->nodes.flags; + const tsk_size_t num_samples = tsk_treeseq_get_num_samples(ts); + const tsk_size_t *restrict site_muts_len = ts->site_mutations_length; + const tsk_site_t *restrict site; + tsk_tree_t tree; + tsk_bit_array_t all_samples_bits, mut_samples, mut_samples_row, out_row; + tsk_size_t max_muts_len, mut_offset, num_nodes, s, m, n; + tsk_id_t node, *nodes = NULL; + void *tmp_nodes; + + tsk_memset(&mut_samples, 0, sizeof(mut_samples)); + tsk_memset(&all_samples_bits, 0, sizeof(all_samples_bits)); + + max_muts_len = 0; + for (s = 0; s < ts->tables->sites.num_rows; s++) { + if (site_muts_len[s] > max_muts_len) { + max_muts_len = site_muts_len[s]; + } + } + ret = tsk_bit_array_init(&mut_samples, num_samples, max_muts_len); + if (ret != 0) { + goto out; + } + ret = tsk_bit_array_init(&all_samples_bits, num_samples, 1); + if (ret != 0) { + goto out; + } + + ret = tsk_tree_init(&tree, ts, TSK_NO_SAMPLE_COUNTS); + if (ret != 0) { + goto out; + } + + // A future improvement could get a union of all sample sets + // instead of all samples + get_all_samples_bits(&all_samples_bits, num_samples); + + // Traverse down each tree, recording all samples below each mutation. We perform one + // preorder traversal per mutation. + mut_offset = 0; + for (ret = tsk_tree_first(&tree); ret == TSK_TREE_OK; ret = tsk_tree_next(&tree)) { + tmp_nodes = tsk_realloc(nodes, tsk_tree_get_size_bound(&tree) * sizeof(*nodes)); + if (tmp_nodes == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + nodes = tmp_nodes; + for (s = 0; s < tree.sites_length; s++) { + site = &tree.sites[s]; + tsk_bit_array_get_row(allele_samples, mut_offset, &out_row); + tsk_bit_array_add(&out_row, &all_samples_bits); + // Zero out results before the start of each iteration + tsk_memset(mut_samples.data, 0, + mut_samples.size * max_muts_len * sizeof(tsk_bit_array_value_t)); + for (m = 0; m < site->mutations_length; m++) { + tsk_bit_array_get_row(&mut_samples, m, &mut_samples_row); + node = site->mutations[m].node; + ret = tsk_tree_preorder_from(&tree, node, nodes, &num_nodes); + if (ret != 0) { + goto out; + } + for (n = 0; n < num_nodes; n++) { + node = nodes[n]; + if (flags[node] & TSK_NODE_IS_SAMPLE) { + tsk_bit_array_add_bit( + &mut_samples_row, (tsk_bit_array_value_t) node); + } + } + mut_offset++; + } + mut_offset++; // One more for the ancestral allele + get_allele_samples(site, &mut_samples, &out_row, &(num_alleles[site->id])); + } + } + // if adding code below, check ret before continuing +out: + tsk_safe_free(nodes); + tsk_tree_free(&tree); + tsk_bit_array_free(&mut_samples); + tsk_bit_array_free(&all_samples_bits); + return ret; +} + +static int +tsk_treeseq_two_site_count_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, + const tsk_bit_array_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, + sample_count_stat_params_t *f_params, norm_func_t *norm_f, + const double *TSK_UNUSED(left_window), const double *TSK_UNUSED(right_window), + tsk_flags_t options, tsk_size_t *result_size, double **result) +{ + int ret = 0; + tsk_bit_array_t allele_samples; + tsk_bit_array_t site_a_state, site_b_state; + tsk_size_t inner, result_offset, inner_offset, a_offset, b_offset; + tsk_size_t site_a, site_b; + bool polarised = false; + const tsk_size_t num_sites = self->tables->sites.num_rows; + const tsk_size_t num_samples = self->num_samples; + const tsk_size_t max_alleles = self->tables->mutations.num_rows + num_sites; + tsk_size_t *num_alleles = tsk_malloc(num_sites * sizeof(*num_alleles)); + const tsk_size_t *restrict site_muts_len = self->site_mutations_length; + + tsk_memset(&allele_samples, 0, sizeof(allele_samples)); + + if (num_alleles == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + + ret = tsk_bit_array_init(&allele_samples, num_samples, max_alleles); + if (ret != 0) { + goto out; + } + ret = get_mutation_samples(self, num_alleles, &allele_samples); + if (ret != 0) { + goto out; + } + + // Number of pairs w/ replacement (sites) + *result_size = (num_sites * (1 + num_sites)) / 2U; + *result = tsk_calloc(*result_size * result_dim, sizeof(**result)); + + if (result == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + + if (options & TSK_STAT_POLARISED) { + polarised = true; + } + + inner = 0; + a_offset = 0; + b_offset = 0; + inner_offset = 0; + result_offset = 0; + // TODO: implement windows! + for (site_a = 0; site_a < num_sites; site_a++) { + b_offset = inner_offset; + for (site_b = inner; site_b < num_sites; site_b++) { + tsk_bit_array_get_row(&allele_samples, a_offset, &site_a_state); + tsk_bit_array_get_row(&allele_samples, b_offset, &site_b_state); + ret = compute_general_two_site_stat_result(&site_a_state, &site_b_state, + num_alleles[site_a], num_alleles[site_b], num_samples, state_dim, + sample_sets, result_dim, f, f_params, norm_f, polarised, + &((*result)[result_offset])); + if (ret != 0) { + goto out; + } + result_offset += result_dim; + b_offset += site_muts_len[site_b] + 1; + } + a_offset += site_muts_len[site_a] + 1; + inner_offset += site_muts_len[site_a] + 1; + inner++; + } + +out: + tsk_safe_free(num_alleles); + tsk_bit_array_free(&allele_samples); + return ret; +} + +static int +sample_sets_to_bit_array(const tsk_treeseq_t *self, const tsk_size_t *sample_set_sizes, + const tsk_id_t *sample_sets, tsk_size_t num_sample_sets, + tsk_bit_array_t *sample_sets_bits) +{ + int ret; + tsk_bit_array_t bits_row; + tsk_size_t j, k, l; + tsk_id_t u, sample_index; + + ret = tsk_bit_array_init(sample_sets_bits, self->num_samples, num_sample_sets); + if (ret != 0) { + return ret; + } + + j = 0; + for (k = 0; k < num_sample_sets; k++) { + tsk_bit_array_get_row(sample_sets_bits, k, &bits_row); + for (l = 0; l < sample_set_sizes[k]; l++) { + u = sample_sets[j]; + sample_index = self->sample_index_map[u]; + if (tsk_bit_array_contains( + &bits_row, (tsk_bit_array_value_t) sample_index)) { + ret = TSK_ERR_DUPLICATE_SAMPLE; + goto out; + } + tsk_bit_array_add_bit(&bits_row, (tsk_bit_array_value_t) sample_index); + j++; + } + } + +out: + return ret; +} + +static int +tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t result_dim, const tsk_id_t *set_indexes, general_stat_func_t *f, + norm_func_t *norm_f, tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result) +{ + // TODO: generalize this function if we ever decide to do weighted two_locus stats. + // We only implement count stats and therefore we don't handle weights. + int ret = 0; + tsk_bit_array_t sample_sets_bits; + bool stat_site = !!(options & TSK_STAT_SITE); + bool stat_branch = !!(options & TSK_STAT_BRANCH); + double default_windows[] = { 0, self->tables->sequence_length }; + tsk_size_t state_dim = num_sample_sets; + sample_count_stat_params_t f_params = { .sample_sets = sample_sets, + .num_sample_sets = num_sample_sets, + .sample_set_sizes = sample_set_sizes, + .set_indexes = set_indexes }; + + tsk_memset(&sample_sets_bits, 0, sizeof(sample_sets_bits)); + + ret = tsk_treeseq_check_sample_sets( + self, num_sample_sets, sample_set_sizes, sample_sets); + if (ret != 0) { + goto out; + } + ret = sample_sets_to_bit_array( + self, sample_set_sizes, sample_sets, num_sample_sets, &sample_sets_bits); + if (ret != 0) { + goto out; + } + + /* If no mode is specified, we default to site mode */ + if (!(stat_site || stat_branch)) { + stat_site = true; + } + /* It's an error to specify more than one mode */ + if (stat_site + stat_branch > 1) { + ret = TSK_ERR_MULTIPLE_STAT_MODES; + goto out; + } + + if (state_dim < 1) { + ret = TSK_ERR_BAD_STATE_DIMS; + goto out; + } + if (result_dim < 1) { + ret = TSK_ERR_BAD_RESULT_DIMS; + goto out; + } + if (left_windows == NULL) { + num_left_windows = 1; + left_windows = default_windows; + num_right_windows = 1; + right_windows = default_windows; + } else if (right_windows == NULL) { + ret = tsk_treeseq_check_windows(self, num_left_windows, left_windows, 0); + if (ret != 0) { + goto out; + } + num_right_windows = num_left_windows; + right_windows = left_windows; + } else { + ret = tsk_treeseq_check_windows(self, num_left_windows, left_windows, 0); + if (ret != 0) { + goto out; + } + ret = tsk_treeseq_check_windows(self, num_right_windows, right_windows, 0); + if (ret != 0) { + goto out; + } + } + + if (stat_site) { + // TODO: assert 1 window left/right + ret = tsk_treeseq_two_site_count_stat(self, state_dim, &sample_sets_bits, + result_dim, f, &f_params, norm_f, left_windows, right_windows, options, + result_size, result); + } else { + // TODO: branch statistics + ret = TSK_ERR_GENERIC; + } + +out: + tsk_bit_array_free(&sample_sets_bits); + return ret; +} + /*********************************** * Allele frequency spectrum ***********************************/ @@ -2922,6 +3446,289 @@ tsk_treeseq_Y1(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, options, result); } +static int +D_summary_func(tsk_size_t state_dim, const double *state, + tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + double n; + const double *state_row; + tsk_size_t j; + + for (j = 0; j < state_dim; j++) { + n = (double) args.sample_set_sizes[j]; + state_row = GET_2D_ROW(state, 3, j); + double p_AB = state_row[0] / n; + double p_Ab = state_row[1] / n; + double p_aB = state_row[2] / n; + + double p_A = p_AB + p_Ab; + double p_B = p_AB + p_aB; + result[j] = p_AB - (p_A * p_B); + } + + return 0; +} + +int +tsk_treeseq_D(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result) +{ + options |= TSK_STAT_POLARISED; // TODO: allow user to pick? + return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, + sample_sets, num_sample_sets, NULL, D_summary_func, norm_total_weighted, + num_left_windows, left_windows, num_right_windows, right_windows, options, + result_size, result); +} + +static int +D2_summary_func(tsk_size_t state_dim, const double *state, + tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + double n; + const double *state_row; + tsk_size_t j; + + for (j = 0; j < state_dim; j++) { + n = (double) args.sample_set_sizes[j]; + state_row = GET_2D_ROW(state, 3, j); + double p_AB = state_row[0] / n; + double p_Ab = state_row[1] / n; + double p_aB = state_row[2] / n; + + double p_A = p_AB + p_Ab; + double p_B = p_AB + p_aB; + result[j] = p_AB - (p_A * p_B); + result[j] *= result[j]; + } + + return 0; +} + +int +tsk_treeseq_D2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result) +{ + return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, + sample_sets, num_sample_sets, NULL, D2_summary_func, norm_total_weighted, + num_left_windows, left_windows, num_right_windows, right_windows, options, + result_size, result); +} + +static int +r2_summary_func(tsk_size_t state_dim, const double *state, + tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + double n; + const double *state_row; + tsk_size_t j; + + for (j = 0; j < state_dim; j++) { + n = (double) args.sample_set_sizes[j]; + state_row = GET_2D_ROW(state, 3, j); + double p_AB = state_row[0] / n; + double p_Ab = state_row[1] / n; + double p_aB = state_row[2] / n; + + double p_A = p_AB + p_Ab; + double p_B = p_AB + p_aB; + + double D_ = p_AB - (p_A * p_B); + double denom = p_A * p_B * (1 - p_A) * (1 - p_B); + + if (denom == 0 && D_ == 0) { + result[j] = 0; + } else { + result[j] = (D_ * D_) / denom; + } + } + return 0; +} + +int +tsk_treeseq_r2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result) +{ + return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, + sample_sets, num_sample_sets, NULL, r2_summary_func, norm_hap_weighted, + num_left_windows, left_windows, num_right_windows, right_windows, options, + result_size, result); +} + +static int +D_prime_summary_func(tsk_size_t state_dim, const double *state, + tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + double n; + const double *state_row; + tsk_size_t j; + + for (j = 0; j < state_dim; j++) { + n = (double) args.sample_set_sizes[j]; + state_row = GET_2D_ROW(state, 3, j); + double p_AB = state_row[0] / n; + double p_Ab = state_row[1] / n; + double p_aB = state_row[2] / n; + + double p_A = p_AB + p_Ab; + double p_B = p_AB + p_aB; + + double D_ = p_AB - (p_A * p_B); + if (D_ >= 0) { + result[j] = D_ / TSK_MIN(p_A * (1 - p_B), (1 - p_A) * p_B); + } else { + result[j] = D_ / TSK_MIN(p_A * p_B, (1 - p_A) * (1 - p_B)); + } + } + return 0; +} + +int +tsk_treeseq_D_prime(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result) +{ + options |= TSK_STAT_POLARISED; // TODO: allow user to pick? + return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, + sample_sets, num_sample_sets, NULL, D_prime_summary_func, norm_hap_weighted, + num_left_windows, left_windows, num_right_windows, right_windows, options, + result_size, result); +} + +static int +r_summary_func(tsk_size_t state_dim, const double *state, + tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + double n; + const double *state_row; + tsk_size_t j; + + for (j = 0; j < state_dim; j++) { + n = (double) args.sample_set_sizes[j]; + state_row = GET_2D_ROW(state, 3, j); + double p_AB = state_row[0] / n; + double p_Ab = state_row[1] / n; + double p_aB = state_row[2] / n; + + double p_A = p_AB + p_Ab; + double p_B = p_AB + p_aB; + + double D_ = p_AB - (p_A * p_B); + double denom = p_A * p_B * (1 - p_A) * (1 - p_B); + + if (denom == 0 && D_ == 0) { + result[j] = 0; + } else { + result[j] = D_ / sqrt(denom); + } + } + return 0; +} + +int +tsk_treeseq_r(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result) +{ + options |= TSK_STAT_POLARISED; // TODO: allow user to pick? + return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, + sample_sets, num_sample_sets, NULL, r_summary_func, norm_total_weighted, + num_left_windows, left_windows, num_right_windows, right_windows, options, + result_size, result); +} + +static int +Dz_summary_func(tsk_size_t state_dim, const double *state, + tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + double n; + const double *state_row; + tsk_size_t j; + + for (j = 0; j < state_dim; j++) { + n = (double) args.sample_set_sizes[j]; + state_row = GET_2D_ROW(state, 3, j); + double p_AB = state_row[0] / n; + double p_Ab = state_row[1] / n; + double p_aB = state_row[2] / n; + + double p_A = p_AB + p_Ab; + double p_B = p_AB + p_aB; + + double D_ = p_AB - (p_A * p_B); + + result[j] = D_ * (1 - 2 * p_A) * (1 - 2 * p_B); + } + return 0; +} + +int +tsk_treeseq_Dz(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result) +{ + return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, + sample_sets, num_sample_sets, NULL, Dz_summary_func, norm_total_weighted, + num_left_windows, left_windows, num_right_windows, right_windows, options, + result_size, result); +} + +static int +pi2_summary_func(tsk_size_t state_dim, const double *state, + tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + double n; + const double *state_row; + tsk_size_t j; + + for (j = 0; j < state_dim; j++) { + n = (double) args.sample_set_sizes[j]; + state_row = GET_2D_ROW(state, 3, j); + double p_AB = state_row[0] / n; + double p_Ab = state_row[1] / n; + double p_aB = state_row[2] / n; + + double p_A = p_AB + p_Ab; + double p_B = p_AB + p_aB; + result[j] = p_A * (1 - p_A) * p_B * (1 - p_B); + } + return 0; +} + +int +tsk_treeseq_pi2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result) +{ + return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, + sample_sets, num_sample_sets, NULL, pi2_summary_func, norm_total_weighted, + num_left_windows, left_windows, num_right_windows, right_windows, options, + result_size, result); +} + /*********************************** * Two way stats ***********************************/ diff --git a/c/tskit/trees.h b/c/tskit/trees.h index 95c66a6ac7..a503b3e39b 100644 --- a/c/tskit/trees.h +++ b/c/tskit/trees.h @@ -919,6 +919,15 @@ typedef int general_stat_func_t(tsk_size_t state_dim, const double *state, int tsk_treeseq_general_stat(const tsk_treeseq_t *self, tsk_size_t K, const double *W, tsk_size_t M, general_stat_func_t *f, void *f_params, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); +// TODO: expose this externally? +/* int tsk_treeseq_two_locus_general_stat(const tsk_treeseq_t *self, */ +/* tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, */ +/* const tsk_id_t *sample_sets, tsk_size_t result_dim, const tsk_id_t *set_indexes, + */ +/* general_stat_func_t *f, norm_func_t *norm_f, tsk_size_t num_left_windows, */ +/* const double *left_windows, tsk_size_t num_right_windows, */ +/* const double *right_windows, tsk_flags_t options, tsk_size_t num_result, */ +/* double *result); */ /* One way weighted stats */ @@ -998,6 +1007,42 @@ int tsk_treeseq_genetic_relatedness(const tsk_treeseq_t *self, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); +int tsk_treeseq_D(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result); +int tsk_treeseq_D2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result); +int tsk_treeseq_r2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result); +int tsk_treeseq_D_prime(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result); +int tsk_treeseq_r(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result); +int tsk_treeseq_Dz(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result); +int tsk_treeseq_pi2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_left_windows, const double *left_windows, + tsk_size_t num_right_windows, const double *right_windows, tsk_flags_t options, + tsk_size_t *result_size, double **result); + /* Three way sample set stats */ int tsk_treeseq_Y3(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets,