Skip to content

Commit

Permalink
Python and C implementations of pair coalescence rates
Browse files Browse the repository at this point in the history
Typo in docstring
  • Loading branch information
nspope authored and mergify[bot] committed Sep 13, 2024
1 parent 46578bd commit 024e042
Show file tree
Hide file tree
Showing 10 changed files with 725 additions and 42 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ jobs:
pip uninstall -y tskit
python run.py
- name: Upload Results
uses: actions/upload-artifact@v2
uses: actions/upload-artifact@v4
with:
name: benchmark-results
path: python/benchmark
Expand Down
130 changes: 130 additions & 0 deletions c/tests/test_stats.c
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,125 @@ verify_pair_coalescence_quantiles(tsk_treeseq_t *ts)
}
}

/* Check coalescence rates */
static void
verify_pair_coalescence_rates(tsk_treeseq_t *ts)
{
int ret;
const tsk_size_t n = tsk_treeseq_get_num_samples(ts);
const tsk_size_t N = tsk_treeseq_get_num_nodes(ts);
const tsk_size_t T = tsk_treeseq_get_num_trees(ts);
const tsk_id_t *samples = tsk_treeseq_get_samples(ts);
const double *breakpoints = tsk_treeseq_get_breakpoints(ts);
const double *nodes_time = ts->tables->nodes.time;
const double max_time = ts->max_time;
const tsk_size_t P = 2;
const tsk_size_t B = 5;
const tsk_size_t I = P * (P + 1) / 2;
double epochs[]
= { 0.0, max_time / 4, max_time / 2, max_time, max_time * 2, INFINITY };
tsk_id_t sample_sets[n];
tsk_size_t sample_set_sizes[P];
tsk_id_t index_tuples[2 * I];
tsk_id_t node_bin_map[N];
tsk_id_t empty_node_bin_map[N];
tsk_size_t dim = T * B * I;
double C[dim];
tsk_size_t i, j, k;

for (i = 0; i < N; i++) {
node_bin_map[i] = TSK_NULL;
for (j = 0; j < B; j++) {
if (nodes_time[i] >= epochs[j] && nodes_time[i] < epochs[j + 1]) {
node_bin_map[i] = (tsk_id_t) j;
}
}
empty_node_bin_map[i] = TSK_NULL;
}

for (i = 0; i < n; i++) {
sample_sets[i] = samples[i];
}

for (i = 0; i < P; i++) {
sample_set_sizes[i] = 0;
}
for (j = 0; j < n; j++) {
i = j / (n / P);
sample_set_sizes[i]++;
}

for (j = 0, i = 0; j < P; j++) {
for (k = j; k < P; k++) {
index_tuples[i++] = (tsk_id_t) j;
index_tuples[i++] = (tsk_id_t) k;
}
}

ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I,
index_tuples, T, breakpoints, B, node_bin_map, epochs, 0, C);
CU_ASSERT_EQUAL_FATAL(ret, 0);
/* TODO: compare against naive coalescence rates per tree */

node_bin_map[0] = TSK_NULL;
ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I,
index_tuples, T, breakpoints, B, node_bin_map, epochs, 0, C);
CU_ASSERT_EQUAL_FATAL(ret, 0);
node_bin_map[0] = 0;

ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I,
index_tuples, T, breakpoints, B, empty_node_bin_map, epochs, 0, C);
CU_ASSERT_EQUAL_FATAL(ret, 0);

/* cover errors */
ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I,
index_tuples, T, breakpoints, 0, node_bin_map, epochs, 0, C);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_TIME_WINDOWS_DIM);

epochs[0] = epochs[1] / 2;
ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I,
index_tuples, T, breakpoints, B, node_bin_map, epochs, 0, C);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SAMPLE_PAIR_TIMES);
epochs[0] = 0.0;

epochs[2] = epochs[1];
ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I,
index_tuples, T, breakpoints, B, node_bin_map, epochs, 0, C);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_TIME_WINDOWS);
epochs[2] = max_time / 2;

epochs[B] = DBL_MAX;
ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I,
index_tuples, T, breakpoints, B, node_bin_map, epochs, 0, C);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_TIME_WINDOWS);
epochs[B] = INFINITY;

node_bin_map[0] = (tsk_id_t) B;
ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I,
index_tuples, T, breakpoints, B, node_bin_map, epochs, 0, C);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NODE_BIN_MAP_DIM);
node_bin_map[0] = 0;

node_bin_map[0] = (tsk_id_t)(B - 1);
ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I,
index_tuples, T, breakpoints, B, node_bin_map, epochs, 0, C);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NODE_TIME_WINDOW);
node_bin_map[0] = 0;

node_bin_map[N - 1] = 0;
ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I,
index_tuples, T, breakpoints, B, node_bin_map, epochs, 0, C);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NODE_TIME_WINDOW);
node_bin_map[N - 1] = 3;

tsk_size_t tmp = sample_set_sizes[0];
sample_set_sizes[0] = 0;
ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I,
index_tuples, T, breakpoints, B, node_bin_map, epochs, 0, C);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EMPTY_SAMPLE_SET);
sample_set_sizes[0] = tmp;
}

typedef struct {
int call_count;
int error_on;
Expand Down Expand Up @@ -3323,6 +3442,16 @@ test_pair_coalescence_quantiles(void)
tsk_treeseq_free(&ts);
}

static void
test_pair_coalescence_rates(void)
{
tsk_treeseq_t ts;
tsk_treeseq_from_text(&ts, 100, nonbinary_ex_nodes, nonbinary_ex_edges, NULL,
nonbinary_ex_sites, nonbinary_ex_mutations, NULL, NULL, 0);
verify_pair_coalescence_rates(&ts);
tsk_treeseq_free(&ts);
}

int
main(int argc, char **argv)
{
Expand Down Expand Up @@ -3425,6 +3554,7 @@ main(int argc, char **argv)

{ "test_pair_coalescence_counts", test_pair_coalescence_counts },
{ "test_pair_coalescence_quantiles", test_pair_coalescence_quantiles },
{ "test_pair_coalescence_rates", test_pair_coalescence_rates },

{ NULL, NULL },
};
Expand Down
17 changes: 16 additions & 1 deletion c/tskit/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ tsk_strerror_internal(int err)
"(TSK_ERR_BAD_NODE_BIN_MAP)";
break;
case TSK_ERR_BAD_NODE_BIN_MAP_DIM:
ret = "Maximum index in node-to-bin map does not match "
ret = "Maximum index in node-to-bin map is greater than the "
"output dimension. (TSK_ERR_BAD_NODE_BIN_MAP_DIM)";
break;
case TSK_ERR_BAD_QUANTILES:
Expand All @@ -504,6 +504,21 @@ tsk_strerror_internal(int err)
case TSK_ERR_UNSORTED_TIMES:
ret = "Times must be strictly increasing. (TSK_ERR_UNSORTED_TIMES)";
break;
case TSK_ERR_BAD_TIME_WINDOWS_DIM:
ret = "Must have at least one time window. (TSK_ERR_BAD_TIME_WINDOWS_DIM)";
break;
case TSK_ERR_BAD_SAMPLE_PAIR_TIMES:
ret = "All sample times must be equal to the start of first time window. "
"(TSK_ERR_BAD_SAMPLE_PAIR_TIMES)";
break;
case TSK_ERR_BAD_TIME_WINDOWS:
ret = "Time windows must be strictly increasing and end at infinity. "
"(TSK_ERR_BAD_TIME_WINDOWS)";
break;
case TSK_ERR_BAD_NODE_TIME_WINDOW:
ret = "Node time does not fall within assigned time window. "
"(TSK_ERR_BAD_NODE_TIME_WINDOW)";
break;

/* Two locus errors */
case TSK_ERR_STAT_UNSORTED_POSITIONS:
Expand Down
18 changes: 17 additions & 1 deletion c/tskit/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -704,7 +704,7 @@ The node bin map contains a value less than TSK_NULL.
*/
#define TSK_ERR_BAD_NODE_BIN_MAP -914
/**
Maximum index in node bin map does not match output dimension.
Maximum index in node bin map is greater than output dimension.
*/
#define TSK_ERR_BAD_NODE_BIN_MAP_DIM -915
/**
Expand All @@ -731,6 +731,22 @@ The provided sites are not provided in strictly increasing position order
The provided sites are not unique
*/
#define TSK_ERR_STAT_DUPLICATE_SITES -921
/**
The number of time windows is zero
*/
#define TSK_ERR_BAD_TIME_WINDOWS_DIM -922
/**
Sample times do not all equal the start of first time window
*/
#define TSK_ERR_BAD_SAMPLE_PAIR_TIMES -923
/**
Time windows are not strictly increasing ending at infinity
*/
#define TSK_ERR_BAD_TIME_WINDOWS -924
/**
Node time does not fall within assigned time window
*/
#define TSK_ERR_BAD_NODE_TIME_WINDOW -925
/** @} */

/**
Expand Down
121 changes: 120 additions & 1 deletion c/tskit/trees.c
Original file line number Diff line number Diff line change
Expand Up @@ -8874,7 +8874,7 @@ check_node_bin_map(
max_index = index;
}
}
if (num_bins < 1 || (tsk_id_t) num_bins != max_index + 1) {
if (num_bins < 1 || (tsk_id_t) num_bins < max_index + 1) {
ret = TSK_ERR_BAD_NODE_BIN_MAP_DIM;
goto out;
}
Expand Down Expand Up @@ -9256,6 +9256,7 @@ pair_coalescence_quantiles(tsk_size_t input_dim, const double *weight,
j = 0;
coalesced = 0.0;
timepoint = -INFINITY;
/* TODO: may be more efficient to use a binary search */
for (i = 0; i < input_dim; i++) {
if (weight[i] > 0) {
coalesced += weight[i];
Expand Down Expand Up @@ -9319,3 +9320,121 @@ tsk_treeseq_pair_coalescence_quantiles(const tsk_treeseq_t *self,
out:
return ret;
}

static int
pair_coalescence_rates(tsk_size_t input_dim, const double *weight, const double *values,
tsk_size_t output_dim, double *output, void *params)
{
int ret = 0;
double coalesced, rate, waiting_time, a, b;
double *time_windows = (double *) params;
tsk_id_t i, j;
tsk_bug_assert(input_dim == output_dim);
for (j = (tsk_id_t) output_dim; j > 0; j--) { /* find last window with data */
if (weight[j - 1] == 0) {
output[j - 1] = NAN; /* TODO: should fill value be zero instead? */
} else {
break;
}
}
coalesced = 0.0;
for (i = 0; i < j; i++) {
tsk_bug_assert(weight[i] >= 0 && weight[i] <= 1);
a = time_windows[i];
b = time_windows[i + 1];
if (i + 1 == j) {
waiting_time = values[i] < a ? 0.0 : values[i] - a;
rate = 1 / waiting_time;
} else {
rate = log(1 - weight[i] / (1 - coalesced)) / (a - b);
}
output[i] = rate > 0 ? rate : 0;
coalesced += weight[i];
}
return ret;
}

static int
check_coalescence_rate_time_windows(const tsk_treeseq_t *self,
tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes,
const tsk_id_t *sample_sets, tsk_size_t num_time_windows,
const tsk_id_t *node_time_window, const double *time_windows)
{
int ret = 0;
double timepoint;
const double *nodes_time = self->tables->nodes.time;
tsk_size_t num_nodes = self->tables->nodes.num_rows;
tsk_id_t i, j, k;
tsk_id_t n;
if (num_time_windows == 0) {
ret = TSK_ERR_BAD_TIME_WINDOWS_DIM;
goto out;
}
/* time windows are sorted */
timepoint = time_windows[0];
for (i = 0; i < (tsk_id_t) num_time_windows; i++) {
if (time_windows[i + 1] <= timepoint) {
ret = TSK_ERR_BAD_TIME_WINDOWS;
goto out;
}
timepoint = time_windows[i + 1];
}
if (timepoint != INFINITY) {
ret = TSK_ERR_BAD_TIME_WINDOWS;
goto out;
}
/* all sample times align with start of first time window */
k = 0;
for (i = 0; i < (tsk_id_t) num_sample_sets; i++) {
for (j = 0; j < (tsk_id_t) sample_set_sizes[i]; j++) {
n = sample_sets[k++];
if (nodes_time[n] != time_windows[0]) {
ret = TSK_ERR_BAD_SAMPLE_PAIR_TIMES;
goto out;
}
}
}
/* nodes are correctly assigned to time windows */
for (i = 0; i < (tsk_id_t) num_nodes; i++) {
j = node_time_window[i];
if (j < 0) {
continue;
}
if (j >= (tsk_id_t) num_time_windows) {
ret = TSK_ERR_BAD_NODE_BIN_MAP_DIM;
goto out;
}
if (nodes_time[i] < time_windows[j] || nodes_time[i] >= time_windows[j + 1]) {
ret = TSK_ERR_BAD_NODE_TIME_WINDOW;
goto out;
}
}
out:
return ret;
}

int
tsk_treeseq_pair_coalescence_rates(const tsk_treeseq_t *self, tsk_size_t num_sample_sets,
const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets,
tsk_size_t num_set_indexes, const tsk_id_t *set_indexes, tsk_size_t num_windows,
const double *windows, tsk_size_t num_time_windows, const tsk_id_t *node_time_window,
double *time_windows, tsk_flags_t options, double *result)
{
int ret = 0;
void *params = (void *) time_windows;
ret = check_coalescence_rate_time_windows(self, num_sample_sets, sample_set_sizes,
sample_sets, num_time_windows, node_time_window, time_windows);
if (ret != 0) {
goto out;
}
options |= TSK_STAT_SPAN_NORMALISE | TSK_STAT_PAIR_NORMALISE;
ret = tsk_treeseq_pair_coalescence_stat(self, num_sample_sets, sample_set_sizes,
sample_sets, num_set_indexes, set_indexes, num_windows, windows,
num_time_windows, node_time_window, pair_coalescence_rates, num_time_windows,
params, options, result);
if (ret != 0) {
goto out;
}
out:
return ret;
}
6 changes: 6 additions & 0 deletions c/tskit/trees.h
Original file line number Diff line number Diff line change
Expand Up @@ -1168,6 +1168,12 @@ int tsk_treeseq_pair_coalescence_quantiles(const tsk_treeseq_t *self,
tsk_size_t num_windows, const double *windows, tsk_size_t num_bins,
const tsk_id_t *node_bin_map, tsk_size_t num_quantiles, double *quantiles,
tsk_flags_t options, double *result);
int tsk_treeseq_pair_coalescence_rates(const tsk_treeseq_t *self,
tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes,
const tsk_id_t *sample_sets, tsk_size_t num_set_indexes, const tsk_id_t *set_indexes,
tsk_size_t num_windows, const double *windows, tsk_size_t num_time_windows,
const tsk_id_t *node_time_window, double *time_windows, tsk_flags_t options,
double *result);

/****************************************************************************/
/* Tree */
Expand Down
Loading

0 comments on commit 024e042

Please sign in to comment.