-
Notifications
You must be signed in to change notification settings - Fork 74
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
time windows in statistics #2948
base: main
Are you sure you want to change the base?
Changes from 18 commits
aeda4dc
0a30696
1a988c0
18ffda0
c6f9562
7a3149f
71da7ad
2a44909
4460db9
bbec6a9
da5f205
6b3ab4f
f2d857b
b8f4ba5
c9f6c06
59ea266
96ac0ce
26a7f09
8a8c05b
0d48891
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -865,6 +865,73 @@ verify_one_way_stat_func_errors(tsk_treeseq_t *ts, one_way_sample_stat_method *m | |
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); | ||
} | ||
|
||
// Temporary definition for time_windows in tsk_treeseq_allele_frequency_spectrum | ||
typedef int one_way_sample_stat_method_tw(const tsk_treeseq_t *self, | ||
tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, | ||
const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows, | ||
tsk_size_t num_time_windows, const double *time_windows, tsk_flags_t options, | ||
double *result); | ||
|
||
// Temporary duplicate for time-windows-having methods | ||
static void | ||
verify_one_way_stat_func_errors_tw( | ||
tsk_treeseq_t *ts, one_way_sample_stat_method_tw *method) | ||
{ | ||
int ret; | ||
tsk_id_t num_nodes = (tsk_id_t) tsk_treeseq_get_num_nodes(ts); | ||
tsk_id_t samples[] = { 0, 1, 2, 3 }; | ||
tsk_size_t sample_set_sizes = 4; | ||
double windows[] = { 0, 0, 0 }; | ||
double time_windows[] = { 0, 0, 0 }; | ||
double result; | ||
|
||
ret = method(ts, 0, &sample_set_sizes, samples, 0, NULL, 0, NULL, 0, &result); | ||
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS); | ||
|
||
samples[0] = TSK_NULL; | ||
ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, NULL, 0, &result); | ||
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); | ||
samples[0] = -10; | ||
ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, NULL, 0, &result); | ||
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); | ||
samples[0] = num_nodes; | ||
ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, NULL, 0, &result); | ||
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); | ||
samples[0] = num_nodes + 1; | ||
ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, NULL, 0, &result); | ||
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); | ||
|
||
samples[0] = num_nodes - 1; | ||
ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, NULL, 0, &result); | ||
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SAMPLES); | ||
|
||
samples[0] = 1; | ||
ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, NULL, 0, &result); | ||
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE); | ||
|
||
samples[0] = 0; | ||
sample_set_sizes = 0; | ||
ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, NULL, 0, &result); | ||
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EMPTY_SAMPLE_SET); | ||
|
||
sample_set_sizes = 4; | ||
/* Window errors */ | ||
ret = method(ts, 1, &sample_set_sizes, samples, 0, windows, 0, NULL, 0, &result); | ||
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NUM_WINDOWS); | ||
|
||
ret = method(ts, 1, &sample_set_sizes, samples, 2, windows, 0, NULL, 0, &result); | ||
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); | ||
|
||
/* Time window errors */ | ||
ret = method( | ||
ts, 1, &sample_set_sizes, samples, 0, NULL, 0, time_windows, 0, &result); | ||
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NUM_WINDOWS); | ||
|
||
ret = method( | ||
ts, 1, &sample_set_sizes, samples, 0, NULL, 2, time_windows, 0, &result); | ||
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); | ||
} | ||
|
||
static void | ||
verify_two_way_stat_func_errors( | ||
tsk_treeseq_t *ts, general_sample_stat_method *method, tsk_flags_t options) | ||
|
@@ -1203,23 +1270,24 @@ verify_afs(tsk_treeseq_t *ts) | |
sample_set_sizes[0] = n - 2; | ||
sample_set_sizes[1] = 2; | ||
ret = tsk_treeseq_allele_frequency_spectrum( | ||
ts, 2, sample_set_sizes, samples, 0, NULL, 0, result); | ||
ts, 2, sample_set_sizes, samples, 0, NULL, 0, NULL, 0, result); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. uh-oh, are these tabs? |
||
CU_ASSERT_EQUAL_FATAL(ret, 0); | ||
|
||
ret = tsk_treeseq_allele_frequency_spectrum( | ||
ts, 2, sample_set_sizes, samples, 0, NULL, TSK_STAT_POLARISED, result); | ||
ts, 2, sample_set_sizes, samples, 0, NULL, 0, NULL, TSK_STAT_POLARISED, result); | ||
CU_ASSERT_EQUAL_FATAL(ret, 0); | ||
|
||
ret = tsk_treeseq_allele_frequency_spectrum(ts, 2, sample_set_sizes, samples, 0, | ||
NULL, TSK_STAT_POLARISED | TSK_STAT_SPAN_NORMALISE, result); | ||
NULL, 0, NULL, TSK_STAT_POLARISED | TSK_STAT_SPAN_NORMALISE, result); | ||
CU_ASSERT_EQUAL_FATAL(ret, 0); | ||
|
||
ret = tsk_treeseq_allele_frequency_spectrum(ts, 2, sample_set_sizes, samples, 0, | ||
NULL, TSK_STAT_BRANCH | TSK_STAT_POLARISED | TSK_STAT_SPAN_NORMALISE, result); | ||
NULL, 0, NULL, TSK_STAT_BRANCH | TSK_STAT_POLARISED | TSK_STAT_SPAN_NORMALISE, | ||
result); | ||
CU_ASSERT_EQUAL_FATAL(ret, 0); | ||
|
||
ret = tsk_treeseq_allele_frequency_spectrum(ts, 2, sample_set_sizes, samples, 0, | ||
NULL, TSK_STAT_BRANCH | TSK_STAT_SPAN_NORMALISE, result); | ||
NULL, 0, NULL, TSK_STAT_BRANCH | TSK_STAT_SPAN_NORMALISE, result); | ||
CU_ASSERT_EQUAL_FATAL(ret, 0); | ||
|
||
free(result); | ||
|
@@ -2418,14 +2486,14 @@ test_paper_ex_afs_errors(void) | |
tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, | ||
paper_ex_mutations, paper_ex_individuals, NULL, 0); | ||
|
||
verify_one_way_stat_func_errors(&ts, tsk_treeseq_allele_frequency_spectrum); | ||
verify_one_way_stat_func_errors_tw(&ts, tsk_treeseq_allele_frequency_spectrum); | ||
|
||
ret = tsk_treeseq_allele_frequency_spectrum( | ||
&ts, 2, sample_set_sizes, samples, 0, NULL, TSK_STAT_NODE, result); | ||
&ts, 2, sample_set_sizes, samples, 0, NULL, 0, NULL, TSK_STAT_NODE, result); | ||
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSUPPORTED_STAT_MODE); | ||
|
||
ret = tsk_treeseq_allele_frequency_spectrum(&ts, 2, sample_set_sizes, samples, 0, | ||
NULL, TSK_STAT_BRANCH | TSK_STAT_SITE, result); | ||
NULL, 0, NULL, TSK_STAT_BRANCH | TSK_STAT_SITE, result); | ||
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MULTIPLE_STAT_MODES); | ||
|
||
tsk_treeseq_free(&ts); | ||
|
@@ -2445,14 +2513,14 @@ test_paper_ex_afs(void) | |
/* we have two singletons and one tripleton */ | ||
|
||
ret = tsk_treeseq_allele_frequency_spectrum( | ||
&ts, 1, sample_set_sizes, samples, 0, NULL, 0, result); | ||
&ts, 1, sample_set_sizes, samples, 0, NULL, 0, NULL, 0, result); | ||
CU_ASSERT_EQUAL_FATAL(ret, 0); | ||
CU_ASSERT_EQUAL_FATAL(result[0], 0); | ||
CU_ASSERT_EQUAL_FATAL(result[1], 3.0); | ||
CU_ASSERT_EQUAL_FATAL(result[2], 0); | ||
|
||
ret = tsk_treeseq_allele_frequency_spectrum( | ||
&ts, 1, sample_set_sizes, samples, 0, NULL, TSK_STAT_POLARISED, result); | ||
&ts, 1, sample_set_sizes, samples, 0, NULL, 0, NULL, TSK_STAT_POLARISED, result); | ||
CU_ASSERT_EQUAL_FATAL(ret, 0); | ||
CU_ASSERT_EQUAL_FATAL(result[0], 0); | ||
CU_ASSERT_EQUAL_FATAL(result[1], 2.0); | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -1232,6 +1232,35 @@ | |||||
return ret; | ||||||
} | ||||||
|
||||||
static int | ||||||
tsk_treeseq_check_time_windows(tsk_size_t num_windows, const double *windows) | ||||||
{ | ||||||
int ret = TSK_ERR_BAD_WINDOWS; | ||||||
tsk_size_t j; | ||||||
|
||||||
if (num_windows < 1) { | ||||||
ret = TSK_ERR_BAD_NUM_WINDOWS; | ||||||
goto out; | ||||||
} | ||||||
|
||||||
if (windows[0] < 0) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if currently the code assumes this is 0, should check for == here |
||||||
goto out; | ||||||
} | ||||||
|
||||||
if (windows[0] != 0) { | ||||||
goto out; | ||||||
} | ||||||
|
||||||
for (j = 0; j < num_windows; j++) { | ||||||
if (windows[j] >= windows[j + 1]) { | ||||||
goto out; | ||||||
} | ||||||
} | ||||||
ret = 0; | ||||||
out: | ||||||
return ret; | ||||||
} | ||||||
|
||||||
/* TODO make these functions more consistent in how the arguments are ordered */ | ||||||
|
||||||
static inline void | ||||||
|
@@ -3486,35 +3515,52 @@ | |||||
|
||||||
static int TSK_WARN_UNUSED | ||||||
tsk_treeseq_update_branch_afs(const tsk_treeseq_t *self, tsk_id_t u, double right, | ||||||
const double *restrict branch_length, double *restrict last_update, | ||||||
const double *counts, tsk_size_t num_sample_sets, tsk_size_t window_index, | ||||||
const tsk_size_t *result_dims, tsk_flags_t options, double *result) | ||||||
double *restrict last_update, const double *restrict time, tsk_id_t *restrict parent, | ||||||
const double *time_windows, const double *counts, tsk_size_t num_sample_sets, | ||||||
tsk_size_t num_time_windows, tsk_size_t window_index, const tsk_size_t *result_dims, | ||||||
tsk_flags_t options, double *result) | ||||||
{ | ||||||
int ret = 0; | ||||||
tsk_size_t afs_size; | ||||||
tsk_size_t k; | ||||||
tsk_size_t time_window_index; | ||||||
double *afs; | ||||||
tsk_size_t *coordinate = tsk_malloc(num_sample_sets * sizeof(*coordinate)); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Gee, wouldn't it be better to malloc this outside this function, and pass it in? (I honestly don't know...) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. HA! IDK, I'll let it there for now, but yeah maybe! |
||||||
bool polarised = !!(options & TSK_STAT_POLARISED); | ||||||
const double *count_row = GET_2D_ROW(counts, num_sample_sets + 1, u); | ||||||
double x = (right - last_update[u]) * branch_length[u]; | ||||||
double x = 0; | ||||||
double t_v = 0; | ||||||
double tw_branch_length = 0; | ||||||
const tsk_size_t all_samples = (tsk_size_t) count_row[num_sample_sets]; | ||||||
|
||||||
if (coordinate == NULL) { | ||||||
ret = TSK_ERR_NO_MEMORY; | ||||||
goto out; | ||||||
} | ||||||
|
||||||
if (0 < all_samples && all_samples < self->num_samples) { | ||||||
afs_size = result_dims[num_sample_sets]; | ||||||
afs = result + afs_size * window_index; | ||||||
for (k = 0; k < num_sample_sets; k++) { | ||||||
coordinate[k] = (tsk_size_t) count_row[k]; | ||||||
} | ||||||
if (!polarised) { | ||||||
fold(coordinate, result_dims, num_sample_sets); | ||||||
if (parent[u] != TSK_NULL) { | ||||||
t_v = time[parent[u]]; | ||||||
if (0 < all_samples && all_samples < self->num_samples) { | ||||||
time_window_index = 0; | ||||||
while (time_window_index < num_time_windows | ||||||
&& time_windows[time_window_index] < t_v) { | ||||||
/* for (time_window_index = 0; time_window_index < num_time_windows; | ||||||
* time_window_index++){ */ | ||||||
afs_size = result_dims[num_sample_sets]; | ||||||
afs = result | ||||||
+ afs_size * (window_index * num_time_windows + time_window_index); | ||||||
for (k = 0; k < num_sample_sets; k++) { | ||||||
coordinate[k] = (tsk_size_t) count_row[k]; | ||||||
} | ||||||
if (!polarised) { | ||||||
fold(coordinate, result_dims, num_sample_sets); | ||||||
} | ||||||
tw_branch_length = TSK_MIN(time_windows[time_window_index + 1], t_v) | ||||||
- TSK_MAX(time_windows[0], time[u]); | ||||||
x = (right - last_update[u]) * tw_branch_length; | ||||||
increment_nd_array_value( | ||||||
afs, num_sample_sets, result_dims, coordinate, x); | ||||||
time_window_index++; | ||||||
} | ||||||
} | ||||||
increment_nd_array_value(afs, num_sample_sets, result_dims, coordinate, x); | ||||||
} | ||||||
last_update[u] = right; | ||||||
out: | ||||||
|
@@ -3525,8 +3571,8 @@ | |||||
static int | ||||||
tsk_treeseq_branch_allele_frequency_spectrum(const tsk_treeseq_t *self, | ||||||
tsk_size_t num_sample_sets, double *counts, tsk_size_t num_windows, | ||||||
const double *windows, const tsk_size_t *result_dims, tsk_flags_t options, | ||||||
double *result) | ||||||
tsk_size_t num_time_windows, const double *windows, const double *time_windows, | ||||||
const tsk_size_t *result_dims, tsk_flags_t options, double *result) | ||||||
{ | ||||||
int ret = 0; | ||||||
tsk_id_t u, v; | ||||||
|
@@ -3571,16 +3617,16 @@ | |||||
tk++; | ||||||
u = edge_child[h]; | ||||||
v = edge_parent[h]; | ||||||
ret = tsk_treeseq_update_branch_afs(self, u, t_left, branch_length, | ||||||
last_update, counts, num_sample_sets, window_index, result_dims, options, | ||||||
result); | ||||||
ret = tsk_treeseq_update_branch_afs(self, u, t_left, last_update, node_time, | ||||||
parent, time_windows, counts, num_sample_sets, num_time_windows, | ||||||
window_index, result_dims, options, result); | ||||||
if (ret != 0) { | ||||||
goto out; | ||||||
} | ||||||
while (v != TSK_NULL) { | ||||||
ret = tsk_treeseq_update_branch_afs(self, v, t_left, branch_length, | ||||||
last_update, counts, num_sample_sets, window_index, result_dims, | ||||||
options, result); | ||||||
ret = tsk_treeseq_update_branch_afs(self, v, t_left, last_update, | ||||||
node_time, parent, time_windows, counts, num_sample_sets, | ||||||
num_time_windows, window_index, result_dims, options, result); | ||||||
if (ret != 0) { | ||||||
goto out; | ||||||
} | ||||||
|
@@ -3599,9 +3645,9 @@ | |||||
parent[u] = v; | ||||||
branch_length[u] = node_time[v] - node_time[u]; | ||||||
while (v != TSK_NULL) { | ||||||
ret = tsk_treeseq_update_branch_afs(self, v, t_left, branch_length, | ||||||
last_update, counts, num_sample_sets, window_index, result_dims, | ||||||
options, result); | ||||||
ret = tsk_treeseq_update_branch_afs(self, v, t_left, last_update, | ||||||
node_time, parent, time_windows, counts, num_sample_sets, | ||||||
num_time_windows, window_index, result_dims, options, result); | ||||||
if (ret != 0) { | ||||||
goto out; | ||||||
} | ||||||
|
@@ -3623,9 +3669,9 @@ | |||||
/* Flush the contributions of all nodes to the current window */ | ||||||
for (u = 0; u < (tsk_id_t) num_nodes; u++) { | ||||||
tsk_bug_assert(last_update[u] < w_right); | ||||||
ret = tsk_treeseq_update_branch_afs(self, u, w_right, branch_length, | ||||||
last_update, counts, num_sample_sets, window_index, result_dims, | ||||||
options, result); | ||||||
ret = tsk_treeseq_update_branch_afs(self, u, w_right, last_update, | ||||||
node_time, parent, time_windows, counts, num_sample_sets, | ||||||
num_time_windows, window_index, result_dims, options, result); | ||||||
if (ret != 0) { | ||||||
goto out; | ||||||
} | ||||||
|
@@ -3653,13 +3699,15 @@ | |||||
tsk_treeseq_allele_frequency_spectrum(const tsk_treeseq_t *self, | ||||||
tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, | ||||||
const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows, | ||||||
tsk_flags_t options, double *result) | ||||||
tsk_size_t num_time_windows, const double *time_windows, tsk_flags_t options, | ||||||
double *result) | ||||||
{ | ||||||
int ret = 0; | ||||||
bool stat_site = !!(options & TSK_STAT_SITE); | ||||||
bool stat_branch = !!(options & TSK_STAT_BRANCH); | ||||||
bool stat_node = !!(options & TSK_STAT_NODE); | ||||||
const double default_windows[] = { 0, self->tables->sequence_length }; | ||||||
const double default_time_windows[] = { 0, INFINITY }; | ||||||
const tsk_size_t num_nodes = self->tables->nodes.num_rows; | ||||||
const tsk_size_t K = num_sample_sets + 1; | ||||||
tsk_size_t j, k, l, afs_size; | ||||||
|
@@ -3669,7 +3717,6 @@ | |||||
* reuse code from the general_stats code paths. */ | ||||||
double *counts = NULL; | ||||||
double *count_row; | ||||||
|
||||||
if (stat_node) { | ||||||
ret = TSK_ERR_UNSUPPORTED_STAT_MODE; | ||||||
goto out; | ||||||
|
@@ -3693,6 +3740,20 @@ | |||||
goto out; | ||||||
} | ||||||
} | ||||||
if (time_windows == NULL) { | ||||||
num_time_windows = 1; | ||||||
time_windows = default_time_windows; | ||||||
} else { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After this line is probably the right place to check if it's |
||||||
if (stat_site | ||||||
&& tsk_memcmp(time_windows, default_time_windows, sizeof(double)) != 0) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, this is a bit awkward - what if instead we used There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But time_windows are always initialized by default as [0, inf], so num_time_windows=2, comparing to the default was the clearest I found for now. But maybe the problem lies in the initialization caused by the parsing of the windows in the first place. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, wait - we're already in the
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So, if someone explicitly specifies |
||||||
ret = TSK_ERR_UNSUPPORTED_STAT_MODE; | ||||||
goto out; | ||||||
} | ||||||
ret = tsk_treeseq_check_time_windows(num_time_windows, time_windows); | ||||||
if (ret != 0) { | ||||||
goto out; | ||||||
} | ||||||
} | ||||||
ret = tsk_treeseq_check_sample_sets( | ||||||
self, num_sample_sets, sample_set_sizes, sample_sets); | ||||||
if (ret != 0) { | ||||||
|
@@ -3728,15 +3789,16 @@ | |||||
count_row[num_sample_sets] = 1; | ||||||
} | ||||||
result_dims[num_sample_sets] = (tsk_size_t) afs_size; | ||||||
tsk_memset(result, 0, num_windows * num_time_windows * afs_size * sizeof(*result)); | ||||||
|
||||||
tsk_memset(result, 0, num_windows * afs_size * sizeof(*result)); | ||||||
if (stat_site) { | ||||||
ret = tsk_treeseq_site_allele_frequency_spectrum(self, num_sample_sets, | ||||||
sample_set_sizes, counts, num_windows, windows, result_dims, options, | ||||||
result); | ||||||
} else { | ||||||
ret = tsk_treeseq_branch_allele_frequency_spectrum(self, num_sample_sets, counts, | ||||||
num_windows, windows, result_dims, options, result); | ||||||
num_windows, num_time_windows, windows, time_windows, result_dims, options, | ||||||
result); | ||||||
} | ||||||
|
||||||
if (options & TSK_STAT_SPAN_NORMALISE) { | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should also have time window errors here?