-
Notifications
You must be signed in to change notification settings - Fork 74
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Multichrom example #2665
base: main
Are you sure you want to change the base?
Multichrom example #2665
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,265 @@ | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <assert.h> | ||
#include <err.h> | ||
#include <string.h> | ||
|
||
#include <pthread.h> | ||
#include <tskit/tables.h> | ||
|
||
#define check_tsk_error(val) \ | ||
if (val < 0) { \ | ||
errx(EXIT_FAILURE, "line %d: %s\n", __LINE__, tsk_strerror(val)); \ | ||
} | ||
|
||
static void | ||
init_tables(tsk_table_collection_t *tcs, int num_chroms) | ||
{ | ||
int j, ret; | ||
|
||
for (j = 0; j < num_chroms; j++) { | ||
ret = tsk_table_collection_init(&tcs[j], 0); | ||
check_tsk_error(ret); | ||
if (j > 0) { | ||
tsk_node_table_free(&tcs[j].nodes); | ||
} | ||
} | ||
} | ||
|
||
static void | ||
free_tables(tsk_table_collection_t *tcs, int num_chroms) | ||
{ | ||
int j; | ||
|
||
for (j = 0; j < num_chroms; j++) { | ||
if (j > 0) { | ||
/* Must not double free node table columns. */ | ||
memset(&tcs[j].nodes, 0, sizeof(tcs[j].nodes)); | ||
} | ||
tsk_table_collection_free(&tcs[j]); | ||
} | ||
} | ||
|
||
static void | ||
join_tables(tsk_table_collection_t *tcs, int num_chroms) | ||
{ | ||
int j, ret; | ||
|
||
for (j = 1; j < num_chroms; j++) { | ||
ret = tsk_edge_table_extend( | ||
&tcs[0].edges, &tcs[j].edges, tcs[j].edges.num_rows, NULL, 0); | ||
check_tsk_error(ret); | ||
} | ||
/* Get all the squashable edges next to each other */ | ||
ret = tsk_table_collection_sort(&tcs[0], NULL, 0); | ||
check_tsk_error(ret); | ||
ret = tsk_edge_table_squash(&tcs[0].edges); | ||
check_tsk_error(ret); | ||
/* We need to sort again after squash */ | ||
ret = tsk_table_collection_sort(&tcs[0], NULL, 0); | ||
check_tsk_error(ret); | ||
ret = tsk_table_collection_build_index(&tcs[0], 0); | ||
check_tsk_error(ret); | ||
} | ||
|
||
struct chunk_work { | ||
int chunk; | ||
tsk_table_collection_t *tc; | ||
int *samples; | ||
int N; | ||
}; | ||
|
||
void * | ||
simplify_chunk(void *arg) | ||
{ | ||
int ret; | ||
struct chunk_work *work = (struct chunk_work *) arg; | ||
tsk_size_t edges_before = work->tc->edges.num_rows; | ||
|
||
ret = tsk_table_collection_sort(work->tc, NULL, 0); | ||
check_tsk_error(ret); | ||
ret = tsk_table_collection_simplify(work->tc, work->samples, work->N, | ||
TSK_SIMPLIFY_NO_FILTER_NODES | TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS, NULL); | ||
check_tsk_error(ret); | ||
/* NOTE: this printf makes helgrind complain */ | ||
printf("\tchunk %d: %lld -> %lld\n", work->chunk, (long long) edges_before, | ||
(long long) work->tc->edges.num_rows); | ||
|
||
return NULL; | ||
} | ||
|
||
void | ||
sort_and_simplify_all(tsk_table_collection_t *tcs, int num_chroms, int *samples, int N) | ||
{ | ||
int j, ret; | ||
struct chunk_work work[num_chroms]; | ||
pthread_t threads[num_chroms]; | ||
|
||
for (j = 1; j < num_chroms; j++) { | ||
tcs[j].nodes = tcs[0].nodes; | ||
} | ||
|
||
for (j = 0; j < num_chroms; j++) { | ||
work[j].chunk = j; | ||
work[j].tc = &tcs[j]; | ||
work[j].samples = samples; | ||
work[j].N = N; | ||
|
||
ret = pthread_create(&threads[j], NULL, simplify_chunk, (void *) &work[j]); | ||
if (ret != 0) { | ||
errx(EXIT_FAILURE, "Pthread create failed"); | ||
} | ||
/* simplify_chunk((void *) &work[j]); */ | ||
} | ||
for (j = 0; j < num_chroms; j++) { | ||
ret = pthread_join(threads[j], NULL); | ||
if (ret != 0) { | ||
errx(EXIT_FAILURE, "Pthread join failed"); | ||
} | ||
} | ||
} | ||
|
||
void | ||
simplify_tables(tsk_table_collection_t *tcs, int num_chroms, int *samples, int N) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd suggest a name like |
||
{ | ||
int j, k, ret; | ||
const tsk_size_t num_nodes = tcs[0].nodes.num_rows; | ||
tsk_bool_t *keep_nodes = malloc(num_nodes * sizeof(*keep_nodes)); | ||
tsk_id_t *node_id_map = malloc(num_nodes * sizeof(*node_id_map)); | ||
tsk_id_t *edge_child, *edge_parent; | ||
|
||
if (keep_nodes == NULL || node_id_map == NULL) { | ||
errx(EXIT_FAILURE, "Out of memory"); | ||
} | ||
|
||
printf("Simplify %lld nodes\n", (long long) tcs[0].nodes.num_rows); | ||
sort_and_simplify_all(tcs, num_chroms, samples, N); | ||
|
||
for (j = 0; j < num_nodes; j++) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. rather than this loop, I used calloc() for keep_nodes; the kernel can provide zeroed pages faster than any loop can, even if the compiler turns it into memset()... |
||
keep_nodes[j] = false; | ||
} | ||
for (j = 0; j < N; j++) { | ||
keep_nodes[samples[j]] = true; | ||
} | ||
|
||
for (j = 0; j < num_chroms; j++) { | ||
edge_child = tcs[j].edges.child; | ||
edge_parent = tcs[j].edges.parent; | ||
for (k = 0; k < tcs[j].edges.num_rows; k++) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here and at line 160, I put |
||
keep_nodes[edge_child[k]] = true; | ||
keep_nodes[edge_parent[k]] = true; | ||
} | ||
} | ||
tsk_node_table_keep_rows(&tcs[0].nodes, keep_nodes, 0, node_id_map); | ||
printf("\tdone: %lld nodes\n", (long long) tcs[0].nodes.num_rows); | ||
|
||
/* Remap node references */ | ||
for (j = 0; j < num_chroms; j++) { | ||
edge_child = tcs[j].edges.child; | ||
edge_parent = tcs[j].edges.parent; | ||
for (k = 0; k < tcs[j].edges.num_rows; k++) { | ||
edge_child[k] = node_id_map[edge_child[k]]; | ||
edge_parent[k] = node_id_map[edge_parent[k]]; | ||
} | ||
ret = tsk_table_collection_check_integrity(&tcs[j], 0); | ||
check_tsk_error(ret); | ||
} | ||
for (j = 0; j < N; j++) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here I actually have a question, and maybe a comment would be useful. The question is: is the result of filtering the node table ourselves, with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The reason for doing the node filtering ourselves here is that the different per-chromosome simplify operations will come up with different node mappings. That's the only real reason for using tsk_node_table_keep_rows. You can't assume that the samples to 0,...n - 1 in this case, no (and would need to do the remapping). |
||
samples[j] = node_id_map[samples[j]]; | ||
} | ||
free(keep_nodes); | ||
free(node_id_map); | ||
} | ||
|
||
void | ||
simulate( | ||
tsk_table_collection_t *tcs, int num_chroms, int N, int T, int simplify_interval) | ||
{ | ||
tsk_id_t *buffer, *parents, *children, child, left_parent, right_parent; | ||
bool left_is_first; | ||
double chunk_left, chunk_right; | ||
int ret, j, t, b, k; | ||
|
||
assert(simplify_interval != 0); // leads to division by zero | ||
buffer = malloc(2 * N * sizeof(tsk_id_t)); | ||
if (buffer == NULL) { | ||
errx(EXIT_FAILURE, "Out of memory"); | ||
} | ||
for (k = 0; k < num_chroms; k++) { | ||
tcs[k].sequence_length = num_chroms; | ||
} | ||
parents = buffer; | ||
for (j = 0; j < N; j++) { | ||
parents[j] | ||
= tsk_node_table_add_row(&tcs[0].nodes, 0, T, TSK_NULL, TSK_NULL, NULL, 0); | ||
check_tsk_error(parents[j]); | ||
} | ||
b = 0; | ||
for (t = T - 1; t >= 0; t--) { | ||
/* Alternate between using the first and last N values in the buffer */ | ||
parents = buffer + (b * N); | ||
b = (b + 1) % 2; | ||
children = buffer + (b * N); | ||
for (j = 0; j < N; j++) { | ||
child = tsk_node_table_add_row( | ||
&tcs[0].nodes, 0, t, TSK_NULL, TSK_NULL, NULL, 0); | ||
check_tsk_error(child); | ||
/* NOTE: the use of rand() is discouraged for | ||
* research code and proper random number generator | ||
* libraries should be preferred. | ||
*/ | ||
left_parent = parents[(size_t)((rand() / (1. + RAND_MAX)) * N)]; | ||
right_parent = parents[(size_t)((rand() / (1. + RAND_MAX)) * N)]; | ||
left_is_first = rand() < 0.5; | ||
chunk_left = 0.0; | ||
for (k = 0; k < num_chroms; k++) { | ||
chunk_right = chunk_left + rand() / (1. + RAND_MAX); | ||
/* a very tiny chance that right and left are equal */ | ||
if (chunk_right > chunk_left) { | ||
ret = tsk_edge_table_add_row(&tcs[k].edges, chunk_left, chunk_right, | ||
left_is_first ? left_parent : right_parent, child, NULL, 0); | ||
check_tsk_error(ret); | ||
} | ||
chunk_left += 1.0; | ||
if (chunk_right < chunk_left) { | ||
ret = tsk_edge_table_add_row(&tcs[k].edges, chunk_right, chunk_left, | ||
left_is_first ? right_parent : left_parent, child, NULL, 0); | ||
check_tsk_error(ret); | ||
} | ||
} | ||
children[j] = child; | ||
} | ||
if (t % simplify_interval == 0) { | ||
simplify_tables(tcs, num_chroms, children, N); | ||
} | ||
} | ||
/* Set the sample flags for final generation */ | ||
for (j = 0; j < N; j++) { | ||
tcs[0].nodes.flags[children[j]] = TSK_NODE_IS_SAMPLE; | ||
} | ||
free(buffer); | ||
} | ||
|
||
int | ||
main(int argc, char **argv) | ||
{ | ||
// int ret; | ||
int num_chroms; | ||
|
||
if (argc != 7) { | ||
errx(EXIT_FAILURE, "usage: N T simplify-interval output seed num-chroms"); | ||
} | ||
|
||
num_chroms = atoi(argv[6]); | ||
tsk_table_collection_t tcs[num_chroms]; | ||
|
||
srand((unsigned) atoi(argv[5])); | ||
init_tables(tcs, num_chroms); | ||
simulate(tcs, num_chroms, atoi(argv[1]), atoi(argv[2]), atoi(argv[3])); | ||
join_tables(tcs, num_chroms); | ||
// ret = tsk_table_collection_dump(&tcs[0], argv[4], 0); | ||
// check_tsk_error(ret); | ||
free_tables(tcs, num_chroms); | ||
|
||
return 0; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <assert.h> | ||
#include <err.h> | ||
#include <string.h> | ||
|
||
#include <tskit/tables.h> | ||
|
||
#define check_tsk_error(val) \ | ||
if (val < 0) { \ | ||
errx(EXIT_FAILURE, "line %d: %s\n", __LINE__, tsk_strerror(val)); \ | ||
} | ||
|
||
void | ||
simulate( | ||
tsk_table_collection_t *tables, int num_chroms, int N, int T, int simplify_interval) | ||
{ | ||
tsk_id_t *buffer, *parents, *children, child, left_parent, right_parent; | ||
bool left_is_first; | ||
double chunk_left, chunk_right; | ||
int ret, j, t, b, k; | ||
|
||
assert(simplify_interval != 0); // leads to division by zero | ||
buffer = malloc(2 * N * sizeof(tsk_id_t)); | ||
if (buffer == NULL) { | ||
errx(EXIT_FAILURE, "Out of memory"); | ||
} | ||
tables->sequence_length = num_chroms; | ||
parents = buffer; | ||
for (j = 0; j < N; j++) { | ||
parents[j] | ||
= tsk_node_table_add_row(&tables->nodes, 0, T, TSK_NULL, TSK_NULL, NULL, 0); | ||
check_tsk_error(parents[j]); | ||
} | ||
b = 0; | ||
for (t = T - 1; t >= 0; t--) { | ||
/* Alternate between using the first and last N values in the buffer */ | ||
parents = buffer + (b * N); | ||
b = (b + 1) % 2; | ||
children = buffer + (b * N); | ||
for (j = 0; j < N; j++) { | ||
child = tsk_node_table_add_row( | ||
&tables->nodes, 0, t, TSK_NULL, TSK_NULL, NULL, 0); | ||
check_tsk_error(child); | ||
/* NOTE: the use of rand() is discouraged for | ||
* research code and proper random number generator | ||
* libraries should be preferred. | ||
*/ | ||
left_parent = parents[(size_t)((rand() / (1. + RAND_MAX)) * N)]; | ||
right_parent = parents[(size_t)((rand() / (1. + RAND_MAX)) * N)]; | ||
left_is_first = rand() < 0.5; | ||
chunk_left = 0.0; | ||
for (k = 0; k < num_chroms; k++) { | ||
chunk_right = chunk_left + rand() / (1. + RAND_MAX); | ||
/* a very tiny chance that right and left are equal */ | ||
if (chunk_right > chunk_left) { | ||
ret = tsk_edge_table_add_row(&tables->edges, chunk_left, chunk_right, | ||
left_is_first ? left_parent : right_parent, child, NULL, 0); | ||
check_tsk_error(ret); | ||
} | ||
chunk_left += 1.0; | ||
if (chunk_right < chunk_left) { | ||
ret = tsk_edge_table_add_row(&tables->edges, chunk_right, chunk_left, | ||
left_is_first ? right_parent : left_parent, child, NULL, 0); | ||
check_tsk_error(ret); | ||
} | ||
} | ||
children[j] = child; | ||
} | ||
if (t % simplify_interval == 0) { | ||
printf("Simplify at generation %lld: (%lld nodes %lld edges)", | ||
(long long) t, | ||
(long long) tables->nodes.num_rows, | ||
(long long) tables->edges.num_rows); | ||
/* Note: Edges must be sorted for simplify to work, and we use a brute force | ||
* approach of sorting each time here for simplicity. This is inefficient. */ | ||
ret = tsk_table_collection_sort(tables, NULL, 0); | ||
check_tsk_error(ret); | ||
ret = tsk_table_collection_simplify(tables, children, N, 0, NULL); | ||
check_tsk_error(ret); | ||
printf(" -> (%lld nodes %lld edges)\n", | ||
(long long) tables->nodes.num_rows, | ||
(long long) tables->edges.num_rows); | ||
for (j = 0; j < N; j++) { | ||
children[j] = j; | ||
} | ||
} | ||
} | ||
/* Set the sample flags for final generation */ | ||
for (j = 0; j < N; j++) { | ||
tables->nodes.flags[children[j]] = TSK_NODE_IS_SAMPLE; | ||
} | ||
free(buffer); | ||
} | ||
|
||
int | ||
main(int argc, char **argv) | ||
{ | ||
int ret; | ||
tsk_table_collection_t tables; | ||
|
||
if (argc != 7) { | ||
errx(EXIT_FAILURE, "usage: N T simplify-interval output seed num-chroms"); | ||
} | ||
ret = tsk_table_collection_init(&tables, 0); | ||
check_tsk_error(ret); | ||
srand((unsigned)atoi(argv[5])); | ||
simulate(&tables, atoi(argv[6]), atoi(argv[1]), atoi(argv[2]), atoi(argv[3])); | ||
|
||
// /* Sort and index so that the result can be opened as a tree sequence */ | ||
// ret = tsk_table_collection_sort(&tables, NULL, 0); | ||
// check_tsk_error(ret); | ||
// ret = tsk_table_collection_build_index(&tables, 0); | ||
// check_tsk_error(ret); | ||
// ret = tsk_table_collection_dump(&tables, argv[4], 0); | ||
// check_tsk_error(ret); | ||
|
||
tsk_table_collection_free(&tables); | ||
return 0; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As I mentioned on Slack, it seems safer to keep the shared tables zeroed out all the time except when they need to be there, to avoid accidental modification of one of the shared tables, use of stale data, stale pointers, etc.; there are lots of risks here. For SLiM I wrote a method that copies the main table collection's shared tables (we're sharing node, individual, and population) into the other table collections, and another method that zeros out those copies. Then I bracket operations like simplification and writing to disk with those methods. All the rest of the time the copies of the shared tables are zeroed for safety. Just a suggestion; might point people in a safer direction, even though it doesn't matter for this model as it is here.