Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multichrom example #2665

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions c/examples/haploid_wright_fisher.c
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,12 @@ main(int argc, char **argv)
check_tsk_error(ret);
srand((unsigned)atoi(argv[5]));
simulate(&tables, atoi(argv[1]), atoi(argv[2]), atoi(argv[3]));

/* Sort and index so that the result can be opened as a tree sequence */
ret = tsk_table_collection_sort(&tables, NULL, 0);
check_tsk_error(ret);
ret = tsk_table_collection_build_index(&tables, 0);
check_tsk_error(ret);
ret = tsk_table_collection_dump(&tables, argv[4], 0);
check_tsk_error(ret);

Expand Down
265 changes: 265 additions & 0 deletions c/examples/multichrom_wright_fisher.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <err.h>
#include <string.h>

#include <pthread.h>
#include <tskit/tables.h>

#define check_tsk_error(val) \
if (val < 0) { \
errx(EXIT_FAILURE, "line %d: %s\n", __LINE__, tsk_strerror(val)); \
}

static void
init_tables(tsk_table_collection_t *tcs, int num_chroms)
{
int j, ret;

for (j = 0; j < num_chroms; j++) {
ret = tsk_table_collection_init(&tcs[j], 0);
check_tsk_error(ret);
if (j > 0) {
tsk_node_table_free(&tcs[j].nodes);
}
}
}

static void
free_tables(tsk_table_collection_t *tcs, int num_chroms)
{
int j;

for (j = 0; j < num_chroms; j++) {
if (j > 0) {
/* Must not double free node table columns. */
memset(&tcs[j].nodes, 0, sizeof(tcs[j].nodes));
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I mentioned on Slack, it seems safer to keep the shared tables zeroed out all the time except when they need to be there, to avoid accidental modification of one of the shared tables, use of stale data, stale pointers, etc.; there are lots of risks here. For SLiM I wrote a method that copies the main table collection's shared tables (we're sharing node, individual, and population) into the other table collections, and another method that zeros out those copies. Then I bracket operations like simplification and writing to disk with those methods. All the rest of the time the copies of the shared tables are zeroed for safety. Just a suggestion; might point people in a safer direction, even though it doesn't matter for this model as it is here.

}
tsk_table_collection_free(&tcs[j]);
}
}

static void
join_tables(tsk_table_collection_t *tcs, int num_chroms)
{
int j, ret;

for (j = 1; j < num_chroms; j++) {
ret = tsk_edge_table_extend(
&tcs[0].edges, &tcs[j].edges, tcs[j].edges.num_rows, NULL, 0);
check_tsk_error(ret);
}
/* Get all the squashable edges next to each other */
ret = tsk_table_collection_sort(&tcs[0], NULL, 0);
check_tsk_error(ret);
ret = tsk_edge_table_squash(&tcs[0].edges);
check_tsk_error(ret);
/* We need to sort again after squash */
ret = tsk_table_collection_sort(&tcs[0], NULL, 0);
check_tsk_error(ret);
ret = tsk_table_collection_build_index(&tcs[0], 0);
check_tsk_error(ret);
}

struct chunk_work {
int chunk;
tsk_table_collection_t *tc;
int *samples;
int N;
};

void *
simplify_chunk(void *arg)
{
int ret;
struct chunk_work *work = (struct chunk_work *) arg;
tsk_size_t edges_before = work->tc->edges.num_rows;

ret = tsk_table_collection_sort(work->tc, NULL, 0);
check_tsk_error(ret);
ret = tsk_table_collection_simplify(work->tc, work->samples, work->N,
TSK_SIMPLIFY_NO_FILTER_NODES | TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS, NULL);
check_tsk_error(ret);
/* NOTE: this printf makes helgrind complain */
printf("\tchunk %d: %lld -> %lld\n", work->chunk, (long long) edges_before,
(long long) work->tc->edges.num_rows);

return NULL;
}

void
sort_and_simplify_all(tsk_table_collection_t *tcs, int num_chroms, int *samples, int N)
{
int j, ret;
struct chunk_work work[num_chroms];
pthread_t threads[num_chroms];

for (j = 1; j < num_chroms; j++) {
tcs[j].nodes = tcs[0].nodes;
}

for (j = 0; j < num_chroms; j++) {
work[j].chunk = j;
work[j].tc = &tcs[j];
work[j].samples = samples;
work[j].N = N;

ret = pthread_create(&threads[j], NULL, simplify_chunk, (void *) &work[j]);
if (ret != 0) {
errx(EXIT_FAILURE, "Pthread create failed");
}
/* simplify_chunk((void *) &work[j]); */
}
for (j = 0; j < num_chroms; j++) {
ret = pthread_join(threads[j], NULL);
if (ret != 0) {
errx(EXIT_FAILURE, "Pthread join failed");
}
}
}

void
simplify_tables(tsk_table_collection_t *tcs, int num_chroms, int *samples, int N)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd suggest a name like samples_count instead of N, but maybe that's just me :->

{
int j, k, ret;
const tsk_size_t num_nodes = tcs[0].nodes.num_rows;
tsk_bool_t *keep_nodes = malloc(num_nodes * sizeof(*keep_nodes));
tsk_id_t *node_id_map = malloc(num_nodes * sizeof(*node_id_map));
tsk_id_t *edge_child, *edge_parent;

if (keep_nodes == NULL || node_id_map == NULL) {
errx(EXIT_FAILURE, "Out of memory");
}

printf("Simplify %lld nodes\n", (long long) tcs[0].nodes.num_rows);
sort_and_simplify_all(tcs, num_chroms, samples, N);

for (j = 0; j < num_nodes; j++) {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rather than this loop, I used calloc() for keep_nodes; the kernel can provide zeroed pages faster than any loop can, even if the compiler turns it into memset()...

keep_nodes[j] = false;
}
for (j = 0; j < N; j++) {
keep_nodes[samples[j]] = true;
}

for (j = 0; j < num_chroms; j++) {
edge_child = tcs[j].edges.child;
edge_parent = tcs[j].edges.parent;
for (k = 0; k < tcs[j].edges.num_rows; k++) {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here and at line 160, I put tcs[j].edges.num_rows into a local temporary; I think the compiler might have to refetch the value from memory every time through, otherwise, since it might not be smart enough to realize that there is not a restrict issue here (i.e., that the loop's work is guaranteed not to change the value of num_rows through a pointer write)...

keep_nodes[edge_child[k]] = true;
keep_nodes[edge_parent[k]] = true;
}
}
tsk_node_table_keep_rows(&tcs[0].nodes, keep_nodes, 0, node_id_map);
printf("\tdone: %lld nodes\n", (long long) tcs[0].nodes.num_rows);

/* Remap node references */
for (j = 0; j < num_chroms; j++) {
edge_child = tcs[j].edges.child;
edge_parent = tcs[j].edges.parent;
for (k = 0; k < tcs[j].edges.num_rows; k++) {
edge_child[k] = node_id_map[edge_child[k]];
edge_parent[k] = node_id_map[edge_parent[k]];
}
ret = tsk_table_collection_check_integrity(&tcs[j], 0);
check_tsk_error(ret);
}
for (j = 0; j < N; j++) {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here I actually have a question, and maybe a comment would be useful. The question is: is the result of filtering the node table ourselves, with tsk_node_table_keep_rows() as done here, different from letting simplify do the filtering for us? Is the order of the nodes that results different? I ask because SLiM's current code assumes that the N samples are in positions 0:(N-1) in the node table after simplification. The code here kind of lumps the samples together with all of the nodes referenced by edges, and tells tskit "keep all of these", without distinguishing the two; so I'm guessing that SLiM can no longer make that 0:(N-1) assumption, and needs to remap the N samples as you do here, using node_id_map. But I'd like to understand why. Does simplification filter the node table in a different and more sophisticated way? Or is there some other reason why SLiM's assumption happened to be valid? @petrelharp wrote that code for SLiM, I think, so maybe he wants to hear this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason for doing the node filtering ourselves here is that the different per-chromosome simplify operations will come up with different node mappings. That's the only real reason for using tsk_node_table_keep_rows. You can't assume that the samples to 0,...n - 1 in this case, no (and would need to do the remapping).

samples[j] = node_id_map[samples[j]];
}
free(keep_nodes);
free(node_id_map);
}

void
simulate(
tsk_table_collection_t *tcs, int num_chroms, int N, int T, int simplify_interval)
{
tsk_id_t *buffer, *parents, *children, child, left_parent, right_parent;
bool left_is_first;
double chunk_left, chunk_right;
int ret, j, t, b, k;

assert(simplify_interval != 0); // leads to division by zero
buffer = malloc(2 * N * sizeof(tsk_id_t));
if (buffer == NULL) {
errx(EXIT_FAILURE, "Out of memory");
}
for (k = 0; k < num_chroms; k++) {
tcs[k].sequence_length = num_chroms;
}
parents = buffer;
for (j = 0; j < N; j++) {
parents[j]
= tsk_node_table_add_row(&tcs[0].nodes, 0, T, TSK_NULL, TSK_NULL, NULL, 0);
check_tsk_error(parents[j]);
}
b = 0;
for (t = T - 1; t >= 0; t--) {
/* Alternate between using the first and last N values in the buffer */
parents = buffer + (b * N);
b = (b + 1) % 2;
children = buffer + (b * N);
for (j = 0; j < N; j++) {
child = tsk_node_table_add_row(
&tcs[0].nodes, 0, t, TSK_NULL, TSK_NULL, NULL, 0);
check_tsk_error(child);
/* NOTE: the use of rand() is discouraged for
* research code and proper random number generator
* libraries should be preferred.
*/
left_parent = parents[(size_t)((rand() / (1. + RAND_MAX)) * N)];
right_parent = parents[(size_t)((rand() / (1. + RAND_MAX)) * N)];
left_is_first = rand() < 0.5;
chunk_left = 0.0;
for (k = 0; k < num_chroms; k++) {
chunk_right = chunk_left + rand() / (1. + RAND_MAX);
/* a very tiny chance that right and left are equal */
if (chunk_right > chunk_left) {
ret = tsk_edge_table_add_row(&tcs[k].edges, chunk_left, chunk_right,
left_is_first ? left_parent : right_parent, child, NULL, 0);
check_tsk_error(ret);
}
chunk_left += 1.0;
if (chunk_right < chunk_left) {
ret = tsk_edge_table_add_row(&tcs[k].edges, chunk_right, chunk_left,
left_is_first ? right_parent : left_parent, child, NULL, 0);
check_tsk_error(ret);
}
}
children[j] = child;
}
if (t % simplify_interval == 0) {
simplify_tables(tcs, num_chroms, children, N);
}
}
/* Set the sample flags for final generation */
for (j = 0; j < N; j++) {
tcs[0].nodes.flags[children[j]] = TSK_NODE_IS_SAMPLE;
}
free(buffer);
}

int
main(int argc, char **argv)
{
// int ret;
int num_chroms;

if (argc != 7) {
errx(EXIT_FAILURE, "usage: N T simplify-interval output seed num-chroms");
}

num_chroms = atoi(argv[6]);
tsk_table_collection_t tcs[num_chroms];

srand((unsigned) atoi(argv[5]));
init_tables(tcs, num_chroms);
simulate(tcs, num_chroms, atoi(argv[1]), atoi(argv[2]), atoi(argv[3]));
join_tables(tcs, num_chroms);
// ret = tsk_table_collection_dump(&tcs[0], argv[4], 0);
// check_tsk_error(ret);
free_tables(tcs, num_chroms);

return 0;
}
120 changes: 120 additions & 0 deletions c/examples/multichrom_wright_fisher_singlethreaded.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <err.h>
#include <string.h>

#include <tskit/tables.h>

#define check_tsk_error(val) \
if (val < 0) { \
errx(EXIT_FAILURE, "line %d: %s\n", __LINE__, tsk_strerror(val)); \
}

void
simulate(
tsk_table_collection_t *tables, int num_chroms, int N, int T, int simplify_interval)
{
tsk_id_t *buffer, *parents, *children, child, left_parent, right_parent;
bool left_is_first;
double chunk_left, chunk_right;
int ret, j, t, b, k;

assert(simplify_interval != 0); // leads to division by zero
buffer = malloc(2 * N * sizeof(tsk_id_t));
if (buffer == NULL) {
errx(EXIT_FAILURE, "Out of memory");
}
tables->sequence_length = num_chroms;
parents = buffer;
for (j = 0; j < N; j++) {
parents[j]
= tsk_node_table_add_row(&tables->nodes, 0, T, TSK_NULL, TSK_NULL, NULL, 0);
check_tsk_error(parents[j]);
}
b = 0;
for (t = T - 1; t >= 0; t--) {
/* Alternate between using the first and last N values in the buffer */
parents = buffer + (b * N);
b = (b + 1) % 2;
children = buffer + (b * N);
for (j = 0; j < N; j++) {
child = tsk_node_table_add_row(
&tables->nodes, 0, t, TSK_NULL, TSK_NULL, NULL, 0);
check_tsk_error(child);
/* NOTE: the use of rand() is discouraged for
* research code and proper random number generator
* libraries should be preferred.
*/
left_parent = parents[(size_t)((rand() / (1. + RAND_MAX)) * N)];
right_parent = parents[(size_t)((rand() / (1. + RAND_MAX)) * N)];
left_is_first = rand() < 0.5;
chunk_left = 0.0;
for (k = 0; k < num_chroms; k++) {
chunk_right = chunk_left + rand() / (1. + RAND_MAX);
/* a very tiny chance that right and left are equal */
if (chunk_right > chunk_left) {
ret = tsk_edge_table_add_row(&tables->edges, chunk_left, chunk_right,
left_is_first ? left_parent : right_parent, child, NULL, 0);
check_tsk_error(ret);
}
chunk_left += 1.0;
if (chunk_right < chunk_left) {
ret = tsk_edge_table_add_row(&tables->edges, chunk_right, chunk_left,
left_is_first ? right_parent : left_parent, child, NULL, 0);
check_tsk_error(ret);
}
}
children[j] = child;
}
if (t % simplify_interval == 0) {
printf("Simplify at generation %lld: (%lld nodes %lld edges)",
(long long) t,
(long long) tables->nodes.num_rows,
(long long) tables->edges.num_rows);
/* Note: Edges must be sorted for simplify to work, and we use a brute force
* approach of sorting each time here for simplicity. This is inefficient. */
ret = tsk_table_collection_sort(tables, NULL, 0);
check_tsk_error(ret);
ret = tsk_table_collection_simplify(tables, children, N, 0, NULL);
check_tsk_error(ret);
printf(" -> (%lld nodes %lld edges)\n",
(long long) tables->nodes.num_rows,
(long long) tables->edges.num_rows);
for (j = 0; j < N; j++) {
children[j] = j;
}
}
}
/* Set the sample flags for final generation */
for (j = 0; j < N; j++) {
tables->nodes.flags[children[j]] = TSK_NODE_IS_SAMPLE;
}
free(buffer);
}

int
main(int argc, char **argv)
{
int ret;
tsk_table_collection_t tables;

if (argc != 7) {
errx(EXIT_FAILURE, "usage: N T simplify-interval output seed num-chroms");
}
ret = tsk_table_collection_init(&tables, 0);
check_tsk_error(ret);
srand((unsigned)atoi(argv[5]));
simulate(&tables, atoi(argv[6]), atoi(argv[1]), atoi(argv[2]), atoi(argv[3]));

// /* Sort and index so that the result can be opened as a tree sequence */
// ret = tsk_table_collection_sort(&tables, NULL, 0);
// check_tsk_error(ret);
// ret = tsk_table_collection_build_index(&tables, 0);
// check_tsk_error(ret);
// ret = tsk_table_collection_dump(&tables, argv[4], 0);
// check_tsk_error(ret);

tsk_table_collection_free(&tables);
return 0;
}
Loading
Loading