From 88fe00ded2df18741114307fd9621e3e89d6f03f Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Thu, 12 Jan 2023 13:14:07 -0800 Subject: [PATCH] Refactor edge difference iterator to work from table collections as well as from tree sequences. --- c/tests/test_tables.c | 34 +++++++- c/tests/test_trees.c | 4 +- c/tskit/haplotype_matching.c | 4 +- c/tskit/tables.c | 164 ++++++++++++++++++++++++++++++++++- c/tskit/tables.h | 47 ++++++++++ c/tskit/trees.c | 161 ++-------------------------------- c/tskit/trees.h | 43 +-------- 7 files changed, 258 insertions(+), 199 deletions(-) diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index 1b99ac6fe3..3dd3697965 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019-2022 Tskit Developers + * Copyright (c) 2019-2023 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -23,6 +23,7 @@ */ #include "testlib.h" +#include "tskit/core.h" #include #include @@ -10420,6 +10421,35 @@ test_table_collection_delete_older(void) tsk_treeseq_free(&ts); } +static void +test_table_collection_edge_diffs_errors(void) +{ + int ret; + tsk_id_t ret_id; + tsk_table_collection_t tables; + tsk_diff_iter_t iter; + + ret = tsk_table_collection_init(&tables, 0); + CU_ASSERT_EQUAL(ret, 0); + tables.sequence_length = 1; + ret_id + = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0, -1, -1, NULL, 0); + CU_ASSERT_EQUAL(ret_id, 0); + ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1, -1, -1, NULL, 0); + CU_ASSERT_EQUAL(ret_id, 1); + ret = (int) tsk_edge_table_add_row(&tables.edges, 0., 1., 1, 0, NULL, 0); + CU_ASSERT_EQUAL(ret, 0); + + ret = tsk_diff_iter_init(&iter, &tables, -1, 0); + CU_ASSERT_EQUAL(ret, TSK_ERR_TABLES_NOT_INDEXED); + + tables.nodes.time[0] = 1; + ret = tsk_diff_iter_init(&iter, &tables, -1, 0); + CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_NODE_TIME_ORDERING); + + tsk_table_collection_free(&tables); +} + int main(int argc, char **argv) { @@ -10542,6 +10572,8 @@ main(int argc, char **argv) { "test_table_collection_takeset_indexes", test_table_collection_takeset_indexes }, { "test_table_collection_delete_older", test_table_collection_delete_older }, + { "test_table_collection_edge_diffs_errors", + test_table_collection_edge_diffs_errors }, { NULL, NULL }, }; diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index af6e7c644f..f0ced8585f 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019-2022 Tskit Developers + * Copyright (c) 2019-2023 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -407,7 +407,7 @@ verify_tree_diffs(tsk_treeseq_t *ts, tsk_flags_t options) child[j] = TSK_NULL; sib[j] = TSK_NULL; } - ret = tsk_diff_iter_init(&iter, ts, options); + ret = tsk_diff_iter_init_from_ts(&iter, ts, options); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_init(&tree, ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); diff --git a/c/tskit/haplotype_matching.c b/c/tskit/haplotype_matching.c index 41c1bd23a0..b942da18d6 100644 --- a/c/tskit/haplotype_matching.c +++ b/c/tskit/haplotype_matching.c @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019-2022 Tskit Developers + * Copyright (c) 2019-2023 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -250,7 +250,7 @@ tsk_ls_hmm_reset(tsk_ls_hmm_t *self) /* This is safe because we've already zero'd out the memory. */ tsk_diff_iter_free(&self->diffs); - ret = tsk_diff_iter_init(&self->diffs, self->tree_sequence, false); + ret = tsk_diff_iter_init_from_ts(&self->diffs, self->tree_sequence, false); if (ret != 0) { goto out; } diff --git a/c/tskit/tables.c b/c/tskit/tables.c index 3039dbbb38..1e910bca69 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019-2022 Tskit Developers + * Copyright (c) 2019-2023 Tskit Developers * Copyright (c) 2017-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -13010,3 +13010,165 @@ tsk_squash_edges(tsk_edge_t *edges, tsk_size_t num_edges, tsk_size_t *num_output out: return ret; } + +/* ======================================================== * + * Tree diff iterator. + * ======================================================== */ + +int TSK_WARN_UNUSED +tsk_diff_iter_init(tsk_diff_iter_t *self, const tsk_table_collection_t *tables, + tsk_id_t num_trees, tsk_flags_t options) +{ + int ret = 0; + + tsk_bug_assert(tables != NULL); + tsk_memset(self, 0, sizeof(tsk_diff_iter_t)); + self->num_nodes = tables->nodes.num_rows; + self->num_edges = tables->edges.num_rows; + self->tables = tables; + self->insertion_index = 0; + self->removal_index = 0; + self->tree_left = 0; + self->tree_index = -1; + if (num_trees < 0) { + num_trees = tsk_table_collection_check_integrity(self->tables, TSK_CHECK_TREES); + if (num_trees < 0) { + ret = (int) num_trees; + goto out; + } + } + self->last_index = num_trees; + + if (options & TSK_INCLUDE_TERMINAL) { + self->last_index = self->last_index + 1; + } + self->edge_list_nodes = tsk_malloc(self->num_edges * sizeof(*self->edge_list_nodes)); + if (self->edge_list_nodes == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } +out: + return ret; +} + +int +tsk_diff_iter_free(tsk_diff_iter_t *self) +{ + tsk_safe_free(self->edge_list_nodes); + return 0; +} + +void +tsk_diff_iter_print_state(const tsk_diff_iter_t *self, FILE *out) +{ + fprintf(out, "tree_diff_iterator state\n"); + fprintf(out, "num_edges = %lld\n", (long long) self->num_edges); + fprintf(out, "insertion_index = %lld\n", (long long) self->insertion_index); + fprintf(out, "removal_index = %lld\n", (long long) self->removal_index); + fprintf(out, "tree_left = %f\n", self->tree_left); + fprintf(out, "tree_index = %lld\n", (long long) self->tree_index); +} + +int TSK_WARN_UNUSED +tsk_diff_iter_next(tsk_diff_iter_t *self, double *ret_left, double *ret_right, + tsk_edge_list_t *edges_out_ret, tsk_edge_list_t *edges_in_ret) +{ + int ret = 0; + tsk_id_t k; + const double sequence_length = self->tables->sequence_length; + double left = self->tree_left; + double right = sequence_length; + tsk_size_t next_edge_list_node = 0; + tsk_edge_list_node_t *out_head = NULL; + tsk_edge_list_node_t *out_tail = NULL; + tsk_edge_list_node_t *in_head = NULL; + tsk_edge_list_node_t *in_tail = NULL; + tsk_edge_list_node_t *w = NULL; + tsk_edge_list_t edges_out; + tsk_edge_list_t edges_in; + const tsk_edge_table_t *edges = &self->tables->edges; + const tsk_id_t *insertion_order = self->tables->indexes.edge_insertion_order; + const tsk_id_t *removal_order = self->tables->indexes.edge_removal_order; + + tsk_memset(&edges_out, 0, sizeof(edges_out)); + tsk_memset(&edges_in, 0, sizeof(edges_in)); + + if (self->tree_index + 1 < self->last_index) { + /* First we remove the stale records */ + while (self->removal_index < (tsk_id_t) self->num_edges + && left == edges->right[removal_order[self->removal_index]]) { + k = removal_order[self->removal_index]; + tsk_bug_assert(next_edge_list_node < self->num_edges); + w = &self->edge_list_nodes[next_edge_list_node]; + next_edge_list_node++; + w->edge.id = k; + w->edge.left = edges->left[k]; + w->edge.right = edges->right[k]; + w->edge.parent = edges->parent[k]; + w->edge.child = edges->child[k]; + w->edge.metadata = edges->metadata + edges->metadata_offset[k]; + w->edge.metadata_length + = edges->metadata_offset[k + 1] - edges->metadata_offset[k]; + w->next = NULL; + w->prev = NULL; + if (out_head == NULL) { + out_head = w; + out_tail = w; + } else { + out_tail->next = w; + w->prev = out_tail; + out_tail = w; + } + self->removal_index++; + } + edges_out.head = out_head; + edges_out.tail = out_tail; + + /* Now insert the new records */ + while (self->insertion_index < (tsk_id_t) self->num_edges + && left == edges->left[insertion_order[self->insertion_index]]) { + k = insertion_order[self->insertion_index]; + tsk_bug_assert(next_edge_list_node < self->num_edges); + w = &self->edge_list_nodes[next_edge_list_node]; + next_edge_list_node++; + w->edge.id = k; + w->edge.left = edges->left[k]; + w->edge.right = edges->right[k]; + w->edge.parent = edges->parent[k]; + w->edge.child = edges->child[k]; + w->edge.metadata = edges->metadata + edges->metadata_offset[k]; + w->edge.metadata_length + = edges->metadata_offset[k + 1] - edges->metadata_offset[k]; + w->next = NULL; + w->prev = NULL; + if (in_head == NULL) { + in_head = w; + in_tail = w; + } else { + in_tail->next = w; + w->prev = in_tail; + in_tail = w; + } + self->insertion_index++; + } + edges_in.head = in_head; + edges_in.tail = in_tail; + + right = sequence_length; + if (self->insertion_index < (tsk_id_t) self->num_edges) { + right = TSK_MIN(right, edges->left[insertion_order[self->insertion_index]]); + } + if (self->removal_index < (tsk_id_t) self->num_edges) { + right = TSK_MIN(right, edges->right[removal_order[self->removal_index]]); + } + self->tree_index++; + ret = TSK_TREE_OK; + } + *edges_out_ret = edges_out; + *edges_in_ret = edges_in; + *ret_left = left; + *ret_right = right; + /* Set the left coordinate for the next tree */ + self->tree_left = right; + return ret; +} diff --git a/c/tskit/tables.h b/c/tskit/tables.h index a3496bb9ef..321a675271 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -670,6 +670,30 @@ typedef struct { bool store_pairs; } tsk_identity_segments_t; +/* Diff iterator. */ +typedef struct _tsk_edge_list_node_t { + tsk_edge_t edge; + struct _tsk_edge_list_node_t *next; + struct _tsk_edge_list_node_t *prev; +} tsk_edge_list_node_t; + +typedef struct { + tsk_edge_list_node_t *head; + tsk_edge_list_node_t *tail; +} tsk_edge_list_t; + +typedef struct { + tsk_size_t num_nodes; + tsk_size_t num_edges; + double tree_left; + const tsk_table_collection_t *tables; + tsk_id_t insertion_index; + tsk_id_t removal_index; + tsk_id_t tree_index; + tsk_id_t last_index; + tsk_edge_list_node_t *edge_list_nodes; +} tsk_diff_iter_t; + /****************************************************************************/ /* Common function options */ /****************************************************************************/ @@ -892,6 +916,16 @@ top-level information of the table collections being compared. #define TSK_CLEAR_PROVENANCE (1 << 2) /** @} */ +/* For the edge diff iterator */ +#define TSK_INCLUDE_TERMINAL (1 << 0) + +/** @brief Value returned by seeking methods when they have successfully + seeked to a non-null tree. + + @ingroup TREE_API_SEEKING_GROUP +*/ +#define TSK_TREE_OK 1 + /****************************************************************************/ /* Function signatures */ /****************************************************************************/ @@ -4417,6 +4451,19 @@ int tsk_identity_segments_get(const tsk_identity_segments_t *self, tsk_id_t a, void tsk_identity_segments_print_state(tsk_identity_segments_t *self, FILE *out); int tsk_identity_segments_free(tsk_identity_segments_t *self); +/* Edge differences */ + +/* Internal API - currently used in a few places, but a better API is envisaged + * at some point. + * IMPORTANT: tskit-rust uses this API, so don't break without discussing! + */ +int tsk_diff_iter_init(tsk_diff_iter_t *self, const tsk_table_collection_t *tables, + tsk_id_t num_trees, tsk_flags_t options); +int tsk_diff_iter_free(tsk_diff_iter_t *self); +int tsk_diff_iter_next(tsk_diff_iter_t *self, double *left, double *right, + tsk_edge_list_t *edges_out, tsk_edge_list_t *edges_in); +void tsk_diff_iter_print_state(const tsk_diff_iter_t *self, FILE *out); + #ifdef __cplusplus } #endif diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 4fcb2ee376..8d202d0163 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019-2022 Tskit Developers + * Copyright (c) 2019-2023 Tskit Developers * Copyright (c) 2015-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -5344,159 +5344,16 @@ tsk_tree_map_mutations(tsk_tree_t *self, int32_t *genotypes, return ret; } -/* ======================================================== * - * Tree diff iterator. - * ======================================================== */ - +/* Compatibility shim for initialising the diff iterator from a tree sequence. We are + * using this function in a small number of places internally, so simplest to keep it + * until a more satisfactory "diff" API comes along. + */ int TSK_WARN_UNUSED -tsk_diff_iter_init( +tsk_diff_iter_init_from_ts( tsk_diff_iter_t *self, const tsk_treeseq_t *tree_sequence, tsk_flags_t options) { - int ret = 0; - - tsk_bug_assert(tree_sequence != NULL); - tsk_memset(self, 0, sizeof(tsk_diff_iter_t)); - self->num_nodes = tsk_treeseq_get_num_nodes(tree_sequence); - self->num_edges = tsk_treeseq_get_num_edges(tree_sequence); - self->tree_sequence = tree_sequence; - self->insertion_index = 0; - self->removal_index = 0; - self->tree_left = 0; - self->tree_index = -1; - self->last_index = (tsk_id_t) tsk_treeseq_get_num_trees(tree_sequence); - if (options & TSK_INCLUDE_TERMINAL) { - self->last_index = self->last_index + 1; - } - self->edge_list_nodes = tsk_malloc(self->num_edges * sizeof(*self->edge_list_nodes)); - if (self->edge_list_nodes == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } -out: - return ret; -} - -int -tsk_diff_iter_free(tsk_diff_iter_t *self) -{ - tsk_safe_free(self->edge_list_nodes); - return 0; -} - -void -tsk_diff_iter_print_state(const tsk_diff_iter_t *self, FILE *out) -{ - fprintf(out, "tree_diff_iterator state\n"); - fprintf(out, "num_edges = %lld\n", (long long) self->num_edges); - fprintf(out, "insertion_index = %lld\n", (long long) self->insertion_index); - fprintf(out, "removal_index = %lld\n", (long long) self->removal_index); - fprintf(out, "tree_left = %f\n", self->tree_left); - fprintf(out, "tree_index = %lld\n", (long long) self->tree_index); -} - -int TSK_WARN_UNUSED -tsk_diff_iter_next(tsk_diff_iter_t *self, double *ret_left, double *ret_right, - tsk_edge_list_t *edges_out_ret, tsk_edge_list_t *edges_in_ret) -{ - int ret = 0; - tsk_id_t k; - const double sequence_length = self->tree_sequence->tables->sequence_length; - double left = self->tree_left; - double right = sequence_length; - tsk_size_t next_edge_list_node = 0; - const tsk_treeseq_t *s = self->tree_sequence; - tsk_edge_list_node_t *out_head = NULL; - tsk_edge_list_node_t *out_tail = NULL; - tsk_edge_list_node_t *in_head = NULL; - tsk_edge_list_node_t *in_tail = NULL; - tsk_edge_list_node_t *w = NULL; - tsk_edge_list_t edges_out; - tsk_edge_list_t edges_in; - const tsk_edge_table_t *edges = &s->tables->edges; - const tsk_id_t *insertion_order = s->tables->indexes.edge_insertion_order; - const tsk_id_t *removal_order = s->tables->indexes.edge_removal_order; - - tsk_memset(&edges_out, 0, sizeof(edges_out)); - tsk_memset(&edges_in, 0, sizeof(edges_in)); - - if (self->tree_index + 1 < self->last_index) { - /* First we remove the stale records */ - while (self->removal_index < (tsk_id_t) self->num_edges - && left == edges->right[removal_order[self->removal_index]]) { - k = removal_order[self->removal_index]; - tsk_bug_assert(next_edge_list_node < self->num_edges); - w = &self->edge_list_nodes[next_edge_list_node]; - next_edge_list_node++; - w->edge.id = k; - w->edge.left = edges->left[k]; - w->edge.right = edges->right[k]; - w->edge.parent = edges->parent[k]; - w->edge.child = edges->child[k]; - w->edge.metadata = edges->metadata + edges->metadata_offset[k]; - w->edge.metadata_length - = edges->metadata_offset[k + 1] - edges->metadata_offset[k]; - w->next = NULL; - w->prev = NULL; - if (out_head == NULL) { - out_head = w; - out_tail = w; - } else { - out_tail->next = w; - w->prev = out_tail; - out_tail = w; - } - self->removal_index++; - } - edges_out.head = out_head; - edges_out.tail = out_tail; - - /* Now insert the new records */ - while (self->insertion_index < (tsk_id_t) self->num_edges - && left == edges->left[insertion_order[self->insertion_index]]) { - k = insertion_order[self->insertion_index]; - tsk_bug_assert(next_edge_list_node < self->num_edges); - w = &self->edge_list_nodes[next_edge_list_node]; - next_edge_list_node++; - w->edge.id = k; - w->edge.left = edges->left[k]; - w->edge.right = edges->right[k]; - w->edge.parent = edges->parent[k]; - w->edge.child = edges->child[k]; - w->edge.metadata = edges->metadata + edges->metadata_offset[k]; - w->edge.metadata_length - = edges->metadata_offset[k + 1] - edges->metadata_offset[k]; - w->next = NULL; - w->prev = NULL; - if (in_head == NULL) { - in_head = w; - in_tail = w; - } else { - in_tail->next = w; - w->prev = in_tail; - in_tail = w; - } - self->insertion_index++; - } - edges_in.head = in_head; - edges_in.tail = in_tail; - - right = sequence_length; - if (self->insertion_index < (tsk_id_t) self->num_edges) { - right = TSK_MIN(right, edges->left[insertion_order[self->insertion_index]]); - } - if (self->removal_index < (tsk_id_t) self->num_edges) { - right = TSK_MIN(right, edges->right[removal_order[self->removal_index]]); - } - self->tree_index++; - ret = TSK_TREE_OK; - } - *edges_out_ret = edges_out; - *edges_in_ret = edges_in; - *ret_left = left; - *ret_right = right; - /* Set the left coordinate for the next tree */ - self->tree_left = right; - return ret; + return tsk_diff_iter_init( + self, tree_sequence->tables, (tsk_id_t) tree_sequence->num_trees, options); } /* ======================================================== * @@ -5927,7 +5784,7 @@ tsk_treeseq_kc_distance(const tsk_treeseq_t *self, const tsk_treeseq_t *other, if (ret != 0) { goto out; } - ret = tsk_diff_iter_init(&diff_iters[i], treeseqs[i], false); + ret = tsk_diff_iter_init_from_ts(&diff_iters[i], treeseqs[i], false); if (ret != 0) { goto out; } diff --git a/c/tskit/trees.h b/c/tskit/trees.h index cae952dee3..4a84bf3446 100644 --- a/c/tskit/trees.h +++ b/c/tskit/trees.h @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019-2022 Tskit Developers + * Copyright (c) 2019-2023 Tskit Developers * Copyright (c) 2015-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -59,9 +59,6 @@ extern "C" { #define TSK_DIR_FORWARD 1 #define TSK_DIR_REVERSE -1 -/* For the edge diff iterator */ -#define TSK_INCLUDE_TERMINAL (1 << 0) - /** @defgroup API_FLAGS_TS_INIT_GROUP :c:func:`tsk_treeseq_init` specific flags. @{ @@ -261,30 +258,6 @@ typedef struct { tsk_id_t right_index; } tsk_tree_t; -/* Diff iterator. */ -typedef struct _tsk_edge_list_node_t { - tsk_edge_t edge; - struct _tsk_edge_list_node_t *next; - struct _tsk_edge_list_node_t *prev; -} tsk_edge_list_node_t; - -typedef struct { - tsk_edge_list_node_t *head; - tsk_edge_list_node_t *tail; -} tsk_edge_list_t; - -typedef struct { - tsk_size_t num_nodes; - tsk_size_t num_edges; - double tree_left; - const tsk_treeseq_t *tree_sequence; - tsk_id_t insertion_index; - tsk_id_t removal_index; - tsk_id_t tree_index; - tsk_id_t last_index; - tsk_edge_list_node_t *edge_list_nodes; -} tsk_diff_iter_t; - /****************************************************************************/ /* Tree sequence.*/ /****************************************************************************/ @@ -1114,10 +1087,6 @@ int tsk_tree_copy(const tsk_tree_t *self, tsk_tree_t *dest, tsk_flags_t options) @{ */ -/** @brief Value returned by seeking methods when they have successfully - seeked to a non-null tree. */ -#define TSK_TREE_OK 1 - /** @brief Seek to the first tree in the sequence. @@ -1742,16 +1711,8 @@ bool tsk_tree_is_sample(const tsk_tree_t *self, tsk_id_t u); */ bool tsk_tree_equals(const tsk_tree_t *self, const tsk_tree_t *other); -/****************************************************************************/ -/* Diff iterator */ -/****************************************************************************/ - -int tsk_diff_iter_init( +int tsk_diff_iter_init_from_ts( tsk_diff_iter_t *self, const tsk_treeseq_t *tree_sequence, tsk_flags_t options); -int tsk_diff_iter_free(tsk_diff_iter_t *self); -int tsk_diff_iter_next(tsk_diff_iter_t *self, double *left, double *right, - tsk_edge_list_t *edges_out, tsk_edge_list_t *edges_in); -void tsk_diff_iter_print_state(const tsk_diff_iter_t *self, FILE *out); #ifdef __cplusplus }