From 25199ac9cd88f77db7fb724bbcc92819d4963370 Mon Sep 17 00:00:00 2001 From: peter Date: Tue, 7 Nov 2023 20:10:46 -0800 Subject: [PATCH] tskit update to C_1.1.2 --- treerec/_README | 1 + treerec/tskit/core.c | 41 +- treerec/tskit/core.h | 62 +- treerec/tskit/genotypes.c | 26 +- treerec/tskit/haplotype_matching.c | 4 +- treerec/tskit/kastore/kastore.c | 55 +- treerec/tskit/kastore/kastore.h | 2 +- treerec/tskit/tables.c | 1336 +++++++++++++++++++++++----- treerec/tskit/tables.h | 427 ++++++++- treerec/tskit/trees.c | 760 ++++++++++++---- treerec/tskit/trees.h | 121 ++- 11 files changed, 2328 insertions(+), 507 deletions(-) diff --git a/treerec/_README b/treerec/_README index a7af2d820..eccf9f403 100644 --- a/treerec/_README +++ b/treerec/_README @@ -8,6 +8,7 @@ The code in `tskit/kastore` is from [kastore/c](https://github.com/tskit-dev/kas **Changelog:** +- *7 November 2023*: updated to tskit C_1.1.2 (=0.5.5) and kastore 0.3.2) - *2 June 2022*: updated to tskit C_1.0.0 and kastore C_2.1.1 - *5 September 2020*: updated to tskit C_0.99.5 - *27 August 2020*: updated to tskit C_0.99.5 diff --git a/treerec/tskit/core.c b/treerec/tskit/core.c index 067dd4931..b1ea25bad 100644 --- a/treerec/tskit/core.c +++ b/treerec/tskit/core.c @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019-2022 Tskit Developers + * Copyright (c) 2019-2023 Tskit Developers * Copyright (c) 2015-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -222,6 +222,10 @@ tsk_strerror_internal(int err) case TSK_ERR_SEEK_OUT_OF_BOUNDS: ret = "Tree seek position out of bounds. (TSK_ERR_SEEK_OUT_OF_BOUNDS)"; break; + case TSK_ERR_KEEP_ROWS_MAP_TO_DELETED: + ret = "One of the kept rows in the table refers to a deleted row. " + "(TSK_ERR_KEEP_ROWS_MAP_TO_DELETED)"; + break; /* Edge errors */ case TSK_ERR_NULL_PARENT: @@ -265,7 +269,8 @@ tsk_strerror_internal(int err) break; case TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN: ret = "Bad edges: contradictory children for a given parent over " - "an interval. (TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN)"; + "an interval, or indexes need to be rebuilt. " + "(TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN)"; break; case TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA: ret = "Can't squash, flush, simplify or link ancestors with edges that have " @@ -336,7 +341,7 @@ tsk_strerror_internal(int err) ret = "Duplicate sample value. (TSK_ERR_DUPLICATE_SAMPLE)"; break; case TSK_ERR_BAD_SAMPLES: - ret = "Bad sample configuration provided. (TSK_ERR_BAD_SAMPLES)"; + ret = "The nodes provided are not samples. (TSK_ERR_BAD_SAMPLES)"; break; /* Table errors */ @@ -350,8 +355,13 @@ tsk_strerror_internal(int err) case TSK_ERR_TABLES_NOT_INDEXED: ret = "Table collection must be indexed. (TSK_ERR_TABLES_NOT_INDEXED)"; break; + case TSK_ERR_TABLES_BAD_INDEXES: + ret = "Table collection indexes inconsistent: do they need to be rebuilt? " + "(TSK_ERR_TABLES_BAD_INDEXES)"; + break; case TSK_ERR_TABLE_OVERFLOW: - ret = "Table too large; cannot allocate more than 2**31 rows. " + ret = "Table too large; cannot allocate more than 2**31 rows. This error " + "is often caused by a lack of simplification when simulating. " "(TSK_ERR_TABLE_OVERFLOW)"; break; case TSK_ERR_COLUMN_OVERFLOW: @@ -408,6 +418,14 @@ tsk_strerror_internal(int err) ret = "A tree sequence can't take ownership of tables with " "TSK_NO_EDGE_METADATA. (TSK_ERR_CANT_TAKE_OWNERSHIP_NO_EDGE_METADATA)"; break; + case TSK_ERR_UNDEFINED_NONBINARY: + ret = "Operation undefined for nonbinary trees. " + "(TSK_ERR_UNDEFINED_NONBINARY)"; + break; + case TSK_ERR_UNDEFINED_MULTIROOT: + ret = "Operation undefined for trees that are not singly-rooted. " + "(TSK_ERR_UNDEFINED_MULTIROOT)"; + break; /* Stats errors */ case TSK_ERR_BAD_NUM_WINDOWS: @@ -500,7 +518,8 @@ tsk_strerror_internal(int err) break; case TSK_ERR_NO_SAMPLE_LISTS: ret = "The sample_lists option must be enabled on the tree to perform this " - "operation. (TSK_ERR_NO_SAMPLE_LISTS)"; + "operation. Pass the option to the constructor or method that created " + "the tree. (TSK_ERR_NO_SAMPLE_LISTS)"; break; /* Haplotype matching errors */ @@ -571,6 +590,18 @@ tsk_strerror_internal(int err) ret = "Individuals cannot be their own ancestor. " "(TSK_ERR_INDIVIDUAL_PARENT_CYCLE)"; break; + + case TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH: + ret = "Individual populations cannot be returned " + "if an individual has nodes from more than one population. " + "(TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH)"; + break; + + case TSK_ERR_INDIVIDUAL_TIME_MISMATCH: + ret = "Individual times cannot be returned " + "if an individual has nodes from more than one time. " + "(TSK_ERR_INDIVIDUAL_TIME_MISMATCH)"; + break; } return ret; } diff --git a/treerec/tskit/core.h b/treerec/tskit/core.h index 5fb0b9f46..b8b9f354b 100644 --- a/treerec/tskit/core.h +++ b/treerec/tskit/core.h @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019-2022 Tskit Developers + * Copyright (c) 2019-2023 Tskit Developers * Copyright (c) 2015-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -123,6 +123,15 @@ specify options to API functions. typedef uint32_t tsk_flags_t; #define TSK_FLAGS_STORAGE_TYPE KAS_UINT32 +/** +@brief Boolean type. + +@rst +Fixed-size (1 byte) boolean values. +@endrst +*/ +typedef uint8_t tsk_bool_t; + // clang-format off /** @defgroup API_VERSION_GROUP API version macros. @@ -138,12 +147,12 @@ sizes and types of externally visible structs. The library minor version. Incremented when non-breaking backward-compatible changes to the API or ABI are introduced, i.e., the addition of a new function. */ -#define TSK_VERSION_MINOR 0 +#define TSK_VERSION_MINOR 1 /** The library patch version. Incremented when any changes not relevant to the to the API or ABI are introduced, i.e., internal refactors of bugfixes. */ -#define TSK_VERSION_PATCH 0 +#define TSK_VERSION_PATCH 2 /** @} */ /* @@ -172,12 +181,12 @@ Used in node flags to indicate that a node is a sample node. */ #define TSK_NODE_IS_SAMPLE 1u -/** +/** Null value used for cases such as absent id references. */ #define TSK_NULL ((tsk_id_t) -1) -/** +/** Value used for missing data in genotype arrays. */ #define TSK_MISSING_DATA (-1) @@ -206,7 +215,7 @@ whose representation differs from the NAN generated by computations such as divi /* Place the common options at the top of the space; this way we can start options for individual functions at the bottom without worrying about -clashing with the common options +clashing with the common options */ /** Turn on debugging output. Not supported by all functions. */ @@ -224,7 +233,7 @@ behaviour. */ #define TSK_NO_CHECK_INTEGRITY (1u << 29) -/** +/** Instead of taking a copy of input objects, the function should take ownership of them and manage their lifecycle. The caller specifying this flag should no longer modify or free the object or objects passed. See individual functions @@ -356,6 +365,12 @@ A time value was non-finite (NaN counts as finite) A genomic position was non-finite */ #define TSK_ERR_GENOME_COORDS_NONFINITE -211 +/** +One of the rows in the retained table refers to a row that has been +deleted. +*/ +#define TSK_ERR_KEEP_ROWS_MAP_TO_DELETED -212 + /** @} */ /** @@ -543,6 +558,10 @@ The table collection contains more than 2**31 trees. Metadata was attempted to be set on a table where it is disabled. */ #define TSK_ERR_METADATA_DISABLED -706 +/** +There was an error with the table's indexes. +*/ +#define TSK_ERR_TABLES_BAD_INDEXES -707 /** @} */ /** @@ -595,13 +614,22 @@ A tree sequence cannot take ownership of a table collection where TSK_NO_EDGE_METADATA. */ #define TSK_ERR_CANT_TAKE_OWNERSHIP_NO_EDGE_METADATA -809 +/** +Operation is undefined for nonbinary trees +*/ +#define TSK_ERR_UNDEFINED_NONBINARY -810 +/** +Operation is undefined for trees with multiple roots. +*/ +#define TSK_ERR_UNDEFINED_MULTIROOT -811 + /** @} */ /** @defgroup STATS_ERROR_GROUP Stats errors. @{ */ -/** +/** Zero windows were specified, at least one window must be specified. */ #define TSK_ERR_BAD_NUM_WINDOWS -900 @@ -653,17 +681,17 @@ were ``uncalibrated``. @defgroup MAPPING_ERROR_GROUP Mutation mapping errors. @{ */ -/** +/** Only missing genotypes were specified, at least one non-missing is required. */ #define TSK_ERR_GENOTYPES_ALL_MISSING -1000 -/** +/** A genotype value was greater than the maximum allowed (64) or less than TSK_MISSING_DATA (-1). */ #define TSK_ERR_BAD_GENOTYPE -1001 -/** +/** A ancestral genotype value was greater than the maximum allowed (64) or less than 0. */ @@ -760,7 +788,7 @@ specified table collection. /** The shared portions of the specified tree sequences are not equal. Note that this may be the case if the table collections were not -fully sorted before union was called. +fully sorted before union was called. */ #define TSK_ERR_UNION_DIFF_HISTORIES -1401 /** @} */ @@ -812,6 +840,16 @@ An individual was its own parent. An individual was its own ancestor in a cycle of references. */ #define TSK_ERR_INDIVIDUAL_PARENT_CYCLE -1702 +/** +An individual had nodes from more than one population +(and only one was requested). +*/ +#define TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH -1703 +/** +An individual had nodes from more than one time +(and only one was requested). +*/ +#define TSK_ERR_INDIVIDUAL_TIME_MISMATCH -1704 /** @} */ // clang-format on diff --git a/treerec/tskit/genotypes.c b/treerec/tskit/genotypes.c index df5afbb0c..d4d1ecb08 100644 --- a/treerec/tskit/genotypes.c +++ b/treerec/tskit/genotypes.c @@ -463,7 +463,6 @@ tsk_variant_decode( tsk_id_t allele_index; tsk_size_t j, num_missing; int no_longer_missing; - tsk_mutation_t mutation; bool impute_missing = !!(self->options & TSK_ISOLATED_NOT_MISSING); bool by_traversal = self->alt_samples != NULL; @@ -485,7 +484,7 @@ tsk_variant_decode( goto out; } - /* When we have a no specified samples we need sample lists to be active + /* When we have no specified samples we need sample lists to be active * on the tree, as indicated by the presence of left_sample */ if (!by_traversal && self->tree.left_sample == NULL) { ret = TSK_ERR_NO_SAMPLE_LISTS; @@ -582,11 +581,11 @@ tsk_variant_restricted_copy(const tsk_variant_t *self, tsk_variant_t *other) tsk_size_t total_len, offset, j; /* Copy everything */ - tsk_memcpy(other, self, sizeof(tsk_variant_t)); + tsk_memcpy(other, self, sizeof(*other)); /* Tree sequence left as NULL and zero'd tree is a way of indicating this variant is * fixed and cannot be further decoded. */ other->tree_sequence = NULL; - tsk_memset(&other->tree, sizeof(tsk_tree_t), 0); + tsk_memset(&other->tree, sizeof(other->tree), 0); other->traversal_stack = NULL; other->samples = NULL; other->sample_index_map = NULL; @@ -598,27 +597,32 @@ tsk_variant_restricted_copy(const tsk_variant_t *self, tsk_variant_t *other) for (j = 0; j < self->num_alleles; j++) { total_len += self->allele_lengths[j]; } - other->genotypes = tsk_malloc(other->num_samples * sizeof(int32_t)); - other->user_alleles_mem = tsk_malloc(total_len * sizeof(char *)); + other->samples = tsk_malloc(other->num_samples * sizeof(*other->samples)); + other->genotypes = tsk_malloc(other->num_samples * sizeof(*other->genotypes)); + other->user_alleles_mem = tsk_malloc(total_len * sizeof(*other->user_alleles_mem)); other->allele_lengths = tsk_malloc(other->num_alleles * sizeof(*other->allele_lengths)); other->alleles = tsk_malloc(other->num_alleles * sizeof(*other->alleles)); - if (other->genotypes == NULL || other->user_alleles_mem == NULL - || other->allele_lengths == NULL || other->alleles == NULL) { + if (other->samples == NULL || other->genotypes == NULL + || other->user_alleles_mem == NULL || other->allele_lengths == NULL + || other->alleles == NULL) { ret = TSK_ERR_NO_MEMORY; goto out; } - - tsk_memcpy(other->genotypes, self->genotypes, other->num_samples * sizeof(int32_t)); + tsk_memcpy( + other->samples, self->samples, other->num_samples * sizeof(*other->samples)); + tsk_memcpy(other->genotypes, self->genotypes, + other->num_samples * sizeof(*other->genotypes)); tsk_memcpy(other->allele_lengths, self->allele_lengths, other->num_alleles * sizeof(*other->allele_lengths)); offset = 0; for (j = 0; j < other->num_alleles; j++) { tsk_memcpy(other->user_alleles_mem + offset, self->alleles[j], - other->allele_lengths[j] * sizeof(char *)); + other->allele_lengths[j] * sizeof(*other->user_alleles_mem)); other->alleles[j] = other->user_alleles_mem + offset; offset += other->allele_lengths[j]; } + out: return ret; } diff --git a/treerec/tskit/haplotype_matching.c b/treerec/tskit/haplotype_matching.c index 41c1bd23a..b942da18d 100644 --- a/treerec/tskit/haplotype_matching.c +++ b/treerec/tskit/haplotype_matching.c @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019-2022 Tskit Developers + * Copyright (c) 2019-2023 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -250,7 +250,7 @@ tsk_ls_hmm_reset(tsk_ls_hmm_t *self) /* This is safe because we've already zero'd out the memory. */ tsk_diff_iter_free(&self->diffs); - ret = tsk_diff_iter_init(&self->diffs, self->tree_sequence, false); + ret = tsk_diff_iter_init_from_ts(&self->diffs, self->tree_sequence, false); if (ret != 0) { goto out; } diff --git a/treerec/tskit/kastore/kastore.c b/treerec/tskit/kastore/kastore.c index 155cd567b..1f7143b01 100644 --- a/treerec/tskit/kastore/kastore.c +++ b/treerec/tskit/kastore/kastore.c @@ -683,14 +683,39 @@ kastore_close(kastore_t *self) return ret; } +static int +kastore_find_item(kastore_t *self, const char *key, size_t key_len, kaitem_t **item) +{ + int ret = KAS_ERR_KEY_NOT_FOUND; + kaitem_t search; + search.key = (char *) malloc(key_len); + search.key_len = key_len; + + if (self->mode != KAS_READ) { + ret = KAS_ERR_ILLEGAL_OPERATION; + goto out; + } + if (search.key == NULL) { + ret = KAS_ERR_NO_MEMORY; + goto out; + } + memcpy(search.key, key, key_len); + *item = bsearch( + &search, self->items, self->num_items, sizeof(kaitem_t), compare_items); + if (*item == NULL) { + goto out; + } + ret = 0; +out: + kas_safe_free(search.key); + return ret; +} + int KAS_WARN_UNUSED kastore_contains(kastore_t *self, const char *key, size_t key_len) { - void *array; - size_t array_len; - int type; - int ret = kastore_get(self, key, key_len, &array, &array_len, &type); - + kaitem_t *item; + int ret = kastore_find_item(self, key, key_len, &item); if (ret == 0) { ret = 1; } else if (ret == KAS_ERR_KEY_NOT_FOUND) { @@ -709,24 +734,9 @@ int KAS_WARN_UNUSED kastore_get(kastore_t *self, const char *key, size_t key_len, void **array, size_t *array_len, int *type) { - int ret = KAS_ERR_KEY_NOT_FOUND; - kaitem_t search; kaitem_t *item; - search.key = (char *) malloc(key_len); - search.key_len = key_len; - - if (self->mode != KAS_READ) { - ret = KAS_ERR_ILLEGAL_OPERATION; - goto out; - } - if (search.key == NULL) { - ret = KAS_ERR_NO_MEMORY; - goto out; - } - memcpy(search.key, key, key_len); - item = bsearch( - &search, self->items, self->num_items, sizeof(kaitem_t), compare_items); - if (item == NULL) { + int ret = kastore_find_item(self, key, key_len, &item); + if (ret != 0) { goto out; } if (item->array == NULL) { @@ -743,7 +753,6 @@ kastore_get(kastore_t *self, const char *key, size_t key_len, void **array, } ret = 0; out: - kas_safe_free(search.key); return ret; } diff --git a/treerec/tskit/kastore/kastore.h b/treerec/tskit/kastore/kastore.h index ee08280fe..fbd48cec2 100644 --- a/treerec/tskit/kastore/kastore.h +++ b/treerec/tskit/kastore/kastore.h @@ -157,7 +157,7 @@ to the API or ABI are introduced, i.e., the addition of a new function. The library patch version. Incremented when any changes not relevant to the to the API or ABI are introduced, i.e., internal refactors of bugfixes. */ -#define KAS_VERSION_PATCH 0 +#define KAS_VERSION_PATCH 1 /** @} */ #define KAS_HEADER_SIZE 64 diff --git a/treerec/tskit/tables.c b/treerec/tskit/tables.c index b5526f492..8eea85f5a 100644 --- a/treerec/tskit/tables.c +++ b/treerec/tskit/tables.c @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019-2022 Tskit Developers + * Copyright (c) 2019-2023 Tskit Developers * Copyright (c) 2017-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -79,13 +79,14 @@ typedef struct { } write_table_ragged_col_t; /* Returns true if adding the specified number of rows would result in overflow. - * Tables can support indexes from 0 to TSK_MAX_ID, and therefore have at most - * TSK_MAX_ID + 1 rows */ + * Tables can support indexes from 0 to TSK_MAX_ID, and therefore could have at most + * TSK_MAX_ID + 1 rows. However we limit to TSK_MAX_ID rows so that counts of rows + * can fit in a tsk_id_t. */ static bool check_table_overflow(tsk_size_t current_size, tsk_size_t additional_rows) { - tsk_size_t max_val = TSK_MAX_ID + (tsk_size_t) 1; - return current_size > (max_val - additional_rows); + tsk_size_t max_val = TSK_MAX_ID; + return additional_rows > max_val || current_size > (max_val - additional_rows); } /* Returns true if adding the specified number of elements would result in overflow @@ -94,7 +95,9 @@ check_table_overflow(tsk_size_t current_size, tsk_size_t additional_rows) static bool check_offset_overflow(tsk_size_t current_size, tsk_size_t additional_elements) { - return current_size > (TSK_MAX_SIZE - additional_elements); + tsk_size_t max_val = TSK_MAX_SIZE; + return additional_elements > max_val + || current_size > (max_val - additional_elements); } #define TSK_NUM_ROWS_UNSET ((tsk_size_t) -1) @@ -729,6 +732,188 @@ write_metadata_schema_header( return fprintf(out, fmt, (int) metadata_schema_length, metadata_schema); } +/* Utilities for in-place subsetting columns */ + +static tsk_size_t +count_true(tsk_size_t num_rows, const tsk_bool_t *restrict keep) +{ + tsk_size_t j; + tsk_size_t count = 0; + + for (j = 0; j < num_rows; j++) { + if (keep[j]) { + count++; + } + } + return count; +} + +static void +keep_mask_to_id_map( + tsk_size_t num_rows, const tsk_bool_t *restrict keep, tsk_id_t *restrict id_map) +{ + tsk_size_t j; + tsk_id_t next_id = 0; + + for (j = 0; j < num_rows; j++) { + id_map[j] = TSK_NULL; + if (keep[j]) { + id_map[j] = next_id; + next_id++; + } + } +} + +static tsk_size_t +subset_remap_id_column(tsk_id_t *restrict column, tsk_size_t num_rows, + const tsk_bool_t *restrict keep, const tsk_id_t *restrict id_map) +{ + tsk_size_t j, k; + tsk_id_t value; + + k = 0; + for (j = 0; j < num_rows; j++) { + if (keep[j]) { + value = column[j]; + if (value != TSK_NULL) { + value = id_map[value]; + } + column[k] = value; + k++; + } + } + return k; +} + +/* Trigger warning: C++ programmers should look away... This may be one of the + * few cases where some macro funkiness is warranted, as these are exact + * duplicates of the same function with just the type of the column + * parameter changed. */ + +static tsk_size_t +subset_id_column( + tsk_id_t *restrict column, tsk_size_t num_rows, const tsk_bool_t *restrict keep) +{ + tsk_size_t j, k; + + k = 0; + for (j = 0; j < num_rows; j++) { + if (keep[j]) { + column[k] = column[j]; + k++; + } + } + return k; +} + +static tsk_size_t +subset_flags_column( + tsk_flags_t *restrict column, tsk_size_t num_rows, const tsk_bool_t *restrict keep) +{ + tsk_size_t j, k; + + k = 0; + for (j = 0; j < num_rows; j++) { + if (keep[j]) { + column[k] = column[j]; + k++; + } + } + return k; +} + +static tsk_size_t +subset_double_column( + double *restrict column, tsk_size_t num_rows, const tsk_bool_t *restrict keep) +{ + tsk_size_t j, k; + + k = 0; + for (j = 0; j < num_rows; j++) { + if (keep[j]) { + column[k] = column[j]; + k++; + } + } + return k; +} + +static tsk_size_t +subset_ragged_char_column(char *restrict data, tsk_size_t *restrict offset_col, + tsk_size_t num_rows, const tsk_bool_t *restrict keep) +{ + tsk_size_t j, k, i, offset; + + k = 0; + offset = 0; + for (j = 0; j < num_rows; j++) { + if (keep[j]) { + offset_col[k] = offset; + /* Note: Unclear whether it's worth calling memcpy instead here? + * Need to be careful since the regions are overlapping */ + for (i = offset_col[j]; i < offset_col[j + 1]; i++) { + data[offset] = data[i]; + offset++; + } + k++; + } + } + offset_col[k] = offset; + return offset; +} + +static tsk_size_t +subset_ragged_double_column(double *restrict data, tsk_size_t *restrict offset_col, + tsk_size_t num_rows, const tsk_bool_t *restrict keep) +{ + tsk_size_t j, k, i, offset; + + k = 0; + offset = 0; + for (j = 0; j < num_rows; j++) { + if (keep[j]) { + offset_col[k] = offset; + /* Note: Unclear whether it's worth calling memcpy instead here? + * Need to be careful since the regions are overlapping */ + for (i = offset_col[j]; i < offset_col[j + 1]; i++) { + data[offset] = data[i]; + offset++; + } + k++; + } + } + offset_col[k] = offset; + return offset; +} + +static tsk_size_t +subset_remap_ragged_id_column(tsk_id_t *restrict data, tsk_size_t *restrict offset_col, + tsk_size_t num_rows, const tsk_bool_t *restrict keep, + const tsk_id_t *restrict id_map) +{ + tsk_size_t j, k, i, offset; + tsk_id_t di; + + k = 0; + offset = 0; + for (j = 0; j < num_rows; j++) { + if (keep[j]) { + offset_col[k] = offset; + for (i = offset_col[j]; i < offset_col[j + 1]; i++) { + di = data[i]; + if (di != TSK_NULL) { + di = id_map[di]; + } + data[offset] = di; + offset++; + } + k++; + } + } + offset_col[k] = offset; + return offset; +} + /************************* * reference sequence *************************/ @@ -1555,10 +1740,10 @@ tsk_individual_table_dump_text(const tsk_individual_table_t *self, FILE *out) } } } - err = fprintf(out, "\t"); - if (err < 0) { - goto out; - } + err = fprintf(out, "\t"); + if (err < 0) { + goto out; + } for (k = self->parents_offset[j]; k < self->parents_offset[j + 1]; k++) { err = fprintf(out, "%lld", (long long) self->parents[k]); if (err < 0) { @@ -1619,6 +1804,71 @@ tsk_individual_table_equals(const tsk_individual_table_t *self, return ret; } +int +tsk_individual_table_keep_rows(tsk_individual_table_t *self, const tsk_bool_t *keep, + tsk_flags_t TSK_UNUSED(options), tsk_id_t *ret_id_map) +{ + int ret = 0; + const tsk_size_t current_num_rows = self->num_rows; + tsk_size_t j, k, remaining_rows; + tsk_id_t pk; + tsk_id_t *id_map = ret_id_map; + tsk_id_t *restrict parents = self->parents; + tsk_size_t *restrict parents_offset = self->parents_offset; + + if (ret_id_map == NULL) { + id_map = tsk_malloc(current_num_rows * sizeof(*id_map)); + if (id_map == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + } + + keep_mask_to_id_map(current_num_rows, keep, id_map); + + /* See notes in tsk_mutation_table_keep_rows for possibilities + * on making this more flexible */ + for (j = 0; j < current_num_rows; j++) { + if (keep[j]) { + for (k = parents_offset[j]; k < parents_offset[j + 1]; k++) { + pk = parents[k]; + if (pk != TSK_NULL) { + if (pk < 0 || pk >= (tsk_id_t) current_num_rows) { + ret = TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS; + ; + goto out; + } + if (id_map[pk] == TSK_NULL) { + ret = TSK_ERR_KEEP_ROWS_MAP_TO_DELETED; + goto out; + } + } + } + } + } + + remaining_rows = subset_flags_column(self->flags, current_num_rows, keep); + self->parents_length = subset_remap_ragged_id_column( + self->parents, self->parents_offset, current_num_rows, keep, id_map); + self->location_length = subset_ragged_double_column( + self->location, self->location_offset, current_num_rows, keep); + if (self->metadata_length > 0) { + /* Implementation note: we special case metadata here because + * it'll make the common-case of no metadata a bit faster, and + * to also potentially support more general use of the + * TSK_TABLE_NO_METADATA option. This is done for all the tables + * but only commented on here. */ + self->metadata_length = subset_ragged_char_column( + self->metadata, self->metadata_offset, current_num_rows, keep); + } + self->num_rows = remaining_rows; +out: + if (ret_id_map == NULL) { + tsk_safe_free(id_map); + } + return ret; +} + static int tsk_individual_table_dump( const tsk_individual_table_t *self, kastore_t *store, tsk_flags_t options) @@ -2268,6 +2518,29 @@ tsk_node_table_get_row(const tsk_node_table_t *self, tsk_id_t index, tsk_node_t return ret; } +int +tsk_node_table_keep_rows(tsk_node_table_t *self, const tsk_bool_t *keep, + tsk_flags_t TSK_UNUSED(options), tsk_id_t *id_map) +{ + int ret = 0; + tsk_size_t remaining_rows; + + if (id_map != NULL) { + keep_mask_to_id_map(self->num_rows, keep, id_map); + } + + remaining_rows = subset_flags_column(self->flags, self->num_rows, keep); + subset_double_column(self->time, self->num_rows, keep); + subset_id_column(self->population, self->num_rows, keep); + subset_id_column(self->individual, self->num_rows, keep); + if (self->metadata_length > 0) { + self->metadata_length = subset_ragged_char_column( + self->metadata, self->metadata_offset, self->num_rows, keep); + } + self->num_rows = remaining_rows; + return ret; +} + static int tsk_node_table_dump(const tsk_node_table_t *self, kastore_t *store, tsk_flags_t options) { @@ -2937,6 +3210,29 @@ tsk_edge_table_equals( return ret; } +int +tsk_edge_table_keep_rows(tsk_edge_table_t *self, const tsk_bool_t *keep, + tsk_flags_t TSK_UNUSED(options), tsk_id_t *id_map) +{ + int ret = 0; + tsk_size_t remaining_rows; + + if (id_map != NULL) { + keep_mask_to_id_map(self->num_rows, keep, id_map); + } + remaining_rows = subset_double_column(self->left, self->num_rows, keep); + subset_double_column(self->right, self->num_rows, keep); + subset_id_column(self->parent, self->num_rows, keep); + subset_id_column(self->child, self->num_rows, keep); + if (self->metadata_length > 0) { + tsk_bug_assert(!(self->options & TSK_TABLE_NO_METADATA)); + self->metadata_length = subset_ragged_char_column( + self->metadata, self->metadata_offset, self->num_rows, keep); + } + self->num_rows = remaining_rows; + return ret; +} + static int tsk_edge_table_dump(const tsk_edge_table_t *self, kastore_t *store, tsk_flags_t options) { @@ -3672,6 +3968,28 @@ tsk_site_table_dump_text(const tsk_site_table_t *self, FILE *out) return ret; } +int +tsk_site_table_keep_rows(tsk_site_table_t *self, const tsk_bool_t *keep, + tsk_flags_t TSK_UNUSED(options), tsk_id_t *id_map) +{ + int ret = 0; + tsk_size_t remaining_rows; + + if (id_map != NULL) { + keep_mask_to_id_map(self->num_rows, keep, id_map); + } + + remaining_rows = subset_double_column(self->position, self->num_rows, keep); + self->ancestral_state_length = subset_ragged_char_column( + self->ancestral_state, self->ancestral_state_offset, self->num_rows, keep); + if (self->metadata_length > 0) { + self->metadata_length = subset_ragged_char_column( + self->metadata, self->metadata_offset, self->num_rows, keep); + } + self->num_rows = remaining_rows; + return ret; +} + static int tsk_site_table_dump(const tsk_site_table_t *self, kastore_t *store, tsk_flags_t options) { @@ -4415,6 +4733,65 @@ tsk_mutation_table_dump_text(const tsk_mutation_table_t *self, FILE *out) return ret; } +int +tsk_mutation_table_keep_rows(tsk_mutation_table_t *self, const tsk_bool_t *keep, + tsk_flags_t TSK_UNUSED(options), tsk_id_t *ret_id_map) +{ + int ret = 0; + const tsk_size_t current_num_rows = self->num_rows; + tsk_size_t j, remaining_rows; + tsk_id_t pj; + tsk_id_t *id_map = ret_id_map; + tsk_id_t *restrict parent = self->parent; + + if (ret_id_map == NULL) { + id_map = tsk_malloc(current_num_rows * sizeof(*id_map)); + if (id_map == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + } + + keep_mask_to_id_map(current_num_rows, keep, id_map); + + /* Note: we could add some options to avoid these checks if we wanted. + * MAP_DELETED_TO_NULL is an obvious one, and I guess it might be + * helpful to also provide NO_REMAP to prevent reference remapping + * entirely. */ + for (j = 0; j < current_num_rows; j++) { + if (keep[j]) { + pj = parent[j]; + if (pj != TSK_NULL) { + if (pj < 0 || pj >= (tsk_id_t) current_num_rows) { + ret = TSK_ERR_MUTATION_OUT_OF_BOUNDS; + goto out; + } + if (id_map[pj] == TSK_NULL) { + ret = TSK_ERR_KEEP_ROWS_MAP_TO_DELETED; + goto out; + } + } + } + } + + remaining_rows = subset_id_column(self->site, current_num_rows, keep); + subset_id_column(self->node, current_num_rows, keep); + subset_remap_id_column(parent, current_num_rows, keep, id_map); + subset_double_column(self->time, current_num_rows, keep); + self->derived_state_length = subset_ragged_char_column( + self->derived_state, self->derived_state_offset, current_num_rows, keep); + if (self->metadata_length > 0) { + self->metadata_length = subset_ragged_char_column( + self->metadata, self->metadata_offset, current_num_rows, keep); + } + self->num_rows = remaining_rows; +out: + if (ret_id_map == NULL) { + tsk_safe_free(id_map); + } + return ret; +} + static int tsk_mutation_table_dump( const tsk_mutation_table_t *self, kastore_t *store, tsk_flags_t options) @@ -5060,6 +5437,31 @@ tsk_migration_table_equals(const tsk_migration_table_t *self, return ret; } +int +tsk_migration_table_keep_rows(tsk_migration_table_t *self, const tsk_bool_t *keep, + tsk_flags_t TSK_UNUSED(options), tsk_id_t *id_map) +{ + int ret = 0; + tsk_size_t remaining_rows; + + if (id_map != NULL) { + keep_mask_to_id_map(self->num_rows, keep, id_map); + } + + remaining_rows = subset_double_column(self->left, self->num_rows, keep); + subset_double_column(self->right, self->num_rows, keep); + subset_id_column(self->node, self->num_rows, keep); + subset_id_column(self->source, self->num_rows, keep); + subset_id_column(self->dest, self->num_rows, keep); + subset_double_column(self->time, self->num_rows, keep); + if (self->metadata_length > 0) { + self->metadata_length = subset_ragged_char_column( + self->metadata, self->metadata_offset, self->num_rows, keep); + } + self->num_rows = remaining_rows; + return ret; +} + static int tsk_migration_table_dump( const tsk_migration_table_t *self, kastore_t *store, tsk_flags_t options) @@ -5629,6 +6031,24 @@ tsk_population_table_equals(const tsk_population_table_t *self, return ret; } +int +tsk_population_table_keep_rows(tsk_population_table_t *self, const tsk_bool_t *keep, + tsk_flags_t TSK_UNUSED(options), tsk_id_t *id_map) +{ + int ret = 0; + + if (id_map != NULL) { + keep_mask_to_id_map(self->num_rows, keep, id_map); + } + + if (self->metadata_length > 0) { + self->metadata_length = subset_ragged_char_column( + self->metadata, self->metadata_offset, self->num_rows, keep); + } + self->num_rows = count_true(self->num_rows, keep); + return ret; +} + static int tsk_population_table_dump( const tsk_population_table_t *self, kastore_t *store, tsk_flags_t options) @@ -6241,6 +6661,24 @@ tsk_provenance_table_equals(const tsk_provenance_table_t *self, return ret; } +int +tsk_provenance_table_keep_rows(tsk_provenance_table_t *self, const tsk_bool_t *keep, + tsk_flags_t TSK_UNUSED(options), tsk_id_t *id_map) +{ + int ret = 0; + + if (id_map != NULL) { + keep_mask_to_id_map(self->num_rows, keep, id_map); + } + self->timestamp_length = subset_ragged_char_column( + self->timestamp, self->timestamp_offset, self->num_rows, keep); + self->record_length = subset_ragged_char_column( + self->record, self->record_offset, self->num_rows, keep); + self->num_rows = count_true(self->num_rows, keep); + + return ret; +} + static int tsk_provenance_table_dump( const tsk_provenance_table_t *self, kastore_t *store, tsk_flags_t options) @@ -7156,7 +7594,6 @@ typedef struct { } segment_overlapper_t; typedef struct { - tsk_id_t *samples; tsk_size_t num_samples; tsk_flags_t options; tsk_table_collection_t *tables; @@ -7165,6 +7602,7 @@ typedef struct { /* State for topology */ tsk_segment_t **ancestor_map_head; tsk_segment_t **ancestor_map_tail; + /* Mapping of input node IDs to output node IDs. */ tsk_id_t *node_id_map; bool *is_sample; /* Segments for a particular parent that are processed together */ @@ -7182,8 +7620,6 @@ typedef struct { tsk_size_t num_buffered_children; /* For each mutation, map its output node. */ tsk_id_t *mutation_node_map; - /* Map of input mutation IDs to output mutation IDs. */ - tsk_id_t *mutation_id_map; /* Map of input nodes to the list of input mutation IDs */ mutation_id_list_t **node_mutation_list_map_head; mutation_id_list_t **node_mutation_list_map_tail; @@ -7369,6 +7805,7 @@ typedef struct { tsk_id_t *buffered_children; tsk_size_t num_buffered_children; double sequence_length; + double oldest_node_time; } ancestor_mapper_t; static tsk_segment_t *TSK_WARN_UNUSED @@ -7509,6 +7946,23 @@ ancestor_mapper_add_ancestry(ancestor_mapper_t *self, tsk_id_t input_id, double return ret; } +static void +ancestor_mapper_find_oldest_node(ancestor_mapper_t *self) +{ + const double *node_time = self->tables->nodes.time; + tsk_size_t j; + double max_time = -1; + + for (j = 0; j < self->num_ancestors; j++) { + max_time = TSK_MAX(max_time, node_time[self->ancestors[j]]); + } + for (j = 0; j < self->num_samples; j++) { + max_time = TSK_MAX(max_time, node_time[self->samples[j]]); + } + + self->oldest_node_time = max_time; +} + static int ancestor_mapper_init_samples(ancestor_mapper_t *self, tsk_id_t *samples) { @@ -7623,6 +8077,7 @@ ancestor_mapper_init(ancestor_mapper_t *self, tsk_id_t *samples, tsk_size_t num_ if (ret != 0) { goto out; } + ancestor_mapper_find_oldest_node(self); ret = tsk_edge_table_clear(self->result); if (ret != 0) { goto out; @@ -7802,6 +8257,8 @@ ancestor_mapper_run(ancestor_mapper_t *self) tsk_id_t parent, current_parent; const tsk_edge_table_t *input_edges = &self->tables->edges; tsk_size_t num_edges = input_edges->num_rows; + const double *node_time = self->tables->nodes.time; + bool early_exit = false; if (num_edges > 0) { start = 0; @@ -7814,14 +8271,21 @@ ancestor_mapper_run(ancestor_mapper_t *self) if (ret != 0) { goto out; } - current_parent = parent; start = j; + current_parent = parent; + if (node_time[current_parent] > self->oldest_node_time) { + early_exit = true; + break; + } } } - ret = ancestor_mapper_process_parent_edges( - self, current_parent, start, num_edges); - if (ret != 0) { - goto out; + if (!early_exit) { + /* If we didn't break out of the loop early, we need to still process + * the final parent */ + ret = ancestor_mapper_process_parent_edges(self, current_parent, start, j); + if (ret != 0) { + goto out; + } } } out: @@ -8666,6 +9130,8 @@ simplifier_print_state(simplifier_t *self, FILE *out) fprintf(out, "options:\n"); fprintf(out, "\tfilter_unreferenced_sites : %d\n", !!(self->options & TSK_SIMPLIFY_FILTER_SITES)); + fprintf(out, "\tno_filter_nodes : %d\n", + !!(self->options & TSK_SIMPLIFY_NO_FILTER_NODES)); fprintf(out, "\treduce_to_site_topology : %d\n", !!(self->options & TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY)); fprintf(out, "\tkeep_unary : %d\n", @@ -8774,19 +9240,21 @@ simplifier_alloc_interval_list(simplifier_t *self, double left, double right) /* Add a new node to the output node table corresponding to the specified input id. * Returns the new ID. */ static tsk_id_t TSK_WARN_UNUSED -simplifier_record_node(simplifier_t *self, tsk_id_t input_id, bool is_sample) +simplifier_record_node(simplifier_t *self, tsk_id_t input_id) { tsk_node_t node; - tsk_flags_t flags; + bool update_flags = !(self->options & TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS); tsk_node_table_get_row_unsafe(&self->input_tables.nodes, (tsk_id_t) input_id, &node); - /* Zero out the sample bit */ - flags = node.flags & (tsk_flags_t) ~TSK_NODE_IS_SAMPLE; - if (is_sample) { - flags |= TSK_NODE_IS_SAMPLE; + if (update_flags) { + /* Zero out the sample bit */ + node.flags &= (tsk_flags_t) ~TSK_NODE_IS_SAMPLE; + if (self->is_sample[input_id]) { + node.flags |= TSK_NODE_IS_SAMPLE; + } } self->node_id_map[input_id] = (tsk_id_t) self->tables->nodes.num_rows; - return tsk_node_table_add_row(&self->tables->nodes, flags, node.time, + return tsk_node_table_add_row(&self->tables->nodes, node.flags, node.time, node.population, node.individual, node.metadata, node.metadata_length); } @@ -8845,7 +9313,7 @@ simplifier_init_position_lookup(simplifier_t *self) goto out; } self->position_lookup[0] = 0; - self->position_lookup[num_sites + 1] = self->tables->sequence_length; + self->position_lookup[num_sites + 1] = self->input_tables.sequence_length; tsk_memcpy(self->position_lookup + 1, self->input_tables.sites.position, num_sites * sizeof(double)); out: @@ -8889,7 +9357,7 @@ simplifier_record_edge(simplifier_t *self, double left, double right, tsk_id_t c interval_list_t *tail, *x; bool skip; - if (!!(self->options & TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY)) { + if (self->options & TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY) { skip = simplifier_map_reduced_coordinates(self, &left, &right); /* NOTE: we exit early here when reduce_coordindates has told us to * skip this edge, as it is not visible in the reduced tree sequence */ @@ -8935,8 +9403,6 @@ simplifier_init_sites(simplifier_t *self) mutation_id_list_t *list_node; tsk_size_t j; - self->mutation_id_map - = tsk_calloc(self->input_tables.mutations.num_rows, sizeof(tsk_id_t)); self->mutation_node_map = tsk_calloc(self->input_tables.mutations.num_rows, sizeof(tsk_id_t)); self->node_mutation_list_mem @@ -8945,15 +9411,12 @@ simplifier_init_sites(simplifier_t *self) = tsk_calloc(self->input_tables.nodes.num_rows, sizeof(mutation_id_list_t *)); self->node_mutation_list_map_tail = tsk_calloc(self->input_tables.nodes.num_rows, sizeof(mutation_id_list_t *)); - if (self->mutation_id_map == NULL || self->mutation_node_map == NULL - || self->node_mutation_list_mem == NULL + if (self->mutation_node_map == NULL || self->node_mutation_list_mem == NULL || self->node_mutation_list_map_head == NULL || self->node_mutation_list_map_tail == NULL) { ret = TSK_ERR_NO_MEMORY; goto out; } - tsk_memset(self->mutation_id_map, 0xff, - self->input_tables.mutations.num_rows * sizeof(tsk_id_t)); tsk_memset(self->mutation_node_map, 0xff, self->input_tables.mutations.num_rows * sizeof(tsk_id_t)); @@ -9027,32 +9490,96 @@ simplifier_add_ancestry( return ret; } +/* Sets up the internal working copies of the various tables, as needed + * depending on the specified options. */ +static int +simplifier_init_tables(simplifier_t *self) +{ + int ret; + bool filter_nodes = !(self->options & TSK_SIMPLIFY_NO_FILTER_NODES); + bool filter_populations = self->options & TSK_SIMPLIFY_FILTER_POPULATIONS; + bool filter_individuals = self->options & TSK_SIMPLIFY_FILTER_INDIVIDUALS; + bool filter_sites = self->options & TSK_SIMPLIFY_FILTER_SITES; + tsk_bookmark_t rows_to_retain; + + /* NOTE: this is a bit inefficient here as we're taking copies of + * the tables even in the no-filter case where the original tables + * won't be touched (beyond references to external tables that may + * need updating). Future versions may do something a bit more + * complicated like temporarily stealing the pointers to the + * underlying column memory in these tables, and then being careful + * not to free the table at the end. + */ + ret = tsk_table_collection_copy(self->tables, &self->input_tables, 0); + if (ret != 0) { + goto out; + } + memset(&rows_to_retain, 0, sizeof(rows_to_retain)); + rows_to_retain.provenances = self->tables->provenances.num_rows; + if (!filter_nodes) { + rows_to_retain.nodes = self->tables->nodes.num_rows; + } + if (!filter_populations) { + rows_to_retain.populations = self->tables->populations.num_rows; + } + if (!filter_individuals) { + rows_to_retain.individuals = self->tables->individuals.num_rows; + } + if (!filter_sites) { + rows_to_retain.sites = self->tables->sites.num_rows; + } + + ret = tsk_table_collection_truncate(self->tables, &rows_to_retain); + if (ret != 0) { + goto out; + } +out: + return ret; +} + static int -simplifier_init_samples(simplifier_t *self, const tsk_id_t *samples) +simplifier_init_nodes(simplifier_t *self, const tsk_id_t *samples) { int ret = 0; tsk_id_t node_id; tsk_size_t j; - - /* Go through the samples to check for errors. */ - for (j = 0; j < self->num_samples; j++) { - if (samples[j] < 0 - || samples[j] > (tsk_id_t) self->input_tables.nodes.num_rows) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; + const tsk_size_t num_nodes = self->input_tables.nodes.num_rows; + bool filter_nodes = !(self->options & TSK_SIMPLIFY_NO_FILTER_NODES); + bool update_flags = !(self->options & TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS); + tsk_flags_t *node_flags = self->tables->nodes.flags; + tsk_id_t *node_id_map = self->node_id_map; + + if (filter_nodes) { + tsk_bug_assert(self->tables->nodes.num_rows == 0); + /* The node table has been cleared. Add nodes for the samples. */ + for (j = 0; j < self->num_samples; j++) { + node_id = simplifier_record_node(self, samples[j]); + if (node_id < 0) { + ret = (int) node_id; + goto out; + } } - if (self->is_sample[samples[j]]) { - ret = TSK_ERR_DUPLICATE_SAMPLE; - goto out; + } else { + tsk_bug_assert(self->tables->nodes.num_rows == num_nodes); + if (update_flags) { + for (j = 0; j < num_nodes; j++) { + /* Reset the sample flags */ + node_flags[j] &= (tsk_flags_t) ~TSK_NODE_IS_SAMPLE; + if (self->is_sample[j]) { + node_flags[j] |= TSK_NODE_IS_SAMPLE; + } + } } - self->is_sample[samples[j]] = true; - node_id = simplifier_record_node(self, samples[j], true); - if (node_id < 0) { - ret = (int) node_id; - goto out; + + for (j = 0; j < num_nodes; j++) { + node_id_map[j] = (tsk_id_t) j; } - ret = simplifier_add_ancestry( - self, samples[j], 0, self->tables->sequence_length, node_id); + } + /* Add the initial ancestry */ + for (j = 0; j < self->num_samples; j++) { + node_id = samples[j]; + ret = simplifier_add_ancestry(self, node_id, 0, + self->input_tables.sequence_length, self->node_id_map[node_id]); if (ret != 0) { goto out; } @@ -9066,6 +9593,7 @@ simplifier_init(simplifier_t *self, const tsk_id_t *samples, tsk_size_t num_samp tsk_table_collection_t *tables, tsk_flags_t options) { int ret = 0; + tsk_size_t j; tsk_id_t ret_id; tsk_size_t num_nodes; @@ -9087,19 +9615,6 @@ simplifier_init(simplifier_t *self, const tsk_id_t *samples, tsk_size_t num_samp goto out; } - ret = tsk_table_collection_copy(self->tables, &self->input_tables, 0); - if (ret != 0) { - goto out; - } - - /* Take a copy of the input samples */ - self->samples = tsk_malloc(num_samples * sizeof(tsk_id_t)); - if (self->samples == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memcpy(self->samples, samples, num_samples * sizeof(tsk_id_t)); - /* Allocate the heaps used for small objects-> Assuming 8K is a good chunk size */ ret = tsk_blkalloc_init(&self->segment_heap, 8192); @@ -9133,26 +9648,40 @@ simplifier_init(simplifier_t *self, const tsk_id_t *samples, tsk_size_t num_samp ret = TSK_ERR_NO_MEMORY; goto out; } - ret = tsk_table_collection_clear(self->tables, 0); + + /* Go through the samples to check for errors before we clear the tables. */ + for (j = 0; j < self->num_samples; j++) { + if (samples[j] < 0 || samples[j] >= (tsk_id_t) num_nodes) { + ret = TSK_ERR_NODE_OUT_OF_BOUNDS; + goto out; + } + if (self->is_sample[samples[j]]) { + ret = TSK_ERR_DUPLICATE_SAMPLE; + goto out; + } + self->is_sample[samples[j]] = true; + } + tsk_memset(self->node_id_map, 0xff, num_nodes * sizeof(tsk_id_t)); + + ret = simplifier_init_tables(self); if (ret != 0) { goto out; } - tsk_memset( - self->node_id_map, 0xff, self->input_tables.nodes.num_rows * sizeof(tsk_id_t)); ret = simplifier_init_sites(self); if (ret != 0) { goto out; } - ret = simplifier_init_samples(self, samples); + ret = simplifier_init_nodes(self, samples); if (ret != 0) { goto out; } - if (!!(self->options & TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY)) { + if (self->options & TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY) { ret = simplifier_init_position_lookup(self); if (ret != 0) { goto out; } } + self->edge_sort_offset = TSK_NULL; out: return ret; @@ -9165,7 +9694,6 @@ simplifier_free(simplifier_t *self) tsk_blkalloc_free(&self->segment_heap); tsk_blkalloc_free(&self->interval_list_heap); segment_overlapper_free(&self->segment_overlapper); - tsk_safe_free(self->samples); tsk_safe_free(self->ancestor_map_head); tsk_safe_free(self->ancestor_map_tail); tsk_safe_free(self->child_edge_map_head); @@ -9173,7 +9701,6 @@ simplifier_free(simplifier_t *self) tsk_safe_free(self->node_id_map); tsk_safe_free(self->segment_queue); tsk_safe_free(self->is_sample); - tsk_safe_free(self->mutation_id_map); tsk_safe_free(self->mutation_node_map); tsk_safe_free(self->node_mutation_list_mem); tsk_safe_free(self->node_mutation_list_map_head); @@ -9221,12 +9748,10 @@ simplifier_merge_ancestors(simplifier_t *self, tsk_id_t input_id) double left, right, prev_right; tsk_id_t ancestry_node; tsk_id_t output_id = self->node_id_map[input_id]; + bool is_sample = self->is_sample[input_id]; + bool filter_nodes = !(self->options & TSK_SIMPLIFY_NO_FILTER_NODES); + bool keep_unary = self->options & TSK_SIMPLIFY_KEEP_UNARY; - bool is_sample = output_id != TSK_NULL; - bool keep_unary = false; - if (self->options & TSK_SIMPLIFY_KEEP_UNARY) { - keep_unary = true; - } if ((self->options & TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS) && (self->input_tables.nodes.individual[input_id] != TSK_NULL)) { keep_unary = true; @@ -9261,7 +9786,7 @@ simplifier_merge_ancestors(simplifier_t *self, tsk_id_t input_id) ancestry_node = output_id; } else if (keep_unary) { if (output_id == TSK_NULL) { - output_id = simplifier_record_node(self, input_id, false); + output_id = simplifier_record_node(self, input_id); } ret = simplifier_record_edge(self, left, right, ancestry_node); if (ret != 0) { @@ -9270,7 +9795,7 @@ simplifier_merge_ancestors(simplifier_t *self, tsk_id_t input_id) } } else { if (output_id == TSK_NULL) { - output_id = simplifier_record_node(self, input_id, false); + output_id = simplifier_record_node(self, input_id); if (output_id < 0) { ret = (int) output_id; goto out; @@ -9317,7 +9842,7 @@ simplifier_merge_ancestors(simplifier_t *self, tsk_id_t input_id) if (ret != 0) { goto out; } - if (num_flushed_edges == 0 && !is_sample) { + if (filter_nodes && (num_flushed_edges == 0) && !is_sample) { ret = simplifier_rewind_node(self, input_id, output_id); } } @@ -9423,133 +9948,60 @@ simplifier_process_parent_edges( } static int TSK_WARN_UNUSED -simplifier_output_sites(simplifier_t *self) +simplifier_finalise_site_references( + simplifier_t *self, const bool *site_referenced, tsk_id_t *site_id_map) { int ret = 0; tsk_id_t ret_id; - tsk_id_t input_site; - tsk_id_t input_mutation, mapped_parent, site_start, site_end; - tsk_id_t num_input_sites = (tsk_id_t) self->input_tables.sites.num_rows; - tsk_id_t num_input_mutations = (tsk_id_t) self->input_tables.mutations.num_rows; - tsk_id_t num_output_mutations, num_output_site_mutations; - tsk_id_t mapped_node; - bool keep_site; - bool filter_sites = !!(self->options & TSK_SIMPLIFY_FILTER_SITES); + tsk_size_t j; tsk_site_t site; - tsk_mutation_t mutation; - - input_mutation = 0; - num_output_mutations = 0; - for (input_site = 0; input_site < num_input_sites; input_site++) { - tsk_site_table_get_row_unsafe( - &self->input_tables.sites, (tsk_id_t) input_site, &site); - site_start = input_mutation; - num_output_site_mutations = 0; - while (input_mutation < num_input_mutations - && self->input_tables.mutations.site[input_mutation] == site.id) { - mapped_node = self->mutation_node_map[input_mutation]; - if (mapped_node != TSK_NULL) { - self->mutation_id_map[input_mutation] = num_output_mutations; - num_output_mutations++; - num_output_site_mutations++; - } - input_mutation++; - } - site_end = input_mutation; - - keep_site = true; - if (filter_sites && num_output_site_mutations == 0) { - keep_site = false; - } - if (keep_site) { - for (input_mutation = site_start; input_mutation < site_end; - input_mutation++) { - if (self->mutation_id_map[input_mutation] != TSK_NULL) { - tsk_bug_assert( - self->tables->mutations.num_rows - == (tsk_size_t) self->mutation_id_map[input_mutation]); - mapped_node = self->mutation_node_map[input_mutation]; - tsk_bug_assert(mapped_node != TSK_NULL); - mapped_parent = self->input_tables.mutations.parent[input_mutation]; - if (mapped_parent != TSK_NULL) { - mapped_parent = self->mutation_id_map[mapped_parent]; - } - tsk_mutation_table_get_row_unsafe(&self->input_tables.mutations, - (tsk_id_t) input_mutation, &mutation); - ret_id = tsk_mutation_table_add_row(&self->tables->mutations, - (tsk_id_t) self->tables->sites.num_rows, mapped_node, - mapped_parent, mutation.time, mutation.derived_state, - mutation.derived_state_length, mutation.metadata, - mutation.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } + const tsk_size_t num_sites = self->input_tables.sites.num_rows; + + if (self->options & TSK_SIMPLIFY_FILTER_SITES) { + for (j = 0; j < num_sites; j++) { + tsk_site_table_get_row_unsafe( + &self->input_tables.sites, (tsk_id_t) j, &site); + site_id_map[j] = TSK_NULL; + if (site_referenced[j]) { + ret_id = tsk_site_table_add_row(&self->tables->sites, site.position, + site.ancestral_state, site.ancestral_state_length, site.metadata, + site.metadata_length); + if (ret_id < 0) { + ret = (int) ret_id; + goto out; } - } - ret_id = tsk_site_table_add_row(&self->tables->sites, site.position, - site.ancestral_state, site.ancestral_state_length, site.metadata, - site.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; + site_id_map[j] = ret_id; } } - tsk_bug_assert( - num_output_mutations == (tsk_id_t) self->tables->mutations.num_rows); - input_mutation = site_end; + } else { + tsk_bug_assert(self->tables->sites.num_rows == num_sites); + for (j = 0; j < num_sites; j++) { + site_id_map[j] = (tsk_id_t) j; + } } - tsk_bug_assert(input_mutation == num_input_mutations); - ret = 0; out: return ret; } static int TSK_WARN_UNUSED -simplifier_finalise_references(simplifier_t *self) +simplifier_finalise_population_references(simplifier_t *self) { int ret = 0; - tsk_id_t ret_id; tsk_size_t j; - bool keep; - tsk_size_t num_nodes = self->tables->nodes.num_rows; - + tsk_id_t pop_id, ret_id; tsk_population_t pop; - tsk_id_t pop_id; - tsk_size_t num_populations = self->input_tables.populations.num_rows; tsk_id_t *node_population = self->tables->nodes.population; + const tsk_size_t num_nodes = self->tables->nodes.num_rows; + const tsk_size_t num_populations = self->input_tables.populations.num_rows; bool *population_referenced = tsk_calloc(num_populations, sizeof(*population_referenced)); tsk_id_t *population_id_map = tsk_malloc(num_populations * sizeof(*population_id_map)); - bool filter_populations = !!(self->options & TSK_SIMPLIFY_FILTER_POPULATIONS); - tsk_individual_t ind; - tsk_id_t ind_id; - tsk_size_t num_individuals = self->input_tables.individuals.num_rows; - tsk_id_t *node_individual = self->tables->nodes.individual; - bool *individual_referenced - = tsk_calloc(num_individuals, sizeof(*individual_referenced)); - tsk_id_t *individual_id_map - = tsk_malloc(num_individuals * sizeof(*individual_id_map)); - bool filter_individuals = !!(self->options & TSK_SIMPLIFY_FILTER_INDIVIDUALS); - - if (population_referenced == NULL || population_id_map == NULL - || individual_referenced == NULL || individual_id_map == NULL) { - goto out; - } + tsk_bug_assert(self->options & TSK_SIMPLIFY_FILTER_POPULATIONS); - /* TODO Migrations fit reasonably neatly into the pattern that we have here. We - * can consider references to populations from migration objects in the same way - * as from nodes, so that we only remove a population if its referenced by - * neither. Mapping the population IDs in migrations is then easy. In principle - * nodes are similar, but the semantics are slightly different because we've - * already allocated all the nodes by their references from edges. We then - * need to decide whether we remove migrations that reference unmapped nodes - * or whether to add these nodes back in (probably the former is the correct - * approach).*/ - if (self->input_tables.migrations.num_rows != 0) { - ret = TSK_ERR_SIMPLIFY_MIGRATIONS_NOT_SUPPORTED; + if (population_referenced == NULL || population_id_map == NULL) { + ret = TSK_ERR_NO_MEMORY; goto out; } @@ -9558,20 +10010,13 @@ simplifier_finalise_references(simplifier_t *self) if (pop_id != TSK_NULL) { population_referenced[pop_id] = true; } - ind_id = node_individual[j]; - if (ind_id != TSK_NULL) { - individual_referenced[ind_id] = true; - } } + for (j = 0; j < num_populations; j++) { tsk_population_table_get_row_unsafe( &self->input_tables.populations, (tsk_id_t) j, &pop); - keep = true; - if (filter_populations && !population_referenced[j]) { - keep = false; - } population_id_map[j] = TSK_NULL; - if (keep) { + if (population_referenced[j]) { ret_id = tsk_population_table_add_row( &self->tables->populations, pop.metadata, pop.metadata_length); if (ret_id < 0) { @@ -9582,15 +10027,56 @@ simplifier_finalise_references(simplifier_t *self) } } + /* Remap the IDs in the node table */ + for (j = 0; j < num_nodes; j++) { + pop_id = node_population[j]; + if (pop_id != TSK_NULL) { + node_population[j] = population_id_map[pop_id]; + } + } +out: + tsk_safe_free(population_id_map); + tsk_safe_free(population_referenced); + return ret; +} + +static int TSK_WARN_UNUSED +simplifier_finalise_individual_references(simplifier_t *self) +{ + int ret = 0; + tsk_size_t j; + tsk_id_t pop_id, ret_id; + tsk_individual_t ind; + tsk_id_t *node_individual = self->tables->nodes.individual; + tsk_id_t *parents; + const tsk_size_t num_nodes = self->tables->nodes.num_rows; + const tsk_size_t num_individuals = self->input_tables.individuals.num_rows; + bool *individual_referenced + = tsk_calloc(num_individuals, sizeof(*individual_referenced)); + tsk_id_t *individual_id_map + = tsk_malloc(num_individuals * sizeof(*individual_id_map)); + + tsk_bug_assert(self->options & TSK_SIMPLIFY_FILTER_INDIVIDUALS); + + if (individual_referenced == NULL || individual_id_map == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + + for (j = 0; j < num_nodes; j++) { + pop_id = node_individual[j]; + if (pop_id != TSK_NULL) { + individual_referenced[pop_id] = true; + } + } + for (j = 0; j < num_individuals; j++) { tsk_individual_table_get_row_unsafe( &self->input_tables.individuals, (tsk_id_t) j, &ind); - keep = true; - if (filter_individuals && !individual_referenced[j]) { - keep = false; - } individual_id_map[j] = TSK_NULL; - if (keep) { + if (individual_referenced[j]) { + /* Can't remap the parents inline here because we have no + * guarantees about sortedness */ ret_id = tsk_individual_table_add_row(&self->tables->individuals, ind.flags, ind.location, ind.location_length, ind.parents, ind.parents_length, ind.metadata, ind.metadata_length); @@ -9602,32 +10088,128 @@ simplifier_finalise_references(simplifier_t *self) } } - /* Remap parent IDs */ - for (j = 0; j < self->tables->individuals.parents_length; j++) { - self->tables->individuals.parents[j] - = self->tables->individuals.parents[j] == TSK_NULL - ? TSK_NULL - : individual_id_map[self->tables->individuals.parents[j]]; - } - - /* Remap node IDs referencing the above */ + /* Remap the IDs in the node table */ for (j = 0; j < num_nodes; j++) { - pop_id = node_population[j]; + pop_id = node_individual[j]; if (pop_id != TSK_NULL) { - node_population[j] = population_id_map[pop_id]; + node_individual[j] = individual_id_map[pop_id]; } - ind_id = node_individual[j]; - if (ind_id != TSK_NULL) { - node_individual[j] = individual_id_map[ind_id]; + } + + /* Remap parent IDs. * + * NOTE! must take the pointer reference here as it can change from + * the start of the function */ + parents = self->tables->individuals.parents; + for (j = 0; j < self->tables->individuals.parents_length; j++) { + if (parents[j] != TSK_NULL) { + parents[j] = individual_id_map[parents[j]]; } } - ret = 0; out: - tsk_safe_free(population_referenced); - tsk_safe_free(individual_referenced); - tsk_safe_free(population_id_map); tsk_safe_free(individual_id_map); + tsk_safe_free(individual_referenced); + return ret; +} + +static int TSK_WARN_UNUSED +simplifier_output_sites(simplifier_t *self) +{ + int ret = 0; + tsk_id_t ret_id; + tsk_size_t j; + tsk_mutation_t mutation; + const tsk_size_t num_sites = self->input_tables.sites.num_rows; + const tsk_size_t num_mutations = self->input_tables.mutations.num_rows; + bool *site_referenced = tsk_calloc(num_sites, sizeof(*site_referenced)); + tsk_id_t *site_id_map = tsk_malloc(num_sites * sizeof(*site_id_map)); + tsk_id_t *mutation_id_map = tsk_malloc(num_mutations * sizeof(*mutation_id_map)); + const tsk_id_t *mutation_node_map = self->mutation_node_map; + const tsk_id_t *mutation_site = self->input_tables.mutations.site; + + if (site_referenced == NULL || site_id_map == NULL || mutation_id_map == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + + for (j = 0; j < num_mutations; j++) { + if (mutation_node_map[j] != TSK_NULL) { + site_referenced[mutation_site[j]] = true; + } + } + ret = simplifier_finalise_site_references(self, site_referenced, site_id_map); + if (ret != 0) { + goto out; + } + + for (j = 0; j < num_mutations; j++) { + mutation_id_map[j] = TSK_NULL; + if (mutation_node_map[j] != TSK_NULL) { + tsk_mutation_table_get_row_unsafe( + &self->input_tables.mutations, (tsk_id_t) j, &mutation); + mutation.node = mutation_node_map[j]; + mutation.site = site_id_map[mutation.site]; + if (mutation.parent != TSK_NULL) { + mutation.parent = mutation_id_map[mutation.parent]; + } + ret_id = tsk_mutation_table_add_row(&self->tables->mutations, mutation.site, + mutation.node, mutation.parent, mutation.time, mutation.derived_state, + mutation.derived_state_length, mutation.metadata, + mutation.metadata_length); + if (ret_id < 0) { + ret = (int) ret_id; + goto out; + } + mutation_id_map[j] = ret_id; + } + } +out: + tsk_safe_free(site_referenced); + tsk_safe_free(site_id_map); + tsk_safe_free(mutation_id_map); + return ret; +} + +/* Flush the remaining non-edge and node data in the model to the + * output tables. */ +static int TSK_WARN_UNUSED +simplifier_flush_output(simplifier_t *self) +{ + int ret = 0; + + /* TODO Migrations fit reasonably neatly into the pattern that we have here. We + * can consider references to populations from migration objects in the same way + * as from nodes, so that we only remove a population if its referenced by + * neither. Mapping the population IDs in migrations is then easy. In principle + * nodes are similar, but the semantics are slightly different because we've + * already allocated all the nodes by their references from edges. We then + * need to decide whether we remove migrations that reference unmapped nodes + * or whether to add these nodes back in (probably the former is the correct + * approach).*/ + if (self->input_tables.migrations.num_rows != 0) { + ret = TSK_ERR_SIMPLIFY_MIGRATIONS_NOT_SUPPORTED; + goto out; + } + + ret = simplifier_output_sites(self); + if (ret != 0) { + goto out; + } + + if (self->options & TSK_SIMPLIFY_FILTER_POPULATIONS) { + ret = simplifier_finalise_population_references(self); + if (ret != 0) { + goto out; + } + } + if (self->options & TSK_SIMPLIFY_FILTER_INDIVIDUALS) { + ret = simplifier_finalise_individual_references(self); + if (ret != 0) { + goto out; + } + } + +out: return ret; } @@ -9676,7 +10258,7 @@ simplifier_insert_input_roots(simplifier_t *self) if (x != NULL) { output_id = self->node_id_map[input_id]; if (output_id == TSK_NULL) { - output_id = simplifier_record_node(self, input_id, false); + output_id = simplifier_record_node(self, input_id); if (output_id < 0) { ret = (int) output_id; goto out; @@ -9741,11 +10323,7 @@ simplifier_run(simplifier_t *self, tsk_id_t *node_map) goto out; } } - ret = simplifier_output_sites(self); - if (ret != 0) { - goto out; - } - ret = simplifier_finalise_references(self); + ret = simplifier_flush_output(self); if (ret != 0) { goto out; } @@ -10254,7 +10832,7 @@ tsk_table_collection_check_tree_integrity(const tsk_table_collection_t *self) { tsk_id_t ret = 0; tsk_size_t j, k; - tsk_id_t u, site, mutation; + tsk_id_t e, u, site, mutation; double tree_left, tree_right; const double sequence_length = self->sequence_length; const tsk_id_t num_sites = (tsk_id_t) self->sites.num_rows; @@ -10272,14 +10850,17 @@ tsk_table_collection_check_tree_integrity(const tsk_table_collection_t *self) const tsk_id_t *restrict edge_child = self->edges.child; const tsk_id_t *restrict edge_parent = self->edges.parent; tsk_id_t *restrict parent = NULL; + int8_t *restrict used_edges = NULL; tsk_id_t num_trees = 0; parent = tsk_malloc(self->nodes.num_rows * sizeof(*parent)); - if (parent == NULL) { + used_edges = tsk_malloc(num_edges * sizeof(*used_edges)); + if (parent == NULL || used_edges == NULL) { ret = TSK_ERR_NO_MEMORY; goto out; } tsk_memset(parent, 0xff, self->nodes.num_rows * sizeof(*parent)); + tsk_memset(used_edges, 0, num_edges * sizeof(*used_edges)); tree_left = 0; num_trees = 0; @@ -10288,19 +10869,32 @@ tsk_table_collection_check_tree_integrity(const tsk_table_collection_t *self) site = 0; mutation = 0; tsk_bug_assert(I != NULL && O != NULL); + tsk_bug_assert(self->indexes.num_edges == num_edges); while (j < num_edges || tree_left < sequence_length) { while (k < num_edges && edge_right[O[k]] == tree_left) { - parent[edge_child[O[k]]] = TSK_NULL; + e = O[k]; + if (used_edges[e] != 1) { + ret = TSK_ERR_TABLES_BAD_INDEXES; + goto out; + } + parent[edge_child[e]] = TSK_NULL; + used_edges[e]++; k++; } while (j < num_edges && edge_left[I[j]] == tree_left) { - u = edge_child[I[j]]; + e = I[j]; + if (used_edges[e] != 0) { + ret = TSK_ERR_TABLES_BAD_INDEXES; + goto out; + } + used_edges[e]++; + u = edge_child[e]; if (parent[u] != TSK_NULL) { ret = TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN; goto out; } - parent[u] = edge_parent[I[j]]; + parent[u] = edge_parent[e]; j++; } tree_right = sequence_length; @@ -10323,6 +10917,10 @@ tsk_table_collection_check_tree_integrity(const tsk_table_collection_t *self) } site++; } + if (tree_right <= tree_left) { + ret = TSK_ERR_TABLES_BAD_INDEXES; + goto out; + } tree_left = tree_right; /* This is technically possible; if we have 2**31 edges each defining * a single tree, and there's a gap between each of these edges we @@ -10333,12 +10931,28 @@ tsk_table_collection_check_tree_integrity(const tsk_table_collection_t *self) } num_trees++; } + tsk_bug_assert(j == num_edges); + while (k < num_edges) { + /* At this point it must be that used_edges[O[k]] == 1, + * since otherwise we would have added a different edge twice, + * and so hit the error above. */ + e = O[k]; + if (edge_right[e] != sequence_length) { + ret = TSK_ERR_TABLES_BAD_INDEXES; + goto out; + } + used_edges[e]++; + k++; + } ret = num_trees; out: /* Can't use tsk_safe_free because of restrict*/ if (parent != NULL) { free(parent); } + if (used_edges != NULL) { + free(used_edges); + } return ret; } @@ -11972,6 +12586,116 @@ tsk_table_collection_compute_mutation_times( return ret; } +int TSK_WARN_UNUSED +tsk_table_collection_delete_older( + tsk_table_collection_t *self, double time, tsk_flags_t TSK_UNUSED(options)) +{ + int ret = 0; + tsk_edge_t edge; + tsk_mutation_t mutation; + tsk_migration_t migration; + tsk_edge_table_t edges; + tsk_mutation_table_t mutations; + tsk_migration_table_t migrations; + const double *restrict node_time = self->nodes.time; + tsk_id_t j, ret_id, parent; + double mutation_time; + tsk_id_t *mutation_map = NULL; + + memset(&edges, 0, sizeof(edges)); + memset(&mutations, 0, sizeof(mutations)); + memset(&migrations, 0, sizeof(migrations)); + + ret = tsk_edge_table_copy(&self->edges, &edges, 0); + if (ret != 0) { + goto out; + } + ret = tsk_edge_table_clear(&self->edges); + if (ret != 0) { + goto out; + } + for (j = 0; j < (tsk_id_t) edges.num_rows; j++) { + tsk_edge_table_get_row_unsafe(&edges, j, &edge); + if (node_time[edge.parent] <= time) { + ret_id = tsk_edge_table_add_row(&self->edges, edge.left, edge.right, + edge.parent, edge.child, edge.metadata, edge.metadata_length); + if (ret_id < 0) { + ret = (int) ret_id; + goto out; + } + } + } + /* Calling x_table_free multiple times is safe, so get rid of the + * extra edge table memory as soon as we can. */ + tsk_edge_table_free(&edges); + + mutation_map = tsk_malloc(self->mutations.num_rows * sizeof(*mutation_map)); + if (mutation_map == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + ret = tsk_mutation_table_copy(&self->mutations, &mutations, 0); + if (ret != 0) { + goto out; + } + ret = tsk_mutation_table_clear(&self->mutations); + if (ret != 0) { + goto out; + } + for (j = 0; j < (tsk_id_t) mutations.num_rows; j++) { + tsk_mutation_table_get_row_unsafe(&mutations, j, &mutation); + mutation_time = tsk_is_unknown_time(mutation.time) ? node_time[mutation.node] + : mutation.time; + mutation_map[j] = TSK_NULL; + if (mutation_time < time) { + ret_id = tsk_mutation_table_add_row(&self->mutations, mutation.site, + mutation.node, mutation.parent, mutation.time, mutation.derived_state, + mutation.derived_state_length, mutation.metadata, + mutation.metadata_length); + if (ret_id < 0) { + ret = (int) ret_id; + goto out; + } + mutation_map[j] = ret_id; + } + } + tsk_mutation_table_free(&mutations); + for (j = 0; j < (tsk_id_t) self->mutations.num_rows; j++) { + parent = self->mutations.parent[j]; + if (parent != TSK_NULL) { + self->mutations.parent[j] = mutation_map[parent]; + } + } + + ret = tsk_migration_table_copy(&self->migrations, &migrations, 0); + if (ret != 0) { + goto out; + } + ret = tsk_migration_table_clear(&self->migrations); + if (ret != 0) { + goto out; + } + for (j = 0; j < (tsk_id_t) migrations.num_rows; j++) { + tsk_migration_table_get_row_unsafe(&migrations, j, &migration); + if (migration.time < time) { + ret_id = tsk_migration_table_add_row(&self->migrations, migration.left, + migration.right, migration.node, migration.source, migration.dest, + migration.time, migration.metadata, migration.metadata_length); + if (ret_id < 0) { + ret = (int) ret_id; + goto out; + } + } + } + tsk_migration_table_free(&migrations); +out: + tsk_edge_table_free(&edges); + tsk_mutation_table_free(&mutations); + tsk_migration_table_free(&migrations); + tsk_safe_free(mutation_map); + return ret; +} + int tsk_table_collection_record_num_rows( const tsk_table_collection_t *self, tsk_bookmark_t *position) @@ -12721,3 +13445,165 @@ tsk_squash_edges(tsk_edge_t *edges, tsk_size_t num_edges, tsk_size_t *num_output out: return ret; } + +/* ======================================================== * + * Tree diff iterator. + * ======================================================== */ + +int TSK_WARN_UNUSED +tsk_diff_iter_init(tsk_diff_iter_t *self, const tsk_table_collection_t *tables, + tsk_id_t num_trees, tsk_flags_t options) +{ + int ret = 0; + + tsk_bug_assert(tables != NULL); + tsk_memset(self, 0, sizeof(tsk_diff_iter_t)); + self->num_nodes = tables->nodes.num_rows; + self->num_edges = tables->edges.num_rows; + self->tables = tables; + self->insertion_index = 0; + self->removal_index = 0; + self->tree_left = 0; + self->tree_index = -1; + if (num_trees < 0) { + num_trees = tsk_table_collection_check_integrity(self->tables, TSK_CHECK_TREES); + if (num_trees < 0) { + ret = (int) num_trees; + goto out; + } + } + self->last_index = num_trees; + + if (options & TSK_INCLUDE_TERMINAL) { + self->last_index = self->last_index + 1; + } + self->edge_list_nodes = tsk_malloc(self->num_edges * sizeof(*self->edge_list_nodes)); + if (self->edge_list_nodes == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } +out: + return ret; +} + +int +tsk_diff_iter_free(tsk_diff_iter_t *self) +{ + tsk_safe_free(self->edge_list_nodes); + return 0; +} + +void +tsk_diff_iter_print_state(const tsk_diff_iter_t *self, FILE *out) +{ + fprintf(out, "tree_diff_iterator state\n"); + fprintf(out, "num_edges = %lld\n", (long long) self->num_edges); + fprintf(out, "insertion_index = %lld\n", (long long) self->insertion_index); + fprintf(out, "removal_index = %lld\n", (long long) self->removal_index); + fprintf(out, "tree_left = %f\n", self->tree_left); + fprintf(out, "tree_index = %lld\n", (long long) self->tree_index); +} + +int TSK_WARN_UNUSED +tsk_diff_iter_next(tsk_diff_iter_t *self, double *ret_left, double *ret_right, + tsk_edge_list_t *edges_out_ret, tsk_edge_list_t *edges_in_ret) +{ + int ret = 0; + tsk_id_t k; + const double sequence_length = self->tables->sequence_length; + double left = self->tree_left; + double right = sequence_length; + tsk_size_t next_edge_list_node = 0; + tsk_edge_list_node_t *out_head = NULL; + tsk_edge_list_node_t *out_tail = NULL; + tsk_edge_list_node_t *in_head = NULL; + tsk_edge_list_node_t *in_tail = NULL; + tsk_edge_list_node_t *w = NULL; + tsk_edge_list_t edges_out; + tsk_edge_list_t edges_in; + const tsk_edge_table_t *edges = &self->tables->edges; + const tsk_id_t *insertion_order = self->tables->indexes.edge_insertion_order; + const tsk_id_t *removal_order = self->tables->indexes.edge_removal_order; + + tsk_memset(&edges_out, 0, sizeof(edges_out)); + tsk_memset(&edges_in, 0, sizeof(edges_in)); + + if (self->tree_index + 1 < self->last_index) { + /* First we remove the stale records */ + while (self->removal_index < (tsk_id_t) self->num_edges + && left == edges->right[removal_order[self->removal_index]]) { + k = removal_order[self->removal_index]; + tsk_bug_assert(next_edge_list_node < self->num_edges); + w = &self->edge_list_nodes[next_edge_list_node]; + next_edge_list_node++; + w->edge.id = k; + w->edge.left = edges->left[k]; + w->edge.right = edges->right[k]; + w->edge.parent = edges->parent[k]; + w->edge.child = edges->child[k]; + w->edge.metadata = edges->metadata + edges->metadata_offset[k]; + w->edge.metadata_length + = edges->metadata_offset[k + 1] - edges->metadata_offset[k]; + w->next = NULL; + w->prev = NULL; + if (out_head == NULL) { + out_head = w; + out_tail = w; + } else { + out_tail->next = w; + w->prev = out_tail; + out_tail = w; + } + self->removal_index++; + } + edges_out.head = out_head; + edges_out.tail = out_tail; + + /* Now insert the new records */ + while (self->insertion_index < (tsk_id_t) self->num_edges + && left == edges->left[insertion_order[self->insertion_index]]) { + k = insertion_order[self->insertion_index]; + tsk_bug_assert(next_edge_list_node < self->num_edges); + w = &self->edge_list_nodes[next_edge_list_node]; + next_edge_list_node++; + w->edge.id = k; + w->edge.left = edges->left[k]; + w->edge.right = edges->right[k]; + w->edge.parent = edges->parent[k]; + w->edge.child = edges->child[k]; + w->edge.metadata = edges->metadata + edges->metadata_offset[k]; + w->edge.metadata_length + = edges->metadata_offset[k + 1] - edges->metadata_offset[k]; + w->next = NULL; + w->prev = NULL; + if (in_head == NULL) { + in_head = w; + in_tail = w; + } else { + in_tail->next = w; + w->prev = in_tail; + in_tail = w; + } + self->insertion_index++; + } + edges_in.head = in_head; + edges_in.tail = in_tail; + + right = sequence_length; + if (self->insertion_index < (tsk_id_t) self->num_edges) { + right = TSK_MIN(right, edges->left[insertion_order[self->insertion_index]]); + } + if (self->removal_index < (tsk_id_t) self->num_edges) { + right = TSK_MIN(right, edges->right[removal_order[self->removal_index]]); + } + self->tree_index++; + ret = TSK_TREE_OK; + } + *edges_out_ret = edges_out; + *edges_in_ret = edges_in; + *ret_left = left; + *ret_right = right; + /* Set the left coordinate for the next tree */ + self->tree_left = right; + return ret; +} diff --git a/treerec/tskit/tables.h b/treerec/tskit/tables.h index 54f6ca796..38f3096c9 100644 --- a/treerec/tskit/tables.h +++ b/treerec/tskit/tables.h @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019-2022 Tskit Developers + * Copyright (c) 2019-2023 Tskit Developers * Copyright (c) 2017-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -670,6 +670,30 @@ typedef struct { bool store_pairs; } tsk_identity_segments_t; +/* Diff iterator. */ +typedef struct _tsk_edge_list_node_t { + tsk_edge_t edge; + struct _tsk_edge_list_node_t *next; + struct _tsk_edge_list_node_t *prev; +} tsk_edge_list_node_t; + +typedef struct { + tsk_edge_list_node_t *head; + tsk_edge_list_node_t *tail; +} tsk_edge_list_t; + +typedef struct { + tsk_size_t num_nodes; + tsk_size_t num_edges; + double tree_left; + const tsk_table_collection_t *tables; + tsk_id_t insertion_index; + tsk_id_t removal_index; + tsk_id_t tree_index; + tsk_id_t last_index; + tsk_edge_list_node_t *edge_list_nodes; +} tsk_diff_iter_t; + /****************************************************************************/ /* Common function options */ /****************************************************************************/ @@ -686,6 +710,17 @@ reference them. */ #define TSK_SIMPLIFY_FILTER_POPULATIONS (1 << 1) /** Remove individuals from the output if there are no nodes that reference them.*/ #define TSK_SIMPLIFY_FILTER_INDIVIDUALS (1 << 2) +/** Do not remove nodes from the output if there are no edges that reference +them and do not reorder nodes so that the samples are nodes 0 to num_samples - 1. +Note that this flag is negated compared to other filtering options because +the default behaviour is to filter unreferenced nodes and reorder to put samples +first. +*/ +#define TSK_SIMPLIFY_NO_FILTER_NODES (1 << 7) +/** +Do not update the sample status of nodes as a result of simplification. +*/ +#define TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS (1 << 8) /** Reduce the topological information in the tables to the minimum necessary to represent the trees that contain sites. If there are zero sites this will @@ -881,6 +916,16 @@ top-level information of the table collections being compared. #define TSK_CLEAR_PROVENANCE (1 << 2) /** @} */ +/* For the edge diff iterator */ +#define TSK_INCLUDE_TERMINAL (1 << 0) + +/** @brief Value returned by seeking methods when they have successfully + seeked to a non-null tree. + + @ingroup TREE_API_SEEKING_GROUP +*/ +#define TSK_TREE_OK 1 + /****************************************************************************/ /* Function signatures */ /****************************************************************************/ @@ -1032,6 +1077,55 @@ int tsk_individual_table_extend(tsk_individual_table_t *self, const tsk_individual_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t options); +/** +@brief Subset this table by keeping rows according to a boolean mask. + +@rst +Deletes rows from this table and optionally return the mapping from IDs in +the current table to the updated table. Rows are kept or deleted according to +the specified boolean array ``keep`` such that for each row ``j`` if +``keep[j]`` is false (zero) the row is deleted, and otherwise the row is +retained. Thus, ``keep`` must be an array of at least ``num_rows`` +:c:type:`bool` values. + +If the ``id_map`` argument is non-null, this array will be updated to represent +the mapping between IDs before and after row deletion. For row ``j``, +``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or +:c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an +array of at least ``num_rows`` :c:type:`tsk_id_t` values. + +The values in the ``parents`` column are updated according to this map, so that +reference integrity within the table is maintained. As a consequence of this, +the values in the ``parents`` column for kept rows are bounds-checked and an +error raised if they are not valid. Rows that are deleted are not checked for +parent ID integrity. + +If an attempt is made to delete rows that are referred to by the ``parents`` +column of rows that are retained, an error is raised. + +These error conditions are checked before any alterations to the table are +made. + +.. warning:: + C++ users need to be careful to specify the correct type when + passing in values for the ``keep`` array, + using ``std::vector`` and not ``std::vector``, + as the latter may not be correct size. + +@endrst + +@param self A pointer to a tsk_individual_table_t object. +@param keep Array of boolean flags describing whether a particular + row should be kept or not. Must be at least ``num_rows`` long. +@param options Bitwise option flags. Currently unused; should be + set to zero to ensure compatibility with later versions of tskit. +@param id_map An array in which to store the mapping between new + and old IDs. If NULL, this will be ignored. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_individual_table_keep_rows(tsk_individual_table_t *self, const tsk_bool_t *keep, + tsk_flags_t options, tsk_id_t *id_map); + /** @brief Returns true if the data in the specified table is identical to the data in this table. @@ -1380,6 +1474,43 @@ and is not checked for compatibility with any existing schema on this table. int tsk_node_table_extend(tsk_node_table_t *self, const tsk_node_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t options); +/** +@brief Subset this table by keeping rows according to a boolean mask. + +@rst +Deletes rows from this table and optionally return the mapping from IDs in +the current table to the updated table. Rows are kept or deleted according to +the specified boolean array ``keep`` such that for each row ``j`` if +``keep[j]`` is false (zero) the row is deleted, and otherwise the row is +retained. Thus, ``keep`` must be an array of at least ``num_rows`` +:c:type:`bool` values. + +If the ``id_map`` argument is non-null, this array will be updated to represent +the mapping between IDs before and after row deletion. For row ``j``, +``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or +:c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an +array of at least ``num_rows`` :c:type:`tsk_id_t` values. + +.. warning:: + C++ users need to be careful to specify the correct type when + passing in values for the ``keep`` array, + using ``std::vector`` and not ``std::vector``, + as the latter may not be correct size. + +@endrst + +@param self A pointer to a tsk_node_table_t object. +@param keep Array of boolean flags describing whether a particular + row should be kept or not. Must be at least ``num_rows`` long. +@param options Bitwise option flags. Currently unused; should be + set to zero to ensure compatibility with later versions of tskit. +@param id_map An array in which to store the mapping between new + and old IDs. If NULL, this will be ignored. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_node_table_keep_rows(tsk_node_table_t *self, const tsk_bool_t *keep, + tsk_flags_t options, tsk_id_t *id_map); + /** @brief Returns true if the data in the specified table is identical to the data in this table. @@ -1690,6 +1821,43 @@ as-is and is not checked for compatibility with any existing schema on this tabl int tsk_edge_table_extend(tsk_edge_table_t *self, const tsk_edge_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t options); +/** +@brief Subset this table by keeping rows according to a boolean mask. + +@rst +Deletes rows from this table and optionally return the mapping from IDs in +the current table to the updated table. Rows are kept or deleted according to +the specified boolean array ``keep`` such that for each row ``j`` if +``keep[j]`` is false (zero) the row is deleted, and otherwise the row is +retained. Thus, ``keep`` must be an array of at least ``num_rows`` +:c:type:`bool` values. + +If the ``id_map`` argument is non-null, this array will be updated to represent +the mapping between IDs before and after row deletion. For row ``j``, +``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or +:c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an +array of at least ``num_rows`` :c:type:`tsk_id_t` values. + +.. warning:: + C++ users need to be careful to specify the correct type when + passing in values for the ``keep`` array, + using ``std::vector`` and not ``std::vector``, + as the latter may not be correct size. + +@endrst + +@param self A pointer to a tsk_edge_table_t object. +@param keep Array of boolean flags describing whether a particular + row should be kept or not. Must be at least ``num_rows`` long. +@param options Bitwise option flags. Currently unused; should be + set to zero to ensure compatibility with later versions of tskit. +@param id_map An array in which to store the mapping between new + and old IDs. If NULL, this will be ignored. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_edge_table_keep_rows(tsk_edge_table_t *self, const tsk_bool_t *keep, + tsk_flags_t options, tsk_id_t *id_map); + /** @brief Returns true if the data in the specified table is identical to the data in this table. @@ -2024,6 +2192,43 @@ int tsk_migration_table_extend(tsk_migration_table_t *self, const tsk_migration_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t options); +/** +@brief Subset this table by keeping rows according to a boolean mask. + +@rst +Deletes rows from this table and optionally return the mapping from IDs in +the current table to the updated table. Rows are kept or deleted according to +the specified boolean array ``keep`` such that for each row ``j`` if +``keep[j]`` is false (zero) the row is deleted, and otherwise the row is +retained. Thus, ``keep`` must be an array of at least ``num_rows`` +:c:type:`bool` values. + +If the ``id_map`` argument is non-null, this array will be updated to represent +the mapping between IDs before and after row deletion. For row ``j``, +``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or +:c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an +array of at least ``num_rows`` :c:type:`tsk_id_t` values. + +.. warning:: + C++ users need to be careful to specify the correct type when + passing in values for the ``keep`` array, + using ``std::vector`` and not ``std::vector``, + as the latter may not be correct size. + +@endrst + +@param self A pointer to a tsk_migration_table_t object. +@param keep Array of boolean flags describing whether a particular + row should be kept or not. Must be at least ``num_rows`` long. +@param options Bitwise option flags. Currently unused; should be + set to zero to ensure compatibility with later versions of tskit. +@param id_map An array in which to store the mapping between new + and old IDs. If NULL, this will be ignored. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_migration_table_keep_rows(tsk_migration_table_t *self, const tsk_bool_t *keep, + tsk_flags_t options, tsk_id_t *id_map); + /** @brief Returns true if the data in the specified table is identical to the data in this table. @@ -2332,6 +2537,43 @@ and is not checked for compatibility with any existing schema on this table. int tsk_site_table_extend(tsk_site_table_t *self, const tsk_site_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t options); +/** +@brief Subset this table by keeping rows according to a boolean mask. + +@rst +Deletes rows from this table and optionally return the mapping from IDs in +the current table to the updated table. Rows are kept or deleted according to +the specified boolean array ``keep`` such that for each row ``j`` if +``keep[j]`` is false (zero) the row is deleted, and otherwise the row is +retained. Thus, ``keep`` must be an array of at least ``num_rows`` +:c:type:`bool` values. + +If the ``id_map`` argument is non-null, this array will be updated to represent +the mapping between IDs before and after row deletion. For row ``j``, +``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or +:c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an +array of at least ``num_rows`` :c:type:`tsk_id_t` values. + +.. warning:: + C++ users need to be careful to specify the correct type when + passing in values for the ``keep`` array, + using ``std::vector`` and not ``std::vector``, + as the latter may not be correct size. + +@endrst + +@param self A pointer to a tsk_site_table_t object. +@param keep Array of boolean flags describing whether a particular + row should be kept or not. Must be at least ``num_rows`` long. +@param options Bitwise option flags. Currently unused; should be + set to zero to ensure compatibility with later versions of tskit. +@param id_map An array in which to store the mapping between new + and old IDs. If NULL, this will be ignored. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_site_table_keep_rows(tsk_site_table_t *self, const tsk_bool_t *keep, + tsk_flags_t options, tsk_id_t *id_map); + /** @brief Returns true if the data in the specified table is identical to the data in this table. @@ -2668,6 +2910,55 @@ int tsk_mutation_table_extend(tsk_mutation_table_t *self, const tsk_mutation_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t options); +/** +@brief Subset this table by keeping rows according to a boolean mask. + +@rst +Deletes rows from this table and optionally return the mapping from IDs in +the current table to the updated table. Rows are kept or deleted according to +the specified boolean array ``keep`` such that for each row ``j`` if +``keep[j]`` is false (zero) the row is deleted, and otherwise the row is +retained. Thus, ``keep`` must be an array of at least ``num_rows`` +:c:type:`bool` values. + +If the ``id_map`` argument is non-null, this array will be updated to represent +the mapping between IDs before and after row deletion. For row ``j``, +``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or +:c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an +array of at least ``num_rows`` :c:type:`tsk_id_t` values. + +The values in the ``parent`` column are updated according to this map, so that +reference integrity within the table is maintained. As a consequence of this, +the values in the ``parent`` column for kept rows are bounds-checked and an +error raised if they are not valid. Rows that are deleted are not checked for +parent ID integrity. + +If an attempt is made to delete rows that are referred to by the ``parent`` +column of rows that are retained, an error is raised. + +These error conditions are checked before any alterations to the table are +made. + +.. warning:: + C++ users need to be careful to specify the correct type when + passing in values for the ``keep`` array, + using ``std::vector`` and not ``std::vector``, + as the latter may not be correct size. + +@endrst + +@param self A pointer to a tsk_mutation_table_t object. +@param keep Array of boolean flags describing whether a particular + row should be kept or not. Must be at least ``num_rows`` long. +@param options Bitwise option flags. Currently unused; should be + set to zero to ensure compatibility with later versions of tskit. +@param id_map An array in which to store the mapping between new + and old IDs. If NULL, this will be ignored. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_mutation_table_keep_rows(tsk_mutation_table_t *self, const tsk_bool_t *keep, + tsk_flags_t options, tsk_id_t *id_map); + /** @brief Returns true if the data in the specified table is identical to the data in this table. @@ -2995,6 +3286,43 @@ int tsk_population_table_extend(tsk_population_table_t *self, const tsk_population_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t options); +/** +@brief Subset this table by keeping rows according to a boolean mask. + +@rst +Deletes rows from this table and optionally return the mapping from IDs in +the current table to the updated table. Rows are kept or deleted according to +the specified boolean array ``keep`` such that for each row ``j`` if +``keep[j]`` is false (zero) the row is deleted, and otherwise the row is +retained. Thus, ``keep`` must be an array of at least ``num_rows`` +:c:type:`bool` values. + +If the ``id_map`` argument is non-null, this array will be updated to represent +the mapping between IDs before and after row deletion. For row ``j``, +``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or +:c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an +array of at least ``num_rows`` :c:type:`tsk_id_t` values. + +.. warning:: + C++ users need to be careful to specify the correct type when + passing in values for the ``keep`` array, + using ``std::vector`` and not ``std::vector``, + as the latter may not be correct size. + +@endrst + +@param self A pointer to a tsk_population_table_t object. +@param keep Array of boolean flags describing whether a particular + row should be kept or not. Must be at least ``num_rows`` long. +@param options Bitwise option flags. Currently unused; should be + set to zero to ensure compatibility with later versions of tskit. +@param id_map An array in which to store the mapping between new + and old IDs. If NULL, this will be ignored. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_population_table_keep_rows(tsk_population_table_t *self, const tsk_bool_t *keep, + tsk_flags_t options, tsk_id_t *id_map); + /** @brief Returns true if the data in the specified table is identical to the data in this table. @@ -3289,6 +3617,43 @@ int tsk_provenance_table_extend(tsk_provenance_table_t *self, const tsk_provenance_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t options); +/** +@brief Subset this table by keeping rows according to a boolean mask. + +@rst +Deletes rows from this table and optionally return the mapping from IDs in +the current table to the updated table. Rows are kept or deleted according to +the specified boolean array ``keep`` such that for each row ``j`` if +``keep[j]`` is false (zero) the row is deleted, and otherwise the row is +retained. Thus, ``keep`` must be an array of at least ``num_rows`` +:c:type:`bool` values. + +If the ``id_map`` argument is non-null, this array will be updated to represent +the mapping between IDs before and after row deletion. For row ``j``, +``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or +:c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an +array of at least ``num_rows`` :c:type:`tsk_id_t` values. + +.. warning:: + C++ users need to be careful to specify the correct type when + passing in values for the ``keep`` array, + using ``std::vector`` and not ``std::vector``, + as the latter may not be correct size. + +@endrst + +@param self A pointer to a tsk_provenance_table_t object. +@param keep Array of boolean flags describing whether a particular + row should be kept or not. Must be at least ``num_rows`` long. +@param options Bitwise option flags. Currently unused; should be + set to zero to ensure compatibility with later versions of tskit. +@param id_map An array in which to store the mapping between new + and old IDs. If NULL, this will be ignored. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_provenance_table_keep_rows(tsk_provenance_table_t *self, const tsk_bool_t *keep, + tsk_flags_t options, tsk_id_t *id_map); + /** @brief Returns true if the data in the specified table is identical to the data in this table. @@ -3909,8 +4274,44 @@ A mapping from the node IDs in the table before simplification to their equivale values after simplification can be obtained via the ``node_map`` argument. If this is non NULL, ``node_map[u]`` will contain the new ID for node ``u`` after simplification, or :c:macro:`TSK_NULL` if the node has been removed. Thus, ``node_map`` must be an array -of at least ``self->nodes.num_rows`` :c:type:`tsk_id_t` values. The table collection will -always be unindexed after simplify successfully completes. +of at least ``self->nodes.num_rows`` :c:type:`tsk_id_t` values. + +If the `TSK_SIMPLIFY_NO_FILTER_NODES` option is specified, the node table will be +unaltered except for changing the sample status of nodes (but see the +`TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS` option below) and to update references +to other tables that may have changed as a result of filtering (see below). +The ``node_map`` (if specified) will always be the identity mapping, such that +``node_map[u] == u`` for all nodes. Note also that the order of the list of +samples is not important in this case. + +When a table is not filtered (i.e., if the `TSK_SIMPLIFY_NO_FILTER_NODES` +option is provided or the `TSK_SIMPLIFY_FILTER_SITES`, +`TSK_SIMPLIFY_FILTER_POPULATIONS` or `TSK_SIMPLIFY_FILTER_INDIVIDUALS` +options are *not* provided) the corresponding table is modified as +little as possible, and all pointers are guaranteed to remain valid +after simplification. The only changes made to an unfiltered table are +to update any references to tables that may have changed (for example, +remapping population IDs in the node table if +`TSK_SIMPLIFY_FILTER_POPULATIONS` was specified) or altering the +sample status flag of nodes. + +.. note:: It is possible for populations and individuals to be filtered + even if `TSK_SIMPLIFY_NO_FILTER_NODES` is specified because there + may be entirely unreferenced entities in the input tables, which + are not affected by whether we filter nodes or not. + +By default, the node sample flags are updated by unsetting the +:c:macro:`TSK_NODE_IS_SAMPLE` flag for all nodes and subsequently setting it +for the nodes provided as input to this function. The +`TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS` option will prevent this from occuring, +making it the responsibility of calling code to keep track of the ultimate +sample status of nodes. Using this option in conjunction with +`TSK_SIMPLIFY_NO_FILTER_NODES` (and without the +`TSK_SIMPLIFY_FILTER_POPULATIONS` and `TSK_SIMPLIFY_FILTER_INDIVIDUALS` +options) guarantees that the node table will not be written to during the +lifetime of this function. + +The table collection will always be unindexed after simplify successfully completes. .. note:: Migrations are currently not supported by simplify, and an error will be raised if we attempt call simplify on a table collection with greater @@ -3924,6 +4325,8 @@ Options can be specified by providing one or more of the following bitwise - :c:macro:`TSK_SIMPLIFY_FILTER_SITES` - :c:macro:`TSK_SIMPLIFY_FILTER_POPULATIONS` - :c:macro:`TSK_SIMPLIFY_FILTER_INDIVIDUALS` +- :c:macro:`TSK_SIMPLIFY_NO_FILTER_NODES` +- :c:macro:`TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS` - :c:macro:`TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY` - :c:macro:`TSK_SIMPLIFY_KEEP_UNARY` - :c:macro:`TSK_SIMPLIFY_KEEP_INPUT_ROOTS` @@ -4217,7 +4620,9 @@ int tsk_table_collection_deduplicate_sites( int tsk_table_collection_compute_mutation_parents( tsk_table_collection_t *self, tsk_flags_t options); int tsk_table_collection_compute_mutation_times( - tsk_table_collection_t *self, double *random, tsk_flags_t TSK_UNUSED(options)); + tsk_table_collection_t *self, double *random, tsk_flags_t options); +int tsk_table_collection_delete_older( + tsk_table_collection_t *self, double time, tsk_flags_t options); int tsk_table_collection_set_indexes(tsk_table_collection_t *self, tsk_id_t *edge_insertion_order, tsk_id_t *edge_removal_order); @@ -4252,6 +4657,7 @@ int tsk_provenance_table_takeset_columns(tsk_provenance_table_t *self, tsk_size_t *record_offset); bool tsk_table_collection_has_reference_sequence(const tsk_table_collection_t *self); + int tsk_reference_sequence_init(tsk_reference_sequence_t *self, tsk_flags_t options); int tsk_reference_sequence_free(tsk_reference_sequence_t *self); bool tsk_reference_sequence_is_null(const tsk_reference_sequence_t *self); @@ -4365,6 +4771,19 @@ int tsk_identity_segments_get(const tsk_identity_segments_t *self, tsk_id_t a, void tsk_identity_segments_print_state(tsk_identity_segments_t *self, FILE *out); int tsk_identity_segments_free(tsk_identity_segments_t *self); +/* Edge differences */ + +/* Internal API - currently used in a few places, but a better API is envisaged + * at some point. + * IMPORTANT: tskit-rust uses this API, so don't break without discussing! + */ +int tsk_diff_iter_init(tsk_diff_iter_t *self, const tsk_table_collection_t *tables, + tsk_id_t num_trees, tsk_flags_t options); +int tsk_diff_iter_free(tsk_diff_iter_t *self); +int tsk_diff_iter_next(tsk_diff_iter_t *self, double *left, double *right, + tsk_edge_list_t *edges_out, tsk_edge_list_t *edges_in); +void tsk_diff_iter_print_state(const tsk_diff_iter_t *self, FILE *out); + #ifdef __cplusplus } #endif diff --git a/treerec/tskit/trees.c b/treerec/tskit/trees.c index c89dfe564..4604579e0 100644 --- a/treerec/tskit/trees.c +++ b/treerec/tskit/trees.c @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019-2022 Tskit Developers + * Copyright (c) 2019-2023 Tskit Developers * Copyright (c) 2015-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -363,6 +363,13 @@ tsk_treeseq_init_mutations(tsk_treeseq_t *self) = discrete_times && (is_discrete(time[j]) || tsk_is_unknown_time(time[j])); } self->discrete_time = self->discrete_time && discrete_times; + + for (j = 0; j < num_mutations; j++) { + if (!tsk_is_unknown_time(time[j])) { + self->min_time = TSK_MIN(self->min_time, time[j]); + self->max_time = TSK_MAX(self->max_time, time[j]); + } + } } static int @@ -405,6 +412,13 @@ tsk_treeseq_init_nodes(tsk_treeseq_t *self) = discrete_times && (is_discrete(time[j]) || tsk_is_unknown_time(time[j])); } self->discrete_time = self->discrete_time && discrete_times; + + for (j = 0; j < num_nodes; j++) { + if (!tsk_is_unknown_time(time[j])) { + self->min_time = TSK_MIN(self->min_time, time[j]); + self->max_time = TSK_MAX(self->max_time, time[j]); + } + } out: return ret; } @@ -452,6 +466,8 @@ tsk_treeseq_init( self->num_trees = (tsk_size_t) num_trees; self->discrete_genome = true; self->discrete_time = true; + self->min_time = INFINITY; + self->max_time = -INFINITY; ret = tsk_treeseq_init_nodes(self); if (ret != 0) { goto out; @@ -710,12 +726,86 @@ tsk_treeseq_get_discrete_time(const tsk_treeseq_t *self) return self->discrete_time; } +double +tsk_treeseq_get_min_time(const tsk_treeseq_t *self) +{ + return self->min_time; +} + +double +tsk_treeseq_get_max_time(const tsk_treeseq_t *self) +{ + return self->max_time; +} + bool tsk_treeseq_has_reference_sequence(const tsk_treeseq_t *self) { return tsk_table_collection_has_reference_sequence(self->tables); } +int +tsk_treeseq_get_individuals_population(const tsk_treeseq_t *self, tsk_id_t *output) +{ + int ret = 0; + tsk_size_t i, j; + tsk_individual_t ind; + tsk_id_t ind_pop; + const tsk_id_t *node_population = self->tables->nodes.population; + const tsk_size_t num_individuals = self->tables->individuals.num_rows; + + tsk_memset(output, TSK_NULL, num_individuals * sizeof(*output)); + + for (i = 0; i < num_individuals; i++) { + ret = tsk_treeseq_get_individual(self, (tsk_id_t) i, &ind); + tsk_bug_assert(ret == 0); + if (ind.nodes_length > 0) { + ind_pop = -2; + for (j = 0; j < ind.nodes_length; j++) { + if (ind_pop == -2) { + ind_pop = node_population[ind.nodes[j]]; + } else if (ind_pop != node_population[ind.nodes[j]]) { + ret = TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH; + goto out; + } + } + output[ind.id] = ind_pop; + } + } +out: + return ret; +} + +int +tsk_treeseq_get_individuals_time(const tsk_treeseq_t *self, double *output) +{ + int ret = 0; + tsk_size_t i, j; + tsk_individual_t ind; + double ind_time; + const double *node_time = self->tables->nodes.time; + const tsk_size_t num_individuals = self->tables->individuals.num_rows; + + for (i = 0; i < num_individuals; i++) { + ret = tsk_treeseq_get_individual(self, (tsk_id_t) i, &ind); + tsk_bug_assert(ret == 0); + /* the default is UNKNOWN_TIME, but nodes cannot have + * UNKNOWN _TIME so this is safe. */ + ind_time = TSK_UNKNOWN_TIME; + for (j = 0; j < ind.nodes_length; j++) { + if (j == 0) { + ind_time = node_time[ind.nodes[j]]; + } else if (ind_time != node_time[ind.nodes[j]]) { + ret = TSK_ERR_INDIVIDUAL_TIME_MISMATCH; + goto out; + } + } + output[ind.id] = ind_time; + } +out: + return ret; +} + /* Stats functions */ #define GET_2D_ROW(array, row_len, row) (array + (((size_t)(row_len)) * (size_t) row)) @@ -1174,7 +1264,7 @@ tsk_treeseq_branch_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, { int ret = 0; tsk_id_t u, v; - tsk_size_t j, k, tree_index, window_index; + tsk_size_t j, k, window_index; tsk_size_t num_nodes = self->tables->nodes.num_rows; const tsk_id_t num_edges = (tsk_id_t) self->tables->edges.num_rows; const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; @@ -1225,7 +1315,6 @@ tsk_treeseq_branch_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, tj = 0; tk = 0; t_left = 0; - tree_index = 0; window_index = 0; while (tj < num_edges || t_left < sequence_length) { while (tk < num_edges && edge_right[O[tk]] == t_left) { @@ -1310,7 +1399,6 @@ tsk_treeseq_branch_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, } /* Move to the next tree */ t_left = t_right; - tree_index++; } tsk_bug_assert(window_index == num_windows); out: @@ -1324,9 +1412,6 @@ tsk_treeseq_branch_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, tsk_safe_free(state); tsk_safe_free(summary); tsk_safe_free(running_sum); - - (void)tree_index; // BCH: suppress "variable set but not used" warning; I submitted an issue about that, so this change can hopefully get overwritten next merge - return ret; } @@ -3256,6 +3341,121 @@ tsk_treeseq_simplify(const tsk_treeseq_t *self, const tsk_id_t *samples, return ret; } +int TSK_WARN_UNUSED +tsk_treeseq_split_edges(const tsk_treeseq_t *self, double time, tsk_flags_t flags, + tsk_id_t population, const char *metadata, tsk_size_t metadata_length, + tsk_flags_t TSK_UNUSED(options), tsk_treeseq_t *output) +{ + int ret = 0; + tsk_table_collection_t *tables = tsk_malloc(sizeof(*tables)); + const double *restrict node_time = self->tables->nodes.time; + const tsk_size_t num_edges = self->tables->edges.num_rows; + const tsk_size_t num_mutations = self->tables->mutations.num_rows; + tsk_id_t *split_edge = tsk_malloc(num_edges * sizeof(*split_edge)); + tsk_id_t j, u, mapped_node, ret_id; + double mutation_time; + tsk_edge_t edge; + tsk_mutation_t mutation; + tsk_bookmark_t sort_start; + + memset(output, 0, sizeof(*output)); + if (split_edge == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + ret = tsk_treeseq_copy_tables(self, tables, 0); + if (ret != 0) { + goto out; + } + if (tables->migrations.num_rows > 0) { + ret = TSK_ERR_MIGRATIONS_NOT_SUPPORTED; + goto out; + } + /* We could catch this below in add_row, but it's simpler to guarantee + * that we always catch the error in corner cases where the values + * aren't used. */ + if (population < -1 || population >= (tsk_id_t) self->tables->populations.num_rows) { + ret = TSK_ERR_POPULATION_OUT_OF_BOUNDS; + goto out; + } + if (!tsk_isfinite(time)) { + ret = TSK_ERR_TIME_NONFINITE; + goto out; + } + + tsk_edge_table_clear(&tables->edges); + tsk_memset(split_edge, TSK_NULL, num_edges * sizeof(*split_edge)); + + for (j = 0; j < (tsk_id_t) num_edges; j++) { + /* Would prefer to use tsk_edge_table_get_row_unsafe, but it's + * currently static to tables.c */ + ret = tsk_edge_table_get_row(&self->tables->edges, j, &edge); + tsk_bug_assert(ret == 0); + if (node_time[edge.child] < time && time < node_time[edge.parent]) { + u = tsk_node_table_add_row(&tables->nodes, flags, time, population, TSK_NULL, + metadata, metadata_length); + if (u < 0) { + ret = (int) u; + goto out; + } + ret_id = tsk_edge_table_add_row(&tables->edges, edge.left, edge.right, u, + edge.child, edge.metadata, edge.metadata_length); + if (ret_id < 0) { + ret = (int) ret_id; + goto out; + } + edge.child = u; + split_edge[j] = u; + } + ret_id = tsk_edge_table_add_row(&tables->edges, edge.left, edge.right, + edge.parent, edge.child, edge.metadata, edge.metadata_length); + if (ret_id < 0) { + ret = (int) ret_id; + goto out; + } + } + + for (j = 0; j < (tsk_id_t) num_mutations; j++) { + /* Note: we could speed this up a bit by accessing the local + * memory for mutations directly. */ + ret = tsk_treeseq_get_mutation(self, j, &mutation); + tsk_bug_assert(ret == 0); + mapped_node = TSK_NULL; + if (mutation.edge != TSK_NULL) { + mapped_node = split_edge[mutation.edge]; + } + mutation_time = tsk_is_unknown_time(mutation.time) ? node_time[mutation.node] + : mutation.time; + if (mapped_node != TSK_NULL && mutation_time >= time) { + /* Update the column in-place to save a bit of time. */ + tables->mutations.node[j] = mapped_node; + } + } + + /* Skip mutations and sites as they haven't been altered */ + /* Note we can probably optimise the edge sort a bit here also by + * reasoning about when the first edge gets altered in the table. + */ + memset(&sort_start, 0, sizeof(sort_start)); + sort_start.sites = tables->sites.num_rows; + sort_start.mutations = tables->mutations.num_rows; + ret = tsk_table_collection_sort(tables, &sort_start, 0); + if (ret != 0) { + goto out; + } + + ret = tsk_treeseq_init( + output, tables, TSK_TS_INIT_BUILD_INDEXES | TSK_TAKE_OWNERSHIP); + tables = NULL; +out: + if (tables != NULL) { + tsk_table_collection_free(tables); + tsk_safe_free(tables); + } + tsk_safe_free(split_edge); + return ret; +} + /* ======================================================== * * Tree * ======================================================== */ @@ -3287,8 +3487,11 @@ tsk_tree_init(tsk_tree_t *self, const tsk_treeseq_t *tree_sequence, tsk_flags_t self->right_child = tsk_malloc(N * sizeof(*self->right_child)); self->left_sib = tsk_malloc(N * sizeof(*self->left_sib)); self->right_sib = tsk_malloc(N * sizeof(*self->right_sib)); + self->num_children = tsk_calloc(N, sizeof(*self->num_children)); + self->edge = tsk_malloc(N * sizeof(*self->edge)); if (self->parent == NULL || self->left_child == NULL || self->right_child == NULL - || self->left_sib == NULL || self->right_sib == NULL) { + || self->left_sib == NULL || self->right_sib == NULL + || self->num_children == NULL || self->edge == NULL) { goto out; } if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { @@ -3353,6 +3556,8 @@ tsk_tree_free(tsk_tree_t *self) tsk_safe_free(self->left_sample); tsk_safe_free(self->right_sample); tsk_safe_free(self->next_sample); + tsk_safe_free(self->num_children); + tsk_safe_free(self->edge); return 0; } @@ -3501,6 +3706,8 @@ tsk_tree_copy(const tsk_tree_t *self, tsk_tree_t *dest, tsk_flags_t options) tsk_memcpy(dest->right_child, self->right_child, N * sizeof(*self->right_child)); tsk_memcpy(dest->left_sib, self->left_sib, N * sizeof(*self->left_sib)); tsk_memcpy(dest->right_sib, self->right_sib, N * sizeof(*self->right_sib)); + tsk_memcpy(dest->num_children, self->num_children, N * sizeof(*self->num_children)); + tsk_memcpy(dest->edge, self->edge, N * sizeof(*self->edge)); if (!(dest->options & TSK_NO_SAMPLE_COUNTS)) { if (self->options & TSK_NO_SAMPLE_COUNTS) { ret = TSK_ERR_UNSUPPORTED_OPERATION; @@ -3699,15 +3906,7 @@ tsk_tree_get_right_root(const tsk_tree_t *self) tsk_size_t tsk_tree_get_num_roots(const tsk_tree_t *self) { - const tsk_id_t *restrict right_sib = self->right_sib; - const tsk_id_t *restrict left_child = self->left_child; - tsk_size_t num_roots = 0; - tsk_id_t u; - - for (u = left_child[self->virtual_root]; u != TSK_NULL; u = right_sib[u]) { - num_roots++; - } - return num_roots; + return (tsk_size_t) self->num_children[self->virtual_root]; } int TSK_WARN_UNUSED @@ -4021,6 +4220,7 @@ tsk_tree_remove_branch( tsk_id_t *restrict right_child = self->right_child; tsk_id_t *restrict left_sib = self->left_sib; tsk_id_t *restrict right_sib = self->right_sib; + tsk_id_t *restrict num_children = self->num_children; tsk_id_t lsib = left_sib[c]; tsk_id_t rsib = right_sib[c]; @@ -4037,6 +4237,7 @@ tsk_tree_remove_branch( parent[c] = TSK_NULL; left_sib[c] = TSK_NULL; right_sib[c] = TSK_NULL; + num_children[p]--; } static inline void @@ -4047,6 +4248,7 @@ tsk_tree_insert_branch( tsk_id_t *restrict right_child = self->right_child; tsk_id_t *restrict left_sib = self->left_sib; tsk_id_t *restrict right_sib = self->right_sib; + tsk_id_t *restrict num_children = self->num_children; tsk_id_t u; parent[c] = p; @@ -4061,6 +4263,7 @@ tsk_tree_insert_branch( right_sib[c] = TSK_NULL; } right_child[p] = c; + num_children[p]++; } static inline void @@ -4082,6 +4285,7 @@ tsk_tree_remove_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c) tsk_id_t *restrict parent = self->parent; tsk_size_t *restrict num_samples = self->num_samples; tsk_size_t *restrict num_tracked_samples = self->num_tracked_samples; + tsk_id_t *restrict edge = self->edge; const tsk_size_t root_threshold = self->root_threshold; tsk_id_t u; tsk_id_t path_end = TSK_NULL; @@ -4091,6 +4295,7 @@ tsk_tree_remove_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c) tsk_tree_remove_branch(self, p, c, parent); self->num_edges--; + edge[c] = TSK_NULL; if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { u = p; @@ -4116,11 +4321,12 @@ tsk_tree_remove_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c) } static void -tsk_tree_insert_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c) +tsk_tree_insert_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c, tsk_id_t edge_id) { tsk_id_t *restrict parent = self->parent; tsk_size_t *restrict num_samples = self->num_samples; tsk_size_t *restrict num_tracked_samples = self->num_tracked_samples; + tsk_id_t *restrict edge = self->edge; const tsk_size_t root_threshold = self->root_threshold; tsk_id_t u; tsk_id_t path_end = TSK_NULL; @@ -4148,6 +4354,7 @@ tsk_tree_insert_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c) tsk_tree_insert_branch(self, p, c, parent); self->num_edges++; + edge[c] = edge_id; if (self->options & TSK_SAMPLE_LISTS) { tsk_tree_update_sample_lists(self, p, parent); @@ -4187,7 +4394,7 @@ tsk_tree_advance(tsk_tree_t *self, int direction, const double *restrict out_bre while (in >= 0 && in < num_edges && in_breakpoints[in_order[in]] == x) { k = in_order[in]; in += direction; - tsk_tree_insert_edge(self, edge_parent[k], edge_child[k]); + tsk_tree_insert_edge(self, edge_parent[k], edge_child[k], k); } self->direction = direction; @@ -4342,19 +4549,138 @@ tsk_tree_position_in_interval(const tsk_tree_t *self, double x) return self->interval.left <= x && x < self->interval.right; } -int TSK_WARN_UNUSED -tsk_tree_seek(tsk_tree_t *self, double x, tsk_flags_t TSK_UNUSED(options)) +/* NOTE: + * + * Notes from Kevin Thornton: + * + * This method inserts the edges for an arbitrary tree + * in linear time and requires no additional memory. + * + * During design, the following alternatives were tested + * (in a combination of rust + C): + * 1. Indexing edge insertion/removal locations by tree. + * The indexing can be done in O(n) time, giving O(1) + * access to the first edge in a tree. We can then add + * edges to the tree in O(e) time, where e is the number + * of edges. This apparoach requires O(n) additional memory + * and is only marginally faster than the implementation below. + * 2. Building an interval tree mapping edge id -> span. + * This approach adds a lot of complexity and wasn't any faster + * than the indexing described above. + */ +static int +tsk_tree_seek_from_null(tsk_tree_t *self, double x, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; + tsk_size_t edge; + tsk_id_t p, c, e, j, k, tree_index; const double L = tsk_treeseq_get_sequence_length(self->tree_sequence); - const double t_l = self->interval.left; - const double t_r = self->interval.right; - double distance_left, distance_right; + const tsk_treeseq_t *treeseq = self->tree_sequence; + const tsk_table_collection_t *tables = treeseq->tables; + const tsk_id_t *restrict edge_parent = tables->edges.parent; + const tsk_id_t *restrict edge_child = tables->edges.child; + const tsk_size_t num_edges = tables->edges.num_rows; + const tsk_size_t num_trees = self->tree_sequence->num_trees; + const double *restrict edge_left = tables->edges.left; + const double *restrict edge_right = tables->edges.right; + const double *restrict breakpoints = treeseq->breakpoints; + const tsk_id_t *restrict insertion = tables->indexes.edge_insertion_order; + const tsk_id_t *restrict removal = tables->indexes.edge_removal_order; + + // NOTE: it may be better to get the + // index first and then ask if we are + // searching in the first or last 1/2 + // of trees. + j = -1; + if (x <= L / 2.0) { + for (edge = 0; edge < num_edges; edge++) { + e = insertion[edge]; + if (edge_left[e] > x) { + j = (tsk_id_t) edge; + break; + } + if (x >= edge_left[e] && x < edge_right[e]) { + p = edge_parent[e]; + c = edge_child[e]; + tsk_tree_insert_edge(self, p, c, e); + } + } + } else { + for (edge = 0; edge < num_edges; edge++) { + e = removal[num_edges - edge - 1]; + if (edge_right[e] < x) { + j = (tsk_id_t)(num_edges - edge - 1); + while (j < (tsk_id_t) num_edges && edge_left[insertion[j]] <= x) { + j++; + } + break; + } + if (x >= edge_left[e] && x < edge_right[e]) { + p = edge_parent[e]; + c = edge_child[e]; + tsk_tree_insert_edge(self, p, c, e); + } + } + } - if (x < 0 || x >= L) { + if (j == -1) { + j = 0; + while (j < (tsk_id_t) num_edges && edge_left[insertion[j]] <= x) { + j++; + } + } + k = 0; + while (k < (tsk_id_t) num_edges && edge_right[removal[k]] <= x) { + k++; + } + + /* NOTE: tsk_search_sorted finds the first the first + * insertion locatiom >= the query point, which + * finds a RIGHT value for queries not at the left edge. + */ + tree_index = (tsk_id_t) tsk_search_sorted(breakpoints, num_trees + 1, x); + if (breakpoints[tree_index] > x) { + tree_index--; + } + self->index = tree_index; + self->interval.left = breakpoints[tree_index]; + self->interval.right = breakpoints[tree_index + 1]; + self->left_index = j; + self->right_index = k; + self->direction = TSK_DIR_FORWARD; + self->num_nodes = tables->nodes.num_rows; + if (tables->sites.num_rows > 0) { + self->sites = treeseq->tree_sites[self->index]; + self->sites_length = treeseq->tree_sites_length[self->index]; + } + + return ret; +} + +int TSK_WARN_UNUSED +tsk_tree_seek_index(tsk_tree_t *self, tsk_id_t tree, tsk_flags_t options) +{ + int ret = 0; + double x; + + if (tree < 0 || tree >= (tsk_id_t) self->tree_sequence->num_trees) { ret = TSK_ERR_SEEK_OUT_OF_BOUNDS; goto out; } + x = self->tree_sequence->breakpoints[tree]; + ret = tsk_tree_seek(self, x, options); +out: + return ret; +} + +static int TSK_WARN_UNUSED +tsk_tree_seek_linear(tsk_tree_t *self, double x, tsk_flags_t TSK_UNUSED(options)) +{ + const double L = tsk_treeseq_get_sequence_length(self->tree_sequence); + const double t_l = self->interval.left; + const double t_r = self->interval.right; + int ret = 0; + double distance_left, distance_right; if (x < t_l) { /* |-----|-----|========|---------| */ @@ -4387,6 +4713,27 @@ tsk_tree_seek(tsk_tree_t *self, double x, tsk_flags_t TSK_UNUSED(options)) return ret; } +int TSK_WARN_UNUSED +tsk_tree_seek(tsk_tree_t *self, double x, tsk_flags_t options) +{ + int ret = 0; + const double L = tsk_treeseq_get_sequence_length(self->tree_sequence); + + if (x < 0 || x >= L) { + ret = TSK_ERR_SEEK_OUT_OF_BOUNDS; + goto out; + } + + if (self->index == -1) { + ret = tsk_tree_seek_from_null(self, x, options); + } else { + ret = tsk_tree_seek_linear(self, x, options); + } + +out: + return ret; +} + int TSK_WARN_UNUSED tsk_tree_clear(tsk_tree_t *self) { @@ -4411,6 +4758,8 @@ tsk_tree_clear(tsk_tree_t *self) tsk_memset(self->right_child, 0xff, N * sizeof(*self->right_child)); tsk_memset(self->left_sib, 0xff, N * sizeof(*self->left_sib)); tsk_memset(self->right_sib, 0xff, N * sizeof(*self->right_sib)); + tsk_memset(self->num_children, 0, N * sizeof(*self->num_children)); + tsk_memset(self->edge, 0xff, N * sizeof(*self->edge)); if (sample_counts) { tsk_memset(self->num_samples, 0, N * sizeof(*self->num_samples)); @@ -4735,6 +5084,208 @@ tsk_tree_sackin_index(const tsk_tree_t *self, tsk_size_t *result) return ret; } +int +tsk_tree_colless_index(const tsk_tree_t *self, tsk_size_t *result) +{ + int ret = 0; + const tsk_id_t *restrict right_child = self->right_child; + const tsk_id_t *restrict left_sib = self->left_sib; + tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); + tsk_id_t *num_leaves = tsk_calloc(self->num_nodes, sizeof(*num_leaves)); + tsk_size_t j, num_nodes, total; + tsk_id_t num_children, u, v; + + if (nodes == NULL || num_leaves == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + if (tsk_tree_get_num_roots(self) != 1) { + ret = TSK_ERR_UNDEFINED_MULTIROOT; + goto out; + } + ret = tsk_tree_postorder(self, nodes, &num_nodes); + if (ret != 0) { + goto out; + } + + total = 0; + for (j = 0; j < num_nodes; j++) { + u = nodes[j]; + /* Cheaper to compute this on the fly than to access the num_children array. + * since we're already iterating over the children. */ + num_children = 0; + for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { + num_children++; + num_leaves[u] += num_leaves[v]; + } + if (num_children == 0) { + num_leaves[u] = 1; + } else if (num_children == 2) { + v = right_child[u]; + total += (tsk_size_t) llabs(num_leaves[v] - num_leaves[left_sib[v]]); + } else { + ret = TSK_ERR_UNDEFINED_NONBINARY; + goto out; + } + } + *result = total; +out: + tsk_safe_free(nodes); + tsk_safe_free(num_leaves); + return ret; +} + +int +tsk_tree_b1_index(const tsk_tree_t *self, double *result) +{ + int ret = 0; + const tsk_id_t *restrict parent = self->parent; + const tsk_id_t *restrict right_child = self->right_child; + const tsk_id_t *restrict left_sib = self->left_sib; + tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); + tsk_size_t *max_path_length = tsk_calloc(self->num_nodes, sizeof(*max_path_length)); + tsk_size_t j, num_nodes, mpl; + double total = 0.0; + tsk_id_t u, v; + + if (nodes == NULL || max_path_length == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + ret = tsk_tree_postorder(self, nodes, &num_nodes); + if (ret != 0) { + goto out; + } + + for (j = 0; j < num_nodes; j++) { + u = nodes[j]; + if (parent[u] != TSK_NULL && right_child[u] != TSK_NULL) { + mpl = 0; + for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { + mpl = TSK_MAX(mpl, max_path_length[v]); + } + max_path_length[u] = mpl + 1; + total += 1 / (double) max_path_length[u]; + } + } + *result = total; +out: + tsk_safe_free(nodes); + tsk_safe_free(max_path_length); + return ret; +} + +static double +general_log(double x, double base) +{ + return log(x) / log(base); +} + +int +tsk_tree_b2_index(const tsk_tree_t *self, double base, double *result) +{ + struct stack_elem { + tsk_id_t node; + double path_product; + }; + int ret = 0; + const tsk_id_t *restrict right_child = self->right_child; + const tsk_id_t *restrict left_sib = self->left_sib; + struct stack_elem *stack + = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*stack)); + int stack_top; + double total_proba = 0; + double num_children; + tsk_id_t u; + struct stack_elem s = { .node = TSK_NULL, .path_product = 1 }; + + if (stack == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + if (tsk_tree_get_num_roots(self) != 1) { + ret = TSK_ERR_UNDEFINED_MULTIROOT; + goto out; + } + + stack_top = 0; + s.node = tsk_tree_get_left_root(self); + stack[stack_top] = s; + + while (stack_top >= 0) { + s = stack[stack_top]; + stack_top--; + u = right_child[s.node]; + if (u == TSK_NULL) { + total_proba -= s.path_product * general_log(s.path_product, base); + } else { + num_children = 0; + for (; u != TSK_NULL; u = left_sib[u]) { + num_children++; + } + s.path_product *= 1 / num_children; + for (u = right_child[s.node]; u != TSK_NULL; u = left_sib[u]) { + stack_top++; + s.node = u; + stack[stack_top] = s; + } + } + } + *result = total_proba; +out: + tsk_safe_free(stack); + return ret; +} + +int +tsk_tree_num_lineages(const tsk_tree_t *self, double t, tsk_size_t *result) +{ + int ret = 0; + const tsk_id_t *restrict right_child = self->right_child; + const tsk_id_t *restrict left_sib = self->left_sib; + const double *restrict time = self->tree_sequence->tables->nodes.time; + tsk_id_t *stack = tsk_tree_alloc_node_stack(self); + tsk_size_t num_lineages = 0; + int stack_top; + tsk_id_t u, v; + double child_time, parent_time; + + if (stack == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + if (!tsk_isfinite(t)) { + ret = TSK_ERR_TIME_NONFINITE; + goto out; + } + /* Push the roots onto the stack */ + stack_top = -1; + for (u = right_child[self->virtual_root]; u != TSK_NULL; u = left_sib[u]) { + stack_top++; + stack[stack_top] = u; + } + + while (stack_top >= 0) { + u = stack[stack_top]; + parent_time = time[u]; + stack_top--; + for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { + child_time = time[v]; + /* Only traverse down the tree as far as we need to */ + if (child_time > t) { + stack_top++; + stack[stack_top] = v; + } else if (t < parent_time) { + num_lineages++; + } + } + } + *result = num_lineages; +out: + tsk_safe_free(stack); + return ret; +} + /* Parsimony methods */ static inline uint64_t @@ -4931,159 +5482,16 @@ tsk_tree_map_mutations(tsk_tree_t *self, int32_t *genotypes, return ret; } -/* ======================================================== * - * Tree diff iterator. - * ======================================================== */ - +/* Compatibility shim for initialising the diff iterator from a tree sequence. We are + * using this function in a small number of places internally, so simplest to keep it + * until a more satisfactory "diff" API comes along. + */ int TSK_WARN_UNUSED -tsk_diff_iter_init( +tsk_diff_iter_init_from_ts( tsk_diff_iter_t *self, const tsk_treeseq_t *tree_sequence, tsk_flags_t options) { - int ret = 0; - - tsk_bug_assert(tree_sequence != NULL); - tsk_memset(self, 0, sizeof(tsk_diff_iter_t)); - self->num_nodes = tsk_treeseq_get_num_nodes(tree_sequence); - self->num_edges = tsk_treeseq_get_num_edges(tree_sequence); - self->tree_sequence = tree_sequence; - self->insertion_index = 0; - self->removal_index = 0; - self->tree_left = 0; - self->tree_index = -1; - self->last_index = (tsk_id_t) tsk_treeseq_get_num_trees(tree_sequence); - if (options & TSK_INCLUDE_TERMINAL) { - self->last_index = self->last_index + 1; - } - self->edge_list_nodes = tsk_malloc(self->num_edges * sizeof(*self->edge_list_nodes)); - if (self->edge_list_nodes == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } -out: - return ret; -} - -int -tsk_diff_iter_free(tsk_diff_iter_t *self) -{ - tsk_safe_free(self->edge_list_nodes); - return 0; -} - -void -tsk_diff_iter_print_state(const tsk_diff_iter_t *self, FILE *out) -{ - fprintf(out, "tree_diff_iterator state\n"); - fprintf(out, "num_edges = %lld\n", (long long) self->num_edges); - fprintf(out, "insertion_index = %lld\n", (long long) self->insertion_index); - fprintf(out, "removal_index = %lld\n", (long long) self->removal_index); - fprintf(out, "tree_left = %f\n", self->tree_left); - fprintf(out, "tree_index = %lld\n", (long long) self->tree_index); -} - -int TSK_WARN_UNUSED -tsk_diff_iter_next(tsk_diff_iter_t *self, double *ret_left, double *ret_right, - tsk_edge_list_t *edges_out_ret, tsk_edge_list_t *edges_in_ret) -{ - int ret = 0; - tsk_id_t k; - const double sequence_length = self->tree_sequence->tables->sequence_length; - double left = self->tree_left; - double right = sequence_length; - tsk_size_t next_edge_list_node = 0; - const tsk_treeseq_t *s = self->tree_sequence; - tsk_edge_list_node_t *out_head = NULL; - tsk_edge_list_node_t *out_tail = NULL; - tsk_edge_list_node_t *in_head = NULL; - tsk_edge_list_node_t *in_tail = NULL; - tsk_edge_list_node_t *w = NULL; - tsk_edge_list_t edges_out; - tsk_edge_list_t edges_in; - const tsk_edge_table_t *edges = &s->tables->edges; - const tsk_id_t *insertion_order = s->tables->indexes.edge_insertion_order; - const tsk_id_t *removal_order = s->tables->indexes.edge_removal_order; - - tsk_memset(&edges_out, 0, sizeof(edges_out)); - tsk_memset(&edges_in, 0, sizeof(edges_in)); - - if (self->tree_index + 1 < self->last_index) { - /* First we remove the stale records */ - while (self->removal_index < (tsk_id_t) self->num_edges - && left == edges->right[removal_order[self->removal_index]]) { - k = removal_order[self->removal_index]; - tsk_bug_assert(next_edge_list_node < self->num_edges); - w = &self->edge_list_nodes[next_edge_list_node]; - next_edge_list_node++; - w->edge.id = k; - w->edge.left = edges->left[k]; - w->edge.right = edges->right[k]; - w->edge.parent = edges->parent[k]; - w->edge.child = edges->child[k]; - w->edge.metadata = edges->metadata + edges->metadata_offset[k]; - w->edge.metadata_length - = edges->metadata_offset[k + 1] - edges->metadata_offset[k]; - w->next = NULL; - w->prev = NULL; - if (out_head == NULL) { - out_head = w; - out_tail = w; - } else { - out_tail->next = w; - w->prev = out_tail; - out_tail = w; - } - self->removal_index++; - } - edges_out.head = out_head; - edges_out.tail = out_tail; - - /* Now insert the new records */ - while (self->insertion_index < (tsk_id_t) self->num_edges - && left == edges->left[insertion_order[self->insertion_index]]) { - k = insertion_order[self->insertion_index]; - tsk_bug_assert(next_edge_list_node < self->num_edges); - w = &self->edge_list_nodes[next_edge_list_node]; - next_edge_list_node++; - w->edge.id = k; - w->edge.left = edges->left[k]; - w->edge.right = edges->right[k]; - w->edge.parent = edges->parent[k]; - w->edge.child = edges->child[k]; - w->edge.metadata = edges->metadata + edges->metadata_offset[k]; - w->edge.metadata_length - = edges->metadata_offset[k + 1] - edges->metadata_offset[k]; - w->next = NULL; - w->prev = NULL; - if (in_head == NULL) { - in_head = w; - in_tail = w; - } else { - in_tail->next = w; - w->prev = in_tail; - in_tail = w; - } - self->insertion_index++; - } - edges_in.head = in_head; - edges_in.tail = in_tail; - - right = sequence_length; - if (self->insertion_index < (tsk_id_t) self->num_edges) { - right = TSK_MIN(right, edges->left[insertion_order[self->insertion_index]]); - } - if (self->removal_index < (tsk_id_t) self->num_edges) { - right = TSK_MIN(right, edges->right[removal_order[self->removal_index]]); - } - self->tree_index++; - ret = TSK_TREE_OK; - } - *edges_out_ret = edges_out; - *edges_in_ret = edges_in; - *ret_left = left; - *ret_right = right; - /* Set the left coordinate for the next tree */ - self->tree_left = right; - return ret; + return tsk_diff_iter_init( + self, tree_sequence->tables, (tsk_id_t) tree_sequence->num_trees, options); } /* ======================================================== * @@ -5514,7 +5922,7 @@ tsk_treeseq_kc_distance(const tsk_treeseq_t *self, const tsk_treeseq_t *other, if (ret != 0) { goto out; } - ret = tsk_diff_iter_init(&diff_iters[i], treeseqs[i], false); + ret = tsk_diff_iter_init_from_ts(&diff_iters[i], treeseqs[i], false); if (ret != 0) { goto out; } diff --git a/treerec/tskit/trees.h b/treerec/tskit/trees.h index 60b011c66..efe998007 100644 --- a/treerec/tskit/trees.h +++ b/treerec/tskit/trees.h @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019-2022 Tskit Developers + * Copyright (c) 2019-2023 Tskit Developers * Copyright (c) 2015-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -59,9 +59,6 @@ extern "C" { #define TSK_DIR_FORWARD 1 #define TSK_DIR_REVERSE -1 -/* For the edge diff iterator */ -#define TSK_INCLUDE_TERMINAL (1 << 0) - /** @defgroup API_FLAGS_TS_INIT_GROUP :c:func:`tsk_treeseq_init` specific flags. @{ @@ -89,6 +86,9 @@ typedef struct { bool discrete_genome; /* Are all time values discrete? */ bool discrete_time; + /* Min and max time in node table and mutation table */ + double min_time; + double max_time; /* Breakpoints along the sequence, including 0 and L. */ double *breakpoints; /* If a node is a sample, map to its index in the samples list */ @@ -172,6 +172,17 @@ typedef struct { ``TSK_NULL`` if node u has no siblings to its right. */ tsk_id_t *right_sib; + /** + @brief The number of children of node u is num_children[u]. + */ + tsk_id_t *num_children; + /** + @brief Array of edge ids where ``edge[u]`` is the edge that encodes the + relationship between the child node ``u`` and its parent. Equal to + ``TSK_NULL`` if node ``u`` is a root, virtual root or is not a node in the + current tree. + */ + tsk_id_t *edge; /** @brief The total number of edges defining the topology of this tree. This is equal to the number of tree sequence edges that intersect with @@ -247,30 +258,6 @@ typedef struct { tsk_id_t right_index; } tsk_tree_t; -/* Diff iterator. */ -typedef struct _tsk_edge_list_node_t { - tsk_edge_t edge; - struct _tsk_edge_list_node_t *next; - struct _tsk_edge_list_node_t *prev; -} tsk_edge_list_node_t; - -typedef struct { - tsk_edge_list_node_t *head; - tsk_edge_list_node_t *tail; -} tsk_edge_list_t; - -typedef struct { - tsk_size_t num_nodes; - tsk_size_t num_edges; - double tree_left; - const tsk_treeseq_t *tree_sequence; - tsk_id_t insertion_index; - tsk_id_t removal_index; - tsk_id_t tree_index; - tsk_id_t last_index; - tsk_edge_list_node_t *edge_list_nodes; -} tsk_diff_iter_t; - /****************************************************************************/ /* Tree sequence.*/ /****************************************************************************/ @@ -697,7 +684,7 @@ Returns the location of each node in the list of samples or @endrst @param self A pointer to a tsk_treeseq_t object. -@return Returns the pointer to the breakpoint array. +@return Returns the pointer to the array of sample indexes. */ const tsk_id_t *tsk_treeseq_get_sample_index_map(const tsk_treeseq_t *self); @@ -737,6 +724,30 @@ then this flag will be true */ bool tsk_treeseq_get_discrete_time(const tsk_treeseq_t *self); +/** +@brief Get the min time in node table and mutation table + +@rst +The times stored in both the node and mutation tables are considered. +@endrst + +@param self A pointer to a tsk_treeseq_t object. +@return Returns the min time of all nodes and mutations. +*/ +double tsk_treeseq_get_min_time(const tsk_treeseq_t *self); + +/** +@brief Get the max time in node table and mutation table + +@rst +The times stored in both the node and mutation tables are considered. +@endrst + +@param self A pointer to a tsk_treeseq_t object. +@return Returns the max time of all nodes and mutations. +*/ +double tsk_treeseq_get_max_time(const tsk_treeseq_t *self); + /** @brief Get a node by its index @@ -882,8 +893,15 @@ int tsk_treeseq_simplify(const tsk_treeseq_t *self, const tsk_id_t *samples, /** @} */ +int tsk_treeseq_split_edges(const tsk_treeseq_t *self, double time, tsk_flags_t flags, + tsk_id_t population, const char *metadata, tsk_size_t metadata_length, + tsk_flags_t options, tsk_treeseq_t *output); + bool tsk_treeseq_has_reference_sequence(const tsk_treeseq_t *self); +int tsk_treeseq_get_individuals_population(const tsk_treeseq_t *self, tsk_id_t *output); +int tsk_treeseq_get_individuals_time(const tsk_treeseq_t *self, double *output); + int tsk_treeseq_kc_distance(const tsk_treeseq_t *self, const tsk_treeseq_t *other, double lambda_, double *result); @@ -1069,10 +1087,6 @@ int tsk_tree_copy(const tsk_tree_t *self, tsk_tree_t *dest, tsk_flags_t options) @{ */ -/** @brief Value returned by seeking methods when they have successfully - seeked to a non-null tree. */ -#define TSK_TREE_OK 1 - /** @brief Seek to the first tree in the sequence. @@ -1178,12 +1192,6 @@ we will have ``position < tree.interval.right``. Seeking to a position currently covered by the tree is a constant time operation. - -.. warning:: - The current implementation of ``seek`` does **not** provide efficient - random access to arbitrary positions along the genome. However, - sequentially seeking in either direction is as efficient as calling - :c:func:`tsk_tree_next` or :c:func:`tsk_tree_prev` directly. @endrst @param self A pointer to an initialised tsk_tree_t object. @@ -1194,6 +1202,22 @@ a constant time operation. */ int tsk_tree_seek(tsk_tree_t *self, double position, tsk_flags_t options); +/** +@brief Seek to a specific tree in a tree sequence. + +@rst +Set the state of this tree to reflect the tree in parent +tree sequence whose index is ``0 <= tree < num_trees``. +@endrst + +@param self A pointer to an initialised tsk_tree_t object. +@param tree The target tree index. +@param options Seek options. Currently unused. Set to 0 for compatibility + with future versions of tskit. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_tree_seek_index(tsk_tree_t *self, tsk_id_t tree, tsk_flags_t options); + /** @} */ /** @@ -1676,6 +1700,15 @@ int tsk_tree_kc_distance( /* Don't document these balance metrics for now so it doesn't get in the way of * C API 1.0, but should be straightforward to document based on Python docs. */ int tsk_tree_sackin_index(const tsk_tree_t *self, tsk_size_t *result); +int tsk_tree_colless_index(const tsk_tree_t *self, tsk_size_t *result); +int tsk_tree_b1_index(const tsk_tree_t *self, double *result); +/* NOTE: if we document this as part of the C API we'll have to be more careful + * about the error behaviour on bad log bases. At the moment we're just returning + * the resulting value which can be nan, inf etc, but some surprising results + * happen like a base 0 seems to return a finite value. */ +int tsk_tree_b2_index(const tsk_tree_t *self, double base, double *result); + +int tsk_tree_num_lineages(const tsk_tree_t *self, double t, tsk_size_t *result); /* Things to consider removing: */ @@ -1688,16 +1721,8 @@ bool tsk_tree_is_sample(const tsk_tree_t *self, tsk_id_t u); */ bool tsk_tree_equals(const tsk_tree_t *self, const tsk_tree_t *other); -/****************************************************************************/ -/* Diff iterator */ -/****************************************************************************/ - -int tsk_diff_iter_init( +int tsk_diff_iter_init_from_ts( tsk_diff_iter_t *self, const tsk_treeseq_t *tree_sequence, tsk_flags_t options); -int tsk_diff_iter_free(tsk_diff_iter_t *self); -int tsk_diff_iter_next(tsk_diff_iter_t *self, double *left, double *right, - tsk_edge_list_t *edges_out, tsk_edge_list_t *edges_in); -void tsk_diff_iter_print_state(const tsk_diff_iter_t *self, FILE *out); #ifdef __cplusplus }