From 0620d3b11e2acd2cf5db06a56cd04cba675fd001 Mon Sep 17 00:00:00 2001 From: Ben Haller Date: Tue, 12 Dec 2023 21:33:31 -0500 Subject: [PATCH 1/3] add node pedigree ID check --- core/species.cpp | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ core/species.h | 1 + 2 files changed, 49 insertions(+) diff --git a/core/species.cpp b/core/species.cpp index 17851581..25db82f5 100644 --- a/core/species.cpp +++ b/core/species.cpp @@ -8398,6 +8398,51 @@ void Species::__AddMutationsFromTreeSequenceToGenomes(std::unordered_mapgenome_id_; + slim_pedigreeid_t pedigree_id = genome_id / 2; // rounds down to integer + + if (pedigree_id >= gSLiM_next_pedigree_id) + { + static bool been_here = false; + + if (!been_here) + { + // if we do see this condition, let's emit a warning for right now; I'd like to know whether this ever happens + p_interpreter->ErrorOutputStream() << "#WARNING (Species::__CheckNodePedigreeIDs): in reading the tree sequence, a node was encountered with a genome pedigree ID that was (after division by 2) greater than or equal to the next pedigree ID; this is not expected to happen." << std::endl; + been_here = true; + } + + gSLiM_next_pedigree_id = pedigree_id + 1; + } + } + } + } +} + void Species::_InstantiateSLiMObjectsFromTables(EidosInterpreter *p_interpreter, slim_tick_t p_metadata_tick, slim_tick_t p_metadata_cycle, SLiMModelType p_file_model_type, int p_file_version, SUBPOP_REMAP_HASH &p_subpop_map) { // set the tick and cycle from the provenance data @@ -8464,6 +8509,9 @@ void Species::_InstantiateSLiMObjectsFromTables(EidosInterpreter *p_interpreter, if (ret != 0) handle_error("_InstantiateSLiMObjectsFromTables tsk_treeseq_free()", ret); free(ts); + // Ensure that the next pedigree ID used will not cause a collision with any existing nodes in the node table + __CheckNodePedigreeIDs(p_interpreter); + // Set up the remembered genomes by looking though the list of nodes and their individuals if (remembered_genomes_.size() != 0) EIDOS_TERMINATION << "ERROR (Species::_InstantiateSLiMObjectsFromTables): (internal error) remembered_genomes_ is not empty." << EidosTerminate(); diff --git a/core/species.h b/core/species.h index 728ce458..2dc8c44d 100644 --- a/core/species.h +++ b/core/species.h @@ -605,6 +605,7 @@ class Species : public EidosDictionaryUnretained void __TallyMutationReferencesWithTreeSequence(std::unordered_map &p_mutMap, std::unordered_map p_nodeToGenomeMap, tsk_treeseq_t *p_ts); void __CreateMutationsFromTabulation(std::unordered_map &p_mutInfoMap, std::unordered_map &p_mutIndexMap); void __AddMutationsFromTreeSequenceToGenomes(std::unordered_map &p_mutIndexMap, std::unordered_map p_nodeToGenomeMap, tsk_treeseq_t *p_ts); + void __CheckNodePedigreeIDs(EidosInterpreter *p_interpreter); void _InstantiateSLiMObjectsFromTables(EidosInterpreter *p_interpreter, slim_tick_t p_metadata_tick, slim_tick_t p_metadata_cycle, SLiMModelType p_file_model_type, int p_file_version, SUBPOP_REMAP_HASH &p_subpop_map); // given tree-seq tables, makes individuals, genomes, and mutations slim_tick_t _InitializePopulationFromTskitTextFile(const char *p_file, EidosInterpreter *p_interpreter, SUBPOP_REMAP_HASH &p_subpop_map); // initialize the population from an tskit text file slim_tick_t _InitializePopulationFromTskitBinaryFile(const char *p_file, EidosInterpreter *p_interpreter, SUBPOP_REMAP_HASH &p_subpop_remap); // initialize the population from an tskit binary file From 8400e42bedc2d0ec57bd7c40c9b5ac249deb9478 Mon Sep 17 00:00:00 2001 From: Ben Haller Date: Wed, 13 Dec 2023 23:44:11 -0500 Subject: [PATCH 2/3] add check for duplicate genome pedigree IDs --- core/species.cpp | 75 +++++++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 30 deletions(-) diff --git a/core/species.cpp b/core/species.cpp index 25db82f5..43262410 100644 --- a/core/species.cpp +++ b/core/species.cpp @@ -7684,6 +7684,7 @@ void Species::__CreateSubpopulationsFromTabulation(std::unordered_map pedigree_id_check; gSLiM_next_pedigree_id = 0; @@ -7809,12 +7810,12 @@ void Species::__CreateSubpopulationsFromTabulation(std::unordered_map genome_id_check; - if (model_type_ == SLiMModelType::kModelTypeNonWF) + for (tsk_size_t j = 0; (size_t)j < node_count; j++) { - for (tsk_size_t j = 0; (size_t) j < node_count; j++) + tsk_size_t offset1 = node_table.metadata_offset[j]; + tsk_size_t offset2 = node_table.metadata_offset[j + 1]; + tsk_size_t length = (offset2 - offset1); + + if (length) // allow nodes with no metadata, which might be carryover non-SLiM nodes { - tsk_size_t offset1 = node_table.metadata_offset[j]; - tsk_size_t offset2 = node_table.metadata_offset[j + 1]; - tsk_size_t length = (offset2 - offset1); + // but if a node has metadata, it must be SLiM metadata, I think; our schema guarantees that + if (length != sizeof(GenomeMetadataRec)) + EIDOS_TERMINATION << "ERROR (Species::__CheckNodePedigreeIDs): unexpected node metadata length; this file cannot be read." << EidosTerminate(); + + // get the metadata record and check the genome pedigree ID + GenomeMetadataRec *metadata_rec = (GenomeMetadataRec *)(node_table.metadata + offset1); + slim_genomeid_t genome_id = metadata_rec->genome_id_; - if (length) // allow nodes with no metadata, which might be carryover non-SLiM nodes + genome_id_check.emplace_back(genome_id); // we will test for collisions below + + slim_pedigreeid_t pedigree_id = genome_id / 2; // rounds down to integer + + if (pedigree_id >= gSLiM_next_pedigree_id) { - // but if a node has metadata, it must be SLiM metadata, I think; our schema guarantees that - if (length != sizeof(GenomeMetadataRec)) - EIDOS_TERMINATION << "ERROR (Species::__CheckNodePedigreeIDs): unexpected node metadata length; this file cannot be read." << EidosTerminate(); - - // get the metadata record and check the genome pedigree ID - GenomeMetadataRec *metadata_rec = (GenomeMetadataRec *)(node_table.metadata + offset1); - slim_genomeid_t genome_id = metadata_rec->genome_id_; - slim_pedigreeid_t pedigree_id = genome_id / 2; // rounds down to integer + static bool been_here = false; - if (pedigree_id >= gSLiM_next_pedigree_id) + if (!been_here) { - static bool been_here = false; - - if (!been_here) - { - // if we do see this condition, let's emit a warning for right now; I'd like to know whether this ever happens - p_interpreter->ErrorOutputStream() << "#WARNING (Species::__CheckNodePedigreeIDs): in reading the tree sequence, a node was encountered with a genome pedigree ID that was (after division by 2) greater than or equal to the next pedigree ID; this is not expected to happen." << std::endl; - been_here = true; - } - - gSLiM_next_pedigree_id = pedigree_id + 1; + // not sure whether this ought to be a warning at all; but the warning message can try to make it clear that it isn't necessarily an error... + p_interpreter->ErrorOutputStream() << "#WARNING (Species::__CheckNodePedigreeIDs): in reading the tree sequence, a node was encountered with a genome pedigree ID that was (after division by 2) greater than the largest individual pedigree ID in the tree sequence. This is not necessarily an error, but it is highly unusual, and could indicate that the tree sequence file is corrupted. It typically only happens if an unsimplified tree sequence is being loaded, and even then only under certain conditions." << std::endl; + been_here = true; } + + gSLiM_next_pedigree_id = pedigree_id + 1; } } } + + // Check for genome pedigree ID collisions by sorting and looking for duplicates + std::sort(genome_id_check.begin(), genome_id_check.end()); + const auto duplicate = std::adjacent_find(genome_id_check.begin(), genome_id_check.end()); + + if (duplicate != genome_id_check.end()) + EIDOS_TERMINATION << "ERROR (Species::__CheckNodePedigreeIDs): the genome pedigree ID value " << *duplicate << " was used more than once; genome pedigree IDs must be unique." << EidosTerminate(); } void Species::_InstantiateSLiMObjectsFromTables(EidosInterpreter *p_interpreter, slim_tick_t p_metadata_tick, slim_tick_t p_metadata_cycle, SLiMModelType p_file_model_type, int p_file_version, SUBPOP_REMAP_HASH &p_subpop_map) @@ -8509,7 +8523,8 @@ void Species::_InstantiateSLiMObjectsFromTables(EidosInterpreter *p_interpreter, if (ret != 0) handle_error("_InstantiateSLiMObjectsFromTables tsk_treeseq_free()", ret); free(ts); - // Ensure that the next pedigree ID used will not cause a collision with any existing nodes in the node table + // Ensure that the next pedigree ID used will not cause a collision with any existing nodes in the node table, + // and that there are no duplicate node pedigree IDs in the input file (whether in use or not). __CheckNodePedigreeIDs(p_interpreter); // Set up the remembered genomes by looking though the list of nodes and their individuals From 74354bf1272a02ee025888f8057d79543f569d3e Mon Sep 17 00:00:00 2001 From: Ben Haller Date: Thu, 14 Dec 2023 19:06:00 -0500 Subject: [PATCH 3/3] final node pedigree policy on metadata, id warning --- core/species.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/core/species.cpp b/core/species.cpp index 43262410..7d177c34 100644 --- a/core/species.cpp +++ b/core/species.cpp @@ -8419,12 +8419,9 @@ void Species::__CheckNodePedigreeIDs(EidosInterpreter *p_interpreter) tsk_size_t offset2 = node_table.metadata_offset[j + 1]; tsk_size_t length = (offset2 - offset1); - if (length) // allow nodes with no metadata, which might be carryover non-SLiM nodes + // allow nodes with other types of metadata; but if the metadata length metches ours, we have to assume it's ours + if (length == sizeof(GenomeMetadataRec)) { - // but if a node has metadata, it must be SLiM metadata, I think; our schema guarantees that - if (length != sizeof(GenomeMetadataRec)) - EIDOS_TERMINATION << "ERROR (Species::__CheckNodePedigreeIDs): unexpected node metadata length; this file cannot be read." << EidosTerminate(); - // get the metadata record and check the genome pedigree ID GenomeMetadataRec *metadata_rec = (GenomeMetadataRec *)(node_table.metadata + offset1); slim_genomeid_t genome_id = metadata_rec->genome_id_; @@ -8439,8 +8436,8 @@ void Species::__CheckNodePedigreeIDs(EidosInterpreter *p_interpreter) if (!been_here) { - // not sure whether this ought to be a warning at all; but the warning message can try to make it clear that it isn't necessarily an error... - p_interpreter->ErrorOutputStream() << "#WARNING (Species::__CheckNodePedigreeIDs): in reading the tree sequence, a node was encountered with a genome pedigree ID that was (after division by 2) greater than the largest individual pedigree ID in the tree sequence. This is not necessarily an error, but it is highly unusual, and could indicate that the tree sequence file is corrupted. It typically only happens if an unsimplified tree sequence is being loaded, and even then only under certain conditions." << std::endl; + // decided to keep this as a warning; this circumstance is not necessarily pathological, but it probably usually is... + p_interpreter->ErrorOutputStream() << "#WARNING (Species::__CheckNodePedigreeIDs): in reading the tree sequence, a node was encountered with a genome pedigree ID that was (after division by 2) greater than the largest individual pedigree ID in the tree sequence. This is not necessarily an error, but it is highly unusual, and could indicate that the tree sequence file is corrupted. It may happen due to external manipulations of a tree sequence, or perhaps if an unsimplified tree sequence produced by SLiM is being loaded." << std::endl; been_here = true; }