Skip to content

Commit

Permalink
Merge pull request #3676 from vgteam/distance-minimizer-caching
Browse files Browse the repository at this point in the history
Distance Index 2
  • Loading branch information
xchang1 authored Jun 1, 2022
2 parents 096bfdc + 34e8eba commit 2c36d2c
Show file tree
Hide file tree
Showing 32 changed files with 15,823 additions and 2,303 deletions.
2 changes: 1 addition & 1 deletion deps/structures
28 changes: 21 additions & 7 deletions src/index_registry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
#include "transcriptome.hpp"
#include "integrated_snarl_finder.hpp"
#include "min_distance.hpp"
#include "snarl_distance_index.hpp"
#include "gfa.hpp"
#include "job_schedule.hpp"
#include "path.hpp"
Expand Down Expand Up @@ -435,7 +436,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
registry.register_index("Giraffe Snarls", "snarls"); // snarls that may reflect the GFA->GBZ node IDs
registry.register_index("Spliced Snarls", "spliced.snarls");

registry.register_index("Giraffe Distance Index", "dist"); // distances that may reflect the GFA->GBZ node IDs
registry.register_index("Giraffe Distance Index", "dist");
registry.register_index("Spliced Distance Index", "spliced.dist");

registry.register_index("GBWTGraph", "gg");
Expand Down Expand Up @@ -3358,6 +3359,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
// Distance Index Recipes
////////////////////////////////////


// meta-recipe to make distance index
auto make_distance_index = [](const HandleGraph& graph,
const IndexFile* snarl_input,
Expand Down Expand Up @@ -3630,22 +3632,34 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
auto minimizer_output = *constructing.begin();
auto& output_names = all_outputs[0];

ifstream infile_dist;
init_in(infile_dist, dist_filename);
auto dist_index = vg::io::VPKG::load_one<MinimumDistanceIndex>(infile_dist);

ifstream infile_gbz;
init_in(infile_gbz, gbz_filename);
auto gbz = vg::io::VPKG::load_one<gbwtgraph::GBZ>(infile_gbz);


ifstream infile_dist;
init_in(infile_dist, dist_filename);
SnarlDistanceIndex new_distance_index;
std::unique_ptr<MinimumDistanceIndex> old_distance_index;
bool use_new_distance_index = false;
if (vg::io::MessageIterator::sniff_tag(infile_dist) == "DISTANCE") {
old_distance_index = vg::io::VPKG::load_one<MinimumDistanceIndex>(infile_dist);
} else {
new_distance_index.deserialize(dist_filename);
use_new_distance_index = true;
}
gbwtgraph::DefaultMinimizerIndex minimizers(IndexingParameters::minimizer_k,
IndexingParameters::use_bounded_syncmers ?
IndexingParameters::minimizer_s :
IndexingParameters::minimizer_w,
IndexingParameters::use_bounded_syncmers);

gbwtgraph::index_haplotypes(gbz->graph, minimizers, [&](const pos_t& pos) -> gbwtgraph::payload_type {
return MIPayload::encode(dist_index->get_minimizer_distances(pos));
if (use_new_distance_index) {
return MIPayload::encode(get_minimizer_distances(new_distance_index, pos));
} else {
return MinimumDistanceIndex::MIPayload::encode(old_distance_index->get_minimizer_distances(pos));
}
});

string output_name = plan->output_filepath(minimizer_output);
Expand Down Expand Up @@ -3681,8 +3695,8 @@ vector<IndexName> VGIndexes::get_default_mpmap_indexes() {

vector<IndexName> VGIndexes::get_default_giraffe_indexes() {
vector<IndexName> indexes{
"Giraffe GBZ",
"Giraffe Distance Index",
"Giraffe GBZ",
"Minimizers"
};
return indexes;
Expand Down
23 changes: 12 additions & 11 deletions src/min_distance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,8 @@ MinimumDistanceIndex::MinimumDistanceIndex (istream& in) : MinimumDistanceIndex(

void MinimumDistanceIndex::load(istream& in){
//Load serialized index from an istream



//Check the file's header to make sure it's the correct version
if (!in) {
throw runtime_error("Could not load distance index");
Expand All @@ -278,13 +279,15 @@ void MinimumDistanceIndex::load(istream& in){
size_t char_index = 0;
//TODO: We're only checking up to the last two so if the header changes this needs to change too
while (in.peek() != EOF && char_index < file_header.size()-2) {
if ( (char) in.get() != file_header[char_index]) {
char next = (char) in.get();
if ( next != file_header[char_index]) {
throw runtime_error ("Distance index file is outdated");
}
char_index ++;
}
if (in.peek() == '.') {
if ((char) in.get() != '.' || (char)in.get() != '2') {
char next = (char) in.get();
if (next != '.' || (char)in.get() != '2') {
throw runtime_error ("Distance index file is outdated");
}
include_component = true;
Expand Down Expand Up @@ -2748,7 +2751,7 @@ tuple<bool, size_t, size_t, bool, size_t, size_t, size_t, size_t, bool> MinimumD
size_t offset = chain_rank == 0 ? 0 : chain_indexes[component_to_chain_index[component-1]].prefix_sum[chain_rank] - 1;

return tuple<bool, size_t, size_t, bool, size_t, size_t, size_t, size_t, bool>(true, component, offset + node_offset,
false, MIPayload::NO_VALUE, MIPayload::NO_VALUE, MIPayload::NO_VALUE, MIPayload::NO_VALUE, false);
false, MinimumDistanceIndex::MIPayload::NO_VALUE, MinimumDistanceIndex::MIPayload::NO_VALUE, MinimumDistanceIndex::MIPayload::NO_VALUE, MinimumDistanceIndex::MIPayload::NO_VALUE, false);

} else if (component != 0 && snarl_index.depth == 0 && snarl_index.in_chain && snarl_index.is_simple_snarl && !chain_indexes[get_chain_assignment(snarl_index.id_in_parent)].is_looping_chain ) {
//This node is on a top-level simple snarl
Expand All @@ -2759,13 +2762,13 @@ tuple<bool, size_t, size_t, bool, size_t, size_t, size_t, size_t, bool> MinimumD
size_t node_len = snarl_index.node_length(snarl_rank);
bool rev_in_snarl = snarl_index.snarl_distance(0, snarl_rank) == -1;
bool is_rev = rev_in_snarl ? !snarl_index.rev_in_parent : snarl_index.rev_in_parent;
return tuple<bool, size_t, size_t, bool, size_t, size_t, size_t, size_t, bool>(false, MIPayload::NO_VALUE, MIPayload::NO_VALUE,
return tuple<bool, size_t, size_t, bool, size_t, size_t, size_t, size_t, bool>(false, MinimumDistanceIndex::MIPayload::NO_VALUE, MinimumDistanceIndex::MIPayload::NO_VALUE,
true, chain_rank, start_len, end_len, node_len, is_rev);

} else {
//If this is a nested position
return tuple<bool, size_t, size_t, bool, size_t, size_t, size_t, size_t, bool>(false, MIPayload::NO_VALUE, MIPayload::NO_VALUE,
false, MIPayload::NO_VALUE, MIPayload::NO_VALUE, MIPayload::NO_VALUE, MIPayload::NO_VALUE, false);
return tuple<bool, size_t, size_t, bool, size_t, size_t, size_t, size_t, bool>(false, MinimumDistanceIndex::MIPayload::NO_VALUE, MinimumDistanceIndex::MIPayload::NO_VALUE,
false, MinimumDistanceIndex::MIPayload::NO_VALUE, MinimumDistanceIndex::MIPayload::NO_VALUE, MinimumDistanceIndex::MIPayload::NO_VALUE, MinimumDistanceIndex::MIPayload::NO_VALUE, false);
}
}

Expand All @@ -2783,9 +2786,7 @@ size_t MinimumDistanceIndex::get_connected_component(id_t node_id) {
return node_to_component[node_id-min_node_id];
}

constexpr MIPayload::code_type MIPayload::NO_CODE;
constexpr size_t MIPayload::NO_VALUE;
constexpr size_t MIPayload::ID_OFFSET;
constexpr MIPayload::code_type MIPayload::OFFSET_MASK;
constexpr MinimumDistanceIndex::MIPayload::code_type MinimumDistanceIndex::MIPayload::NO_CODE;
constexpr size_t MinimumDistanceIndex::MIPayload::NO_VALUE;

}
26 changes: 14 additions & 12 deletions src/min_distance.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -509,8 +509,9 @@ class MinimumDistanceIndex {
friend class ChainIndex;
friend class SnarlSeedClusterer;

public:
/******************* MIPayload redefinded ***********************/

};

/**
* The encoding of distances for positions in top-level chains or top-level simple bubbles.
Expand All @@ -519,17 +520,15 @@ class MinimumDistanceIndex {
* We store this information in the minimizer index.
*/
/*
Simple bubble:
Simple bubble:
8 bit | 1 | 24 | 10 | 10 | 10 | 1
--- | is rev | snarl rank in chain | start len | end len | node len | is_node
Top level chain
31 bit | 32 | 1
component id | offset | is_node
Top level chain
31 bit | 32 | 1
component id | offset | is_node
is_node is true if it is a top-level chain node, false if it is a simple bubble
*/

Expand All @@ -545,14 +544,14 @@ struct MIPayload {
constexpr static size_t RANK_OFFSET = 31;
constexpr static size_t REV_OFFSET = 55;


constexpr static size_t LENGTH_WIDTH = 10;
constexpr static size_t RANK_WIDTH = 24;
constexpr static code_type LENGTH_MASK = (static_cast<code_type>(1) << LENGTH_WIDTH) - 1;
constexpr static code_type RANK_MASK = (static_cast<code_type>(1) << RANK_WIDTH) - 1;





constexpr static size_t ID_OFFSET = 33;
constexpr static size_t ID_WIDTH = 31;
constexpr static size_t OFFSET_WIDTH = 32;
Expand All @@ -570,7 +569,7 @@ struct MIPayload {
} else if (is_top_level_node) {
//Top level node in chain

if (component >= (static_cast<code_type>(1) << 31) - 1
if (component >= (static_cast<code_type>(1) << 31) - 1
|| offset >= static_cast<size_t>(OFFSET_MASK) ) {
//If the values are too large to be stored
return NO_CODE;
Expand Down Expand Up @@ -613,6 +612,9 @@ struct MIPayload {
}
};


};

}

#endif
Loading

2 comments on commit 2c36d2c

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

13 tests passed, 0 tests failed and 0 tests skipped in 6134 seconds

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch v1.41.0. View the full report here.

11 tests passed, 0 tests failed and 0 tests skipped in 4852 seconds

Please sign in to comment.