From fd9766e17cd5f0a2a957d47bb542310beb5a2db3 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Mon, 16 Dec 2024 15:26:22 -0500 Subject: [PATCH 1/4] add subcommand to mask sequences --- deps/libbdsg | 2 +- deps/libhandlegraph | 2 +- src/masker.cpp | 393 +++++++++++++++++++++++++++++++++++ src/masker.hpp | 57 +++++ src/subcommand/mask_main.cpp | 177 ++++++++++++++++ src/unittest/masker.cpp | 388 ++++++++++++++++++++++++++++++++++ test/t/55_vg_mask.t | 23 ++ 7 files changed, 1040 insertions(+), 2 deletions(-) create mode 100644 src/masker.cpp create mode 100644 src/masker.hpp create mode 100644 src/subcommand/mask_main.cpp create mode 100644 src/unittest/masker.cpp create mode 100644 test/t/55_vg_mask.t diff --git a/deps/libbdsg b/deps/libbdsg index 6cff67667e..076e21a58c 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit 6cff67667eefaf1a573cd3b64777d2d78ad2a572 +Subproject commit 076e21a58cb76fcb4a8b58d1322786a6e0cfc1b3 diff --git a/deps/libhandlegraph b/deps/libhandlegraph index ccc1145901..e53d64fbef 160000 --- a/deps/libhandlegraph +++ b/deps/libhandlegraph @@ -1 +1 @@ -Subproject commit ccc11459019680ae53b65bcc19077d41ecff3075 +Subproject commit e53d64fbef0e729414613be3456057c0523534c4 diff --git a/src/masker.cpp b/src/masker.cpp new file mode 100644 index 0000000000..675b4a6f8d --- /dev/null +++ b/src/masker.cpp @@ -0,0 +1,393 @@ +#include "masker.hpp" + +#include +#include + +#include "integrated_snarl_finder.hpp" + +//#define debug + +namespace vg { + +Masker::Masker(gbwtgraph::GBWTGraph& _graph, const SnarlManager* _snarl_manager) : graph(&_graph), snarl_manager(_snarl_manager) { + if (!snarl_manager) { + init_snarls(); + } +} + +Masker::Masker(MutablePathDeletableHandleGraph& _graph, const SnarlManager* _snarl_manager) : graph(&_graph), snarl_manager(_snarl_manager) { + if (!snarl_manager) { + init_snarls(); + } +} + +void Masker::init_snarls() { + + // construct our own snarl finder + IntegratedSnarlFinder snarl_finder(*graph); + own_snarl_manager.reset(new SnarlManager(std::move(snarl_finder.find_snarls_parallel()))); + snarl_manager = own_snarl_manager.get(); +} + +void Masker::mask_sequences(const std::vector>& regions) const { + +#ifdef debug + std::cerr << "input regions:" << std::endl; + for (const auto& region : regions) { + std::cerr << '\t' << std::get<0>(region) << '\t' << std::get<1>(region) << '\t' << std::get<2>(region) << std::endl; + } +#endif + + auto sorted_regions = regions; + std::sort(sorted_regions.begin(), sorted_regions.end()); + + // consolidate the regions in place + size_t i = 0; + for (size_t j = 1; j < sorted_regions.size(); ++j) { + auto& into = sorted_regions[i]; + auto& here = sorted_regions[j]; + if (std::get<0>(here) != std::get<0>(into) || std::get<1>(here) > std::get<2>(into)) { + // they do not overlap + ++i; + if (i != j) { + // move merged the next interval into the prefix of the set + sorted_regions[i] = std::move(here); + } + } + else { + // they overlap + std::get<2>(into) = std::max(std::get<2>(into), std::get<2>(here)); + } + } + // trim off remnants of the intervals that were merged away + if (i + 1 < sorted_regions.size()) { + sorted_regions.resize(i + 1); + } + +#ifdef debug + std::cerr << "sorted and consolidated regions:" << std::endl; + for (const auto& region : sorted_regions) { + std::cerr << '\t' << std::get<0>(region) << '\t' << std::get<1>(region) << '\t' << std::get<2>(region) << std::endl; + } +#endif + + // collect the path names intervals associated with each path handle + std::vector> path_intervals; + graph->for_each_path_matching({PathSense::REFERENCE, PathSense::GENERIC}, {}, {}, [&](const path_handle_t& path_handle) { + + std::string annotated_path_name = graph->get_path_name(path_handle); + PathSense sense; + std::string sample; + std::string locus; + size_t haplotype; + size_t phase_block; + subrange_t subrange; + graph->parse_path_name(annotated_path_name, sense, sample, locus, haplotype, phase_block, subrange); + + auto non_range_path_name = graph->create_path_name(sense, sample, locus, haplotype, phase_block, + PathMetadata::NO_SUBRANGE); + + path_intervals.emplace_back(non_range_path_name, + subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first, + path_handle); + }); + std::sort(path_intervals.begin(), path_intervals.end()); + +#ifdef debug + std::cerr << "path intervals:" << std::endl; + for (const auto& interval : path_intervals) { + std::cerr << '\t' << std::get<0>(interval) << '\t' << std::get<1>(interval) << std::endl; + } +#endif + + // records of (first step, offset on first, last step, offset on last) + std::vector> step_intervals; + + // iterate along both lists to find the bounding steps where they overlap + size_t p_idx = 0, r_idx = 0; + while (r_idx < sorted_regions.size() && p_idx < path_intervals.size()) { + const auto& region_here = sorted_regions[r_idx]; + const auto& path_interval_here = path_intervals[p_idx]; + // check if they are on different paths + if (std::get<0>(region_here) < std::get<0>(path_interval_here)) { + ++r_idx; + } + else if (std::get<0>(region_here) > std::get<0>(path_interval_here)) { + ++p_idx; + } + else { + size_t p_end = std::numeric_limits::max(); + if (p_idx + 1 < path_intervals.size() && + std::get<0>(path_intervals[p_idx + 1]) == std::get<0>(path_interval_here)) { + p_end = std::get<1>(path_intervals[p_idx + 1]); + } + // check if they are on disjoint intervals + if (std::get<2>(region_here) < std::get<1>(path_interval_here)) { + ++r_idx; + } + else if (p_end < std::get<1>(region_here)) { + ++p_idx; + } + else { +#ifdef debug + std::cerr << "possible overlap at path interval " << p_idx << " and region " << r_idx << std::endl ; +#endif + // we may have reached an overlap, scan the path + size_t offset = std::get<1>(path_interval_here); + bool inside_region = false; + auto step = graph->path_begin(std::get<2>(path_interval_here)); + auto end = graph->path_end(std::get<2>(path_interval_here)); + while (step != end && r_idx < sorted_regions.size() && std::get<1>(sorted_regions[r_idx]) < p_end) { + + size_t end_offset = offset + graph->get_length(graph->get_handle_of_step(step)); + + if (!inside_region && + ((offset <= std::get<1>(sorted_regions[r_idx]) && std::get<1>(sorted_regions[r_idx]) < end_offset) || + (std::get<1>(sorted_regions[r_idx]) <= offset && offset < std::get<2>(sorted_regions[r_idx])))) { + // we're overlapping a new region for this path interval + + size_t node_offset = std::get<1>(sorted_regions[r_idx]) >= offset ? std::get<1>(sorted_regions[r_idx]) - offset : 0; + + // the latter two are placeholders for now + step_intervals.emplace_back(step, node_offset, step, -1); + + inside_region = true; + } + + if (inside_region && std::get<2>(sorted_regions[r_idx]) <= end_offset) { + // we're exiting the region we've been traversing + std::get<2>(step_intervals.back()) = step; + std::get<3>(step_intervals.back()) = std::get<2>(sorted_regions[r_idx]) - offset; + + inside_region = false; + ++r_idx; + } + else { + step = graph->get_next_step(step); + offset = end_offset; + } + } + + // we never left and closed out this region (it must end after this path fragment) + if (inside_region) { + std::get<2>(step_intervals.back()) = graph->get_previous_step(end); + std::get<3>(step_intervals.back()) = offset; + } + + ++p_idx; + } + } + } + +#ifdef debug + std::cerr << "step intervals:" << std::endl; + for (const auto& interval : step_intervals) { + std::cerr << '\t' << graph->get_id(graph->get_handle_of_step(std::get<0>(interval))) << '\t' << std::get<1>(interval) << '\t' << graph->get_id(graph->get_handle_of_step(std::get<2>(interval))) << '\t' << std::get<3>(interval) << std::endl; + } +#endif + + MutablePathDeletableHandleGraph* mutable_graph = dynamic_cast(graph); + gbwtgraph::GBWTGraph* gbwt_graph = dynamic_cast(graph); + if (mutable_graph) { + // lambda to change sequence of a mutable handle graph + std::function apply = [&](handle_t handle, const std::string& sequence) { + mutable_graph->change_sequence(handle, sequence); + }; + apply_mask_sequences(step_intervals, apply); + } + else if (gbwt_graph) { + // lambda to change sequence of a gbwt graph + std::function apply = [&](handle_t handle, const std::string& sequence) { + + auto rev_sequence = reverse_complement(sequence); + + auto fwd_handle = gbwt_graph->forward(handle); + auto rev_handle = gbwt_graph->flip(fwd_handle); + // GBWT graph stores sequence in both orientations, so we have to modify both + for (bool fwd : {true, false}) { + auto local_handle = fwd ? fwd_handle : rev_handle; + const auto& local_seq = fwd ? sequence : rev_sequence; + // FIXME: have to replicate GBWTGraph::node_offset's function because it's private + size_t node_offset = gbwt_graph->handle_to_node(local_handle) - gbwt_graph->index->firstNode(); + size_t seq_offset = gbwt_graph->sequences.index[node_offset]; + for (size_t i = 0; i < sequence.size(); ++i) { + gbwt_graph->sequences.strings[seq_offset + i] = local_seq[i]; + } + } + + + }; + + apply_mask_sequences(step_intervals, apply); + } + else { + std::cerr << "error: could not identify input graph to Masker as a GBZ/GBWTGraph or a DeletableHandleGraph" << std::endl; + exit(1); + } +} + +void Masker::apply_mask_sequences(const std::vector>& mask_intervals, + const std::function& apply_mask) const { + + // lambda to mask out part of a node, with an interval given in the handle's orientation + auto partial_mask = [&](handle_t handle, size_t begin, size_t end) { + auto sequence = graph->get_sequence(graph->forward(handle)); + size_t i, n; + if (graph->get_is_reverse(handle)) { + i = sequence.size() - end; + n = sequence.size() - begin; + } + else { + i = begin; + n = end; + } + for (; i < n; ++i) { + sequence[i] = 'N'; + } + apply_mask(handle, sequence); + }; + + for (const auto& mask_interval : mask_intervals) { + + if (std::get<0>(mask_interval) == std::get<2>(mask_interval)) { + // the entire interval is contained within one node + + partial_mask(graph->get_handle_of_step(std::get<0>(mask_interval)), + std::get<1>(mask_interval), std::get<3>(mask_interval)); + } + else { + + // returns true if the handle points into a non-trivial snarl + auto into_nontrivial_snarl = [&](handle_t handle) -> bool { + if (graph->get_degree(handle, false) < 2) { + // any snarl that this points into will be trivial + return false; + } + return snarl_manager->into_which_snarl(graph->get_id(handle), graph->get_is_reverse(handle)); + }; + + // the depth (relative to where we enter) in the snarl tree + int64_t level = 0; + // the prefix of recorded snarls that cannot be contained in a future snarl + size_t uncontained_prefix = 0; + // snarls for which both ends lie on this path interval + std::vector> snarls_discovered; + // stack of the snarl entrypoints we've passed + std::vector snarl_source_stack; + + // handle snarls that we may be exiting + auto process_incoming = [&](handle_t handle) { + if (into_nontrivial_snarl(graph->flip(handle))) { + --level; + if (!snarl_source_stack.empty()) { + // clear out any snarls contained in this one + while (snarls_discovered.size() > uncontained_prefix && std::get<0>(snarls_discovered.back()) > level) { + snarls_discovered.pop_back(); + } + // record the snarl + snarls_discovered.emplace_back(level, snarl_source_stack.back(), handle); + snarl_source_stack.pop_back(); +#ifdef debug + std::cerr << "found traversed snarl between " << graph->get_id(std::get<1>(snarls_discovered.back())) << " and " << graph->get_id(std::get<2>(snarls_discovered.back())) << '\n'; +#endif + + if (snarl_source_stack.empty()) { + // nothing can bracket this snarl on both sides anymore + uncontained_prefix = snarls_discovered.size(); + } + } + } + }; + + // handle snarls we may be entering + auto process_outgoing = [&](handle_t handle) { + if (into_nontrivial_snarl(handle)) { + ++level; + snarl_source_stack.push_back(handle); + } + }; + + std::unordered_set to_mask; + + // walk the path interval + +#ifdef debug + std::cerr << "starting interval traversal on node " << graph->get_id(graph->get_handle_of_step(std::get<0>(mask_interval))) << '\n'; +#endif + + // first partial node + partial_mask(graph->get_handle_of_step(std::get<0>(mask_interval)), + std::get<1>(mask_interval), + graph->get_length(graph->get_handle_of_step(std::get<0>(mask_interval)))); + process_outgoing(graph->get_handle_of_step(std::get<0>(mask_interval))); + + // full nodes + for (auto step = graph->get_next_step(std::get<0>(mask_interval)); + step != std::get<2>(mask_interval); step = graph->get_next_step(step)) { + + handle_t handle = graph->get_handle_of_step(step); + to_mask.insert(graph->forward(handle)); + +#ifdef debug + std::cerr << "at node " << graph->get_id(handle) << '\n'; + std::cerr << "snarl stack state:\n"; + for (auto handle : snarl_source_stack) { + std::cerr << '\t' << graph->get_id(handle) << '\n'; + } +#endif + + process_incoming(handle); + process_outgoing(handle); + } + +#ifdef debug + std::cerr << "ending interval traversal on node " << graph->get_id(graph->get_handle_of_step(std::get<2>(mask_interval))) << '\n'; +#endif + + // last partial node + process_incoming(graph->get_handle_of_step(std::get<2>(mask_interval))); + partial_mask(graph->get_handle_of_step(std::get<2>(mask_interval)), + 0, std::get<3>(mask_interval)); + + // traverse the snarls + for (const auto& snarl : snarls_discovered) { + + // DFS starting from one end + std::unordered_set seen{graph->forward(std::get<1>(snarl)), graph->forward(std::get<2>(snarl))}; + std::vector stack; + auto lambda = [&](const handle_t& next) { + auto fwd = graph->forward(next); + bool is_new = seen.insert(fwd).second; + if (is_new) { + stack.push_back(fwd); + to_mask.insert(fwd); +#ifdef debug + std::cerr << "inserting mask node from snarl traversal: " << graph->get_id(fwd) << '\n'; +#endif + } + }; + graph->follow_edges(std::get<1>(snarl), false, lambda); + while (!stack.empty()) { + handle_t here = stack.back(); + stack.pop_back(); + for (bool to_left : {true, false}) { + graph->follow_edges(here, to_left, lambda); + } + } + } + +#ifdef debug + std::cerr << "final masking nodes:\n"; + for (auto handle : to_mask) { + std::cerr << '\t' << graph->get_id(handle) << '\n'; + } +#endif + + for (auto handle : to_mask) { + apply_mask(handle, std::string(graph->get_length(handle), 'N')); + } + } + } +} + +} diff --git a/src/masker.hpp b/src/masker.hpp new file mode 100644 index 0000000000..3fa0e9b992 --- /dev/null +++ b/src/masker.hpp @@ -0,0 +1,57 @@ +#ifndef VG_MASKER_HPP_INCLUDED +#define VG_MASKER_HPP_INCLUDED + +#include + +#include + +#include "handle.hpp" +#include "snarls.hpp" +#include "region.hpp" + +namespace vg { + +/* + * Class to mask out graph regions with N's + */ +class Masker { + +public: + + // construct own snarl decomposition + // use external snarl decomposition + Masker(gbwtgraph::GBWTGraph& graph, const SnarlManager* snarl_manager = nullptr); + Masker(MutablePathDeletableHandleGraph& graph, const SnarlManager* snarl_manager = nullptr); + Masker() = default; + ~Masker() = default; + + // move + Masker(Masker&& other) = default; + Masker& operator=(Masker&& other) = default; + + // replace regions with N's, with regions given as tuples of (path name, region begin, region end). + // regions indexes are assumed to be 0-based and end-exclusive + void mask_sequences(const std::vector>& regions) const; + +private: + + // make our own snarl manager + void init_snarls(); + + // takes tuples of (first step, offset in first step, last step, end in last step) + // uses the lambda function provided to apply mask sequences to the graph + void apply_mask_sequences(const std::vector>& mask_intervals, + const std::function& apply_mask) const; + + PathHandleGraph* graph = nullptr; + const SnarlManager* snarl_manager = nullptr; + + // if not given a snarl manager, we construct one and make the snarl_manager + // point to it + std::unique_ptr own_snarl_manager; + +}; + +} + +#endif diff --git a/src/subcommand/mask_main.cpp b/src/subcommand/mask_main.cpp new file mode 100644 index 0000000000..b251fc182b --- /dev/null +++ b/src/subcommand/mask_main.cpp @@ -0,0 +1,177 @@ +#include "subcommand.hpp" +#include "../io/save_handle_graph.hpp" +#include "../handle.hpp" +#include "../xg.hpp" +#include "../utility.hpp" +#include "../snarls.hpp" +#include "../masker.hpp" +#include +#include +#include + +#include +#include +#include + +using namespace vg; +using namespace vg::subcommand; +using namespace vg::io; +using namespace std; + +vector> parse_bed(istream& in){ + + vector> regions; + + string line_buffer; + string token_buffer; + while (getline(in, line_buffer)) { + + if (line_buffer.empty() || line_buffer.front() == '#' || line_buffer.substr(0, 5) == "track" || line_buffer.substr(0, 7) == "browser") { + // header line + continue; + } + istringstream line_strm(line_buffer); + + regions.emplace_back(); + + getline(line_strm, token_buffer, '\t'); + get<0>(regions.back()) = token_buffer; + getline(line_strm, token_buffer, '\t'); + get<1>(regions.back()) = parse(token_buffer); + getline(line_strm, token_buffer, '\t'); + get<2>(regions.back()) = parse(token_buffer); + } + + return regions; +} + +void help_mask(char** argv) { + cerr << "usage: " << argv[0] << " [options] " << endl + << "Mask out specified regions of a graph with N's" << endl + << endl + << "input options: " << endl + << " -b, --bed FILE BED regions corresponding to path intervals of the graph to target (required)" << endl + << " -g, --gbz-input Input graph is in GBZ format" << endl + << " -s, --snarls FILE Snarls from vg snarls (computed directly if not provided)" << endl + + << endl; +} + +int main_mask(int argc, char** argv) { + + string bed_filepath; + string snarls_filepath; + bool gbz_input = false; + + int c; + optind = 2; // force optind past command positional argument + while (true) { + static struct option long_options[] = + { + {"help", no_argument, 0, 'h'}, + {"bed", required_argument, 0, 'b'}, + {"gbz-input", no_argument, 0, 'g'}, + {"snarls", required_argument, 0, 's'}, + {0, 0, 0, 0} + + }; + int option_index = 0; + c = getopt_long (argc, argv, "hb:gs:", long_options, &option_index); + + // Detect the end of the options. + if (c == -1) + break; + + switch (c) + { + + case '?': + case 'h': + help_mask(argv); + return 0; + case 'b': + bed_filepath = optarg; + break; + case 'g': + gbz_input = true; + break; + case 's': + snarls_filepath = optarg; + break; + default: + help_mask(argv); + return 1; + } + } + + + if (argc - optind != 1) { + cerr << "error: vg mask requires exactly 1 positional argument" << endl; + help_mask(argv); + return 1; + } + + if (bed_filepath.empty()) { + cerr << "error: vg mask requires an input BED file from -b / --bed" << endl; + return 1; + } + if (bed_filepath != "-") { + if (!ifstream(bed_filepath)) { + cerr << "error: could not open BED file from " << bed_filepath << endl; + return 1; + } + } + if (!snarls_filepath.empty() && snarls_filepath != "-") { + if (!ifstream(snarls_filepath)) { + cerr << "error: could not open snarls file from " << snarls_filepath << endl; + return 1; + } + } + + // load the graph + string graph_path = get_input_file_name(optind, argc, argv); + unique_ptr graph; + unique_ptr gbz; + if (gbz_input) { + gbz = vg::io::VPKG::load_one(graph_path);; + } + else { + graph = vg::io::VPKG::load_one(graph_path); + } + + // load the BED + vector> regions; + get_input_file(bed_filepath, [&](istream& in) { + regions = std::move(parse_bed(in)); + }); + + // load the snarls + unique_ptr snarl_manager; + if (!snarls_filepath.empty()) { + snarl_manager = vg::io::VPKG::load_one(snarls_filepath); + } + + Masker masker; + if (gbz.get()) { + masker = std::move(Masker(gbz->graph, snarl_manager.get())); + } + else { + masker = std::move(Masker(*graph, snarl_manager.get())); + } + + masker.mask_sequences(regions); + + // write the graph + if (gbz.get()) { + gbz->simple_sds_serialize(cout); + } + else { + vg::io::save_handle_graph(graph.get(), cout); + } + + return 0; +} + + +// Register subcommand +static Subcommand vg_mask("mask", "Mask out sequences in a graph with N's", main_mask); diff --git a/src/unittest/masker.cpp b/src/unittest/masker.cpp new file mode 100644 index 0000000000..59a3acb9b6 --- /dev/null +++ b/src/unittest/masker.cpp @@ -0,0 +1,388 @@ +/// +/// \file masker.cpp +/// +/// Unit tests for the Masker class +/// + +#include "catch.hpp" +#include "../masker.hpp" +#include "../handle.hpp" +#include "../gbwt_helper.hpp" +#include "../gbwtgraph_helper.hpp" + +#include +#include +#include +#include + +namespace vg { +namespace unittest { + +TEST_CASE( "Masker produces expected results on simple paths", "[masker]" ) { + + SECTION( "Simple masking task") { + + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("ACGT"); + handle_t h2 = graph.create_handle("TTA"); + handle_t h3 = graph.create_handle("GTAC"); + handle_t h4 = graph.create_handle("AA"); + handle_t h5 = graph.create_handle("G"); + + graph.create_edge(h1, h2); + graph.create_edge(h2, h3); + graph.create_edge(h3, h4); + graph.create_edge(h4, h5); + + path_handle_t p = graph.create_path_handle("p"); + + graph.append_step(p, h1); + graph.append_step(p, h2); + graph.append_step(p, h3); + graph.append_step(p, h4); + graph.append_step(p, h5); + + Masker masker(graph); + + std::vector> intervals{ + {"p", 4, 13} + }; + + masker.mask_sequences(intervals); + + REQUIRE(graph.get_sequence(h1) == "ACGT"); + REQUIRE(graph.get_sequence(h2) == "NNN"); + REQUIRE(graph.get_sequence(h3) == "NNNN"); + REQUIRE(graph.get_sequence(h4) == "NN"); + REQUIRE(graph.get_sequence(h5) == "G"); + } + + SECTION( "Mask on reverse strand") { + + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("ACGT"); + handle_t h2 = graph.create_handle("TTA"); + handle_t h3 = graph.create_handle("GTAC"); + handle_t h4 = graph.create_handle("AA"); + handle_t h5 = graph.create_handle("G"); + + graph.create_edge(h1, h2); + graph.create_edge(h2, h3); + graph.create_edge(h3, h4); + graph.create_edge(h4, h5); + + path_handle_t p = graph.create_path_handle("p"); + + graph.append_step(p, graph.flip(h5)); + graph.append_step(p, graph.flip(h4)); + graph.append_step(p, graph.flip(h3)); + graph.append_step(p, graph.flip(h2)); + graph.append_step(p, graph.flip(h1)); + + Masker masker(graph); + + std::vector> intervals{ + {"p", 4, 11} + }; + + masker.mask_sequences(intervals); + + REQUIRE(graph.get_sequence(h1) == "ACGN"); + REQUIRE(graph.get_sequence(h2) == "NNN"); + REQUIRE(graph.get_sequence(h3) == "NNNC"); + REQUIRE(graph.get_sequence(h4) == "AA"); + REQUIRE(graph.get_sequence(h5) == "G"); + } + + SECTION( "Mask on a single node") { + + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("ACGTACGT"); + + path_handle_t p = graph.create_path_handle("p"); + + graph.append_step(p, h1); + + Masker masker(graph); + + std::vector> intervals{ + {"p", 1, 2}, + {"p", 5, 7} + }; + + masker.mask_sequences(intervals); + + REQUIRE(graph.get_sequence(h1) == "ANGTANNT"); + } + + SECTION( "Mask across a clipped path" ) { + + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("ACGTACGT"); + handle_t h2 = graph.create_handle("ATCTCTAA"); + handle_t h3 = graph.create_handle("G"); + handle_t h4 = graph.create_handle("GGACTCA"); + + graph.create_edge(h1, h2); + graph.create_edge(h2, h3); + graph.create_edge(h3, h4); + + std::string pname1 = graph.create_path_name(PathSense::GENERIC, + PathMetadata::NO_SAMPLE_NAME, + "p", + PathMetadata::NO_HAPLOTYPE, + PathMetadata::NO_PHASE_BLOCK, + handlegraph::subrange_t(0, PathMetadata::NO_END_POSITION)); + std::string pname2 = graph.create_path_name(PathSense::GENERIC, + PathMetadata::NO_SAMPLE_NAME, + "p", + PathMetadata::NO_HAPLOTYPE, + PathMetadata::NO_PHASE_BLOCK, + handlegraph::subrange_t(17, PathMetadata::NO_END_POSITION)); + + path_handle_t p1 = graph.create_path_handle(pname1); + path_handle_t p2 = graph.create_path_handle(pname2); + + graph.append_step(p1, h1); + graph.append_step(p1, h2); + + graph.append_step(p2, h4); + + Masker masker(graph); + + std::vector> intervals{ + {"p", 12, 20}, + }; + + masker.mask_sequences(intervals); + + REQUIRE(graph.get_sequence(h1) == "ACGTACGT"); + REQUIRE(graph.get_sequence(h2) == "ATCTNNNN"); + REQUIRE(graph.get_sequence(h3) == "G"); + REQUIRE(graph.get_sequence(h4) == "NNNCTCA"); + } + + SECTION( "Masker works on GBWTGraph" ) { + + std::vector paths; + + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("ACGT"); + handle_t h2 = graph.create_handle("TTA"); + handle_t h3 = graph.create_handle("GTAC"); + handle_t h4 = graph.create_handle("AA"); + handle_t h5 = graph.create_handle("G"); + + graph.create_edge(h1, h2); + graph.create_edge(h2, h3); + graph.create_edge(h3, h4); + graph.create_edge(h4, h5); + + std::string ref = graph.create_path_name(PathSense::GENERIC, + PathMetadata::NO_SAMPLE_NAME, + "chr", + PathMetadata::NO_HAPLOTYPE, + PathMetadata::NO_PHASE_BLOCK, + PathMetadata::NO_SUBRANGE); + + auto p = graph.create_path_handle(ref); + + graph.append_step(p, h1); + graph.append_step(p, h2); + graph.append_step(p, h3); + graph.append_step(p, h4); + graph.append_step(p, h5); + + // parameters surmised from from get_gbwt() source code + gbwt::Verbosity::set(gbwt::Verbosity::SILENT); + gbwt::GBWTBuilder gbwt_builder(8, 20); + + gbwtgraph::MetadataBuilder metadata; + // stolen from gbwtgraph::GFAParsingParameters::DEFAULT_XXXXX + metadata.add_path_name_format(".*", "C", PathSense::GENERIC); + + auto jobs = gbwtgraph::gbwt_construction_jobs(graph, 10000); + std::vector> assignments = assign_paths(graph, jobs, &metadata, nullptr); + assert(assignments.size() == 1); + assert(assignments.front().size() == 1); + gbwtgraph::insert_paths(graph, assignments.front(), gbwt_builder, 0, false); + gbwt_builder.index.addMetadata(); + gbwt_builder.index.metadata = metadata.get_metadata(); + + gbwt_builder.finish(); + gbwt::GBWT gbwt_index(gbwt_builder.index); + + gbwtgraph::GBWTGraph gbwt_graph(gbwt_index, graph); + + Masker masker(gbwt_graph); + + std::vector> intervals{ + {ref, 4, 13} + }; + + masker.mask_sequences(intervals); + + REQUIRE(gbwt_graph.get_sequence(h1) == "ACGT"); + REQUIRE(gbwt_graph.get_sequence(h2) == "NNN"); + REQUIRE(gbwt_graph.get_sequence(h3) == "NNNN"); + REQUIRE(gbwt_graph.get_sequence(h4) == "NN"); + REQUIRE(gbwt_graph.get_sequence(h5) == "G"); + REQUIRE(gbwt_graph.get_sequence(graph.flip(h1)) == "ACGT"); + REQUIRE(gbwt_graph.get_sequence(graph.flip(h2)) == "NNN"); + REQUIRE(gbwt_graph.get_sequence(graph.flip(h3)) == "NNNN"); + REQUIRE(gbwt_graph.get_sequence(graph.flip(h4)) == "NN"); + REQUIRE(gbwt_graph.get_sequence(graph.flip(h5)) == "C"); + } +} + +TEST_CASE( "Masker produces expected results on paths with variation", "[masker]" ) { + + SECTION( "Simple bubble" ) { + + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("ACGT"); + handle_t h2 = graph.create_handle("TTA"); + handle_t h3 = graph.create_handle("GTAC"); + handle_t h4 = graph.create_handle("AA"); + handle_t h5 = graph.create_handle("G"); + handle_t h6 = graph.create_handle("CG"); + + graph.create_edge(h1, h2); + graph.create_edge(h2, h3); + graph.create_edge(h2, h6); + graph.create_edge(h3, h4); + graph.create_edge(h6, h4); + graph.create_edge(h4, h5); + + path_handle_t p = graph.create_path_handle("p"); + + graph.append_step(p, h1); + graph.append_step(p, h2); + graph.append_step(p, h3); + graph.append_step(p, h4); + graph.append_step(p, h5); + + Masker masker(graph); + + std::vector> intervals{ + {"p", 4, 13} + }; + + masker.mask_sequences(intervals); + + REQUIRE(graph.get_sequence(h1) == "ACGT"); + REQUIRE(graph.get_sequence(h2) == "NNN"); + REQUIRE(graph.get_sequence(h3) == "NNNN"); + REQUIRE(graph.get_sequence(h4) == "NN"); + REQUIRE(graph.get_sequence(h5) == "G"); + REQUIRE(graph.get_sequence(h6) == "NN"); + } + + SECTION( "Nested bubbles" ) { + + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("A"); + handle_t h2 = graph.create_handle("A"); + handle_t h3 = graph.create_handle("A"); + handle_t h4 = graph.create_handle("A"); + handle_t h5 = graph.create_handle("A"); + handle_t h6 = graph.create_handle("A"); + handle_t h7 = graph.create_handle("A"); + handle_t h8 = graph.create_handle("A"); + handle_t h9 = graph.create_handle("A"); + handle_t h10 = graph.create_handle("A"); + handle_t h11 = graph.create_handle("A"); + handle_t h12 = graph.create_handle("A"); + handle_t h13 = graph.create_handle("A"); + handle_t h14 = graph.create_handle("A"); + handle_t h15 = graph.create_handle("A"); + handle_t h16 = graph.create_handle("A"); + + graph.create_edge(h1, h2); + graph.create_edge(h1, h3); + graph.create_edge(h2, h7); + graph.create_edge(h3, h4); + graph.create_edge(h3, h5); + graph.create_edge(h4, h6); + graph.create_edge(h5, h6); + graph.create_edge(h6, h7); + graph.create_edge(h7, h8); + graph.create_edge(h7, h9); + graph.create_edge(h8, h16); + graph.create_edge(h9, h10); + graph.create_edge(h9, h11); + graph.create_edge(h10, h12); + graph.create_edge(h11, h12); + graph.create_edge(h12, h13); + graph.create_edge(h12, h14); + graph.create_edge(h13, h15); + graph.create_edge(h14, h15); + graph.create_edge(h15, h16); + + // some extra bubbles to make sure we get the right snarl decomposition + handle_t h17 = graph.create_handle("A"); + handle_t h18 = graph.create_handle("A"); + handle_t h19 = graph.create_handle("A"); + handle_t h20 = graph.create_handle("A"); + handle_t h21 = graph.create_handle("A"); + handle_t h22 = graph.create_handle("A"); + handle_t h23 = graph.create_handle("A"); + handle_t h24 = graph.create_handle("A"); + handle_t h25 = graph.create_handle("A"); + + graph.create_edge(h16, h17); + graph.create_edge(h16, h18); + graph.create_edge(h17, h19); + graph.create_edge(h18, h19); + graph.create_edge(h19, h20); + graph.create_edge(h19, h21); + graph.create_edge(h20, h22); + graph.create_edge(h21, h22); + graph.create_edge(h22, h23); + graph.create_edge(h22, h24); + graph.create_edge(h23, h25); + graph.create_edge(h24, h25); + + path_handle_t p = graph.create_path_handle("p"); + + graph.append_step(p, h1); + graph.append_step(p, h3); + graph.append_step(p, h4); + graph.append_step(p, h6); + graph.append_step(p, h7); + graph.append_step(p, h9); + graph.append_step(p, h10); + graph.append_step(p, h12); + graph.append_step(p, h13); + graph.append_step(p, h15); + graph.append_step(p, h16); + + Masker masker(graph); + + std::vector> intervals{ + {"p", 1, 11} + }; + + masker.mask_sequences(intervals); + + REQUIRE(graph.get_sequence(h1) == "A"); + REQUIRE(graph.get_sequence(h2) == "A"); + REQUIRE(graph.get_sequence(h3) == "N"); + REQUIRE(graph.get_sequence(h4) == "N"); + REQUIRE(graph.get_sequence(h5) == "N"); + REQUIRE(graph.get_sequence(h6) == "N"); + REQUIRE(graph.get_sequence(h7) == "N"); + REQUIRE(graph.get_sequence(h8) == "N"); + REQUIRE(graph.get_sequence(h9) == "N"); + REQUIRE(graph.get_sequence(h10) == "N"); + REQUIRE(graph.get_sequence(h11) == "N"); + REQUIRE(graph.get_sequence(h12) == "N"); + REQUIRE(graph.get_sequence(h13) == "N"); + REQUIRE(graph.get_sequence(h14) == "N"); + REQUIRE(graph.get_sequence(h15) == "N"); + REQUIRE(graph.get_sequence(h16) == "N"); + } +} + +} +} diff --git a/test/t/55_vg_mask.t b/test/t/55_vg_mask.t new file mode 100644 index 0000000000..ca34f532b8 --- /dev/null +++ b/test/t/55_vg_mask.t @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +BASH_TAP_ROOT=../deps/bash-tap +. ../deps/bash-tap/bash-tap-bootstrap + +PATH=../bin:$PATH # for vg + +plan tests 4 + +printf "GRCh38#0#chr1\t1\t4\n" > m.bed +vg gbwt -g m.gbz --gbz-format -G graphs/gfa_with_reference.gfa +vg mask -g -b m.bed m.gbz > mm.gbz + +is $(vg view mm.gbz | grep "^S" | grep 4 | cut -f 3) "NNN" "A node can be masked in a GBZ" +is $(vg view mm.gbz | grep "^S" | grep -v 4 | grep N | wc -l | sed 's/^[[:space:]]*//') "0" "Off target nodes are not masked in a GBZ" + +vg convert -g graphs/gfa_with_reference.gfa > m.vg +vg mask -b m.bed m.vg > mm.vg + +is $(vg view mm.vg | grep "^S" | grep 4 | cut -f 3) "NNN" "A node can be masked in a BDSG graph" +is $(vg view mm.vg | grep "^S" | grep -v 4 | grep N | wc -l | sed 's/^[[:space:]]*//') "0" "Off target nodes are not masked in a BDSG graph" + +rm m.bed m.gbz mm.gbz m.vg mm.vg From 4b68e899ddaa40bd30e923c9acffdc1c8e355802 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Tue, 17 Dec 2024 10:53:32 -0500 Subject: [PATCH 2/4] bug fix for when an entire contig is masked --- src/masker.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/masker.cpp b/src/masker.cpp index 675b4a6f8d..03c7eacf32 100644 --- a/src/masker.cpp +++ b/src/masker.cpp @@ -130,12 +130,13 @@ void Masker::mask_sequences(const std::vectorget_path_name(std::get<2>(path_interval_here)) << ") and region " << r_idx << std::endl ; #endif // we may have reached an overlap, scan the path size_t offset = std::get<1>(path_interval_here); bool inside_region = false; auto step = graph->path_begin(std::get<2>(path_interval_here)); + auto prev_step = step; // just a placeholder for now auto end = graph->path_end(std::get<2>(path_interval_here)); while (step != end && r_idx < sorted_regions.size() && std::get<1>(sorted_regions[r_idx]) < p_end) { @@ -163,6 +164,7 @@ void Masker::mask_sequences(const std::vectorget_next_step(step); offset = end_offset; } @@ -170,8 +172,8 @@ void Masker::mask_sequences(const std::vector(step_intervals.back()) = graph->get_previous_step(end); - std::get<3>(step_intervals.back()) = offset; + std::get<2>(step_intervals.back()) = prev_step; + std::get<3>(step_intervals.back()) = graph->get_length(graph->get_handle_of_step(prev_step)); } ++p_idx; From cf13e37832f94dd8b340a534d6a7ab53f3608de2 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Tue, 17 Dec 2024 11:26:39 -0500 Subject: [PATCH 3/4] update libhandlegraph --- deps/libhandlegraph | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/libhandlegraph b/deps/libhandlegraph index e53d64fbef..9f04969ea0 160000 --- a/deps/libhandlegraph +++ b/deps/libhandlegraph @@ -1 +1 @@ -Subproject commit e53d64fbef0e729414613be3456057c0523534c4 +Subproject commit 9f04969ea05a9aa3803c4ddd46f0fa76efeb300d From ff7ea9d2c027381438d10fe00ac0d9849f6ec06c Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Wed, 18 Dec 2024 13:32:40 -0500 Subject: [PATCH 4/4] switch to merged branch of libbdsg and libhandlegraph --- deps/libbdsg | 2 +- deps/libhandlegraph | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deps/libbdsg b/deps/libbdsg index 076e21a58c..dfe53fd2fb 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit 076e21a58cb76fcb4a8b58d1322786a6e0cfc1b3 +Subproject commit dfe53fd2fb8fc9b748b8c24cfc3fb7fbc6e1ee32 diff --git a/deps/libhandlegraph b/deps/libhandlegraph index 9f04969ea0..0e70dadb50 160000 --- a/deps/libhandlegraph +++ b/deps/libhandlegraph @@ -1 +1 @@ -Subproject commit 9f04969ea05a9aa3803c4ddd46f0fa76efeb300d +Subproject commit 0e70dadb5054568d8071e280b3b7b11b5658937f