Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into lr-giraffe
Browse files Browse the repository at this point in the history
  • Loading branch information
adamnovak committed Dec 19, 2024
2 parents 1acaa4c + 8494a52 commit 71428eb
Show file tree
Hide file tree
Showing 7 changed files with 1,042 additions and 2 deletions.
2 changes: 1 addition & 1 deletion deps/libbdsg
Submodule libbdsg updated 38 files
+9 −4 CMakeLists.txt
+2 −2 bdsg/cmake_bindings/bdsg.cpp
+1 −1 bdsg/cmake_bindings/bdsg.sources
+77 −75 bdsg/cmake_bindings/bdsg/graph_proxy.cpp
+70 −68 bdsg/cmake_bindings/bdsg/graph_proxy_1.cpp
+76 −74 bdsg/cmake_bindings/bdsg/internal/base_packed_graph.cpp
+1 −1 bdsg/cmake_bindings/bdsg/internal/eades_algorithm.cpp
+26 −5 bdsg/cmake_bindings/bdsg/internal/hash_map.cpp
+1 −1 bdsg/cmake_bindings/bdsg/internal/is_single_stranded.cpp
+21 −21 bdsg/cmake_bindings/bdsg/internal/mapped_structs_1.cpp
+3 −3 bdsg/cmake_bindings/bdsg/overlays/packed_path_position_overlay.cpp
+6 −7 bdsg/cmake_bindings/bdsg/overlays/packed_path_position_overlay_1.cpp
+14 −3 bdsg/cmake_bindings/bdsg/overlays/path_position_overlays.cpp
+1 −3 bdsg/cmake_bindings/bdsg/overlays/path_subgraph_overlay.cpp
+1,121 −0 bdsg/cmake_bindings/bdsg/overlays/reference_path_overlay.cpp
+1 −1 bdsg/cmake_bindings/bdsg/overlays/strand_split_overlay.cpp
+516 −116 bdsg/cmake_bindings/bdsg/overlays/vectorizable_overlays.cpp
+0 −817 bdsg/cmake_bindings/bdsg/overlays/vectorizable_overlays_1.cpp
+59 −7 bdsg/cmake_bindings/bdsg/packed_graph.cpp
+42 −35 bdsg/cmake_bindings/bdsg/snarl_distance_index.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/expanding_overlay_graph.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/handle_graph.cpp
+15 −1 bdsg/cmake_bindings/handlegraph/mutable_path_metadata.cpp
+28 −2 bdsg/cmake_bindings/handlegraph/mutable_path_mutable_handle_graph.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/path_handle_graph.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/path_metadata.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/path_position_handle_graph.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/trivially_serializable.cpp
+4 −0 bdsg/cmake_bindings/std/bdsg/internal/binder_hook_bind.cpp
+1 −1 bdsg/deps/libhandlegraph
+6 −1 bdsg/include/bdsg/hash_graph.hpp
+36 −0 bdsg/include/bdsg/internal/base_packed_graph.hpp
+8 −0 bdsg/include/bdsg/internal/binder_hook_bind.hpp
+8 −0 bdsg/include/bdsg/internal/graph_proxy_mutable_path_deletable_handle_graph_fragment.classfragment
+70 −20 bdsg/include/bdsg/snarl_distance_index.hpp
+7 −0 bdsg/src/hash_graph.cpp
+8 −1 bdsg/src/test_libbdsg.cpp
+38 −7 make_and_run_binder.py
395 changes: 395 additions & 0 deletions src/masker.cpp

Large diffs are not rendered by default.

57 changes: 57 additions & 0 deletions src/masker.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#ifndef VG_MASKER_HPP_INCLUDED
#define VG_MASKER_HPP_INCLUDED

#include <memory>

#include <gbwtgraph/gbwtgraph.h>

#include "handle.hpp"
#include "snarls.hpp"
#include "region.hpp"

namespace vg {

/*
* Class to mask out graph regions with N's
*/
class Masker {

public:

// construct own snarl decomposition
// use external snarl decomposition
Masker(gbwtgraph::GBWTGraph& graph, const SnarlManager* snarl_manager = nullptr);
Masker(MutablePathDeletableHandleGraph& graph, const SnarlManager* snarl_manager = nullptr);
Masker() = default;
~Masker() = default;

// move
Masker(Masker&& other) = default;
Masker& operator=(Masker&& other) = default;

// replace regions with N's, with regions given as tuples of (path name, region begin, region end).
// regions indexes are assumed to be 0-based and end-exclusive
void mask_sequences(const std::vector<std::tuple<std::string, size_t, size_t>>& regions) const;

private:

// make our own snarl manager
void init_snarls();

// takes tuples of (first step, offset in first step, last step, end in last step)
// uses the lambda function provided to apply mask sequences to the graph
void apply_mask_sequences(const std::vector<std::tuple<step_handle_t, size_t, step_handle_t, size_t>>& mask_intervals,
const std::function<void(handle_t, const std::string&)>& apply_mask) const;

PathHandleGraph* graph = nullptr;
const SnarlManager* snarl_manager = nullptr;

// if not given a snarl manager, we construct one and make the snarl_manager
// point to it
std::unique_ptr<SnarlManager> own_snarl_manager;

};

}

#endif
177 changes: 177 additions & 0 deletions src/subcommand/mask_main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
#include "subcommand.hpp"
#include "../io/save_handle_graph.hpp"
#include "../handle.hpp"
#include "../xg.hpp"
#include "../utility.hpp"
#include "../snarls.hpp"
#include "../masker.hpp"
#include <vg/io/stream.hpp>
#include <vg/io/vpkg.hpp>
#include <gbwtgraph/gbz.h>

#include <unistd.h>
#include <getopt.h>
#include <sstream>

using namespace vg;
using namespace vg::subcommand;
using namespace vg::io;
using namespace std;

vector<tuple<string, size_t, size_t>> parse_bed(istream& in){

vector<tuple<string, size_t, size_t>> regions;

string line_buffer;
string token_buffer;
while (getline(in, line_buffer)) {

if (line_buffer.empty() || line_buffer.front() == '#' || line_buffer.substr(0, 5) == "track" || line_buffer.substr(0, 7) == "browser") {
// header line
continue;
}
istringstream line_strm(line_buffer);

regions.emplace_back();

getline(line_strm, token_buffer, '\t');
get<0>(regions.back()) = token_buffer;
getline(line_strm, token_buffer, '\t');
get<1>(regions.back()) = parse<int64_t>(token_buffer);
getline(line_strm, token_buffer, '\t');
get<2>(regions.back()) = parse<int64_t>(token_buffer);
}

return regions;
}

void help_mask(char** argv) {
cerr << "usage: " << argv[0] << " [options] <graph>" << endl
<< "Mask out specified regions of a graph with N's" << endl
<< endl
<< "input options: " << endl
<< " -b, --bed FILE BED regions corresponding to path intervals of the graph to target (required)" << endl
<< " -g, --gbz-input Input graph is in GBZ format" << endl
<< " -s, --snarls FILE Snarls from vg snarls (computed directly if not provided)" << endl

<< endl;
}

int main_mask(int argc, char** argv) {

string bed_filepath;
string snarls_filepath;
bool gbz_input = false;

int c;
optind = 2; // force optind past command positional argument
while (true) {
static struct option long_options[] =
{
{"help", no_argument, 0, 'h'},
{"bed", required_argument, 0, 'b'},
{"gbz-input", no_argument, 0, 'g'},
{"snarls", required_argument, 0, 's'},
{0, 0, 0, 0}

};
int option_index = 0;
c = getopt_long (argc, argv, "hb:gs:", long_options, &option_index);

// Detect the end of the options.
if (c == -1)
break;

switch (c)
{

case '?':
case 'h':
help_mask(argv);
return 0;
case 'b':
bed_filepath = optarg;
break;
case 'g':
gbz_input = true;
break;
case 's':
snarls_filepath = optarg;
break;
default:
help_mask(argv);
return 1;
}
}


if (argc - optind != 1) {
cerr << "error: vg mask requires exactly 1 positional argument" << endl;
help_mask(argv);
return 1;
}

if (bed_filepath.empty()) {
cerr << "error: vg mask requires an input BED file from -b / --bed" << endl;
return 1;
}
if (bed_filepath != "-") {
if (!ifstream(bed_filepath)) {
cerr << "error: could not open BED file from " << bed_filepath << endl;
return 1;
}
}
if (!snarls_filepath.empty() && snarls_filepath != "-") {
if (!ifstream(snarls_filepath)) {
cerr << "error: could not open snarls file from " << snarls_filepath << endl;
return 1;
}
}

// load the graph
string graph_path = get_input_file_name(optind, argc, argv);
unique_ptr<MutablePathDeletableHandleGraph> graph;
unique_ptr<gbwtgraph::GBZ> gbz;
if (gbz_input) {
gbz = vg::io::VPKG::load_one<gbwtgraph::GBZ>(graph_path);;
}
else {
graph = vg::io::VPKG::load_one<MutablePathDeletableHandleGraph>(graph_path);
}

// load the BED
vector<tuple<string, size_t, size_t>> regions;
get_input_file(bed_filepath, [&](istream& in) {
regions = std::move(parse_bed(in));
});

// load the snarls
unique_ptr<SnarlManager> snarl_manager;
if (!snarls_filepath.empty()) {
snarl_manager = vg::io::VPKG::load_one<SnarlManager>(snarls_filepath);
}

Masker masker;
if (gbz.get()) {
masker = std::move(Masker(gbz->graph, snarl_manager.get()));
}
else {
masker = std::move(Masker(*graph, snarl_manager.get()));
}

masker.mask_sequences(regions);

// write the graph
if (gbz.get()) {
gbz->simple_sds_serialize(cout);
}
else {
vg::io::save_handle_graph(graph.get(), cout);
}

return 0;
}


// Register subcommand
static Subcommand vg_mask("mask", "Mask out sequences in a graph with N's", main_mask);
Loading

1 comment on commit 71428eb

@adamnovak
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch lr-giraffe. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17511 seconds

Please sign in to comment.