Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Keep haplotypes in vg chunk that don't extend far enough #4501

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 78 additions & 35 deletions src/haplotype_extracter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,49 +21,65 @@ static int64_t make_side(id_t id, bool is_end) {
}


void trace_haplotypes_and_paths(const PathHandleGraph& source, const gbwt::GBWT& haplotype_database,
vg::id_t start_node, int extend_distance,
Graph& out_graph,
map<string, int>& out_thread_frequencies,
bool expand_graph) {
void trace_haplotypes(const PathHandleGraph& source, const gbwt::GBWT& haplotype_database,
const handle_t& start_handle, function<bool(const vector<gbwt::node_type>&)> stop_fn,
MutablePathMutableHandleGraph& out_graph,
map<string, int>& out_thread_frequencies) {
// get our haplotypes
handle_t n = source.get_handle(start_node, false);
vector<pair<thread_t, gbwt::SearchState> > haplotypes = list_haplotypes(source, haplotype_database, n,
[&extend_distance](const vector<gbwt::node_type>& new_thread) {
return new_thread.size() >= extend_distance;
});
vector<pair<thread_t, gbwt::SearchState> > haplotypes = list_haplotypes(source, haplotype_database, start_handle, stop_fn);

#ifdef debug
cerr << "Haplotype database " << &haplotype_database << " produced " << haplotypes.size() << " haplotypes" << endl;
#endif

if (expand_graph) {
// get our subgraph and "regular" paths by expanding forward
handle_t handle = source.get_handle(start_node);
bdsg::HashGraph extractor;
extractor.create_handle(source.get_sequence(handle), source.get_id(handle));
// TODO: is expanding only forward really the right behavior here?
algorithms::expand_context_with_paths(&source, &extractor, extend_distance, true, true, false);

// Convert to Protobuf I guess
from_path_handle_graph(extractor, out_graph);
}

// add a frequency of 1 for each normal path
for (int i = 0; i < out_graph.path_size(); ++i) {
out_thread_frequencies[out_graph.path(i).name()] = 1;
}

// add our haplotypes to the subgraph, naming ith haplotype "thread_i"
for (int i = 0; i < haplotypes.size(); ++i) {
Path p = path_from_thread_t(haplotypes[i].first, source);
p.set_name("thread_" + to_string(i));
out_thread_frequencies[p.name()] = haplotypes[i].second.size();
*(out_graph.add_path()) = move(p);
std::string path_name = "thread_" + to_string(i);
out_thread_frequencies[path_name] = haplotypes[i].second.size();
path_handle_t path_handle = out_graph.create_path_handle(path_name);
const thread_t& thread = haplotypes[i].first;
for (int j = 0; j < thread.size(); j++) {
// Copy in all the steps
if (!out_graph.has_node(gbwt::Node::id(thread[j]))) {
// It's possible for us to be extracting haplotypes that leave the
// subgraph we extracted. Maybe the end node for ID range extraction
// had an alternative with a higher ID. If we find that that is
// happening, we cut the haplotype short.
break;
}
out_graph.append_step(path_handle, out_graph.get_handle(gbwt::Node::id(thread[j]), gbwt::Node::is_reverse(thread[j])));
}
}
}

void trace_paths(const PathHandleGraph& source,
const handle_t& start_handle, int extend_distance,
MutablePathMutableHandleGraph& out_graph,
map<string, int>& out_thread_frequencies) {

// get our subgraph and "regular" paths by expanding forward
bdsg::HashGraph extractor;
extractor.create_handle(source.get_sequence(start_handle), source.get_id(start_handle));
// TODO: is expanding only forward really the right behavior here?
algorithms::expand_context_with_paths(&source, &extractor, extend_distance, true, true, false);

// Copy over nodes and edges
handlegraph::algorithms::copy_handle_graph(&extractor, &out_graph);

for (auto& sense : {PathSense::REFERENCE, PathSense::GENERIC, PathSense::HAPLOTYPE}) {
extractor.for_each_path_of_sense(sense, [&](const path_handle_t& path_handle) {
// For every non-haplotype path we pulled, give it a frequency of 1.
// Including haplotypes, since we didn't trace them to get frequencies.
out_thread_frequencies[extractor.get_path_name(path_handle)] = 1;

// Copy it over
handlegraph::algorithms::copy_path(&extractor, path_handle, &out_graph);
});
}
}



void output_haplotype_counts(ostream& annotation_ostream,
vector<pair<thread_t,int>>& haplotype_list) {
for(int i = 0; i < haplotype_list.size(); i++) {
Expand Down Expand Up @@ -200,7 +216,15 @@ vector<pair<vector<gbwt::node_type>, gbwt::SearchState> > list_haplotypes(const
#endif

if (!first_state.empty()) {
search_intermediates.push_back(make_pair(first_thread, first_state));
if (stop_fn(first_thread)) {
// We immediately want to stop the search
#ifdef debug
cerr << "\tImmediately hit limit with " << first_state.size() << " results; emitting" << endl;
#endif
search_results.push_back(make_pair(std::move(first_thread), first_state));
} else {
search_intermediates.push_back(make_pair(first_thread, first_state));
}
}

while(!search_intermediates.empty()) {
Expand All @@ -222,16 +246,23 @@ vector<pair<vector<gbwt::node_type>, gbwt::SearchState> > list_haplotypes(const
}
});

size_t continued_members = 0;

for (auto& nhs : next_handle_states) {

const handle_t& next = get<0>(nhs);
gbwt::node_type& extend_node = get<1>(nhs);
gbwt::SearchState& new_state = get<2>(nhs);

// Keep track of how many selected threads still exist in all the extensions.
continued_members += new_state.size();

vector<gbwt::node_type> new_thread;
if (&nhs == &next_handle_states.back()) {
// avoid a copy by re-using the vector for the last thread. this way simple cases
// like scanning along one path don't blow up to n^2
if (&nhs == &next_handle_states.back() && continued_members == last.second.size()) {
// Avoid a copy by re-using the vector for the last thread if
// we don't need to split off or stop any more haplotypes. This
// way simple cases like scanning along one path don't blow up
// to n^2.
new_thread = std::move(last.first);
} else {
new_thread = last.first;
Expand All @@ -251,6 +282,18 @@ vector<pair<vector<gbwt::node_type>, gbwt::SearchState> > list_haplotypes(const
search_intermediates.push_back(make_pair(std::move(new_thread), new_state));
}
}

if (continued_members != last.second.size()) {
#ifdef debug
cerr << "Stopped " << (last.second.size() - continued_members) << " results here; emitting" << endl;
#endif
// We need to make a result with *only* the stopping results.
// So we extend with a sentinel (0, false) GBWT node, which we don't put in our output vector for the haplotype.
// TODO: This is undocumented in gbwt but seems to work.
auto new_state = gbwt.extend(last.second, gbwt::Node::encode(0, false));
search_results.emplace_back(std::move(last.first), new_state);
}

}

return search_results;
Expand Down
38 changes: 30 additions & 8 deletions src/haplotype_extracter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,42 @@ using thread_t = vector<gbwt::node_type>;
// as Paths a path with name thread_i. Each path name (including threads) is
// mapped to a frequency in out_thread_frequencies. Haplotypes will be pulled
// from the GBWT index.
void trace_haplotypes_and_paths(const PathHandleGraph& source,
const gbwt::GBWT& haplotype_database,
vg::id_t start_node, int extend_distance,
Graph& out_graph,
map<string, int>& out_thread_frequencies,
bool expand_graph = true);
// out_graph may already contain nodes and paths.
// If stop_fn returns True, the haplotype extraction is stopped for that haplotype.
// Haplotypes are emitted when they are stopped or when a sample cannot be continued anymore.
void trace_haplotypes(const PathHandleGraph& source,
const gbwt::GBWT& haplotype_database,
const handle_t& start_handle, function<bool(const vector<gbwt::node_type>&)> stop_fn,
MutablePathMutableHandleGraph& out_graph,
map<string, int>& out_thread_frequencies);

// Walk forward from a node, collecting all non-haplotype paths. Each path name is
// mapped to frequency 1 in out_thread_frequencies.
// out_graph may already contain nodes and paths.
void trace_paths(const PathHandleGraph& source,
const handle_t& start_handle, int extend_distance,
MutablePathMutableHandleGraph& out_graph,
map<string, int>& out_thread_frequencies);

// Turns a (GBWT-based) thread_t into a (vg-based) Path
Path path_from_thread_t(thread_t& t, const HandleGraph& source);

// Lists all the sub-haplotypes of nodes starting at
// node start_node from the set of haplotypes embedded in the geven GBWT
// start from the set of haplotypes embedded in the given GBWT
// haplotype database. At each step stop_fn() is called on the thread being created, and if it returns true
// then the search stops and the thread is added two the list to be returned.
// then the search stops. The search will also emit a result if any selected sample stops.
//
// No empty sub-haplotypes will be returned.
//
// For haplotypes emitted because a sample stopped, the corresponding GBWT
// search state will have been extended with the end marker. It will have the
// correct .size() for the number of haplotypes that stop there, but it will
// *not* be usable with locate() queries! If you need to do locate() queries,
// you need to filter the output and keep only results that would have
// satisfied the stop_fn predicate.
//
// TODO: Adopt a different internal format that doesn't have such a weird
// quirk.
vector<pair<vector<gbwt::node_type>, gbwt::SearchState> > list_haplotypes(const HandleGraph& graph,
const gbwt::GBWT& gbwt,
handle_t start,
Expand Down
81 changes: 52 additions & 29 deletions src/subcommand/chunk_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -757,11 +757,16 @@ int main_chunk(int argc, char** argv) {

// optionally trace our haplotypes
if (trace && subgraph && gbwt_index) {
int64_t trace_start;
handle_t trace_start, trace_end;
int64_t trace_steps = 0;
if (id_range) {
trace_start = output_regions[i].start;
trace_steps = output_regions[i].end - trace_start;
// Assume we want to look local forward probably
trace_start = graph->get_handle(output_regions[i].start, false);
trace_end = graph->get_handle(output_regions[i].end, false);
trace_steps = output_regions[i].end - output_regions[i].start;
#ifdef debug
std::cerr << "Looking for ID range " << output_regions[i].start << " to " << output_regions[i].end << ", up to " << trace_steps << " steps" << std::endl;
#endif
} else {
path_handle_t path_handle = graph->get_path_handle(output_regions[i].seq);
step_handle_t trace_start_step = graph->get_step_at_position(path_handle, output_regions[i].start);
Expand All @@ -770,36 +775,54 @@ int main_chunk(int argc, char** argv) {
if (output_regions[i].start > output_regions[i].end) {
swap(trace_start_step, trace_end_step);
}
trace_start = graph->get_id(graph->get_handle_of_step(trace_start_step));
trace_start = graph->get_handle_of_step(trace_start_step);
trace_end = graph->get_handle_of_step(trace_end_step);
if (output_regions[i].start > output_regions[i].end) {
// Actually we need to look in reverse alogn the path.
trace_start = graph->flip(trace_start);
trace_end = graph->flip(trace_end);
}
for (; trace_start_step != trace_end_step; trace_start_step = graph->get_next_step(trace_start_step)) {
// Count the number of steps along the backbone path, which we use to limit graph expansion.
++trace_steps;
}
// haplotype_extender is forward only. until it's made bidirectional, try to
// detect backward paths and trace them backwards. this will not cover all possible cases though.
if (graph->get_is_reverse(graph->get_handle_of_step(trace_start_step)) &&
graph->get_is_reverse(graph->get_handle_of_step(trace_end_step))) {
trace_start = graph->get_id(graph->get_handle_of_step(trace_end_step));
}
}
Graph g;
trace_haplotypes_and_paths(*graph, *gbwt_index, trace_start, trace_steps,
g, trace_thread_frequencies, false);
subgraph->for_each_path_handle([&trace_thread_frequencies, &subgraph](path_handle_t path_handle) {
trace_thread_frequencies[subgraph->get_path_name(path_handle)] = 1;});
VG* vg_subgraph = dynamic_cast<VG*>(subgraph.get());
if (vg_subgraph != nullptr) {
// our graph is in vg format, just extend it
vg_subgraph->extend(g);
} else {
// our graph is not in vg format. covert it, extend it, convert it back
// this can eventually be avoided by handlifying the haplotype tracer
VG vg;
handlealgs::copy_path_handle_graph(subgraph.get(), &vg);
subgraph.reset();
vg.extend(g);
subgraph = vg::io::new_output_graph<MutablePathMutableHandleGraph>(output_format);
handlealgs::copy_path_handle_graph(&vg, subgraph.get());
#ifdef debug
std::cerr << "Looking for position range " << output_regions[i].start << " to " << output_regions[i].end << ", nodes " << graph->get_id(trace_start) << " to " << graph->get_id(trace_end) << ", up to " << trace_steps << " steps" << std::endl;
#endif
}

// Stop extending haplotypes when either they get too long or they arrive at the end.
auto stop_function = [&](const vector<gbwt::node_type>& candidate) {
if (candidate.size() >= trace_steps) {
// If you go more steps than the backbone path, stop there.
// TODO: We should add some padding here to account for insertions of manageable size.
#ifdef debug
std::cerr << "Stop because " << candidate.size() << " is enough steps" << std::endl;
#endif
return true;
}
if (gbwt::Node::id(candidate.back()) == graph->get_id(trace_end) &&
gbwt::Node::is_reverse(candidate.back()) == graph->get_is_reverse(trace_end)) {

// The path being traced has reached the last node on our
// extracted path region, so stop here and avoid leaving
// the region.
#ifdef debug
std::cerr << "Stop because node " << gbwt::Node::id(candidate.back()) << " orientation " << gbwt::Node::is_reverse(candidate.back()) << " is our end node" << std::endl;
#endif
return true;
}
#ifdef debug
std::cerr << "Continue because node " << gbwt::Node::id(candidate.back()) << " orientation " << gbwt::Node::is_reverse(candidate.back()) << " is not our end node " << graph->get_id(trace_end) << " orientation " << graph->get_is_reverse(trace_end) << std::endl;
#endif
return false;
};

trace_haplotypes(*graph, *gbwt_index, trace_start, stop_function,
*subgraph, trace_thread_frequencies);
#ifdef debug
std::cerr << "Traced " << trace_thread_frequencies.size() << " distinct threads" << std::endl;
#endif
}

ofstream out_file;
Expand Down
27 changes: 20 additions & 7 deletions src/subcommand/trace_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,18 +133,31 @@ int main_trace(int argc, char** argv) {
}

// trace out our graph and paths from the start node
Graph trace_graph;
VG trace_graph;
map<string, int> haplotype_frequences;
trace_haplotypes_and_paths(*xindex, *gbwt_index, start_node, extend_distance,
trace_graph, haplotype_frequences);

if (!xindex->has_node(start_node)) {
cerr << "error:[vg trace] unable to find node " << start_node << " in graph" << endl;
exit(1);
}

handle_t start_handle = xindex->get_handle(start_node, false);
trace_paths(*xindex, start_handle, extend_distance,
trace_graph, haplotype_frequences);

auto stop_haplotype = [&extend_distance](const vector<gbwt::node_type>& new_thread) {
// Stop haplotypes at the given length, ignoring any possible stopping nodes.
return new_thread.size() >= extend_distance;
};

trace_haplotypes(*xindex, *gbwt_index, start_handle, stop_haplotype,
trace_graph, haplotype_frequences);

// dump our graph to stdout
if (json) {
cout << pb2json(trace_graph);
cout << pb2json(trace_graph.graph);
} else {
VG vg_graph;
vg_graph.extend(trace_graph);
vg_graph.serialize_to_ostream(cout);
trace_graph.serialize_to_ostream(cout);
}

// if requested, write thread frequencies to a file
Expand Down
Loading
Loading