Skip to content

Commit

Permalink
Merge pull request #3431 from vgteam/splice-faster
Browse files Browse the repository at this point in the history
Try to reverse speed regression in spliced alignment
  • Loading branch information
jeizenga authored Sep 27, 2021
2 parents 29d7666 + 89cc9aa commit a214924
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 46 deletions.
41 changes: 35 additions & 6 deletions src/algorithms/ref_path_distance.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
/** \file
* Implementsthe reference path distance function
* Implements the reference path distance function
*/

#include "ref_path_distance.hpp"

//#define debug_ref_path_distance

namespace vg {
namespace algorithms {

Expand All @@ -22,12 +24,15 @@ int64_t ref_path_distance(const PathPositionHandleGraph* graph, const pos_t& pos
// intialize in both directions from both positions
handle_t handle_1 = graph->get_handle(id(pos_1), is_rev(pos_1));
handle_t handle_2 = graph->get_handle(id(pos_2), is_rev(pos_2));
queue.push_or_reprioritize(make_tuple(handle_1, true, true),
offset(pos_1) - graph->get_length(handle_1));

// deactivating the ones because they can cause positions that cannot reach each other
// to be assigned an approximate distance that is positive
// queue.push_or_reprioritize(make_tuple(handle_1, true, true),
// offset(pos_1) - graph->get_length(handle_1));
queue.push_or_reprioritize(make_tuple(handle_1, false, true), -offset(pos_1));
queue.push_or_reprioritize(make_tuple(handle_2, true, true),
queue.push_or_reprioritize(make_tuple(handle_2, true, false),
offset(pos_2) - graph->get_length(handle_2));
queue.push_or_reprioritize(make_tuple(handle_2, false, true), -offset(pos_2));
// queue.push_or_reprioritize(make_tuple(handle_2, false, false), -offset(pos_2));

bool found_shared = false;
while (!queue.empty()) {
Expand All @@ -40,9 +45,15 @@ int64_t ref_path_distance(const PathPositionHandleGraph* graph, const pos_t& pos
tie(handle, go_left, from_pos_1) = top.first;
int64_t dist = top.second;

#ifdef debug_ref_path_distance
cerr << "[ref_path_distance] dequeue " << graph->get_id(handle) << (graph->get_is_reverse(handle) ? "-" : "+") << ", left? " << go_left << " from pos " << (from_pos_1 ? 1 : 2) << endl;
#endif
// after the min search distance, we can quit as soon as we find a
// shared path
if (found_shared && dist > min_search_dist) {
#ifdef debug_ref_path_distance
cerr << "[ref_path_distance] over min search, breaking" << endl;
#endif
break;
}

Expand All @@ -62,7 +73,9 @@ int64_t ref_path_distance(const PathPositionHandleGraph* graph, const pos_t& pos
graph->for_each_step_on_handle(handle, [&](const step_handle_t& step) {
pair<path_handle_t, bool> oriented_path(graph->get_path_handle_of_step(step),
graph->get_handle_of_step(step) != handle);

#ifdef debug_ref_path_distance
cerr << "[ref_path_distance] on path " << graph->get_path_name(oriented_path.first) << ", on rev? " << oriented_path.second << ", path offset " << graph->get_position_of_step(step) << endl;
#endif
if (!nearby_paths->count(oriented_path)) {

int64_t path_offset;
Expand All @@ -84,6 +97,10 @@ int64_t ref_path_distance(const PathPositionHandleGraph* graph, const pos_t& pos
// traverse right, forward strand of path
path_offset = graph->get_position_of_step(step) - dist;
}

#ifdef debug_ref_path_distance
cerr << "[ref_path_distance] first encounter, recording path offset of " << path_offset << endl;
#endif

(*nearby_paths)[oriented_path] = path_offset;
found_shared = found_shared || other_nearby_paths->count(oriented_path);
Expand All @@ -98,12 +115,20 @@ int64_t ref_path_distance(const PathPositionHandleGraph* graph, const pos_t& pos
}
}

#ifdef debug_ref_path_distance
cerr << "[ref_path_distance] choosing max absolute distance on shared paths" << endl;
#endif

// check all shared paths that we found
int64_t approx_ref_dist = numeric_limits<int64_t>::max();
for (const auto& path_record_1 : nearby_paths_1) {
auto it = nearby_paths_2.find(path_record_1.first);
if (it != nearby_paths_2.end()) {
int64_t dist = it->second - path_record_1.second;

#ifdef debug_ref_path_distance
cerr << "[ref_path_distance] distance on path " << graph->get_path_name(path_record_1.first.first) << " is " << dist << endl;
#endif
// we'll assume that the reference distance is the longest distance along
// a shared path (because others may correspond to spliced transcripts)
if (approx_ref_dist == numeric_limits<int64_t>::max() ||
Expand All @@ -113,6 +138,10 @@ int64_t ref_path_distance(const PathPositionHandleGraph* graph, const pos_t& pos
}
}

#ifdef debug_ref_path_distance
cerr << "[ref_path_distance] max absolute distance is " << approx_ref_dist << endl;
#endif

return approx_ref_dist;
}

Expand Down
150 changes: 110 additions & 40 deletions src/multipath_mapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
//#define debug_report_startup_training
//#define debug_pretty_print_alignments
//#define debug_time_phases
//#define debug_log_splice_align_stats

#ifdef debug_time_phases
#include <ctime>
Expand Down Expand Up @@ -126,7 +127,7 @@ namespace vg {
multipath_alns_out.back().clear_start();
}

if (do_spliced_alignment) {
if (do_spliced_alignment && !likely_mismapping(multipath_alns_out.front())) {
find_spliced_alignments(alignment, multipath_alns_out, multiplicities, cluster_idxs,
mems, cluster_graphs, fanouts.get());
}
Expand Down Expand Up @@ -1431,12 +1432,17 @@ namespace vg {
}

// find splices independently, also use the mate to rescue missing splice segments
bool did_splice_1 = find_spliced_alignments(alignment1, multipath_alns_1, multiplicities_1, cluster_idxs_1,
mems1, cluster_graphs1, fanouts1,
rescue_anchor_2, true, rescue_multiplicity_2);
bool did_splice_2 = find_spliced_alignments(alignment2, multipath_alns_2, multiplicities_2, cluster_idxs_2,
mems2, cluster_graphs2, fanouts2,
rescue_anchor_1, false, rescue_multiplicity_1);
bool did_splice_1 = false, did_splice_2 = false;
if (!multipath_alns_1.empty() && !likely_mismapping(multipath_alns_1.front())) {
did_splice_1 = find_spliced_alignments(alignment1, multipath_alns_1, multiplicities_1, cluster_idxs_1,
mems1, cluster_graphs1, fanouts1,
rescue_anchor_2, true, rescue_multiplicity_2);
}
if (!multipath_alns_2.empty() && !likely_mismapping(multipath_alns_2.front())) {
did_splice_2 = find_spliced_alignments(alignment2, multipath_alns_2, multiplicities_2, cluster_idxs_2,
mems2, cluster_graphs2, fanouts2,
rescue_anchor_1, false, rescue_multiplicity_1);
}

if (did_splice_1 || did_splice_2) {
// it may now be possible to identify some pairs as properly paired using the spliced alignment
Expand Down Expand Up @@ -2221,7 +2227,7 @@ namespace vg {
const PrejoinSide& left, const PrejoinSide& right,
const tuple<handle_t, size_t, int64_t>& left_location,
const tuple<handle_t, size_t, int64_t>& right_location,
int64_t estimated_intron_length, size_t motif_idx)
size_t motif_idx)
: joined_graph(graph, left.splice_region->get_subgraph(),
get<0>(left_location), get<1>(left_location),
right.splice_region->get_subgraph(),
Expand All @@ -2232,8 +2238,8 @@ namespace vg {
right_clip_length(right.clip_length),
left_candidate_idx(left.candidate_idx),
right_candidate_idx(right.candidate_idx),
estimated_intron_length(estimated_intron_length),
intron_score(splice_stats.intron_length_score(estimated_intron_length)),
estimated_intron_length(-1),
intron_score(0),
motif_idx(motif_idx),
untrimmed_score(left.untrimmed_score + right.untrimmed_score)
{
Expand All @@ -2248,18 +2254,28 @@ namespace vg {
int64_t right_clip_length;
size_t left_candidate_idx;
size_t right_candidate_idx;
int64_t estimated_intron_length;
int32_t intron_score;
size_t motif_idx;
int32_t max_score;
int32_t untrimmed_score;
size_t motif_idx;
// intron stats start uninitialized until measurign length
int32_t intron_score;
int64_t estimated_intron_length;
// these two filled out after doing alignment
Alignment connecting_aln;
size_t splice_idx;

int32_t fixed_score_components(const SpliceStats& splice_stats,
const Alignment& opt) {
return splice_stats.motif_score(motif_idx) + untrimmed_score - opt.score() + intron_score;
return splice_stats.motif_score(motif_idx) + untrimmed_score - opt.score();
}

void set_intron_length(int64_t estimated_intron_length,
const SpliceStats& splice_stats) {

estimated_intron_length = estimated_intron_length;
intron_score = splice_stats.intron_length_score(estimated_intron_length);
// memoize the max score again
max_score += intron_score;
}

int32_t pre_align_max_score(const GSSWAligner& aligner,
Expand All @@ -2284,7 +2300,7 @@ namespace vg {

int32_t post_align_net_score(const SpliceStats& splice_stats,
const Alignment& opt) {
return fixed_score_components(splice_stats, opt) + connecting_aln.score();
return fixed_score_components(splice_stats, opt) + connecting_aln.score() + intron_score;
}
};

Expand All @@ -2295,6 +2311,47 @@ namespace vg {
#endif
}

// we'll memoize the relatively expensive reference distance computations, since often
// there are multiple splice motifs on the same node
unordered_map<tuple<nid_t, bool, nid_t, bool>, int64_t> ref_length_memo;
auto get_reference_dist = [&](const pos_t& pos_1, const pos_t& pos_2) -> int64_t {

tuple<nid_t, bool, nid_t, bool> key(id(pos_1), is_rev(pos_1), id(pos_2), is_rev(pos_2));

auto it = ref_length_memo.find(key);
if (it != ref_length_memo.end()) {
// the reference distance of these nodes is already memoized
return it->second - offset(pos_1) + offset(pos_2);
}
else {
int64_t dist = numeric_limits<int64_t>::max();
if (xindex->get_path_count() != 0) {
// estimate the distance using the reference path
dist = algorithms::ref_path_distance(xindex, pos_1, pos_2,
min_splice_ref_search_length,
max_splice_ref_search_length);
}

if (distance_index && (dist < 0 || dist == numeric_limits<int64_t>::max())) {
// they're probably still reachable if they got this far, get a worse estimate of the
// distance from the distance index
int64_t min_dist = distance_index->min_distance(pos_1, pos_2);
if (min_dist >= 0) {
dist = min_dist;
}
}

if (dist != numeric_limits<int64_t>::max()) {
// not memoizing unreachable distances, since distance index should
// filter out most of those anyway, and they actually might change on
// different positions on the node

ref_length_memo[key] = dist + offset(pos_1) - offset(pos_2);
}
return dist;
}
};


#ifdef debug_multipath_mapper
cerr << "testing splice candidates for mp aln: " << endl;
Expand Down Expand Up @@ -2435,41 +2492,48 @@ namespace vg {
#ifdef debug_multipath_mapper
cerr << "\tchecking shared motif " << j << " with has positions " << l_pos << ", and " << r_pos << endl;
#endif
int64_t dist = numeric_limits<int64_t>::max();
if (xindex->get_path_count() != 0) {
// estimate the distance using the reference path
dist = algorithms::ref_path_distance(xindex, l_pos, r_pos,
min_splice_ref_search_length,
max_splice_ref_search_length);

putative_joins.emplace_back(*xindex, splice_stats, opt,
*get_aligner(!alignment.quality().empty()),
left_prejoin_side, right_prejoin_side,
left_location, right_location, j);

if (putative_joins.back().max_score < no_splice_log_odds) {
#ifdef debug_multipath_mapper
cerr << "\tscore bound of " << putative_joins.back().max_score << " ensures insigificant spliced alignment against prior log odds " << no_splice_log_odds << " before measuring intron length" << endl;
#endif

// this has no chance of becoming significant, let's skip it
putative_joins.pop_back();
continue;
}
if (distance_index && (dist < 0 || dist == numeric_limits<int64_t>::max())) {
// they're probably still reachable if they got this far, get a worse estimate of the
// distance from the distance index
dist = distance_index->min_distance(l_pos, r_pos);

// measure the intron length
int64_t dist = get_reference_dist(l_pos, r_pos);
if (dist <= 0 || dist > max_intron_length || dist == numeric_limits<int64_t>::max()) {
#ifdef debug_multipath_mapper
cerr << "\tinconsistent intron length " << dist << ", skipping putative join" << endl;
#endif
putative_joins.pop_back();
continue;
}

putative_joins.back().set_intron_length(dist, splice_stats);

// TODO: enforce pairing constraints?

if (dist > 0 && dist != numeric_limits<int64_t>::max() && dist < max_intron_length) {


// the positions can reach each other in under the max length, make a join
putative_joins.emplace_back(*xindex, splice_stats, opt,
*get_aligner(!alignment.quality().empty()),
left_prejoin_side, right_prejoin_side,
left_location, right_location, dist, j);
#ifdef debug_multipath_mapper
cerr << "\tshared motif has a spliceable path of length " << dist << " (intron score: " << putative_joins.back().intron_score << "), adding as a putative join with score bound " << putative_joins.back().max_score << endl;
cerr << "\tshared motif has a spliceable path of length " << dist << " (intron score: " << putative_joins.back().intron_score << "), adding as a putative join with score bound " << putative_joins.back().max_score << endl;
#endif
if (putative_joins.back().max_score < no_splice_log_odds) {
if (putative_joins.back().max_score < no_splice_log_odds) {
#ifdef debug_multipath_mapper
cerr << "\tscore bound of " << putative_joins.back().max_score << " ensures insigificant spliced alignment against prior log odds " << no_splice_log_odds << endl;
cerr << "\tscore bound of " << putative_joins.back().max_score << " ensures insigificant spliced alignment against prior log odds " << no_splice_log_odds << " before doing alignment" << endl;
#endif

// this has no chance of becoming significant, let's skip it
putative_joins.pop_back();
}

// this has no chance of becoming significant, let's skip it
putative_joins.pop_back();
}

}
}
}
Expand Down Expand Up @@ -3343,6 +3407,12 @@ namespace vg {
rescue_left, interval);
}

#ifdef debug_log_splice_align_stats
string line = alignment.name() + '\t' + to_string(did_splice) + '\t' + to_string(interval.first) + '\t' + to_string(interval.second) + '\t' + to_string(do_left) + '\t' + to_string(mp_aln_candidates.size()) + '\t' + to_string(cluster_candidates.size()) + '\t' + to_string(hit_candidates.size()) + '\n';
#pragma omp critical
cerr << line;
#endif

if (did_splice) {
// we may need to update the multiplicity based on the splicing
multiplicities[current_index[j]] = anchor_multiplicity;
Expand Down

1 comment on commit a214924

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch v1.35.0. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 10841 seconds

Please sign in to comment.