Skip to content

Commit

Permalink
Merge pull request #3892 from vgteam/support-matteo
Browse files Browse the repository at this point in the history
Improve Surject Validation and Error Reporting
  • Loading branch information
adamnovak authored Mar 23, 2023
2 parents ce2c139 + 1916a65 commit d577e3d
Show file tree
Hide file tree
Showing 8 changed files with 210 additions and 39 deletions.
32 changes: 20 additions & 12 deletions src/alignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2081,24 +2081,32 @@ void alignment_set_distance_to_correct(Alignment& aln, const map<string ,vector<
}
}

bool alignment_is_valid(Alignment& aln, const HandleGraph* hgraph) {
AlignmentValidity alignment_is_valid(const Alignment& aln, const HandleGraph* hgraph) {
for (size_t i = 0; i < aln.path().mapping_size(); ++i) {
const Mapping& mapping = aln.path().mapping(i);
if (!hgraph->has_node(mapping.position().node_id())) {
cerr << "Invalid Alignment:\n" << pb2json(aln) <<"\nNode " << mapping.position().node_id()
<< " not found in graph" << endl;
return false;
std::stringstream ss;
ss << "Node " << mapping.position().node_id() << " not found in graph";
return {
AlignmentValidity::NODE_MISSING,
i,
ss.str()
};
}
size_t node_len = hgraph->get_length(hgraph->get_handle(mapping.position().node_id()));
if (mapping_from_length(mapping) + mapping.position().offset() > node_len) {
cerr << "Invalid Alignment:\n" << pb2json(aln) << "\nLength of node "
<< mapping.position().node_id() << " (" << node_len << ") exceeded by Mapping with offset "
<< mapping.position().offset() << " and from-length " << mapping_from_length(mapping) << ":\n"
<< pb2json(mapping) << endl;
return false;
}
}
return true;
std::stringstream ss;
ss << "Length of node "
<< mapping.position().node_id() << " (" << node_len << ") exceeded by Mapping with offset "
<< mapping.position().offset() << " and from-length " << mapping_from_length(mapping);
return {
AlignmentValidity::NODE_TOO_SHORT,
i,
ss.str()
};
}
}
return {AlignmentValidity::OK};
}

Alignment target_alignment(const PathPositionHandleGraph* graph, const path_handle_t& path, size_t pos1, size_t pos2,
Expand Down
30 changes: 28 additions & 2 deletions src/alignment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -310,8 +310,34 @@ map<string ,vector<pair<size_t, bool> > > alignment_refpos_to_path_offsets(const
void alignment_set_distance_to_correct(Alignment& aln, const Alignment& base, const unordered_map<string, string>* translation = nullptr);
void alignment_set_distance_to_correct(Alignment& aln, const map<string, vector<pair<size_t, bool>>>& base_offsets, const unordered_map<string, string>* translation = nullptr);

/// check to make sure edits on the alignment's path don't assume incorrect node lengths or ids
bool alignment_is_valid(Alignment& aln, const HandleGraph* hgraph);
/**
* Represents a report on whether an alignment makes sense in the context of a graph.
*/
struct AlignmentValidity {
/// The different kinds of possible problems with alignments
enum Problem {
OK,
NODE_MISSING,
NODE_TOO_SHORT
};

/// The kind of problem with the alignment.
Problem problem = OK;
/// The mapping in the alignment's path at which the problem was encountered.
size_t bad_mapping_index = 0;
/// An explanation for the problem.
std::string message = "";

/// We are truthy if the alignment has no problem, and falsey otherwise.
inline operator bool() const {
return problem == OK;
}
};

/// Check to make sure edits on the alignment's path don't assume incorrect
/// node lengths or ids. Result can be used like a bool or inspected for
/// further details. Does not log anything itself about bad alignments.
AlignmentValidity alignment_is_valid(const Alignment& aln, const HandleGraph* hgraph);

/// Make an Alignment corresponding to a subregion of a stored path.
/// Positions are 0-based, and pos2 is excluded.
Expand Down
25 changes: 23 additions & 2 deletions src/hts_alignment_emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,9 @@ vector<tuple<path_handle_t, size_t, size_t>> get_sequence_dictionary(const strin
// and filled in later
vector<pair<string, int64_t>> input_names_lengths;

// Should we print path subrange warnings?
bool print_subrange_warnings = true;

// When we get a sequence and possibly its length (or -1 if no length is
// known), put it in the dictionary.
// Can optionally provide a file name for error reporting.
Expand All @@ -223,6 +226,24 @@ vector<tuple<path_handle_t, size_t, size_t>> get_sequence_dictionary(const strin
cerr << endl;
exit(1);
}

if (print_subrange_warnings) {
subrange_t subrange;
std::string base_path_name = Paths::strip_subrange(sequence_name, &subrange);
if (subrange != PathMetadata::NO_SUBRANGE) {
// The user is asking explicitly to surject to a path that is a
// subrange of some other logical path, like
// GRCh38#0#chr1[1000-2000]. That's weird. Warn.
cerr << "warning:[vg::get_sequence_dictionary] Path " << sequence_name;
if (filename) {
// Report the source file.
cerr << " from sequence dictionary in " << *filename;
}
cerr << " looks like part of a path. Output coordinates will be in " << base_path_name << " instead. Suppressing further warnings." << endl;
print_subrange_warnings = false;
}
}

// Remember the path
base_path_to_subpaths[sequence_name].push_back(path);
} else {
Expand All @@ -237,10 +258,10 @@ vector<tuple<path_handle_t, size_t, size_t>> get_sequence_dictionary(const strin
});
if (!base_path_to_subpaths.count(sequence_name)) {
// We didn't find any subpaths for this path as a base path either.
cerr << "error:[vg::get_sequence_dictionary] Graph does not have a path named " << sequence_name;
cerr << "error:[vg::get_sequence_dictionary] Graph does not have the entirety or any pieces of a path named " << sequence_name;
if (filename) {
// Report the source file.
cerr << " which was indicated in " << filename;
cerr << " which was indicated in " << *filename;
}
cerr << endl;
exit(1);
Expand Down
70 changes: 63 additions & 7 deletions src/subcommand/gbwt_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ using namespace vg;

struct GBWTConfig {
// Build mode also defines the type of input args.
enum build_mode { build_none, build_vcf, build_gfa, build_paths, build_alignments, build_gbz };
enum build_mode { build_none, build_vcf, build_gfa, build_paths, build_alignments, build_gbz, build_gbwtgraph };
enum merge_mode { merge_none, merge_insert, merge_fast, merge_parallel };
enum path_cover_mode { path_cover_none, path_cover_augment, path_cover_local, path_cover_greedy };

Expand Down Expand Up @@ -66,6 +66,7 @@ struct GBWTConfig {
std::vector<std::string> input_filenames;
std::string gbwt_name; // There is a single input GBWT to load.
std::string graph_name;
std::string gbwtgraph_name;

// File names.
std::string gbwt_output; // Output GBWT.
Expand Down Expand Up @@ -104,7 +105,7 @@ struct GBWTConfig {
};

struct GraphHandler {
enum graph_type { graph_none, graph_path, graph_source, graph_gbz };
enum graph_type { graph_none, graph_path, graph_source, graph_gbz, graph_gbwtgraph };

std::unique_ptr<PathHandleGraph> path_graph = nullptr;
std::unique_ptr<gbwtgraph::SequenceSource> sequence_source = nullptr;
Expand All @@ -123,6 +124,11 @@ struct GraphHandler {
// graph in this handler.
// NOTE: The graph will become invalid if the GBWT in the GBWTHandler changes.
void load_gbz(GBWTHandler& gbwts, GBWTConfig& config);

// Load the GBWTGraph specified in the config, store the GBWT in the
// GBWTHandler and the graph in this handler.
// NOTE: The graph will become invalid if the GBWT in the GBWTHandler changes.
void load_gbwtgraph(GBWTHandler& gbwts, GBWTConfig& config);

void clear();

Expand Down Expand Up @@ -268,6 +274,7 @@ void help_gbwt(char** argv) {
std::cerr << " --translation FILE write the segment to node translation table to FILE" << std::endl;
std::cerr << " -Z, --gbz-input extract GBWT and GBWTGraph from GBZ input (one input arg)" << std::endl;
std::cerr << " --translation FILE write the segment to node translation table to FILE" << std::endl;
std::cerr << " -I, --gg-in FILE load GBWTGraph from FILE and GBWT from input (one input arg) " << std::endl;
std::cerr << " -E, --index-paths index the embedded non-alt paths in the graph (requires -x, no input args)" << std::endl;
std::cerr << " -A, --alignment-input index the alignments in the GAF files specified in input args (requires -x)" << std::endl;
std::cerr << " --gam-format the input files are in GAM format instead of GAF format" << std::endl;
Expand Down Expand Up @@ -428,6 +435,9 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {

// Input GBWT construction: GBZ
{ "gbz-input", no_argument, 0, 'Z' },

// Input GBWT construction: GBWTGraph and GBWT
{ "gg-in", required_argument, 0, 'I' },

// Input GBWT construction: paths
{ "index-paths", no_argument, 0, 'E' },
Expand Down Expand Up @@ -487,7 +497,7 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {
GBWTConfig config;
while (true) {
int option_index = 0;
c = getopt_long(argc, argv, "x:o:d:pvGZEAmfbR:alPn:k:g:r:MCHSLTce:h?", long_options, &option_index);
c = getopt_long(argc, argv, "x:o:d:pvGZI:EAmfbR:alPn:k:g:r:MCHSLTce:h?", long_options, &option_index);

/* Detect the end of the options. */
if (c == -1)
Expand Down Expand Up @@ -645,6 +655,14 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {
config.build = GBWTConfig::build_gbz;
config.produces_one_gbwt = true;
break;

// Input GBWT construction: GBWTGraph and GBWT
case 'I':
no_multiple_input_types(config);
config.build = GBWTConfig::build_gbwtgraph;
config.gbwtgraph_name = optarg;
config.produces_one_gbwt = true;
break;

// Input GBWT construction: Paths
case 'E':
Expand Down Expand Up @@ -831,8 +849,8 @@ void validate_gbwt_config(GBWTConfig& config) {
// We have one input GBWT after steps 1-4.
bool one_input_gbwt = config.input_filenames.size() == 1 || config.produces_one_gbwt;

// We can load a PathHandleGraph from a file, get a SequenceSource from parsing GFA, or get a GBWTGraph from GBZ.
bool has_graph_input = !config.graph_name.empty() || config.build == GBWTConfig::build_gfa || config.build == GBWTConfig::build_gbz;
// We can load a PathHandleGraph from a file, get a SequenceSource from parsing GFA, or get a GBWTGraph from GBZ or GG/GBWT.
bool has_graph_input = !config.graph_name.empty() || config.build == GBWTConfig::build_gfa || config.build == GBWTConfig::build_gbz || config.build == GBWTConfig::build_gbwtgraph;

if (config.build == GBWTConfig::build_gbz) {
// If we "build" a GBWT by loading it from a GBZ, we just need to make
Expand All @@ -845,6 +863,17 @@ void validate_gbwt_config(GBWTConfig& config) {
std::cerr << "error: [vg gbwt] GBZ input requires one input arg" << std::endl;
std::exit(EXIT_FAILURE);
}
} else if (config.build == GBWTConfig::build_gbwtgraph) {
// If we "build" a GBWT by loading it from a GG and a GBWT, we just need to make
// sure that we know enough to actually load it.
if (!config.graph_name.empty()) {
std::cerr << "error: [vg gbwt] GBWTGraph input does not use -x" << std::endl;
std::exit(EXIT_FAILURE);
}
if (config.input_filenames.size() != 1) {
std::cerr << "error: [vg gbwt] GBWTGraph input requires one input arg" << std::endl;
std::exit(EXIT_FAILURE);
}
} else if (config.build != GBWTConfig::build_none) {
if (!has_gbwt_output) {
// If we build our GBWT by doing anything other than loading it
Expand Down Expand Up @@ -897,6 +926,9 @@ void validate_gbwt_config(GBWTConfig& config) {
if (config.build == GBWTConfig::build_gbz) {
std::cerr << "error: [vg gbwt] the GBWT extracted from GBZ cannot have paths modified" << std::endl;
}
if (config.build == GBWTConfig::build_gbwtgraph) {
std::cerr << "error: [vg gbwt] the GBWT loaded with a GBWTGraph cannot have paths modified" << std::endl;
}
if (!(config.input_filenames.size() == 1 || config.merge != GBWTConfig::merge_none) || !has_gbwt_output) {
std::cerr << "error: [vg gbwt] removing a sample requires one input GBWT and output GBWT" << std::endl;
std::exit(EXIT_FAILURE);
Expand Down Expand Up @@ -1177,7 +1209,7 @@ void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& co
std::cerr << "Building input GBWTs" << std::endl;
}
gbwts.unbacked(); // We will build a new GBWT.
if (config.build != GBWTConfig::build_gfa && config.build != GBWTConfig::build_gbz) {
if (config.build != GBWTConfig::build_gfa && config.build != GBWTConfig::build_gbz && config.build != GBWTConfig::build_gbwtgraph) {
graphs.get_graph(config);
}

Expand Down Expand Up @@ -1242,6 +1274,11 @@ void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& co
std::cerr << "Input type: GBZ" << std::endl;
}
graphs.load_gbz(gbwts, config);
} else if (config.build == GBWTConfig::build_gbwtgraph) {
if(config.show_progress) {
std::cerr << "Input type: GBWTGraph" << std::endl;
}
graphs.load_gbwtgraph(gbwts, config);
} else if (config.build == GBWTConfig::build_paths) {
if(config.show_progress) {
std::cerr << "Input type: embedded paths" << std::endl;
Expand Down Expand Up @@ -1462,7 +1499,7 @@ void step_5_gbwtgraph(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& conf
gbwtgraph::GBWTGraph graph;
if (graphs.in_use == GraphHandler::graph_source) {
graph = gbwtgraph::GBWTGraph(gbwts.compressed, *(graphs.sequence_source));
} else if (graphs.in_use == GraphHandler::graph_gbz) {
} else if (graphs.in_use == GraphHandler::graph_gbz || graphs.in_use == GraphHandler::graph_gbwtgraph) {
graph = std::move(*(graphs.gbwt_graph));
graphs.clear();
} else {
Expand Down Expand Up @@ -1653,6 +1690,25 @@ void GraphHandler::load_gbz(GBWTHandler& gbwts, GBWTConfig& config) {
}
}

void GraphHandler::load_gbwtgraph(GBWTHandler& gbwts, GBWTConfig& config) {
if (this->in_use == graph_gbwtgraph) {
return;
} else {
this->clear();
// Load the GBWT
gbwt::GBWT input_gbwt;
vg::load_gbwt(input_gbwt, config.input_filenames.front(), config.show_progress);
gbwts.use(input_gbwt);

// Then load the GBWTGraph
this->gbwt_graph = std::make_unique<gbwtgraph::GBWTGraph>();
vg::load_gbwtgraph(*this->gbwt_graph, config.gbwtgraph_name, config.show_progress);
// And connect it
this->gbwt_graph->set_gbwt(gbwts.compressed);
this->in_use = graph_gbwtgraph;
}
}

void GraphHandler::clear() {
this->path_graph.reset();
this->sequence_source.reset();
Expand Down
36 changes: 33 additions & 3 deletions src/subcommand/surject_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,23 @@ void help_surject(char** argv) {
<< " -R, --read-group NAME set this read group for all reads" << endl
<< " -f, --max-frag-len N reads with fragment lengths greater than N will not be marked properly paired in SAM/BAM/CRAM" << endl
<< " -L, --list-all-paths annotate SAM records with a list of all attempted re-alignments to paths in SS tag" << endl
<< " -C, --compression N level for compression [0-9]" << endl;
<< " -C, --compression N level for compression [0-9]" << endl
<< " -V, --no-validate skip checking whether alignments plausibly are against the provided graph" << endl;
}

/// If the given alignment doesn't make sense against the given graph (i.e.
/// doesn't agree with the nodes in the graph), print a message and stop the
/// program. Is thread-safe.
static void ensure_alignment_is_for_graph(const Alignment& aln, const HandleGraph& graph) {
AlignmentValidity validity = alignment_is_valid(aln, &graph);
if (!validity) {
#pragma omp critical (cerr)
{
std::cerr << "error:[vg surject] Alignment " << aln.name() << " cannot be interpreted against this graph: " << validity.message << std::endl;
std::cerr << "Make sure that you are using the same graph that the reads were mapped to!" << std::endl;
}
exit(1);
}
}

int main_surject(int argc, char** argv) {
Expand All @@ -79,6 +95,7 @@ int main_surject(int argc, char** argv) {
bool prune_anchors = false;
bool annotate_with_all_path_scores = false;
bool multimap = false;
bool validate = true;

int c;
optind = 2; // force optind past command positional argument
Expand Down Expand Up @@ -107,11 +124,12 @@ int main_surject(int argc, char** argv) {
{"max-frag-len", required_argument, 0, 'f'},
{"list-all-paths", no_argument, 0, 'L'},
{"compress", required_argument, 0, 'C'},
{"no-validate", required_argument, 0, 'V'},
{0, 0, 0, 0}
};

int option_index = 0;
c = getopt_long (argc, argv, "hx:p:F:liGmcbsN:R:f:C:t:SPALM",
c = getopt_long (argc, argv, "hx:p:F:liGmcbsN:R:f:C:t:SPALMV",
long_options, &option_index);

// Detect the end of the options.
Expand Down Expand Up @@ -194,6 +212,10 @@ int main_surject(int argc, char** argv) {
compress_level = parse<int>(optarg);
break;

case 'V':
validate = false;
break;

case 't':
omp_set_num_threads(parse<int>(optarg));
break;
Expand Down Expand Up @@ -276,7 +298,7 @@ int main_surject(int argc, char** argv) {
if (src.has_path() && src.sequence().empty()) {
#pragma omp critical
{
cerr << "error:[surject] Read " << src.name() << " is aligned but does not have a sequence and therefore cannot be surjected. Was it derived from a GAF without a base-level alignment? or a GAF with a CIGAR string in the 'cg' tag (which does not provide enough information to reconstruct the sequence)?" << endl;
cerr << "error:[surject] Read " << src.name() << " is aligned but does not have a sequence and therefore cannot be surjected. Was it derived from a GAF without a base-level alignment? Or a GAF with a CIGAR string in the 'cg' tag (which does not provide enough information to reconstruct the sequence)?" << endl;
exit(1);
}
}
Expand Down Expand Up @@ -332,6 +354,10 @@ int main_surject(int argc, char** argv) {
exit(1);
}

if (validate) {
ensure_alignment_is_for_graph(src1, *xgidx);
ensure_alignment_is_for_graph(src2, *xgidx);
}

// Preprocess read to set metadata before surjection
set_metadata(src1);
Expand Down Expand Up @@ -399,6 +425,10 @@ int main_surject(int argc, char** argv) {
// TODO: We don't preserve order relationships (like primary/secondary).
function<void(Alignment&)> lambda = [&](Alignment& src) {

if (validate) {
ensure_alignment_is_for_graph(src, *xgidx);
}

// Preprocess read to set metadata before surjection
set_metadata(src);

Expand Down
Loading

2 comments on commit d577e3d

@adamnovak
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 10841 seconds

@adamnovak
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch v1.47.0. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 11280 seconds

Please sign in to comment.