diff --git a/src/alignment.cpp b/src/alignment.cpp index 3a9b39ff742..6b464fe38b1 100644 --- a/src/alignment.cpp +++ b/src/alignment.cpp @@ -2081,24 +2081,32 @@ void alignment_set_distance_to_correct(Alignment& aln, const maphas_node(mapping.position().node_id())) { - cerr << "Invalid Alignment:\n" << pb2json(aln) <<"\nNode " << mapping.position().node_id() - << " not found in graph" << endl; - return false; + std::stringstream ss; + ss << "Node " << mapping.position().node_id() << " not found in graph"; + return { + AlignmentValidity::NODE_MISSING, + i, + ss.str() + }; } size_t node_len = hgraph->get_length(hgraph->get_handle(mapping.position().node_id())); if (mapping_from_length(mapping) + mapping.position().offset() > node_len) { - cerr << "Invalid Alignment:\n" << pb2json(aln) << "\nLength of node " - << mapping.position().node_id() << " (" << node_len << ") exceeded by Mapping with offset " - << mapping.position().offset() << " and from-length " << mapping_from_length(mapping) << ":\n" - << pb2json(mapping) << endl; - return false; - } - } - return true; + std::stringstream ss; + ss << "Length of node " + << mapping.position().node_id() << " (" << node_len << ") exceeded by Mapping with offset " + << mapping.position().offset() << " and from-length " << mapping_from_length(mapping); + return { + AlignmentValidity::NODE_TOO_SHORT, + i, + ss.str() + }; + } + } + return {AlignmentValidity::OK}; } Alignment target_alignment(const PathPositionHandleGraph* graph, const path_handle_t& path, size_t pos1, size_t pos2, diff --git a/src/alignment.hpp b/src/alignment.hpp index 9069f2e4bbc..1e7f2a11bdb 100644 --- a/src/alignment.hpp +++ b/src/alignment.hpp @@ -310,8 +310,34 @@ map > > alignment_refpos_to_path_offsets(const void alignment_set_distance_to_correct(Alignment& aln, const Alignment& base, const unordered_map* translation = nullptr); void alignment_set_distance_to_correct(Alignment& aln, const map>>& base_offsets, const unordered_map* translation = nullptr); -/// check to make sure edits on the alignment's path don't assume incorrect node lengths or ids -bool alignment_is_valid(Alignment& aln, const HandleGraph* hgraph); +/** + * Represents a report on whether an alignment makes sense in the context of a graph. + */ +struct AlignmentValidity { + /// The different kinds of possible problems with alignments + enum Problem { + OK, + NODE_MISSING, + NODE_TOO_SHORT + }; + + /// The kind of problem with the alignment. + Problem problem = OK; + /// The mapping in the alignment's path at which the problem was encountered. + size_t bad_mapping_index = 0; + /// An explanation for the problem. + std::string message = ""; + + /// We are truthy if the alignment has no problem, and falsey otherwise. + inline operator bool() const { + return problem == OK; + } +}; + +/// Check to make sure edits on the alignment's path don't assume incorrect +/// node lengths or ids. Result can be used like a bool or inspected for +/// further details. Does not log anything itself about bad alignments. +AlignmentValidity alignment_is_valid(const Alignment& aln, const HandleGraph* hgraph); /// Make an Alignment corresponding to a subregion of a stored path. /// Positions are 0-based, and pos2 is excluded. diff --git a/src/hts_alignment_emitter.cpp b/src/hts_alignment_emitter.cpp index aaaf2f4a0b5..fc0c4d79705 100644 --- a/src/hts_alignment_emitter.cpp +++ b/src/hts_alignment_emitter.cpp @@ -201,6 +201,9 @@ vector> get_sequence_dictionary(const strin // and filled in later vector> input_names_lengths; + // Should we print path subrange warnings? + bool print_subrange_warnings = true; + // When we get a sequence and possibly its length (or -1 if no length is // known), put it in the dictionary. // Can optionally provide a file name for error reporting. @@ -223,6 +226,24 @@ vector> get_sequence_dictionary(const strin cerr << endl; exit(1); } + + if (print_subrange_warnings) { + subrange_t subrange; + std::string base_path_name = Paths::strip_subrange(sequence_name, &subrange); + if (subrange != PathMetadata::NO_SUBRANGE) { + // The user is asking explicitly to surject to a path that is a + // subrange of some other logical path, like + // GRCh38#0#chr1[1000-2000]. That's weird. Warn. + cerr << "warning:[vg::get_sequence_dictionary] Path " << sequence_name; + if (filename) { + // Report the source file. + cerr << " from sequence dictionary in " << *filename; + } + cerr << " looks like part of a path. Output coordinates will be in " << base_path_name << " instead. Suppressing further warnings." << endl; + print_subrange_warnings = false; + } + } + // Remember the path base_path_to_subpaths[sequence_name].push_back(path); } else { @@ -237,10 +258,10 @@ vector> get_sequence_dictionary(const strin }); if (!base_path_to_subpaths.count(sequence_name)) { // We didn't find any subpaths for this path as a base path either. - cerr << "error:[vg::get_sequence_dictionary] Graph does not have a path named " << sequence_name; + cerr << "error:[vg::get_sequence_dictionary] Graph does not have the entirety or any pieces of a path named " << sequence_name; if (filename) { // Report the source file. - cerr << " which was indicated in " << filename; + cerr << " which was indicated in " << *filename; } cerr << endl; exit(1); diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp index 577938ba01e..22bd92cae69 100644 --- a/src/subcommand/gbwt_main.cpp +++ b/src/subcommand/gbwt_main.cpp @@ -28,7 +28,7 @@ using namespace vg; struct GBWTConfig { // Build mode also defines the type of input args. - enum build_mode { build_none, build_vcf, build_gfa, build_paths, build_alignments, build_gbz }; + enum build_mode { build_none, build_vcf, build_gfa, build_paths, build_alignments, build_gbz, build_gbwtgraph }; enum merge_mode { merge_none, merge_insert, merge_fast, merge_parallel }; enum path_cover_mode { path_cover_none, path_cover_augment, path_cover_local, path_cover_greedy }; @@ -66,6 +66,7 @@ struct GBWTConfig { std::vector input_filenames; std::string gbwt_name; // There is a single input GBWT to load. std::string graph_name; + std::string gbwtgraph_name; // File names. std::string gbwt_output; // Output GBWT. @@ -104,7 +105,7 @@ struct GBWTConfig { }; struct GraphHandler { - enum graph_type { graph_none, graph_path, graph_source, graph_gbz }; + enum graph_type { graph_none, graph_path, graph_source, graph_gbz, graph_gbwtgraph }; std::unique_ptr path_graph = nullptr; std::unique_ptr sequence_source = nullptr; @@ -123,6 +124,11 @@ struct GraphHandler { // graph in this handler. // NOTE: The graph will become invalid if the GBWT in the GBWTHandler changes. void load_gbz(GBWTHandler& gbwts, GBWTConfig& config); + + // Load the GBWTGraph specified in the config, store the GBWT in the + // GBWTHandler and the graph in this handler. + // NOTE: The graph will become invalid if the GBWT in the GBWTHandler changes. + void load_gbwtgraph(GBWTHandler& gbwts, GBWTConfig& config); void clear(); @@ -268,6 +274,7 @@ void help_gbwt(char** argv) { std::cerr << " --translation FILE write the segment to node translation table to FILE" << std::endl; std::cerr << " -Z, --gbz-input extract GBWT and GBWTGraph from GBZ input (one input arg)" << std::endl; std::cerr << " --translation FILE write the segment to node translation table to FILE" << std::endl; + std::cerr << " -I, --gg-in FILE load GBWTGraph from FILE and GBWT from input (one input arg) " << std::endl; std::cerr << " -E, --index-paths index the embedded non-alt paths in the graph (requires -x, no input args)" << std::endl; std::cerr << " -A, --alignment-input index the alignments in the GAF files specified in input args (requires -x)" << std::endl; std::cerr << " --gam-format the input files are in GAM format instead of GAF format" << std::endl; @@ -428,6 +435,9 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { // Input GBWT construction: GBZ { "gbz-input", no_argument, 0, 'Z' }, + + // Input GBWT construction: GBWTGraph and GBWT + { "gg-in", required_argument, 0, 'I' }, // Input GBWT construction: paths { "index-paths", no_argument, 0, 'E' }, @@ -487,7 +497,7 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { GBWTConfig config; while (true) { int option_index = 0; - c = getopt_long(argc, argv, "x:o:d:pvGZEAmfbR:alPn:k:g:r:MCHSLTce:h?", long_options, &option_index); + c = getopt_long(argc, argv, "x:o:d:pvGZI:EAmfbR:alPn:k:g:r:MCHSLTce:h?", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) @@ -645,6 +655,14 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { config.build = GBWTConfig::build_gbz; config.produces_one_gbwt = true; break; + + // Input GBWT construction: GBWTGraph and GBWT + case 'I': + no_multiple_input_types(config); + config.build = GBWTConfig::build_gbwtgraph; + config.gbwtgraph_name = optarg; + config.produces_one_gbwt = true; + break; // Input GBWT construction: Paths case 'E': @@ -831,8 +849,8 @@ void validate_gbwt_config(GBWTConfig& config) { // We have one input GBWT after steps 1-4. bool one_input_gbwt = config.input_filenames.size() == 1 || config.produces_one_gbwt; - // We can load a PathHandleGraph from a file, get a SequenceSource from parsing GFA, or get a GBWTGraph from GBZ. - bool has_graph_input = !config.graph_name.empty() || config.build == GBWTConfig::build_gfa || config.build == GBWTConfig::build_gbz; + // We can load a PathHandleGraph from a file, get a SequenceSource from parsing GFA, or get a GBWTGraph from GBZ or GG/GBWT. + bool has_graph_input = !config.graph_name.empty() || config.build == GBWTConfig::build_gfa || config.build == GBWTConfig::build_gbz || config.build == GBWTConfig::build_gbwtgraph; if (config.build == GBWTConfig::build_gbz) { // If we "build" a GBWT by loading it from a GBZ, we just need to make @@ -845,6 +863,17 @@ void validate_gbwt_config(GBWTConfig& config) { std::cerr << "error: [vg gbwt] GBZ input requires one input arg" << std::endl; std::exit(EXIT_FAILURE); } + } else if (config.build == GBWTConfig::build_gbwtgraph) { + // If we "build" a GBWT by loading it from a GG and a GBWT, we just need to make + // sure that we know enough to actually load it. + if (!config.graph_name.empty()) { + std::cerr << "error: [vg gbwt] GBWTGraph input does not use -x" << std::endl; + std::exit(EXIT_FAILURE); + } + if (config.input_filenames.size() != 1) { + std::cerr << "error: [vg gbwt] GBWTGraph input requires one input arg" << std::endl; + std::exit(EXIT_FAILURE); + } } else if (config.build != GBWTConfig::build_none) { if (!has_gbwt_output) { // If we build our GBWT by doing anything other than loading it @@ -897,6 +926,9 @@ void validate_gbwt_config(GBWTConfig& config) { if (config.build == GBWTConfig::build_gbz) { std::cerr << "error: [vg gbwt] the GBWT extracted from GBZ cannot have paths modified" << std::endl; } + if (config.build == GBWTConfig::build_gbwtgraph) { + std::cerr << "error: [vg gbwt] the GBWT loaded with a GBWTGraph cannot have paths modified" << std::endl; + } if (!(config.input_filenames.size() == 1 || config.merge != GBWTConfig::merge_none) || !has_gbwt_output) { std::cerr << "error: [vg gbwt] removing a sample requires one input GBWT and output GBWT" << std::endl; std::exit(EXIT_FAILURE); @@ -1177,7 +1209,7 @@ void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& co std::cerr << "Building input GBWTs" << std::endl; } gbwts.unbacked(); // We will build a new GBWT. - if (config.build != GBWTConfig::build_gfa && config.build != GBWTConfig::build_gbz) { + if (config.build != GBWTConfig::build_gfa && config.build != GBWTConfig::build_gbz && config.build != GBWTConfig::build_gbwtgraph) { graphs.get_graph(config); } @@ -1242,6 +1274,11 @@ void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& co std::cerr << "Input type: GBZ" << std::endl; } graphs.load_gbz(gbwts, config); + } else if (config.build == GBWTConfig::build_gbwtgraph) { + if(config.show_progress) { + std::cerr << "Input type: GBWTGraph" << std::endl; + } + graphs.load_gbwtgraph(gbwts, config); } else if (config.build == GBWTConfig::build_paths) { if(config.show_progress) { std::cerr << "Input type: embedded paths" << std::endl; @@ -1462,7 +1499,7 @@ void step_5_gbwtgraph(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& conf gbwtgraph::GBWTGraph graph; if (graphs.in_use == GraphHandler::graph_source) { graph = gbwtgraph::GBWTGraph(gbwts.compressed, *(graphs.sequence_source)); - } else if (graphs.in_use == GraphHandler::graph_gbz) { + } else if (graphs.in_use == GraphHandler::graph_gbz || graphs.in_use == GraphHandler::graph_gbwtgraph) { graph = std::move(*(graphs.gbwt_graph)); graphs.clear(); } else { @@ -1653,6 +1690,25 @@ void GraphHandler::load_gbz(GBWTHandler& gbwts, GBWTConfig& config) { } } +void GraphHandler::load_gbwtgraph(GBWTHandler& gbwts, GBWTConfig& config) { + if (this->in_use == graph_gbwtgraph) { + return; + } else { + this->clear(); + // Load the GBWT + gbwt::GBWT input_gbwt; + vg::load_gbwt(input_gbwt, config.input_filenames.front(), config.show_progress); + gbwts.use(input_gbwt); + + // Then load the GBWTGraph + this->gbwt_graph = std::make_unique(); + vg::load_gbwtgraph(*this->gbwt_graph, config.gbwtgraph_name, config.show_progress); + // And connect it + this->gbwt_graph->set_gbwt(gbwts.compressed); + this->in_use = graph_gbwtgraph; + } +} + void GraphHandler::clear() { this->path_graph.reset(); this->sequence_source.reset(); diff --git a/src/subcommand/surject_main.cpp b/src/subcommand/surject_main.cpp index d4ffb8ca4b7..f079f93dfea 100644 --- a/src/subcommand/surject_main.cpp +++ b/src/subcommand/surject_main.cpp @@ -52,7 +52,23 @@ void help_surject(char** argv) { << " -R, --read-group NAME set this read group for all reads" << endl << " -f, --max-frag-len N reads with fragment lengths greater than N will not be marked properly paired in SAM/BAM/CRAM" << endl << " -L, --list-all-paths annotate SAM records with a list of all attempted re-alignments to paths in SS tag" << endl - << " -C, --compression N level for compression [0-9]" << endl; + << " -C, --compression N level for compression [0-9]" << endl + << " -V, --no-validate skip checking whether alignments plausibly are against the provided graph" << endl; +} + +/// If the given alignment doesn't make sense against the given graph (i.e. +/// doesn't agree with the nodes in the graph), print a message and stop the +/// program. Is thread-safe. +static void ensure_alignment_is_for_graph(const Alignment& aln, const HandleGraph& graph) { + AlignmentValidity validity = alignment_is_valid(aln, &graph); + if (!validity) { + #pragma omp critical (cerr) + { + std::cerr << "error:[vg surject] Alignment " << aln.name() << " cannot be interpreted against this graph: " << validity.message << std::endl; + std::cerr << "Make sure that you are using the same graph that the reads were mapped to!" << std::endl; + } + exit(1); + } } int main_surject(int argc, char** argv) { @@ -79,6 +95,7 @@ int main_surject(int argc, char** argv) { bool prune_anchors = false; bool annotate_with_all_path_scores = false; bool multimap = false; + bool validate = true; int c; optind = 2; // force optind past command positional argument @@ -107,11 +124,12 @@ int main_surject(int argc, char** argv) { {"max-frag-len", required_argument, 0, 'f'}, {"list-all-paths", no_argument, 0, 'L'}, {"compress", required_argument, 0, 'C'}, + {"no-validate", required_argument, 0, 'V'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hx:p:F:liGmcbsN:R:f:C:t:SPALM", + c = getopt_long (argc, argv, "hx:p:F:liGmcbsN:R:f:C:t:SPALMV", long_options, &option_index); // Detect the end of the options. @@ -194,6 +212,10 @@ int main_surject(int argc, char** argv) { compress_level = parse(optarg); break; + case 'V': + validate = false; + break; + case 't': omp_set_num_threads(parse(optarg)); break; @@ -276,7 +298,7 @@ int main_surject(int argc, char** argv) { if (src.has_path() && src.sequence().empty()) { #pragma omp critical { - cerr << "error:[surject] Read " << src.name() << " is aligned but does not have a sequence and therefore cannot be surjected. Was it derived from a GAF without a base-level alignment? or a GAF with a CIGAR string in the 'cg' tag (which does not provide enough information to reconstruct the sequence)?" << endl; + cerr << "error:[surject] Read " << src.name() << " is aligned but does not have a sequence and therefore cannot be surjected. Was it derived from a GAF without a base-level alignment? Or a GAF with a CIGAR string in the 'cg' tag (which does not provide enough information to reconstruct the sequence)?" << endl; exit(1); } } @@ -332,6 +354,10 @@ int main_surject(int argc, char** argv) { exit(1); } + if (validate) { + ensure_alignment_is_for_graph(src1, *xgidx); + ensure_alignment_is_for_graph(src2, *xgidx); + } // Preprocess read to set metadata before surjection set_metadata(src1); @@ -399,6 +425,10 @@ int main_surject(int argc, char** argv) { // TODO: We don't preserve order relationships (like primary/secondary). function lambda = [&](Alignment& src) { + if (validate) { + ensure_alignment_is_for_graph(src, *xgidx); + } + // Preprocess read to set metadata before surjection set_metadata(src); diff --git a/src/subcommand/validate_main.cpp b/src/subcommand/validate_main.cpp index fe470e002cd..12125c6218c 100644 --- a/src/subcommand/validate_main.cpp +++ b/src/subcommand/validate_main.cpp @@ -27,7 +27,8 @@ void help_validate(char** argv) { << "options:" << endl << " default: check all aspects of the graph, if options are specified do only those" << endl << " -o, --orphans verify that all nodes have edges" << endl - << " -a, --gam FILE verify that edits in the alignment fit on nodes in the graph" << endl; + << " -a, --gam FILE verify that edits in the alignment fit on nodes in the graph" << endl + << " -A, --gam-only do not verify the graph itself, only the alignment" << endl; } int main_validate(int argc, char** argv) { @@ -39,6 +40,7 @@ int main_validate(int argc, char** argv) { bool check_orphans = false; string gam_path; + bool gam_only = false; int c; optind = 2; // force optind past command positional argument @@ -48,11 +50,12 @@ int main_validate(int argc, char** argv) { {"help", no_argument, 0, 'h'}, {"orphans", no_argument, 0, 'o'}, {"gam", required_argument, 0, 'a'}, + {"gam-only", no_argument, 0, 'A'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hoa:", + c = getopt_long (argc, argv, "hoa:A", long_options, &option_index); // Detect the end of the options. @@ -68,6 +71,10 @@ int main_validate(int argc, char** argv) { case 'a': gam_path = optarg; break; + + case 'A': + gam_only = true; + break; case 'h': case '?': @@ -90,7 +97,15 @@ int main_validate(int argc, char** argv) { if (!gam_path.empty()) { get_input_file(gam_path, [&](istream& in) { vg::io::for_each(in, [&](Alignment& aln) { - if (!alignment_is_valid(aln, graph.get())) { + AlignmentValidity validity = alignment_is_valid(aln, graph.get()); + if (!validity) { + // Complain about this alignment + cerr << "Invalid Alignment:\n" << pb2json(aln) << "\n" << validity.message; + if (validity.problem == AlignmentValidity::NODE_TOO_SHORT) { + // If a node is too short, report the whole mapping again. + cerr << ":\n" << pb2json(aln.path().mapping(validity.bad_mapping_index)); + } + cerr << endl; valid_aln = false; } }); @@ -99,14 +114,17 @@ int main_validate(int argc, char** argv) { // VG's a little less structured, so try its own logic bool valid_graph = true; - VG* vg_graph = dynamic_cast(graph.get()); - if (vg_graph != nullptr) { - if (!vg_graph->is_valid(true, true, check_orphans, true)) { - valid_graph = false; + + if (!gam_only) { + VG* vg_graph = dynamic_cast(graph.get()); + if (vg_graph != nullptr) { + if (!vg_graph->is_valid(true, true, check_orphans, true)) { + valid_graph = false; + } } } - if (valid_graph) { + if (!gam_only && valid_graph) { // I don't think this is possible with any libbdsg implementations, but check edges just in case graph->for_each_edge([&](const edge_t& edge) { if (!graph->has_node(graph->get_id(edge.first))) { @@ -163,7 +181,9 @@ int main_validate(int argc, char** argv) { if (!gam_path.empty()) { cerr << "alignment: " << (valid_aln ? "valid" : "invalid") << endl; } - cerr << "graph: " << (valid_graph ? "valid" : "invalid") << endl; + if (!gam_only) { + cerr << "graph: " << (valid_graph ? "valid" : "invalid") << endl; + } return valid_aln && valid_graph ? 0 : 1; } diff --git a/src/surjector.cpp b/src/surjector.cpp index 770e9d4e202..b255b2fca83 100644 --- a/src/surjector.cpp +++ b/src/surjector.cpp @@ -3133,13 +3133,17 @@ using namespace std; } if (all_path_ranges_out) { if (all_path_ranges_out->empty()) { - cerr << "error: couldn't identify a path corresponding to surjected read " << source.name() << endl; + cerr << "error: couldn't identify a path range corresponding to surjected read " << source.name() + << " because there are no path ranges on path " << graph->get_path_name(path_handle) << endl; + cerr << "Surjected read dump: " << pb2json(surjected) << endl; exit(1); } path_range_out = all_path_ranges_out->front(); } else if (mappings_matched != surj_path.mapping_size()) { - cerr << "error: couldn't identify a path corresponding to surjected read " << source.name() << endl; + cerr << "error: couldn't identify a path range corresponding to surjected read " << source.name() + << " because " << mappings_matched << "/" << surj_path.mapping_size() << " mappings were matched on path " << graph->get_path_name(path_handle) << endl; + cerr << "Surjected read dump: " << pb2json(surjected) << endl; exit(1); } } diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t index 73469a17fab..d79526c6321 100644 --- a/test/t/37_vg_gbwt.t +++ b/test/t/37_vg_gbwt.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 145 +plan tests 147 # Build vg graphs for two chromosomes @@ -243,6 +243,12 @@ is $? 0 "GBZ construction from VCF" cmp x.gbz x2.gbz is $? 0 "Identical construction results from GBWT and VCF" +# Build and serialize GBZ from an existing GBWT and GBWTGraph +vg gbwt -I x.gg -g x3.gbz --gbz-format x.gbwt +is $? 0 "GBZ construction from GBWTGraph" +cmp x.gbz x3.gbz +is $? 0 "Identical construction results from XG and GBWTGraph" + # Extract GBWT from GBZ vg gbwt -o extracted.gbwt -Z x.gbz is $? 0 "GBWT extraction from GBZ" @@ -257,7 +263,7 @@ is $? 0 "Identical GBWT indexes" cmp x.gg extracted2.gg is $? 0 "Identical GBWTGraphs" -rm -f x.gbwt x.gg x.gbz x2.gbz +rm -f x.gbwt x.gg x.gbz x2.gbz x3.gbz rm -f extracted.gbwt extracted2.gbwt extracted2.gg