Merge pull request #3892 from vgteam/support-matteo

Improve Surject Validation and Error Reporting
vgteam · Mar 23, 2023 · d577e3d · d577e3d · adamnovak · Mar 24, 2023
2 parents ce2c139 + 1916a65
commit d577e3d
Show file tree

Hide file tree

Showing 8 changed files with 210 additions and 39 deletions.
diff --git a/src/alignment.cpp b/src/alignment.cpp
@@ -2081,24 +2081,32 @@ void alignment_set_distance_to_correct(Alignment& aln, const map<string ,vector<
     }
 }
 
-bool alignment_is_valid(Alignment& aln, const HandleGraph* hgraph) {
+AlignmentValidity alignment_is_valid(const Alignment& aln, const HandleGraph* hgraph) {
     for (size_t i = 0; i < aln.path().mapping_size(); ++i) {
         const Mapping& mapping = aln.path().mapping(i);
         if (!hgraph->has_node(mapping.position().node_id())) {
-            cerr << "Invalid Alignment:\n" << pb2json(aln) <<"\nNode " << mapping.position().node_id()
-                 << " not found in graph" << endl;
-            return false;
+            std::stringstream ss;
+            ss << "Node " << mapping.position().node_id() << " not found in graph";
+            return {
+                AlignmentValidity::NODE_MISSING,
+                i,
+                ss.str()
+            };
         }
         size_t node_len = hgraph->get_length(hgraph->get_handle(mapping.position().node_id()));
         if (mapping_from_length(mapping) + mapping.position().offset() > node_len) {
-            cerr << "Invalid Alignment:\n" << pb2json(aln) << "\nLength of node "
-                 << mapping.position().node_id() << " (" << node_len << ") exceeded by Mapping with offset "
-                 << mapping.position().offset() << " and from-length " << mapping_from_length(mapping) << ":\n"
-                 << pb2json(mapping) << endl;
-            return false;
-        }
-    }
-    return true;
+            std::stringstream ss;
+            ss << "Length of node "
+               << mapping.position().node_id() << " (" << node_len << ") exceeded by Mapping with offset "
+               << mapping.position().offset() << " and from-length " << mapping_from_length(mapping);
+            return {
+                AlignmentValidity::NODE_TOO_SHORT,
+                i,
+                ss.str()
+            };
+        }
+    }
+    return {AlignmentValidity::OK};
 }
 
 Alignment target_alignment(const PathPositionHandleGraph* graph, const path_handle_t& path, size_t pos1, size_t pos2,

diff --git a/src/alignment.hpp b/src/alignment.hpp
@@ -310,8 +310,34 @@ map<string ,vector<pair<size_t, bool> > > alignment_refpos_to_path_offsets(const
 void alignment_set_distance_to_correct(Alignment& aln, const Alignment& base, const unordered_map<string, string>* translation = nullptr);
 void alignment_set_distance_to_correct(Alignment& aln, const map<string, vector<pair<size_t, bool>>>& base_offsets, const unordered_map<string, string>* translation = nullptr);
 
-/// check to make sure edits on the alignment's path don't assume incorrect node lengths or ids
-bool alignment_is_valid(Alignment& aln, const HandleGraph* hgraph);
+/**
+ * Represents a report on whether an alignment makes sense in the context of a graph.
+ */
+struct AlignmentValidity {
+    /// The different kinds of possible problems with alignments
+    enum Problem {
+        OK,
+        NODE_MISSING,
+        NODE_TOO_SHORT
+    };
+
+    /// The kind of problem with the alignment.
+    Problem problem = OK;
+    /// The mapping in the alignment's path at which the problem was encountered.
+    size_t bad_mapping_index = 0;
+    /// An explanation for the problem.
+    std::string message = "";
+
+    /// We are truthy if the alignment has no problem, and falsey otherwise.
+    inline operator bool() const {
+        return problem == OK;
+    }
+};
+
+/// Check to make sure edits on the alignment's path don't assume incorrect
+/// node lengths or ids. Result can be used like a bool or inspected for
+/// further details. Does not log anything itself about bad alignments.
+AlignmentValidity alignment_is_valid(const Alignment& aln, const HandleGraph* hgraph);
 
 /// Make an Alignment corresponding to a subregion of a stored path.
 /// Positions are 0-based, and pos2 is excluded.

diff --git a/src/hts_alignment_emitter.cpp b/src/hts_alignment_emitter.cpp
@@ -201,6 +201,9 @@ vector<tuple<path_handle_t, size_t, size_t>> get_sequence_dictionary(const strin
     // and filled in later
     vector<pair<string, int64_t>> input_names_lengths;
 
+    // Should we print path subrange warnings?
+    bool print_subrange_warnings = true;
+
     // When we get a sequence and possibly its length (or -1 if no length is
     // known), put it in the dictionary.
     // Can optionally provide a file name for error reporting.
@@ -223,6 +226,24 @@ vector<tuple<path_handle_t, size_t, size_t>> get_sequence_dictionary(const strin
                 cerr << endl;
                 exit(1);
             }
+
+            if (print_subrange_warnings) {
+                subrange_t subrange;
+                std::string base_path_name = Paths::strip_subrange(sequence_name, &subrange);
+                if (subrange != PathMetadata::NO_SUBRANGE) {
+                    // The user is asking explicitly to surject to a path that is a
+                    // subrange of some other logical path, like
+                    // GRCh38#0#chr1[1000-2000]. That's weird. Warn.
+                    cerr << "warning:[vg::get_sequence_dictionary] Path " << sequence_name;
+                    if (filename) {
+                        // Report the source file.
+                        cerr << " from sequence dictionary in " << *filename;
+                    }
+                    cerr << " looks like part of a path. Output coordinates will be in " << base_path_name << " instead. Suppressing further warnings." << endl;
+                    print_subrange_warnings = false;
+                }
+            }
+
             // Remember the path
             base_path_to_subpaths[sequence_name].push_back(path);
         } else {
@@ -237,10 +258,10 @@ vector<tuple<path_handle_t, size_t, size_t>> get_sequence_dictionary(const strin
             });
             if (!base_path_to_subpaths.count(sequence_name)) {
                 // We didn't find any subpaths for this path as a base path either.
-                cerr << "error:[vg::get_sequence_dictionary] Graph does not have a path named " << sequence_name;
+                cerr << "error:[vg::get_sequence_dictionary] Graph does not have the entirety or any pieces of a path named " << sequence_name;
                 if (filename) {
                     // Report the source file.
-                    cerr << " which was indicated in " << filename;
+                    cerr << " which was indicated in " << *filename;
                 }
                 cerr << endl;
                 exit(1);

diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp
@@ -28,7 +28,7 @@ using namespace vg;
 
 struct GBWTConfig {
     // Build mode also defines the type of input args.
-    enum build_mode { build_none, build_vcf, build_gfa, build_paths, build_alignments, build_gbz };
+    enum build_mode { build_none, build_vcf, build_gfa, build_paths, build_alignments, build_gbz, build_gbwtgraph };
     enum merge_mode { merge_none, merge_insert, merge_fast, merge_parallel };
     enum path_cover_mode { path_cover_none, path_cover_augment, path_cover_local, path_cover_greedy };
 
@@ -66,6 +66,7 @@ struct GBWTConfig {
     std::vector<std::string> input_filenames;
     std::string gbwt_name; // There is a single input GBWT to load.
     std::string graph_name;
+    std::string gbwtgraph_name;
 
     // File names.
     std::string gbwt_output; // Output GBWT.
@@ -104,7 +105,7 @@ struct GBWTConfig {
 };
 
 struct GraphHandler {
-    enum graph_type { graph_none, graph_path, graph_source, graph_gbz };
+    enum graph_type { graph_none, graph_path, graph_source, graph_gbz, graph_gbwtgraph };
 
     std::unique_ptr<PathHandleGraph> path_graph = nullptr;
     std::unique_ptr<gbwtgraph::SequenceSource> sequence_source = nullptr;
@@ -123,6 +124,11 @@ struct GraphHandler {
     // graph in this handler.
     // NOTE: The graph will become invalid if the GBWT in the GBWTHandler changes.
     void load_gbz(GBWTHandler& gbwts, GBWTConfig& config);
+
+    // Load the GBWTGraph specified in the config, store the GBWT in the
+    // GBWTHandler and the graph in this handler.
+    // NOTE: The graph will become invalid if the GBWT in the GBWTHandler changes.
+    void load_gbwtgraph(GBWTHandler& gbwts, GBWTConfig& config);
 
     void clear();
 
@@ -268,6 +274,7 @@ void help_gbwt(char** argv) {
     std::cerr << "        --translation FILE  write the segment to node translation table to FILE" << std::endl;
     std::cerr << "    -Z, --gbz-input         extract GBWT and GBWTGraph from GBZ input (one input arg)" << std::endl;
     std::cerr << "        --translation FILE  write the segment to node translation table to FILE" << std::endl;
+    std::cerr << "    -I, --gg-in FILE        load GBWTGraph from FILE and GBWT from input (one input arg) " << std::endl;
     std::cerr << "    -E, --index-paths       index the embedded non-alt paths in the graph (requires -x, no input args)" << std::endl;
     std::cerr << "    -A, --alignment-input   index the alignments in the GAF files specified in input args (requires -x)" << std::endl;
     std::cerr << "        --gam-format        the input files are in GAM format instead of GAF format" << std::endl;
@@ -428,6 +435,9 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {
 
         // Input GBWT construction: GBZ
         { "gbz-input", no_argument, 0, 'Z' },
+
+        // Input GBWT construction: GBWTGraph and GBWT
+        { "gg-in", required_argument, 0, 'I' },
 
         // Input GBWT construction: paths
         { "index-paths", no_argument, 0, 'E' },
@@ -487,7 +497,7 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {
     GBWTConfig config;
     while (true) {
         int option_index = 0;
-        c = getopt_long(argc, argv, "x:o:d:pvGZEAmfbR:alPn:k:g:r:MCHSLTce:h?", long_options, &option_index);
+        c = getopt_long(argc, argv, "x:o:d:pvGZI:EAmfbR:alPn:k:g:r:MCHSLTce:h?", long_options, &option_index);
 
         /* Detect the end of the options. */
         if (c == -1)
@@ -645,6 +655,14 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {
             config.build = GBWTConfig::build_gbz;
             config.produces_one_gbwt = true;
             break;
+
+        // Input GBWT construction: GBWTGraph and GBWT
+        case 'I':
+            no_multiple_input_types(config);
+            config.build = GBWTConfig::build_gbwtgraph;
+            config.gbwtgraph_name = optarg;
+            config.produces_one_gbwt = true;
+            break;
 
         // Input GBWT construction: Paths
         case 'E':
@@ -831,8 +849,8 @@ void validate_gbwt_config(GBWTConfig& config) {
     // We have one input GBWT after steps 1-4.
     bool one_input_gbwt = config.input_filenames.size() == 1 || config.produces_one_gbwt;
 
-    // We can load a PathHandleGraph from a file, get a SequenceSource from parsing GFA, or get a GBWTGraph from GBZ.
-    bool has_graph_input = !config.graph_name.empty() || config.build == GBWTConfig::build_gfa || config.build == GBWTConfig::build_gbz;
+    // We can load a PathHandleGraph from a file, get a SequenceSource from parsing GFA, or get a GBWTGraph from GBZ or GG/GBWT.
+    bool has_graph_input = !config.graph_name.empty() || config.build == GBWTConfig::build_gfa || config.build == GBWTConfig::build_gbz || config.build == GBWTConfig::build_gbwtgraph;
 
     if (config.build == GBWTConfig::build_gbz) {
         // If we "build" a GBWT by loading it from a GBZ, we just need to make
@@ -845,6 +863,17 @@ void validate_gbwt_config(GBWTConfig& config) {
             std::cerr << "error: [vg gbwt] GBZ input requires one input arg" << std::endl;
             std::exit(EXIT_FAILURE);
         }
+    } else if (config.build == GBWTConfig::build_gbwtgraph) {
+        // If we "build" a GBWT by loading it from a GG and a GBWT, we just need to make
+        // sure that we know enough to actually load it.  
+        if (!config.graph_name.empty()) {
+            std::cerr << "error: [vg gbwt] GBWTGraph input does not use -x" << std::endl;
+            std::exit(EXIT_FAILURE);
+        }
+        if (config.input_filenames.size() != 1) {
+            std::cerr << "error: [vg gbwt] GBWTGraph input requires one input arg" << std::endl;
+            std::exit(EXIT_FAILURE);
+        }
     } else if (config.build != GBWTConfig::build_none) {
         if (!has_gbwt_output) {
             // If we build our GBWT by doing anything other than loading it
@@ -897,6 +926,9 @@ void validate_gbwt_config(GBWTConfig& config) {
         if (config.build == GBWTConfig::build_gbz) {
             std::cerr << "error: [vg gbwt] the GBWT extracted from GBZ cannot have paths modified" << std::endl;
         }
+        if (config.build == GBWTConfig::build_gbwtgraph) {
+            std::cerr << "error: [vg gbwt] the GBWT loaded with a GBWTGraph cannot have paths modified" << std::endl;
+        }
         if (!(config.input_filenames.size() == 1 || config.merge != GBWTConfig::merge_none) || !has_gbwt_output) {
             std::cerr << "error: [vg gbwt] removing a sample requires one input GBWT and output GBWT" << std::endl;
             std::exit(EXIT_FAILURE);
@@ -1177,7 +1209,7 @@ void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& co
         std::cerr << "Building input GBWTs" << std::endl;
     }
     gbwts.unbacked(); // We will build a new GBWT.
-    if (config.build != GBWTConfig::build_gfa && config.build != GBWTConfig::build_gbz) {
+    if (config.build != GBWTConfig::build_gfa && config.build != GBWTConfig::build_gbz && config.build != GBWTConfig::build_gbwtgraph) {
         graphs.get_graph(config);
     }
 
@@ -1242,6 +1274,11 @@ void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& co
             std::cerr << "Input type: GBZ" << std::endl;
         }
         graphs.load_gbz(gbwts, config);
+    } else if (config.build == GBWTConfig::build_gbwtgraph) {
+        if(config.show_progress) {
+            std::cerr << "Input type: GBWTGraph" << std::endl;
+        }
+        graphs.load_gbwtgraph(gbwts, config);
     } else if (config.build == GBWTConfig::build_paths) {
         if(config.show_progress) {
             std::cerr << "Input type: embedded paths" << std::endl;
@@ -1462,7 +1499,7 @@ void step_5_gbwtgraph(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& conf
     gbwtgraph::GBWTGraph graph;
     if (graphs.in_use == GraphHandler::graph_source) {
         graph = gbwtgraph::GBWTGraph(gbwts.compressed, *(graphs.sequence_source));
-    } else if (graphs.in_use == GraphHandler::graph_gbz) {
+    } else if (graphs.in_use == GraphHandler::graph_gbz || graphs.in_use == GraphHandler::graph_gbwtgraph) {
         graph = std::move(*(graphs.gbwt_graph));
         graphs.clear();
     } else {
@@ -1653,6 +1690,25 @@ void GraphHandler::load_gbz(GBWTHandler& gbwts, GBWTConfig& config) {
     }
 }
 
+void GraphHandler::load_gbwtgraph(GBWTHandler& gbwts, GBWTConfig& config) {
+    if (this->in_use == graph_gbwtgraph) {
+        return;
+    } else {
+        this->clear();
+        // Load the GBWT
+        gbwt::GBWT input_gbwt;
+        vg::load_gbwt(input_gbwt, config.input_filenames.front(), config.show_progress);
+        gbwts.use(input_gbwt);
+
+        // Then load the GBWTGraph
+        this->gbwt_graph = std::make_unique<gbwtgraph::GBWTGraph>();
+        vg::load_gbwtgraph(*this->gbwt_graph, config.gbwtgraph_name, config.show_progress);
+        // And connect it
+        this->gbwt_graph->set_gbwt(gbwts.compressed);
+        this->in_use = graph_gbwtgraph;
+    }
+}
+
 void GraphHandler::clear() {
     this->path_graph.reset();
     this->sequence_source.reset();

diff --git a/src/subcommand/surject_main.cpp b/src/subcommand/surject_main.cpp
@@ -52,7 +52,23 @@ void help_surject(char** argv) {
          << "  -R, --read-group NAME   set this read group for all reads" << endl
          << "  -f, --max-frag-len N    reads with fragment lengths greater than N will not be marked properly paired in SAM/BAM/CRAM" << endl
          << "  -L, --list-all-paths    annotate SAM records with a list of all attempted re-alignments to paths in SS tag" << endl
-         << "  -C, --compression N     level for compression [0-9]" << endl;
+         << "  -C, --compression N     level for compression [0-9]" << endl
+         << "  -V, --no-validate       skip checking whether alignments plausibly are against the provided graph" << endl;
+}
+
+/// If the given alignment doesn't make sense against the given graph (i.e.
+/// doesn't agree with the nodes in the graph), print a message and stop the
+/// program. Is thread-safe.
+static void ensure_alignment_is_for_graph(const Alignment& aln, const HandleGraph& graph) {
+    AlignmentValidity validity = alignment_is_valid(aln, &graph);
+    if (!validity) {
+        #pragma omp critical (cerr)
+        {
+            std::cerr << "error:[vg surject] Alignment " << aln.name() << " cannot be interpreted against this graph: " << validity.message << std::endl;
+            std::cerr << "Make sure that you are using the same graph that the reads were mapped to!" << std::endl;
+        }
+        exit(1);
+    }
 }
 
 int main_surject(int argc, char** argv) {
@@ -79,6 +95,7 @@ int main_surject(int argc, char** argv) {
     bool prune_anchors = false;
     bool annotate_with_all_path_scores = false;
     bool multimap = false;
+    bool validate = true;
 
     int c;
     optind = 2; // force optind past command positional argument
@@ -107,11 +124,12 @@ int main_surject(int argc, char** argv) {
             {"max-frag-len", required_argument, 0, 'f'},
             {"list-all-paths", no_argument, 0, 'L'},
             {"compress", required_argument, 0, 'C'},
+            {"no-validate", required_argument, 0, 'V'},
             {0, 0, 0, 0}
         };
 
         int option_index = 0;
-        c = getopt_long (argc, argv, "hx:p:F:liGmcbsN:R:f:C:t:SPALM",
+        c = getopt_long (argc, argv, "hx:p:F:liGmcbsN:R:f:C:t:SPALMV",
                 long_options, &option_index);
 
         // Detect the end of the options.
@@ -194,6 +212,10 @@ int main_surject(int argc, char** argv) {
             compress_level = parse<int>(optarg);
             break;
 
+        case 'V':
+            validate = false;
+            break;
+
         case 't':
             omp_set_num_threads(parse<int>(optarg));
             break;
@@ -276,7 +298,7 @@ int main_surject(int argc, char** argv) {
             if (src.has_path() && src.sequence().empty()) {
 #pragma omp critical
                 {
-                    cerr << "error:[surject] Read " << src.name() << " is aligned but does not have a sequence and therefore cannot be surjected. Was it derived from a GAF without a base-level alignment? or a GAF with a CIGAR string in the 'cg' tag (which does not provide enough information to reconstruct the sequence)?" << endl;
+                    cerr << "error:[surject] Read " << src.name() << " is aligned but does not have a sequence and therefore cannot be surjected. Was it derived from a GAF without a base-level alignment? Or a GAF with a CIGAR string in the 'cg' tag (which does not provide enough information to reconstruct the sequence)?" << endl;
                     exit(1);
                 }
             }
@@ -332,6 +354,10 @@ int main_surject(int argc, char** argv) {
                     exit(1);
                 }
 
+                if (validate) {
+                    ensure_alignment_is_for_graph(src1, *xgidx);
+                    ensure_alignment_is_for_graph(src2, *xgidx);
+                }
 
                 // Preprocess read to set metadata before surjection
                 set_metadata(src1);
@@ -399,6 +425,10 @@ int main_surject(int argc, char** argv) {
             // TODO: We don't preserve order relationships (like primary/secondary).
             function<void(Alignment&)> lambda = [&](Alignment& src) {
 
+                if (validate) {
+                    ensure_alignment_is_for_graph(src, *xgidx);
+                }
+
                 // Preprocess read to set metadata before surjection
                 set_metadata(src);