From 9a6206e4667167257ec918802e89675e5aaebd5d Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Tue, 13 Aug 2024 14:24:09 +0300
Subject: [PATCH 01/12] Better error handling in vg gbwt

---
 src/subcommand/gbwt_main.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp
index db1175b426e..b22f0c0ce8e 100644
--- a/src/subcommand/gbwt_main.cpp
+++ b/src/subcommand/gbwt_main.cpp
@@ -999,9 +999,11 @@ void validate_gbwt_config(GBWTConfig& config) {
     if (!config.to_remove.empty()) {
         if (config.build == GBWTConfig::build_gbz) {
             std::cerr << "error: [vg gbwt] the GBWT extracted from GBZ cannot have paths modified" << std::endl;
+            std::exit(EXIT_FAILURE);
         }
         if (config.build == GBWTConfig::build_gbwtgraph) {
             std::cerr << "error: [vg gbwt] the GBWT loaded with a GBWTGraph cannot have paths modified" << std::endl;
+            std::exit(EXIT_FAILURE);
         }
         if (!(config.input_filenames.size() == 1 || config.merge != GBWTConfig::merge_none) || !has_gbwt_output) {
             std::cerr << "error: [vg gbwt] removing a sample requires one input GBWT and output GBWT" << std::endl;

From a57b473efb79f6986b03eb5d1db394df38309c3f Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Thu, 31 Oct 2024 20:59:09 -0700
Subject: [PATCH 02/12] Start getting rid of the old path/thread terminology

---
 src/subcommand/gbwt_main.cpp | 75 ++++++++++++++++++------------------
 test/t/37_vg_gbwt.t          | 62 ++++++++++++++---------------
 2 files changed, 69 insertions(+), 68 deletions(-)

diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp
index b22f0c0ce8e..af88d10f2e9 100644
--- a/src/subcommand/gbwt_main.cpp
+++ b/src/subcommand/gbwt_main.cpp
@@ -37,7 +37,7 @@ struct GBWTConfig {
     build_mode build = build_none;
     merge_mode merge = merge_none;
     path_cover_mode path_cover = path_cover_none;
-    bool metadata_mode = false, thread_mode = false;
+    bool metadata_mode = false, path_mode = false;
 
     // Input GBWT construction.
     HaplotypeIndexer haplotype_indexer;
@@ -55,8 +55,8 @@ struct GBWTConfig {
 
     // Other parameters and flags.
     bool show_progress = false;
-    bool count_threads = false;
-    bool metadata = false, contigs = false, haplotypes = false, samples = false, list_names = false, thread_names = false, tags = false;
+    bool count_paths = false;
+    bool metadata = false, contigs = false, haplotypes = false, samples = false, list_names = false, path_names = false, tags = false;
     bool include_named_paths = false;
     size_t num_paths = default_num_paths(), context_length = default_context_length();
     bool num_paths_set = false;
@@ -70,7 +70,7 @@ struct GBWTConfig {
 
     // File names.
     std::string gbwt_output; // Output GBWT.
-    std::string thread_output; // Threads in SDSL format.
+    std::string path_output; // Paths in SDSL format.
     std::string graph_output; // Output GBWTGraph.
     std::string segment_translation; // Segment to node translation output.
     std::string r_index_name; // Output r-index.
@@ -163,7 +163,7 @@ void step_4_path_cover(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& con
 void step_5_gbwtgraph(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config);
 void step_6_r_index(GBWTHandler& gbwts, GBWTConfig& config);
 void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config);
-void step_8_threads(GBWTHandler& gbwts, GBWTConfig& config);
+void step_8_paths(GBWTHandler& gbwts, GBWTConfig& config);
 
 void report_time_memory(const std::string& what, double start_time, const GBWTConfig& config);
 void print_metadata(std::ostream& out, const GBWTHandler& gbwts);
@@ -233,9 +233,9 @@ int main_gbwt(int argc, char** argv) {
         step_7_metadata(gbwts, config);
     }
 
-    // Thread options.
-    if (config.thread_mode) {
-        step_8_threads(gbwts, config);
+    // Path options.
+    if (config.path_mode) {
+        step_8_paths(gbwts, config);
     }
 
     return 0;
@@ -260,7 +260,7 @@ void help_gbwt(char** argv) {
     std::cerr << "        --id-interval N     store path ids at one out of N positions (default " << gbwt::DynamicGBWT::SAMPLE_INTERVAL << ")" << std::endl;
     std::cerr << std::endl;
     std::cerr << "Multithreading:" << std::endl;
-    std::cerr << "        --num-jobs N        use at most N parallel build jobs (for -v, -G, -l, -P; default " << GBWTConfig::default_build_jobs() << ")" << std::endl;
+    std::cerr << "        --num-jobs N        use at most N parallel build jobs (for -v, -G, -A, -l, -P; default " << GBWTConfig::default_build_jobs() << ")" << std::endl;
     std::cerr << "        --num-threads N     use N parallel search threads (for -b and -r; default " << omp_get_max_threads() << ")" << std::endl;
     std::cerr << std::endl;
     std::cerr << "Step 1: GBWT construction (requires -o and one of { -v, -G, -Z, -E, A }):" << std::endl;
@@ -330,12 +330,12 @@ void help_gbwt(char** argv) {
     std::cerr << "    -H, --haplotypes        print the number of haplotypes" << std::endl;
     std::cerr << "    -S, --samples           print the number of samples" << std::endl;
     std::cerr << "    -L, --list-names        list contig/sample names (use with -C or -S)" << std::endl;
-    std::cerr << "    -T, --thread-names      list thread names" << std::endl;
+    std::cerr << "    -T, --thread-names      list path names" << std::endl; // FIXME change to --path-names; make --thread-names a deprecated alias
     std::cerr << "        --tags              list GBWT tags" << std::endl;
     std::cerr << std::endl;
-    std::cerr << "Step 8: Threads (one input GBWT):" << std::endl;
-    std::cerr << "    -c, --count-threads     print the number of threads" << std::endl;
-    std::cerr << "    -e, --extract FILE      extract threads in SDSL format to FILE" << std::endl;
+    std::cerr << "Step 8: Paths (one input GBWT):" << std::endl;
+    std::cerr << "    -c, --count-threads     print the number of paths" << std::endl; // FIXME also here, change to --count-paths
+    std::cerr << "    -e, --extract FILE      extract paths in SDSL format to FILE" << std::endl;
     std::cerr << std::endl;
 }
 
@@ -519,11 +519,11 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {
         { "haplotypes", no_argument, 0, 'H' },
         { "samples", no_argument, 0, 'S' },
         { "list-names", no_argument, 0, 'L' },
-        { "thread-names", no_argument, 0, 'T' },
+        { "thread-names", no_argument, 0, 'T' }, // FIXME
         { "tags", no_argument, 0, OPT_TAGS },
 
-        // Threads
-        { "count-threads", no_argument, 0, 'c' },
+        // Paths
+        { "count-threads", no_argument, 0, 'c' }, // FIXME
         { "extract", required_argument, 0, 'e' },
 
         { "help", no_argument, 0, 'h' },
@@ -864,7 +864,7 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {
             config.metadata_mode = true;
             break;
         case 'T':
-            config.thread_names = true;
+            config.path_names = true;
             config.metadata_mode = true;
             break;
         case OPT_TAGS:
@@ -872,14 +872,14 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {
             config.metadata_mode = true;
             break;
 
-        // Threads
+        // Paths
         case 'c':
-            config.count_threads = true;
-            config.thread_mode = true;
+            config.count_paths = true;
+            config.path_mode = true;
             break;
         case 'e':
-            config.thread_output = optarg;
-            config.thread_mode = true;
+            config.path_output = optarg;
+            config.path_mode = true;
             break;
 
         case 'h':
@@ -1077,9 +1077,9 @@ void validate_gbwt_config(GBWTConfig& config) {
         }
     }
 
-    if (config.thread_mode) {
+    if (config.path_mode) {
         if (!one_input_gbwt) {
-            std::cerr << "error: [vg gbwt] thread operations require one input GBWT" << std::endl;
+            std::cerr << "error: [vg gbwt] path operations require one input GBWT" << std::endl;
             std::exit(EXIT_FAILURE);
         }
     }
@@ -1373,6 +1373,7 @@ void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& co
         if (config.show_progress) {
             std::cerr << "Input type: " << (config.gam_format ? "GAM" : "GAF") << std::endl;
         }
+        // FIXME: Parallelize this.
         std::unique_ptr<gbwt::DynamicGBWT> temp = config.haplotype_indexer.build_gbwt(*(graphs.path_graph), config.input_filenames, (config.gam_format ? "GAM" : "GAF"));
         gbwts.use(*temp);
     }
@@ -1458,7 +1459,7 @@ void remove_samples(GBWTHandler& gbwts, GBWTConfig& config) {
 
     gbwts.use_dynamic();
     if (!(gbwts.dynamic.hasMetadata() && gbwts.dynamic.metadata.hasPathNames() && gbwts.dynamic.metadata.hasSampleNames())) {
-        std::cerr << "error: [vg gbwt] the index does not contain metadata with thread and sample names" << std::endl;
+        std::cerr << "error: [vg gbwt] the index does not contain metadata with path and sample names" << std::endl;
         std::exit(EXIT_FAILURE);
     }
 
@@ -1471,11 +1472,11 @@ void remove_samples(GBWTHandler& gbwts, GBWTConfig& config) {
         }
         std::vector<gbwt::size_type> path_ids = gbwts.dynamic.metadata.removeSample(sample_id);
         if (path_ids.empty()) {
-            std::cerr << "warning: [vg gbwt] no threads associated with sample " << sample_name << std::endl;
+            std::cerr << "warning: [vg gbwt] no paths associated with sample " << sample_name << std::endl;
             continue;
         }
         if (config.show_progress) {
-            std::cerr << "Removing " << path_ids.size() << " threads for sample " << sample_name << std::endl;
+            std::cerr << "Removing " << path_ids.size() << " paths for sample " << sample_name << std::endl;
         }
         gbwts.dynamic.remove(path_ids);
     }
@@ -1666,7 +1667,7 @@ void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config) {
         }
     }
 
-    if (config.thread_names) {
+    if (config.path_names) {
         auto& metadata = get_metadata();
         if (metadata.hasPathNames()) {
             // Precompute some metadata
@@ -1676,7 +1677,7 @@ void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config) {
                 std::cout << gbwtgraph::compose_path_name(gbwts.compressed, i, sense) << std::endl;
             }
         } else {
-            std::cerr << "error: [vg gbwt] the metadata does not contain thread names" << std::endl;
+            std::cerr << "error: [vg gbwt] the metadata does not contain path names" << std::endl;
         }
     }
     
@@ -1692,19 +1693,19 @@ void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config) {
 
 //----------------------------------------------------------------------------
 
-void step_8_threads(GBWTHandler& gbwts, GBWTConfig& config) {
-    // Extract threads in SDSL format.
-    if (!config.thread_output.empty()) {
+void step_8_paths(GBWTHandler& gbwts, GBWTConfig& config) {
+    // Extract paths in SDSL format.
+    if (!config.path_output.empty()) {
         double start = gbwt::readTimer();
         if (config.show_progress) {
-            std::cerr << "Extracting threads to " << config.thread_output << std::endl;
+            std::cerr << "Extracting paths to " << config.path_output << std::endl;
         }
         gbwts.use_compressed();
         if (config.show_progress) {
             std::cerr << "Starting the extraction" << std::endl;
         }
         gbwt::size_type node_width = gbwt::bit_length(gbwts.compressed.sigma() - 1);
-        gbwt::text_buffer_type out(config.thread_output, std::ios::out, gbwt::MEGABYTE, node_width);
+        gbwt::text_buffer_type out(config.path_output, std::ios::out, gbwt::MEGABYTE, node_width);
         for (gbwt::size_type id = 0; id < gbwts.compressed.sequences(); id += 2) { // Ignore reverse complements.
             gbwt::vector_type sequence = gbwts.compressed.extract(id);
             for (auto node : sequence) {
@@ -1713,11 +1714,11 @@ void step_8_threads(GBWTHandler& gbwts, GBWTConfig& config) {
             out.push_back(gbwt::ENDMARKER);
         }
         out.close();
-        report_time_memory("Threads extracted", start, config);
+        report_time_memory("Paths extracted", start, config);
     }
 
-    // There are two sequences for each thread.
-    if (config.count_threads) {
+    // There are two sequences for each path.
+    if (config.count_paths) {
         gbwts.use_compressed();
         std::cout << (gbwts.compressed.sequences() / 2) << std::endl;
     }
diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t
index 6c11196fe7a..9fdc0c8a4fb 100644
--- a/test/t/37_vg_gbwt.t
+++ b/test/t/37_vg_gbwt.t
@@ -33,11 +33,11 @@ cmp x.gbwt parse_x.gbwt
 is $? 0 "identical construction results with vg gbwt and from VCF parse"
 
 # Single chromosome: metadata for haplotypes
-is $(vg gbwt -c x.gbwt) 2 "chromosome X: 2 threads"
+is $(vg gbwt -c x.gbwt) 2 "chromosome X: 2 paths"
 is $(vg gbwt -C x.gbwt) 1 "chromosome X: 1 contig"
 is $(vg gbwt -H x.gbwt) 2 "chromosome X: 2 haplotypes"
 is $(vg gbwt -S x.gbwt) 1 "chromosome X: 1 sample"
-is $(vg gbwt -T x.gbwt | wc -l) 2 "chromosome X: 2 thread names"
+is $(vg gbwt -T x.gbwt | wc -l) 2 "chromosome X: 2 path names"
 is $(vg gbwt -C -L x.gbwt | wc -l) 1 "chromosome X: 1 contig name"
 is $(vg gbwt -S -L x.gbwt | wc -l) 1 "chromosome X: 1 sample name"
 
@@ -52,7 +52,7 @@ vg index -G x2.ref.gbwt -T x.vg
 is $? 0 "chromosome X reference GBWT with vg index"
 cmp x.ref.gbwt x2.ref.gbwt
 is $? 0 "identical construction results with vg gbwt and vg index"
-is $(vg gbwt -c x.ref.gbwt) 1 "chromosome X reference: 1 thread"
+is $(vg gbwt -c x.ref.gbwt) 1 "chromosome X reference: 1 path"
 
 rm -f x.ref.gbwt x2.ref.gbwt
 
@@ -115,7 +115,7 @@ vg gbwt -x xy-alt.xg -o xy.1000gp.gbwt --preset 1000gp -v small/xy2.vcf.gz
 is $? 0 "construction preset: 1000gp"
 
 # Multiple chromosomes: metadata for haplotypes
-is $(vg gbwt -c xy.merge.gbwt) 4 "multiple chromosomes: 4 threads"
+is $(vg gbwt -c xy.merge.gbwt) 4 "multiple chromosomes: 4 paths"
 is $(vg gbwt -C xy.merge.gbwt) 2 "multiple chromosomes: 2 contigs"
 is $(vg gbwt -H xy.merge.gbwt) 2 "multiple chromosomes: 2 haplotypes"
 is $(vg gbwt -S xy.merge.gbwt) 1 "multiple chromosomes: 1 sample"
@@ -131,7 +131,7 @@ vg index -G xy2.contigs.gbwt -T xy.xg
 is $? 0 "paths as contigs with vg index"
 cmp xy.contigs.gbwt xy2.contigs.gbwt
 is $? 0 "identical construction results with vg gbwt and vg index"
-is $(vg gbwt -c xy.contigs.gbwt) 2 "paths as contigs: 2 threads"
+is $(vg gbwt -c xy.contigs.gbwt) 2 "paths as contigs: 2 paths"
 is $(vg gbwt -C xy.contigs.gbwt) 2 "paths as contigs: 2 contigs"
 is $(vg gbwt -H xy.contigs.gbwt) 1 "paths as contigs: 1 haplotype"
 is $(vg gbwt -S xy.contigs.gbwt) 1 "paths as contigs: 1 sample"
@@ -194,35 +194,35 @@ rm -f x.gbwt empty.gbwt x2.gbwt
 vg gbwt -x xy-alt.xg -o xy.gbwt -v small/xy2.vcf.gz
 vg gbwt -E -o xy.ref.gbwt -x xy.xg
 vg gbwt -m -o xy.both.gbwt xy.gbwt xy.ref.gbwt
-is $(vg gbwt -c xy.both.gbwt) 6 "haplotypes and paths: 6 threads"
+is $(vg gbwt -c xy.both.gbwt) 6 "haplotypes and paths: 6 paths"
 
 # Remove the reference sample that GBWTs use for paths
 vg gbwt -R _gbwt_ref -o xy.removed.gbwt xy.both.gbwt
 is $? 0 "samples can be removed from a GBWT index"
-is $(vg gbwt -c xy.removed.gbwt) 4 "haplotypes only: 4 threads"
+is $(vg gbwt -c xy.removed.gbwt) 4 "haplotypes only: 4 paths"
 
 rm -f xy.gbwt xy.ref.gbwt xy.both.gbwt xy.removed.gbwt
 
 
 # Build a three-sample GBWT from a simple GFA
 vg gbwt -o all.gbwt -G graphs/three_samples.gfa
-is $(vg gbwt -c all.gbwt) 12 "all samples: 12 threads"
+is $(vg gbwt -c all.gbwt) 12 "all samples: 12 paths"
 is $(vg gbwt -H all.gbwt) 6 "all samples: 6 haplotypes"
 
 # Remove samples 1 and 3
 vg gbwt -R sample1 -R sample3 -o removed.gbwt all.gbwt
 is $? 0 "multiple samples can be removed from a GBWT index"
-is $(vg gbwt -c removed.gbwt) 4 "sample 2: 4 threads"
+is $(vg gbwt -c removed.gbwt) 4 "sample 2: 4 paths"
 is $(vg gbwt -H removed.gbwt) 2 "sample 2: 2 haplotypes"
 
 rm -f all.gbwt removed.gbwt
 
 
-# Extract threads from GBWT
+# Extract paths from GBWT
 vg gbwt -x x.vg -o x.gbwt -v small/xy2.vcf.gz
 vg gbwt -e x.extract x.gbwt
-is $? 0 "threads can be extracted from GBWT"
-is $(cat x.extract | wc -c) 121 "correct size for the thread file"
+is $? 0 "paths can be extracted from GBWT"
+is $(cat x.extract | wc -c) 121 "correct size for the paths file"
 
 rm -f x.gbwt x.extract
 
@@ -271,7 +271,7 @@ rm -f extracted.gbwt extracted2.gbwt extracted2.gg
 vg gbwt -P -n 16 -x xy.xg -g xy.cover.gg -o xy.cover.gbwt
 is $? 0 "Path cover GBWTGraph construction"
 is $(md5sum xy.cover.gg | cut -f 1 -d\ ) 6a2738f51472e0ba1553a815a005b157 "GBWTGraph was serialized correctly"
-is $(vg gbwt -c xy.cover.gbwt) 32 "path cover: 32 threads"
+is $(vg gbwt -c xy.cover.gbwt) 32 "path cover: 32 paths"
 is $(vg gbwt -C xy.cover.gbwt) 2 "path cover: 2 contigs"
 is $(vg gbwt -H xy.cover.gbwt) 16 "path cover: 16 haplotypes"
 is $(vg gbwt -S xy.cover.gbwt) 16 "path cover: 16 samples"
@@ -282,10 +282,10 @@ rm -f xy.cover.gg xy.cover.gbwt
 vg gbwt -P -n 16 -x xy.xg -g xy.cover.gg -o xy.cover.gbwt --pass-paths
 is $? 0 "Path cover GBWTGraph construction"
 is $(md5sum xy.cover.gg | cut -f 1 -d\ ) 6a2738f51472e0ba1553a815a005b157 "GBWTGraph was serialized correctly"
-is $(vg gbwt -c xy.cover.gbwt) 34 "path cover w/ paths: 34 threads"
-is $(vg gbwt -C xy.cover.gbwt) 2 "path cover w/ paths: 2 contigs"
-is $(vg gbwt -H xy.cover.gbwt) 17 "path cover w/ paths: 17 haplotypes"
-is $(vg gbwt -S xy.cover.gbwt) 17 "path cover w/ paths: 17 samples"
+is $(vg gbwt -c xy.cover.gbwt) 34 "path cover w/ reference paths: 34 paths"
+is $(vg gbwt -C xy.cover.gbwt) 2 "path cover w/ reference paths: 2 contigs"
+is $(vg gbwt -H xy.cover.gbwt) 17 "path cover w/ reference paths: 17 haplotypes"
+is $(vg gbwt -S xy.cover.gbwt) 17 "path cover w/ reference paths: 17 samples"
 
 rm -f xy.cover.gg xy.cover.gbwt
 
@@ -293,7 +293,7 @@ rm -f xy.cover.gg xy.cover.gbwt
 vg gbwt -x xy-alt.xg -g xy.local.gg -l -n 16 -o xy.local.gbwt -v small/xy2.vcf.gz
 is $? 0 "Local haplotypes GBWTGraph construction"
 is $(md5sum xy.local.gg | cut -f 1 -d\ ) 00429586246711abcf1367a97d3c468c "GBWTGraph was serialized correctly"
-is $(vg gbwt -c xy.local.gbwt) 32 "local haplotypes: 32 threads"
+is $(vg gbwt -c xy.local.gbwt) 32 "local haplotypes: 32 paths"
 is $(vg gbwt -C xy.local.gbwt) 2 "local haplotypes: 2 contigs"
 is $(vg gbwt -H xy.local.gbwt) 16 "local haplotypes: 16 haplotypes"
 is $(vg gbwt -S xy.local.gbwt) 16 "local haplotypes: 16 samples"
@@ -314,10 +314,10 @@ rm -f xy.local.gg xy.local.gbwt xy.local.gbz
 vg gbwt -x xy-alt.xg -g xy.local.gg -l -n 16 -o xy.local.gbwt -v small/xy2.vcf.gz --pass-paths
 is $? 0 "Local haplotypes GBWTGraph construction"
 is $(md5sum xy.local.gg | cut -f 1 -d\ ) 6a2738f51472e0ba1553a815a005b157 "GBWTGraph was serialized correctly"
-is $(vg gbwt -c xy.local.gbwt) 34 "local haplotypes w/ paths: 34 threads"
-is $(vg gbwt -C xy.local.gbwt) 2 "local haplotypes w/ paths: 2 contigs"
-is $(vg gbwt -H xy.local.gbwt) 17 "local haplotypes w/ paths: 17 haplotypes"
-is $(vg gbwt -S xy.local.gbwt) 17 "local haplotypes w/ paths: 17 samples"
+is $(vg gbwt -c xy.local.gbwt) 34 "local haplotypes w/ reference paths: 34 paths"
+is $(vg gbwt -C xy.local.gbwt) 2 "local haplotypes w/ reference paths: 2 contigs"
+is $(vg gbwt -H xy.local.gbwt) 17 "local haplotypes w/ reference paths: 17 haplotypes"
+is $(vg gbwt -S xy.local.gbwt) 17 "local haplotypes w/ reference paths: 17 samples"
 
 rm -f xy.local.gg xy.local.gbwt
 
@@ -325,11 +325,11 @@ rm -f xy.local.gg xy.local.gbwt
 vg gbwt -G haplotype-sampling/micb-kir3dl1.gfa -g large.gbz --gbz-format
 vg gbwt -Z large.gbz -l -n 16 --pass-paths -o large.local.gbwt
 is $? 0 "Local haplotypes with reference paths from a larger GBZ"
-is $(vg gbwt -c large.local.gbwt) 36 "local haplotypes w/ paths: 36 threads"
-is $(vg gbwt -C large.local.gbwt) 2 "local haplotypes w/ paths: 2 contigs"
-is $(vg gbwt -H large.local.gbwt) 18 "local haplotypes w/ paths: 18 haplotypes"
-is $(vg gbwt -S large.local.gbwt) 18 "local haplotypes w/ paths: 18 samples"
-is $(vg gbwt --tags large.local.gbwt | grep -c reference_samples) 1 "local haplotypes w/ paths: reference_samples set"
+is $(vg gbwt -c large.local.gbwt) 36 "local haplotypes w/ reference paths: 36 paths"
+is $(vg gbwt -C large.local.gbwt) 2 "local haplotypes w/ reference paths: 2 contigs"
+is $(vg gbwt -H large.local.gbwt) 18 "local haplotypes w/ reference paths: 18 haplotypes"
+is $(vg gbwt -S large.local.gbwt) 18 "local haplotypes w/ reference paths: 18 samples"
+is $(vg gbwt --tags large.local.gbwt | grep -c reference_samples) 1 "local haplotypes w/ reference paths: reference_samples set"
 
 rm -f large.gbz large.local.gbwt
 
@@ -339,7 +339,7 @@ vg gbwt -x x.vg -o x.gbwt -v small/xy2.vcf.gz
 vg gbwt -a -n 16 -x xy.xg -g augmented.gg -o augmented.gbwt x.gbwt
 is $? 0 "Augmented GBWTGraph construction"
 is $(md5sum augmented.gg | cut -f 1 -d\ ) 00429586246711abcf1367a97d3c468c "GBWTGraph was serialized correctly"
-is $(vg gbwt -c augmented.gbwt) 18 "augmented: 18 threads"
+is $(vg gbwt -c augmented.gbwt) 18 "augmented: 18 paths"
 is $(vg gbwt -C augmented.gbwt) 2 "augmented: 2 contigs"
 is $(vg gbwt -H augmented.gbwt) 18 "augmented: 18 haplotypes"
 is $(vg gbwt -S augmented.gbwt) 17 "augmented: 17 samples"
@@ -355,7 +355,7 @@ rm -f x.vg y.vg x.xg xy.xg xy-alt.xg
 vg gbwt -o gfa.gbwt -G graphs/components_walks.gfa
 is $? 0 "GBWT construction from GFA"
 is $(md5sum gfa.gbwt | cut -f 1 -d\ ) 44c27c37c7af6911c26aea2a41008460 "GBWT was serialized correctly"
-is $(vg gbwt -c gfa.gbwt) 4 "gfa: 4 threads"
+is $(vg gbwt -c gfa.gbwt) 4 "gfa: 4 paths"
 is $(vg gbwt -C gfa.gbwt) 2 "gfa: 2 contigs"
 is $(vg gbwt -H gfa.gbwt) 2 "gfa: 2 haplotypes"
 is $(vg gbwt -S gfa.gbwt) 1 "gfa: 1 sample"
@@ -376,7 +376,7 @@ is $(md5sum gfa2.gbz | cut -f 1 -d\ ) ab241a3f79a781a367b701cb8888bf01 "GBZ was
 # Build GBWT and GBWTGraph from GFA with node chopping
 vg gbwt -o chopping.gbwt -g chopping.gg --translation chopping.trans --max-node 2 -G graphs/chopping_walks.gfa
 is $? 0 "GBWT+GBWTGraph construction from GFA with chopping"
-is $(vg gbwt -c chopping.gbwt) 3 "chopping: 3 threads"
+is $(vg gbwt -c chopping.gbwt) 3 "chopping: 3 paths"
 is $(vg gbwt -C chopping.gbwt) 1 "chopping: 1 contig"
 is $(vg gbwt -H chopping.gbwt) 3 "chopping: 3 haplotypes"
 is $(vg gbwt -S chopping.gbwt) 2 "chopping: 2 samples"
@@ -391,7 +391,7 @@ is $(wc -l < from_gbz.trans) 8 "from GBZ: 8 translations"
 # Build GBWT and GBWTGraph from GFA with both paths and walks
 vg gbwt -o ref_paths.gbwt -g ref_paths.gg --translation ref_paths.trans -G graphs/components_paths_walks.gfa
 is $? 0 "GBWT+GBWTGraph construction from GFA with reference paths"
-is $(vg gbwt -c ref_paths.gbwt) 6 "ref paths: 6 threads"
+is $(vg gbwt -c ref_paths.gbwt) 6 "ref paths: 6 paths"
 is $(vg gbwt -C ref_paths.gbwt) 2 "ref paths: 2 contigs"
 is $(vg gbwt -H ref_paths.gbwt) 3 "ref paths: 3 haplotypes"
 is $(vg gbwt -S ref_paths.gbwt) 2 "ref paths: 2 samples"

From 6377996ef9f148b02e6e890e023d32dbf01ad96c Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Thu, 31 Oct 2024 21:13:18 -0700
Subject: [PATCH 03/12] Rename some long options

---
 src/subcommand/gbwt_main.cpp | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp
index af88d10f2e9..5e55928a4fe 100644
--- a/src/subcommand/gbwt_main.cpp
+++ b/src/subcommand/gbwt_main.cpp
@@ -330,11 +330,11 @@ void help_gbwt(char** argv) {
     std::cerr << "    -H, --haplotypes        print the number of haplotypes" << std::endl;
     std::cerr << "    -S, --samples           print the number of samples" << std::endl;
     std::cerr << "    -L, --list-names        list contig/sample names (use with -C or -S)" << std::endl;
-    std::cerr << "    -T, --thread-names      list path names" << std::endl; // FIXME change to --path-names; make --thread-names a deprecated alias
+    std::cerr << "    -T, --path-names        list path names" << std::endl;
     std::cerr << "        --tags              list GBWT tags" << std::endl;
     std::cerr << std::endl;
     std::cerr << "Step 8: Paths (one input GBWT):" << std::endl;
-    std::cerr << "    -c, --count-threads     print the number of paths" << std::endl; // FIXME also here, change to --count-paths
+    std::cerr << "    -c, --count-paths       print the number of paths" << std::endl;
     std::cerr << "    -e, --extract FILE      extract paths in SDSL format to FILE" << std::endl;
     std::cerr << std::endl;
 }
@@ -424,7 +424,11 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {
     constexpr int OPT_PASS_PATHS = 1400;
     constexpr int OPT_GBZ_FORMAT = 1500;
     constexpr int OPT_TAGS = 1700;
-    
+
+    // Deprecated options.
+    constexpr int OPT_THREAD_NAMES = 2000;
+    constexpr int OPT_COUNT_THREADS = 2001;
+
     // Make a collection of all the known tags and their descriptions. Use an ordered map so that we can do some typo guessing.
     // Values are description and list of prohibited characters.
     const std::map<std::string, std::pair<std::string, std::unordered_set<char>>> KNOWN_TAGS = {
@@ -519,11 +523,13 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {
         { "haplotypes", no_argument, 0, 'H' },
         { "samples", no_argument, 0, 'S' },
         { "list-names", no_argument, 0, 'L' },
-        { "thread-names", no_argument, 0, 'T' }, // FIXME
+        { "path-names", no_argument, 0, 'T' },
+        { "thread-names", no_argument, 0, OPT_THREAD_NAMES },
         { "tags", no_argument, 0, OPT_TAGS },
 
         // Paths
-        { "count-threads", no_argument, 0, 'c' }, // FIXME
+        { "count-paths", no_argument, 0, 'c' },
+        { "count-threads", no_argument, 0, OPT_COUNT_THREADS },
         { "extract", required_argument, 0, 'e' },
 
         { "help", no_argument, 0, 'h' },
@@ -867,6 +873,11 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {
             config.path_names = true;
             config.metadata_mode = true;
             break;
+        case OPT_THREAD_NAMES:
+            std::cerr << "warning: [vg gbwt] option --thread-names is deprecated; use --path-names instead" << std::endl;
+            config.path_names = true;
+            config.metadata_mode = true;
+            break;
         case OPT_TAGS:
             config.tags = true;
             config.metadata_mode = true;
@@ -877,6 +888,11 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {
             config.count_paths = true;
             config.path_mode = true;
             break;
+        case OPT_COUNT_THREADS:
+            std::cerr << "warning: [vg gbwt] option --count-threads is deprecated; use --count-paths instead" << std::endl;
+            config.count_paths = true;
+            config.path_mode = true;
+            break;
         case 'e':
             config.path_output = optarg;
             config.path_mode = true;

From 7c6b977c989b90e538cd7b1ab1ff422b718473d2 Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Thu, 31 Oct 2024 22:28:58 -0700
Subject: [PATCH 04/12] Remove GBWT options from vg index

---
 src/subcommand/index_main.cpp | 218 ++++------------------------------
 test/t/05_vg_find.t           |   4 +-
 test/t/06_vg_index.t          |  73 ++----------
 test/t/07_vg_map.t            |   3 +-
 test/t/11_vg_paths.t          |   3 +-
 test/t/13_vg_sim.t            |   4 +-
 test/t/18_vg_call.t           |   2 +-
 test/t/26_deconstruct.t       |   3 +-
 test/t/33_vg_mpmap.t          |   3 +-
 test/t/37_vg_gbwt.t           |  32 +----
 test/t/38_vg_prune.t          |   6 +-
 test/t/46_vg_minimizer.t      |   6 +-
 12 files changed, 62 insertions(+), 295 deletions(-)

diff --git a/src/subcommand/index_main.cpp b/src/subcommand/index_main.cpp
index 26a761162f2..731fdcff26d 100644
--- a/src/subcommand/index_main.cpp
+++ b/src/subcommand/index_main.cpp
@@ -1,4 +1,4 @@
-// index.cpp: define the "vg index" subcommand, which makes xg, GCSA2, and GBWT indexes
+// index.cpp: define the "vg index" subcommand, which makes xg, GCSA2, and distance indexes
 
 #include <omp.h>
 #include <unistd.h>
@@ -11,7 +11,6 @@
 #include "subcommand.hpp"
 
 #include "../vg.hpp"
-#include "../haplotype_indexer.hpp"
 #include "xg.hpp"
 #include <vg/io/stream.hpp>
 #include <vg/io/vpkg.hpp>
@@ -23,12 +22,10 @@
 #include "../integrated_snarl_finder.hpp"
 #include "../snarl_distance_index.hpp"
 #include "../source_sink_overlay.hpp"
-#include "../gbwt_helper.hpp"
 #include "../gbwtgraph_helper.hpp"
 #include "../gcsa_helper.hpp"
 
 #include <gcsa/algorithms.h>
-#include <gbwt/variants.h>
 #include <bdsg/overlays/packed_subgraph_overlay.hpp>
 #include <handlegraph/algorithms/weakly_connected_components.hpp>
 
@@ -47,24 +44,6 @@ void help_index(char** argv) {
          << "xg options:" << endl
          << "    -x, --xg-name FILE     use this file to store a succinct, queryable version of the graph(s), or read for GCSA or distance indexing" << endl
          << "    -L, --xg-alts          include alt paths in xg" << endl
-         << "gbwt options (more in vg gbwt):" << endl
-         << "    -v, --vcf-phasing FILE generate threads from the haplotypes in the VCF file FILE" << endl
-         << "    -W, --ignore-missing   don't warn when variants in the VCF are missing from the graph; silently skip them" << endl
-         << "    -T, --store-threads    generate threads from the embedded paths" << endl
-         << "    -M, --store-gam FILE   generate threads from the alignments in gam FILE (many allowed)" << endl
-         << "    -F, --store-gaf FILE   generate threads from the alignments in gaf FILE (many allowed)" << endl
-         << "    -G, --gbwt-name FILE   store the threads as GBWT in FILE" << endl
-         << "    -z, --actual-phasing   do not make unphased homozygous genotypes phased"<< endl
-         << "    -P, --force-phasing    replace unphased genotypes with randomly phased ones" << endl
-         << "    -o, --discard-overlaps skip overlapping alternate alleles if the overlap cannot be resolved" << endl
-         << "    -B, --batch-size N     number of samples per batch (default 200)" << endl
-         << "    -u, --buffer-size N    GBWT construction buffer size in millions of nodes (default 100)" << endl
-         << "    -n, --id-interval N    store haplotype ids at one out of N positions (default 1024)" << endl
-         << "    -R, --range X..Y       process samples X to Y (inclusive)" << endl
-         << "    -r, --rename V=P       rename contig V in the VCFs to path P in the graph (may repeat)" << endl
-         << "    --rename-variants      when renaming contigs, find variants in the graph based on the new name" << endl
-         << "    -I, --region C:S-E     operate on only the given 1-based region of the given VCF contig (may repeat)" << endl
-         << "    -E, --exclude SAMPLE   exclude any samples with the given name from haplotype indexing" << endl
          << "gcsa options:" << endl
          << "    -g, --gcsa-out FILE    output a GCSA2 index to the given file" << endl
          //<< "    -i, --dbg-in FILE      use kmers from FILE instead of input VG (may repeat)" << endl
@@ -83,12 +62,6 @@ void help_index(char** argv) {
          << "                           if N is 0 then don't store distances, only the snarl tree" << endl;
 }
 
-void multiple_thread_sources() {
-    std::cerr << "error: [vg index] cannot generate threads from multiple sources (VCF, GAM, GAF, paths)" << std::endl;
-    std::cerr << "error: [vg index] GBWT indexes can be built separately and merged with vg gbwt" << std::endl;
-    std::exit(EXIT_FAILURE);
-}
-
 int main_index(int argc, char** argv) {
 
     if (argc == 2) {
@@ -101,25 +74,18 @@ int main_index(int argc, char** argv) {
     #define OPT_DISTANCE_SNARL_LIMIT 1002
 
     // Which indexes to build.
-    bool build_xg = false, build_gbwt = false, build_gcsa = false, build_dist = false;
+    bool build_xg = false, build_gcsa = false, build_dist = false;
 
     // Files we should read.
     string vcf_name, mapping_name;
     vector<string> dbg_names;
 
     // Files we should write.
-    string xg_name, gbwt_name, gcsa_name, dist_name;
-
+    string xg_name, gcsa_name, dist_name;
 
     // General
     bool show_progress = false;
 
-    // GBWT
-    HaplotypeIndexer haplotype_indexer;
-    enum thread_source_type { thread_source_none, thread_source_vcf, thread_source_paths, thread_source_gam, thread_source_gaf };
-    thread_source_type thread_source = thread_source_none;
-    vector<string> aln_file_names;
-
     // GCSA
     gcsa::size_type kmer_size = gcsa::Key::MAX_LENGTH;
     gcsa::ConstructionParameters params;
@@ -152,7 +118,7 @@ int main_index(int argc, char** argv) {
             {"thread-db", required_argument, 0, 'F'},
             {"xg-alts", no_argument, 0, 'L'},
 
-            // GBWT
+            // GBWT. These have been removed and will return an error.
             {"vcf-phasing", required_argument, 0, 'v'},
             {"ignore-missing", no_argument, 0, 'W'},
             {"store-threads", no_argument, 0, 'T'},
@@ -211,7 +177,6 @@ int main_index(int argc, char** argv) {
             break;
         case 'p':
             show_progress = true;
-            haplotype_indexer.show_progress = true;
             break;
 
         // XG
@@ -223,111 +188,26 @@ int main_index(int argc, char** argv) {
             xg_alts = true;
             break;
 
-        // GBWT
-        case 'v':
-            if (thread_source != thread_source_none) {
-                multiple_thread_sources();
-            }
-            thread_source = thread_source_vcf;
-            vcf_name = optarg;
-            break;
-        case 'W':
-            haplotype_indexer.warn_on_missing_variants = false;
-            break;
-        case 'T':
-            if (thread_source != thread_source_none) {
-                multiple_thread_sources();
-            }
-            thread_source = thread_source_paths;
-            break;
-        case 'M':
-            if (thread_source != thread_source_none && thread_source != thread_source_gam) {
-                multiple_thread_sources();
-            }
-            thread_source = thread_source_gam;
-            build_gbwt = true;
-            aln_file_names.push_back(optarg);
-            break;
-        case 'F':
-            if (thread_source != thread_source_none && thread_source != thread_source_gaf) {
-                multiple_thread_sources();
-            }
-            thread_source = thread_source_gaf;
-            build_gbwt = true;
-            aln_file_names.push_back(optarg);
-            break;
-        case 'G':
-            build_gbwt = true;
-            gbwt_name = optarg;
-            break;
-        case 'z':
-            haplotype_indexer.phase_homozygous = false;
-            break;
-        case 'P':
-            haplotype_indexer.force_phasing = true;
-            break;
-        case 'o':
-            haplotype_indexer.discard_overlaps = true;
-            break;
-        case 'B':
-            haplotype_indexer.samples_in_batch = std::max(parse<size_t>(optarg), 1ul);
-            break;
-        case 'u':
-            haplotype_indexer.gbwt_buffer_size = std::max(parse<size_t>(optarg), 1ul);
-            break;
-        case 'n':
-            haplotype_indexer.id_interval = parse<size_t>(optarg);
-            break;
-        case 'R':
-            {
-                // Parse first..last
-                string temp(optarg);
-                size_t found = temp.find("..");
-                if(found == string::npos || found == 0 || found + 2 == temp.size()) {
-                    cerr << "error: [vg index] could not parse range " << temp << endl;
-                    exit(1);
-                }
-                haplotype_indexer.sample_range.first = parse<size_t>(temp.substr(0, found));
-                haplotype_indexer.sample_range.second = parse<size_t>(temp.substr(found + 2)) + 1;
-            }
-            break;
-        case 'r':
-            {
-                // Parse the rename old=new
-                string key_value(optarg);
-                auto found = key_value.find('=');
-                if (found == string::npos || found == 0 || found + 1 == key_value.size()) {
-                    cerr << "error: [vg index] could not parse rename " << key_value << endl;
-                    exit(1);
-                }
-                // Parse out the two parts
-                string vcf_contig = key_value.substr(0, found);
-                string graph_contig = key_value.substr(found + 1);
-                // Add the name mapping
-                haplotype_indexer.path_to_vcf[graph_contig] = vcf_contig;
-            }
-            break;
-        case OPT_RENAME_VARIANTS:
-            haplotype_indexer.rename_variants = true;
-            break;
-        case 'I':
-            {
-                // We want to parse this region specifier
-                string region(optarg);
-                
-                Region parsed;
-                parse_region(region, parsed);
-                if (parsed.start <= 0 || parsed.end <= 0) {
-                    // We need both range bounds, and we can't accept 0 since input is 1-based.
-                    cerr << "error: [vg index] could not parse 1-based region " << optarg << endl;
-                }
-                
-                // Make sure to correct the coordinates to 0-based exclusive-end, from 1-based inclusive-end
-                haplotype_indexer.regions[parsed.seq] = make_pair((size_t) (parsed.start - 1), (size_t) parsed.end);
-            }
-            break;
+        // GBWT. The options remain, but they are no longer supported.
+        case 'v': // Fall through
+        case 'W': // Fall through
+        case 'T': // Fall through
+        case 'M': // Fall through
+        case 'F': // Fall through
+        case 'G': // Fall through
+        case 'z': // Fall through
+        case 'P': // Fall through
+        case 'o': // Fall through
+        case 'B': // Fall through
+        case 'u': // Fall through
+        case 'n': // Fall through
+        case 'R': // Fall through
+        case 'r': // Fall through
+        case OPT_RENAME_VARIANTS: // Fall through
+        case 'I': // Fall through
         case 'E':
-            haplotype_indexer.excluded_samples.insert(optarg);
+            std::cerr << "error: [vg index] GBWT construction options have been removed; use vg gbwt instead" << std::endl;
+            std::exit(EXIT_FAILURE);
             break;
 
         // GCSA
@@ -391,37 +271,11 @@ int main_index(int argc, char** argv) {
     }
 
 
-    if (xg_name.empty() && gbwt_name.empty() &&
-        gcsa_name.empty() && !build_gai_index && !build_vgi_index && dist_name.empty()) {
+    if (xg_name.empty() && gcsa_name.empty() && !build_gai_index && !build_vgi_index && dist_name.empty()) {
         cerr << "error: [vg index] index type not specified" << endl;
         return 1;
     }
 
-    if (build_gbwt && thread_source == thread_source_none) {
-        cerr << "error: [vg index] cannot build GBWT without threads" << endl;
-        return 1;
-    }
-
-    if (thread_source != thread_source_none && !build_gbwt) {
-        cerr << "error: [vg index] no GBWT output specified for the threads" << endl;
-        return 1;
-    }
-
-    if (thread_source == thread_source_gam || thread_source == thread_source_gaf) {
-        for (const auto& name : aln_file_names) {
-            if (name == "-") {
-                cerr << "error: [vg index] GAM (-M) and GAF (-F) input files cannot be read from stdin (-)" << endl;
-                return 1;
-            }
-        }
-    }
-
-    if (thread_source != thread_source_none && file_names.size() != 1) {
-        cerr << "error: [vg index] exactly one graph required for generating threads" << std::endl;
-        cerr << "error: [vg index] you may combine the graphs with vg index -x combined.xg --xg-alts" << std::endl;
-        return 1;
-    }
-
     if (file_names.size() <= 0 && dbg_names.empty()){
         //cerr << "No graph provided for indexing. Please provide a .vg file or GCSA2-format deBruijn graph to index." << endl;
         //return 1;
@@ -481,30 +335,6 @@ int main_index(int argc, char** argv) {
         vg::io::save_handle_graph(&xg_index, xg_name);
     }
 
-    // Generate threads
-    if (thread_source != thread_source_none) {
-
-        // Load the only input graph.
-        unique_ptr<PathHandleGraph> path_handle_graph;
-        path_handle_graph = vg::io::VPKG::load_one<PathHandleGraph>(file_names[0]);
-
-        std::unique_ptr<gbwt::DynamicGBWT> gbwt_index(nullptr);
-        if (thread_source == thread_source_vcf) {
-            std::vector<std::string> parse_files = haplotype_indexer.parse_vcf(vcf_name, *path_handle_graph);
-            path_handle_graph.reset(); // Save memory by deleting the graph.
-            gbwt_index = haplotype_indexer.build_gbwt(parse_files);
-        } else if (thread_source == thread_source_paths) {
-            gbwt_index = haplotype_indexer.build_gbwt(*path_handle_graph);
-        } else if (thread_source == thread_source_gam) {
-            gbwt_index = haplotype_indexer.build_gbwt(*path_handle_graph, aln_file_names, "GAM");
-        } else if (thread_source == thread_source_gaf) {
-            gbwt_index = haplotype_indexer.build_gbwt(*path_handle_graph, aln_file_names, "GAF");
-        }
-        if (build_gbwt && gbwt_index.get() != nullptr) {
-            save_gbwt(*gbwt_index, gbwt_name, show_progress);
-        }
-    } // End of thread indexing.
-
     // Build GCSA
     if (build_gcsa) {
 
diff --git a/test/t/05_vg_find.t b/test/t/05_vg_find.t
index f7c54e36eee..67a51741da1 100644
--- a/test/t/05_vg_find.t
+++ b/test/t/05_vg_find.t
@@ -112,7 +112,7 @@ rm -f t.xg t.vg t.x:30:35.vg t.x:10:20.vg q.x:30:35.vg q.x:10:20.vg t.bed
 
 vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null
 vg index -x x.xg x.vg
-vg index -G x.gbwt -v small/xy2.vcf.gz x.vg
+vg gbwt -v small/xy2.vcf.gz -o x.gbwt -x x.vg
 is $(vg find -p x -x x.xg -K 16 -H x.gbwt | cut -f 5 | sort | uniq -c  | tail -n 1 | awk '{ print $1 }') 1510 "we find the expected number of kmers with haplotype frequency equal to 2"
 rm -f x.vg x.xg x.gbwt
 
@@ -124,7 +124,7 @@ is $? 0 "GFA i/o for find -n consistent with converting both ways"
 
 # Find nodes that map to the provided ids
 vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null
-vg index -G x.gbwt -v small/xy2.vcf.gz x.vg
+vg gbwt -v small/xy2.vcf.gz -o x.gbwt -x x.vg
 vg prune -u -m x.mapping -g x.gbwt -e 1 x.vg > x.unfolded.vg
 
 rm -f expected.gfa
diff --git a/test/t/06_vg_index.t b/test/t/06_vg_index.t
index a8ac6add8dc..0e3edf78dbc 100644
--- a/test/t/06_vg_index.t
+++ b/test/t/06_vg_index.t
@@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg
 
 export LC_ALL="en_US.utf8" # force ekg's favorite sort order
 
-plan tests 54
+plan tests 45
 
 # Single graph without haplotypes
 vg construct -r small/x.fa -v small/x.vcf.gz > x.vg
@@ -39,9 +39,6 @@ rm -f x3.gcsa x3.gcsa.lcp
 # Single graph with haplotypes
 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg
 
-vg index -G x.gbwt -v small/x.vcf.gz x.vg
-is $? 0 "building a GBWT index of a graph with haplotypes"
-
 vg index -x x.xg x.vg
 is $? 0 "building an XG index of a graph with haplotypes"
 
@@ -55,51 +52,25 @@ is $(vg paths -x x-ap.xg -L | wc -l) $(vg paths -v x.vg -L | wc -l) "xg index do
 vg index -g x.gcsa x.vg
 is $? 0 "building a GCSA index of a graph with haplotypes"
 
-vg index -x x2.xg -G x2.gbwt -v small/x.vcf.gz -g x2.gcsa x.vg
+vg index -x x2.xg -g x2.gcsa x.vg
 is $? 0 "building all indexes at once"
 
-cmp x.xg x2.xg && cmp x.gbwt x2.gbwt && cmp x.gcsa x2.gcsa && cmp x.gcsa.lcp x2.gcsa.lcp
+cmp x.xg x2.xg && cmp x.gcsa x2.gcsa && cmp x.gcsa.lcp x2.gcsa.lcp
 is $? 0 "the indexes are identical"
 
-vg index -x x2-ap.xg -G x2-ap.gbwt -v small/x.vcf.gz -g x2-ap.gcsa x.vg -L
+vg index -x x2-ap.xg -g x2-ap.gcsa x.vg -L
 is $? 0 "building all indexes at once, while leaving alt paths in xg"
 
-cmp x.gbwt x2-ap.gbwt && cmp x.gcsa x2-ap.gcsa && cmp x.gcsa.lcp x2-ap.gcsa.lcp
+cmp x.gcsa x2-ap.gcsa && cmp x.gcsa.lcp x2-ap.gcsa.lcp
 is $? 0 "the indexes are identical with -L"
 
 is $(vg paths -x x2-ap.xg -L | wc -l) $(vg paths -v x.vg -L | wc -l) "xg index does contains alt paths with index -L all at once"
 
-# Exclude a sample from the GBWT index
-vg index -G empty.gbwt -v small/x.vcf.gz --exclude 1 x.vg
-is $? 0 "samples can be excluded from haplotype indexing"
-is $(vg gbwt -c empty.gbwt) 0 "excluded samples were not included in the GBWT index"
-
-# Make GBWT from GAM
-vg paths -v x.vg -X -Q _alt > x-alts.gam
-vg index x.vg -M x-alts.gam -G x-gam.gbwt
-# Make GBWT from GAF
-vg convert x.vg -G x-alts.gam > x-alts.gaf
-vg index x.vg -F x-alts.gaf -G x-gaf.gbwt
-cmp x-gaf.gbwt x-gam.gbwt
-is $? 0 "GBWT from GAF same as from GAM"
-
 rm -f x.vg
-rm -f x.xg x-ap.xg x.gbwtx.gcsa x.gcsa.lcp
-rm -f x2.xg x2.gbwt x2.gcsa x2.gcsa.lcp
-rm -f x2-ap.xg x2-ap.gbwt x2-ap.gcsa x2-ap.gcsa.lcp
-rm -f empty.gbwt
-rm -f x-alts.gam x-alts.gaf x-gam.gbwt x-gaf.gbwt
-
-
-# Subregion graph with haplotypes
-vg construct -r small/x.fa -v small/x.vcf.gz -a --region x:100-200 > x.part.vg
-
-vg index -x x.part.xg -G x.part.gbwt --region x:100-200 -v small/x.vcf.gz x.part.vg 2>log.txt
-is $? 0 "building GBWT index for a regional graph"
-
-is "$(cat log.txt | wc -c)" "0" "no warnings about missing variants produced"
-
-rm -f x.part.vg x.part.xg x.part.gbwt log.txt
+rm -f x.xg x-ap.xg x.gcsa x.gcsa.lcp
+rm -f x2.xg x2.gcsa x2.gcsa.lcp
+rm -f x2-ap.xg x2-ap.gcsa x2-ap.gcsa.lcp
+rm -f x-alts.gam x-alts.gaf
 
 
 # Multiple graphs without haplotypes
@@ -129,9 +100,6 @@ vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null
 vg construct -r small/xy.fa -v small/xy2.vcf.gz -R y -C -a > y.vg 2> /dev/null
 vg ids -j x.vg y.vg
 
-vg index -G x.gbwt -v small/xy2.vcf.gz x.vg && vg index -G y.gbwt -v small/xy2.vcf.gz y.vg && vg gbwt -m -f -o xy.gbwt x.gbwt y.gbwt
-is $? 0 "building a GBWT index of multiple graphs with haplotypes"
-
 vg index -x xy.xg x.vg y.vg
 is $? 0 "building an XG index of multiple graphs with haplotypes"
 
@@ -144,31 +112,14 @@ is $? 0 "building XG and GCSA indexes at once"
 vg index -x xy-alt.xg -L x.vg y.vg
 is $? 0 "building an XG index with alt paths"
 
-vg index -G xy2.gbwt -v small/xy2.vcf.gz xy-alt.xg
-is $? 0 "building a GBWT index from an XG index"
-
-cmp xy.xg xy2.xg && cmp xy.gcsa xy2.gcsa && cmp xy.gcsa.lcp xy2.gcsa.lcp && cmp xy.gbwt xy2.gbwt
+cmp xy.xg xy2.xg && cmp xy.gcsa xy2.gcsa && cmp xy.gcsa.lcp xy2.gcsa.lcp
 is $? 0 "the indexes are identical"
 
 rm -f x.vg y.vg
-rm -f x.gbwt y.gbwt
-rm -f xy.xg xy.gbwt xy.gcsa xy.gcsa.lcp
-rm -f xy2.xg xy2.gbwt xy2.gcsa xy2.gcsa.lcp
+rm -f xy.xg xy.gcsa xy.gcsa.lcp
+rm -f xy2.xg xy2.gcsa xy2.gcsa.lcp
 rm -f xy-alt.xg
 
-
-# GBWT construction options
-vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null
-
-vg index -G x_ref.gbwt -T x.vg
-is $? 0 "GBWT can be built for paths"
-
-rm -f x_ref.gbwt
-
-# We do not test GBWT construction parameters (-B, -u, -n) because they matter only for large inputs.
-# We do not test chromosome-length path generation (-P, -o) for the same reason.
-
-
 # Other tests
 vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz >x.vg
 vg index -x x.xg x.vg bogus123.vg 2>/dev/null
diff --git a/test/t/07_vg_map.t b/test/t/07_vg_map.t
index 08a99edc398..a7c1e41c9a8 100644
--- a/test/t/07_vg_map.t
+++ b/test/t/07_vg_map.t
@@ -152,7 +152,8 @@ is $? 1 "error on vg map -f <nonexistent-file> (paired, RHS)"
 
 # Now do the GBWT
 vg construct -a -r small/x.fa -v small/x.vcf.gz >x.vg
-vg index -x x.xg -g x.gcsa -v small/x.vcf.gz --gbwt-name x.gbwt -k 16 x.vg
+vg index -x x.xg -g x.gcsa -k 16 x.vg
+vg gbwt -v small/x.vcf.gz -o x.gbwt -x x.vg
 
 # This read is all ref which matches no haplotype in x.vcf.gz and visits some unused nodes
 is "$(vg map -x x.xg -g x.gcsa --gbwt-name x.gbwt --hap-exp 1 --full-l-bonus 0 -f reads/x.unvisited.fq -j | jq -r '.score')" "36" "mapping a read that touches unused nodes gets the base score"
diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t
index e21a8fe2c09..e83fa8de659 100644
--- a/test/t/11_vg_paths.t
+++ b/test/t/11_vg_paths.t
@@ -11,7 +11,8 @@ plan tests 26
 
 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg
 vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg
-vg index -x x.xg -G x.gbwt -v small/x.vcf.gz x.vg
+vg index -x x.xg x.vg
+vg gbwt -v small/x.vcf.gz -o x.gbwt -x x.vg
 
 # List path/thread names from various input formats
 is "$(vg paths --list -v x2.vg)" "x" "path listing works from vg"
diff --git a/test/t/13_vg_sim.t b/test/t/13_vg_sim.t
index 0481a82e463..ecc0799eeda 100644
--- a/test/t/13_vg_sim.t
+++ b/test/t/13_vg_sim.t
@@ -11,7 +11,7 @@ plan tests 36
 vg construct -r small/x.fa -v small/x.vcf.gz >x.vg
 vg construct -r small/x.fa -v small/x.vcf.gz -a >x2.vg
 vg index -x x.xg x.vg
-vg index -G x.gbwt -v small/x.vcf.gz x2.vg
+vg gbwt -o x.gbwt -v small/x.vcf.gz -x x2.vg
 
 is $(vg sim -l 100 -n 100 -x x.xg | wc -l) 100 \
     "vg sim creates the correct number of reads"
@@ -58,7 +58,7 @@ rm -f x.vg x2.vg x.xg x.gbwt n.vg n.fa n.xg
 
 vg construct -r small/xy.fa -v small/x.vcf.gz -a >xy.vg
 vg index -x xy.xg xy.vg
-vg index -G xy.gbwt -v small/x.vcf.gz xy.vg
+vg gbwt -o xy.gbwt -v small/x.vcf.gz -x xy.vg
 
 vg sim -s 12345 -n 1000 -l 2 -e 0.1 -x xy.xg -g xy.gbwt --sample-name 1 --any-path >/dev/null
 is $? "0" "Sample simulation works along with --any-path"
diff --git a/test/t/18_vg_call.t b/test/t/18_vg_call.t
index 6a457f4eb9f..b97cee3fb08 100644
--- a/test/t/18_vg_call.t
+++ b/test/t/18_vg_call.t
@@ -89,7 +89,7 @@ is "${REF_COUNT_V}" "${REF_COUNT_A}" "Same number of reference calls with -a as
 
 # Output snarl traversals into a GBWT then genotype that
 vg call HGSVC_alts.xg -k HGSVC_alts.pack -s HG00514 -T | gzip > HGSVC_travs.gaf.gz
-vg index HGSVC_alts.xg -F HGSVC_travs.gaf.gz -G HGSVC_travs.gbwt
+vg gbwt -o HGSVC_travs.gbwt -x HGSVC_alts.xg -A HGSVC_travs.gaf.gz
 vg call HGSVC_alts.xg -k HGSVC_alts.pack  -g HGSVC_travs.gbwt -s HG00514 > HGSVC_travs.vcf
 vg call HGSVC_alts.xg -k HGSVC_alts.pack -s HG00514 > HGSVC_direct.vcf
 # extract the called genotypes
diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t
index 03c7f8f3a87..47a28c4ae35 100644
--- a/test/t/26_deconstruct.t
+++ b/test/t/26_deconstruct.t
@@ -121,7 +121,8 @@ is "$?" 0 "deconstructing vg graph gives same output as xg graph"
 rm -f tiny_names.gfa tiny_names.vg tiny_names.xg tiny_names_decon.vcf tiny_names_decon_vg.vcf
 
 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg
-vg index -x x.xg -G x.gbwt -v small/x.vcf.gz x.vg
+vg index -x x.xg x.vg
+vg gbwt -v small/x.vcf.gz -o x.gbwt -x x.vg
 vg deconstruct x.xg -g x.gbwt | bgzip > x.decon.vcf.gz
 tabix -f -p vcf  x.decon.vcf.gz
 cat small/x.fa |  bcftools consensus small/x.vcf.gz -s 1 -H 1 > small.s1.h1.fa
diff --git a/test/t/33_vg_mpmap.t b/test/t/33_vg_mpmap.t
index 16af8c4cef9..e569b2b9ed6 100644
--- a/test/t/33_vg_mpmap.t
+++ b/test/t/33_vg_mpmap.t
@@ -11,7 +11,8 @@ plan tests 21
 # Exercise the GBWT
 # Index a couple of nearly identical contigs
 vg construct -m 1000 -a -r small/xy.fa -v small/xy2.vcf.gz >xy2.vg
-vg index -x xy2.xg -g xy2.gcsa -v small/xy2.vcf.gz --gbwt-name xy2.gbwt -k 16 xy2.vg
+vg index -x xy2.xg -g xy2.gcsa -k 16 xy2.vg
+vg gbwt -v small/xy2.vcf.gz -o xy2.gbwt -x xy2.vg
 
 # We turn off the background model calibration with -B and ignore it with -P 1
 
diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t
index 9fdc0c8a4fb..870b5d9674a 100644
--- a/test/t/37_vg_gbwt.t
+++ b/test/t/37_vg_gbwt.t
@@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap
 
 PATH=../bin:$PATH # for vg
 
-plan tests 159
+plan tests 149
 
 
 # Build vg graphs for two chromosomes
@@ -21,10 +21,6 @@ vg index -x xy-alt.xg -L x.vg y.vg
 # Single chromosome: haplotypes
 vg gbwt -x x.vg -o x.gbwt -v small/xy2.vcf.gz
 is $? 0 "chromosome X GBWT with vg gbwt"
-vg index -G x2.gbwt -v small/xy2.vcf.gz x.vg
-is $? 0 "chromosome X GBWT with vg index"
-cmp x.gbwt x2.gbwt
-is $? 0 "identical construction results with vg gbwt and vg index"
 vg gbwt -x x.vg -o parse --parse-only -v small/xy2.vcf.gz
 is $? 0 "chromosome X VCF parse"
 ../deps/gbwt/bin/build_gbwt -p -r parse_x > /dev/null 2> /dev/null
@@ -41,20 +37,16 @@ is $(vg gbwt -T x.gbwt | wc -l) 2 "chromosome X: 2 path names"
 is $(vg gbwt -C -L x.gbwt | wc -l) 1 "chromosome X: 1 contig name"
 is $(vg gbwt -S -L x.gbwt | wc -l) 1 "chromosome X: 1 sample name"
 
-rm -f x.gbwt x2.gbwt parse_x.gbwt
+rm -f x.gbwt parse_x.gbwt
 rm -f parse_x parse_x_0_1
 
 
 # Single chromosome: paths
 vg gbwt -E -o x.ref.gbwt -x x.vg
 is $? 0 "chromosome X reference GBWT with vg gbwt"
-vg index -G x2.ref.gbwt -T x.vg
-is $? 0 "chromosome X reference GBWT with vg index"
-cmp x.ref.gbwt x2.ref.gbwt
-is $? 0 "identical construction results with vg gbwt and vg index"
 is $(vg gbwt -c x.ref.gbwt) 1 "chromosome X reference: 1 path"
 
-rm -f x.ref.gbwt x2.ref.gbwt
+rm -f x.ref.gbwt
 
 
 # Single chromosome: alignments
@@ -62,17 +54,13 @@ vg paths -v x.vg -X -Q _alt > x.alts.gam
 vg convert -G x.alts.gam x.vg > x.alts.gaf
 vg gbwt -A -o x.alts.gaf.gbwt -x x.vg x.alts.gaf
 is $? 0 "chromosome X GAF with vg gbwt"
-vg index -F x.alts.gaf -G x2.alts.gaf.gbwt x.vg
-is $? 0 "chromosome X GAF with vg index"
-cmp x.alts.gaf.gbwt x2.alts.gaf.gbwt
-is $? 0 "identical construction results with vg gbwt and vg index"
 vg gbwt -A --gam-format -o x.alts.gam.gbwt -x x.vg x.alts.gam
 is $? 0 "chromosome X GAM with vg gbwt"
 cmp x.alts.gaf.gbwt x.alts.gaf.gbwt
 is $? 0 "identical construction results from GAF and GAM"
 
 rm -f x.alts.gam x.alts.gaf
-rm -f x.alts.gaf.gbwt x2.alts.gaf.gbwt x.alts.gam.gbwt
+rm -f x.alts.gaf.gbwt x.alts.gam.gbwt
 
 
 # Graph region: haplotypes
@@ -80,12 +68,8 @@ vg construct -r small/x.fa -v small/x.vcf.gz -a --region x:100-200 > x.part.vg
 vg gbwt -x x.part.vg -o x.part.gbwt --vcf-region x:100-200 -v small/x.vcf.gz 2> log.txt
 is $? 0 "chromosome X subgraph GBWT with vg gbwt"
 is "$(cat log.txt | wc -c)" 0 "no warnings about missing variants"
-vg index -G x2.part.gbwt --region x:100-200 -v small/x.vcf.gz x.part.vg 2> log.txt
-is $? 0 "chromosome X subgraph GBWT with vg index"
-cmp x.part.gbwt x2.part.gbwt
-is $? 0 "identical construction results with vg gbwt and vg index"
 
-rm -f x.part.vg x.part.gbwt x2.part.gbwt log.txt
+rm -f x.part.vg x.part.gbwt log.txt
 
 
 # Multiple chromosomes: haplotypes
@@ -127,16 +111,12 @@ rm -f xy.1000gp.gbwt
 # Multiple chromosomes: paths as contigs
 vg gbwt -E -o xy.contigs.gbwt -x xy.xg
 is $? 0 "paths as contigs with vg gbwt"
-vg index -G xy2.contigs.gbwt -T xy.xg
-is $? 0 "paths as contigs with vg index"
-cmp xy.contigs.gbwt xy2.contigs.gbwt
-is $? 0 "identical construction results with vg gbwt and vg index"
 is $(vg gbwt -c xy.contigs.gbwt) 2 "paths as contigs: 2 paths"
 is $(vg gbwt -C xy.contigs.gbwt) 2 "paths as contigs: 2 contigs"
 is $(vg gbwt -H xy.contigs.gbwt) 1 "paths as contigs: 1 haplotype"
 is $(vg gbwt -S xy.contigs.gbwt) 1 "paths as contigs: 1 sample"
 
-rm -f xy.contigs.gbwt xy2.contigs.gbwt 
+rm -f xy.contigs.gbwt
 
 
 # Build an r-index
diff --git a/test/t/38_vg_prune.t b/test/t/38_vg_prune.t
index 3cd5db9cde0..1aa7533c260 100644
--- a/test/t/38_vg_prune.t
+++ b/test/t/38_vg_prune.t
@@ -10,7 +10,7 @@ plan tests 21
 
 # Build a graph with one path and two threads
 vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null
-vg index -G x.gbwt -v small/xy2.vcf.gz x.vg
+vg gbwt -o x.gbwt -v small/xy2.vcf.gz -x x.vg
 
 # Basic pruning: 5 components, 51 nodes, 51 edges
 vg prune -e 1 x.vg > y.vg
@@ -54,8 +54,8 @@ rm -f x.vg x.gbwt
 vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null
 vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R y -C -a > y.vg 2> /dev/null
 vg ids -j -m xy.mapping x.vg y.vg
-vg index -G x.gbwt -v small/xy2.vcf.gz x.vg
-vg index -G y.gbwt -v small/xy2.vcf.gz y.vg
+vg gbwt -o x.gbwt -v small/xy2.vcf.gz -x x.vg
+vg gbwt -o y.gbwt -v small/xy2.vcf.gz -x y.vg
 
 # Prune a single-chromosome graph using multi-chromosome GBWT
 vg gbwt -m -o xy.gbwt x.gbwt y.gbwt
diff --git a/test/t/46_vg_minimizer.t b/test/t/46_vg_minimizer.t
index 06f9133f09d..e4090b54c9d 100644
--- a/test/t/46_vg_minimizer.t
+++ b/test/t/46_vg_minimizer.t
@@ -62,8 +62,10 @@ rm -f x.vg x.xg x.gbwt x.snarls x.dist x.mi x.gg x.gbz
 vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null
 vg construct -r small/xy.fa -v small/xy2.vcf.gz -R y -C -a > y.vg 2> /dev/null
 vg ids -j x.vg y.vg
-vg index -x x.xg -G x.gbwt -v small/xy2.vcf.gz x.vg
-vg index -x y.xg -G y.gbwt -v small/xy2.vcf.gz y.vg
+vg index -x x.xg x.vg
+vg gbwt -o x.gbwt -x x.vg -v small/xy2.vcf.gz
+vg index -x y.xg y.vg
+vg gbwt -o y.gbwt -x y.vg -v small/xy2.vcf.gz
 
 # Appending to the index
 vg minimizer --no-dist -t 1 -o x.mi -g x.gbwt x.xg

From 5a2c3af1fda989c2e31ad9da5c1897ec999ac91a Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Thu, 31 Oct 2024 23:01:56 -0700
Subject: [PATCH 05/12] Update SDSL, GCSA, GBWT, GBWTGraph

---
 deps/gbwt      | 2 +-
 deps/gbwtgraph | 2 +-
 deps/gcsa2     | 2 +-
 deps/sdsl-lite | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/deps/gbwt b/deps/gbwt
index 029c2b45667..2aab62f0664 160000
--- a/deps/gbwt
+++ b/deps/gbwt
@@ -1 +1 @@
-Subproject commit 029c2b456675eab0fbb55de21c8002e1660cb0d3
+Subproject commit 2aab62f0664b2ce8eb4cffd6b3872c89b85fdfa6
diff --git a/deps/gbwtgraph b/deps/gbwtgraph
index de61d340695..d2a423b5eb5 160000
--- a/deps/gbwtgraph
+++ b/deps/gbwtgraph
@@ -1 +1 @@
-Subproject commit de61d340695f64b8aa978816b195d6687e7fda5a
+Subproject commit d2a423b5eb54c3c2f0676cb2925b46ad0ba7e189
diff --git a/deps/gcsa2 b/deps/gcsa2
index 7dbf9305817..8b6b049ab64 160000
--- a/deps/gcsa2
+++ b/deps/gcsa2
@@ -1 +1 @@
-Subproject commit 7dbf93058171b5af420f1b6166d4d2bf54cd8748
+Subproject commit 8b6b049ab6444e891bb7c38ec8f38169ce62e5fb
diff --git a/deps/sdsl-lite b/deps/sdsl-lite
index 86fa3534c1b..115dfd5b737 160000
--- a/deps/sdsl-lite
+++ b/deps/sdsl-lite
@@ -1 +1 @@
-Subproject commit 86fa3534c1bf02b5468bbe58e2e0b8f6ae2d6fa4
+Subproject commit 115dfd5b7371ef99a1e0611389b956bb88bf9fc4

From ba0ed68ebff2d05dc23e6190b572284d7f750527 Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Sat, 2 Nov 2024 04:09:56 -0700
Subject: [PATCH 06/12] Multithreaded GAF/GAM to GBWT

---
 src/gbwt_helper.cpp          |  20 -----
 src/gbwt_helper.hpp          |   7 --
 src/haplotype_indexer.cpp    | 146 +++++++++++++++++++++++++++--------
 src/haplotype_indexer.hpp    |  11 ++-
 src/subcommand/gbwt_main.cpp |   4 +-
 test/t/37_vg_gbwt.t          |   4 +-
 6 files changed, 126 insertions(+), 66 deletions(-)

diff --git a/src/gbwt_helper.cpp b/src/gbwt_helper.cpp
index 0958ba45533..19306a7dfe3 100644
--- a/src/gbwt_helper.cpp
+++ b/src/gbwt_helper.cpp
@@ -88,26 +88,6 @@ gbwt::size_type gbwt_node_width(const HandleGraph& graph) {
     return gbwt::bit_length(gbwt::Node::encode(graph.max_node_id(), true));
 }
 
-void finish_gbwt_constuction(gbwt::GBWTBuilder& builder,
-    const std::vector<std::string>& sample_names,
-    const std::vector<std::string>& contig_names,
-    size_t haplotype_count, bool print_metadata,
-    const std::string& header) {
-
-    builder.finish();
-    builder.index.metadata.setSamples(sample_names);
-    builder.index.metadata.setHaplotypes(haplotype_count);
-    builder.index.metadata.setContigs(contig_names);
-    if (print_metadata) {
-        #pragma omp critical
-        {
-            std::cerr << header << ": ";
-            gbwt::operator<<(std::cerr, builder.index.metadata);
-            std::cerr << std::endl;
-        }
-    }
-}
-
 //------------------------------------------------------------------------------
 
 void load_gbwt(gbwt::GBWT& index, const std::string& filename, bool show_progress) {
diff --git a/src/gbwt_helper.hpp b/src/gbwt_helper.hpp
index 33e1545066c..57b819a579b 100644
--- a/src/gbwt_helper.hpp
+++ b/src/gbwt_helper.hpp
@@ -67,13 +67,6 @@ gbwt::vector_type path_predecessors(const PathHandleGraph& graph, const std::str
 /// Determine the node width in bits for the GBWT nodes based on the given graph.
 gbwt::size_type gbwt_node_width(const HandleGraph& graph);
 
-/// Finish GBWT construction and optionally print the metadata.
-void finish_gbwt_constuction(gbwt::GBWTBuilder& builder,
-    const std::vector<std::string>& sample_names,
-    const std::vector<std::string>& contig_names,
-    size_t haplotype_count, bool print_metadata,
-    const std::string& header = "GBWT");
-
 //------------------------------------------------------------------------------
 
 /*
diff --git a/src/haplotype_indexer.cpp b/src/haplotype_indexer.cpp
index 528160485cd..b7c935d7e05 100644
--- a/src/haplotype_indexer.cpp
+++ b/src/haplotype_indexer.cpp
@@ -4,10 +4,12 @@
 
 #include <iostream>
 #include <map>
+#include <mutex>
 #include <vector>
 #include <string>
 
 #include <vg/io/stream.hpp>
+#include <gbwtgraph/algorithms.h>
 #include <gbwtgraph/path_cover.h>
 
 #include "gbwt_helper.hpp"
@@ -394,7 +396,7 @@ std::unique_ptr<gbwt::DynamicGBWT> HaplotypeIndexer::build_gbwt(const std::vecto
         // And add every path that passes the filter (including haplotype paths) from the source graph.
         gbwtgraph::store_paths(builder, *graph, {PathSense::GENERIC, PathSense::REFERENCE, PathSense::HAPLOTYPE}, &path_filter);
 
-        // Finish the construction for this set of threads and put the index back.
+        // Finish the construction for this set of paths and put the index back.
         builder.finish();
         builder.swapIndex(*index);
     }
@@ -413,24 +415,43 @@ std::unique_ptr<gbwt::DynamicGBWT> HaplotypeIndexer::build_gbwt(const PathHandle
     return build_gbwt({}, "GBWT", &graph);
 }
 
-std::unique_ptr<gbwt::DynamicGBWT> HaplotypeIndexer::build_gbwt(const PathHandleGraph& graph,
-    const std::vector<std::string>& aln_filenames, const std::string& aln_format) const {
+std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& graph,
+    const std::vector<std::string>& aln_filenames, const std::string& aln_format, size_t parallel_jobs) const {
 
-    // GBWT metadata.
-    std::vector<std::string> sample_names, contig_names;
-    std::map<std::string, std::pair<size_t, size_t>> sample_info; // name -> (id, count)
-    contig_names.push_back("0"); // An artificial contig.
-    size_t haplotype_count = 0;
+    // Handle multithreading and parallel jobs.
+    parallel_jobs = std::max<size_t>(1, parallel_jobs);
+    int old_threads = omp_get_max_threads();
+    omp_set_num_threads(parallel_jobs);
+
+    // Partition the graph into construction jobs.
+    if (this->show_progress) {
+        #pragma omp critical
+        {
+            std::cerr << "Partitioning the graph into GBWT construction jobs" << std::endl;
+        }
+    }
+    size_t target_size = graph.get_node_count() / parallel_jobs;
+    gbwtgraph::ConstructionJobs jobs = gbwtgraph::gbwt_construction_jobs(graph, target_size);
+    if (this->show_progress) {
+        #pragma omp critical
+        {
+            std::cerr << "Created " << jobs.size() << " parallel construction jobs" << std::endl;
+        }
+    }
 
     // GBWT construction.
-    gbwt::GBWTBuilder builder(gbwt_node_width(graph), this->gbwt_buffer_size * gbwt::MILLION, this->id_interval);
-    builder.index.addMetadata();
+    std::vector<std::mutex> builder_mutexes(jobs.size());
+    std::vector<std::unique_ptr<gbwt::GBWTBuilder>> builders(jobs.size());
+    std::vector<std::vector<std::string>> read_names(jobs.size()); // TODO: Concatenated strings + starting offsets may save space.
+    for (size_t i = 0; i < jobs.size(); i++) {
+        builders[i].reset(new gbwt::GBWTBuilder(gbwt_node_width(graph), this->gbwt_buffer_size * gbwt::MILLION, this->id_interval));
+    }
 
     // Actual work.
     if (this->show_progress) {
         #pragma omp critical
         {
-            std::cerr << "Converting " << aln_format << " to threads" << std::endl;
+            std::cerr << "Converting " << aln_format << " to GBWT paths" << std::endl;
         }
     }
     std::function<void(Alignment&)> lambda = [&](Alignment& aln) {
@@ -438,37 +459,98 @@ std::unique_ptr<gbwt::DynamicGBWT> HaplotypeIndexer::build_gbwt(const PathHandle
         for (auto& m : aln.path().mapping()) {
             buffer.push_back(mapping_to_gbwt(m));
         }
-        builder.insert(buffer, true); // Insert in both orientations.
-        size_t sample_id = 0, sample_count = 0;
-        auto iter = sample_info.find(aln.name());
-        if (iter == sample_info.end()) {
-            sample_id = sample_names.size();
-            sample_names.push_back(aln.name());
-            sample_info[aln.name()] = std::pair<size_t, size_t>(sample_id, sample_count);
-            haplotype_count++;
-        } else {
-            sample_id = iter->second.first;
-            sample_count = iter->second.second;
-            iter->second.second++;
+        size_t job_id = 0;
+        if (buffer.size() > 0) {
+            job_id = jobs.job(gbwt::Node::id(buffer.front()));
+            if (job_id >= jobs.size()) {
+                job_id = 0;
+            }
+        }
+        {
+            // Insert the path into the appropriate builder and record the read name.
+            std::lock_guard<std::mutex> lock(builder_mutexes[job_id]);
+            builders[job_id]->insert(buffer, true);
+            read_names[job_id].push_back(aln.name());
         }
-        builder.index.metadata.addPath(sample_id, 0, 0, sample_count);
     };
     for (auto& file_name : aln_filenames) {
         if (aln_format == "GAM") {
             get_input_file(file_name, [&](istream& in) {
-                vg::io::for_each(in, lambda);
+                vg::io::for_each_parallel(in, lambda);
             });
         } else {
             assert(aln_format == "GAF");
-            vg::io::gaf_unpaired_for_each(graph, file_name, lambda);
+            vg::io::gaf_unpaired_for_each_parallel(graph, file_name, lambda);
         }
     }
-        
-    // Finish the construction and extract the index.
-    finish_gbwt_constuction(builder, sample_names, contig_names, haplotype_count, this->show_progress);
-    std::unique_ptr<gbwt::DynamicGBWT> built(new gbwt::DynamicGBWT());
-    builder.swapIndex(*built);
-    return built;
+
+    // Finish the construction and convert to compressed GBWT.
+    std::vector<gbwt::GBWT> partial_indexes(jobs.size());
+    #pragma omp parallel for schedule(dynamic, 1)
+    for (size_t i = 0; i < jobs.size(); i++) {
+        builders[i]->finish();
+        partial_indexes[i] = gbwt::GBWT(builders[i]->index);
+        builders[i].reset();
+    }
+
+    // Merge the partial indexes.
+    if (this->show_progress) {
+        #pragma omp critical
+        {
+            std::cerr << "Merging the partial GBWTs" << std::endl;
+        }
+    }
+    std::unique_ptr<gbwt::GBWT> result(new gbwt::GBWT(partial_indexes));
+    partial_indexes.clear();
+
+    // Create the metadata.
+    // TODO: This is quite slow.
+    if (this->show_progress) {
+        #pragma omp critical
+        {
+            std::cerr << "Creating metadata" << std::endl;
+        }
+    }
+    result->addMetadata();
+    result->metadata.setContigs({ "unknown" });
+    {
+        std::map<std::string, std::pair<size_t, size_t>> read_info; // name -> (sample id, fragment count)
+        for (auto& names : read_names) {
+            for (const std::string& name : names) {
+                size_t sample_id = 0, fragment_count = 0;
+                auto iter = read_info.find(name);
+                if (iter == read_info.end()) {
+                    sample_id = read_info.size();
+                    read_info[name] = std::pair<size_t, size_t>(sample_id, fragment_count);
+                } else {
+                    sample_id = iter->second.first;
+                    fragment_count = iter->second.second;
+                    iter->second.second++;
+                }
+                result->metadata.addPath(sample_id, 0, 0, fragment_count);
+            }
+            names = std::vector<std::string>();
+        }
+        std::vector<std::string> sample_names(read_info.size());
+        for (auto& p : read_info) {
+            sample_names[p.second.first] = p.first;
+        }
+        read_info.clear();
+        result->metadata.setSamples(sample_names);
+        result->metadata.setHaplotypes(sample_names.size());
+    }
+    if (this->show_progress) {
+        #pragma omp critical
+        {
+            std::cerr << "GBWT: ";
+            gbwt::operator<<(std::cerr, result->metadata);
+            std::cerr << std::endl;
+        }
+    }
+
+    // Restore the number of threads.
+    omp_set_num_threads(old_threads);
+    return result;
 }
 
 }
diff --git a/src/haplotype_indexer.hpp b/src/haplotype_indexer.hpp
index f555aec947b..ca212a94267 100644
--- a/src/haplotype_indexer.hpp
+++ b/src/haplotype_indexer.hpp
@@ -135,10 +135,15 @@ class HaplotypeIndexer : public Progressive {
      * the same name, the corresponding GBWT path names will have the same
      * sample identifier but different values in the count field.
      *
-     * aln_format can be "GAM" or "GAF"
+     * aln_format can be "GAM" or "GAF".
+     *
+     * Runs approximately the given number of jobs in parallel. The exact
+     * number depends on the sizes of weakly connected components in the graph.
+     * Each job uses at most 2 threads.
      */
-    std::unique_ptr<gbwt::DynamicGBWT> build_gbwt(const PathHandleGraph& graph,
-        const std::vector<std::string>& aln_filenames, const std::string& aln_format) const;
+    std::unique_ptr<gbwt::GBWT> build_gbwt(const HandleGraph& graph,
+        const std::vector<std::string>& aln_filenames, const std::string& aln_format,
+        size_t parallel_jobs = 1) const;
 };
 
 }
diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp
index 5e55928a4fe..4eca4e47a81 100644
--- a/src/subcommand/gbwt_main.cpp
+++ b/src/subcommand/gbwt_main.cpp
@@ -1389,8 +1389,8 @@ void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& co
         if (config.show_progress) {
             std::cerr << "Input type: " << (config.gam_format ? "GAM" : "GAF") << std::endl;
         }
-        // FIXME: Parallelize this.
-        std::unique_ptr<gbwt::DynamicGBWT> temp = config.haplotype_indexer.build_gbwt(*(graphs.path_graph), config.input_filenames, (config.gam_format ? "GAM" : "GAF"));
+        std::unique_ptr<gbwt::GBWT> temp =
+            config.haplotype_indexer.build_gbwt(*(graphs.path_graph), config.input_filenames, (config.gam_format ? "GAM" : "GAF"), config.build_jobs);
         gbwts.use(*temp);
     }
 
diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t
index 870b5d9674a..1046da788d3 100644
--- a/test/t/37_vg_gbwt.t
+++ b/test/t/37_vg_gbwt.t
@@ -52,9 +52,9 @@ rm -f x.ref.gbwt
 # Single chromosome: alignments
 vg paths -v x.vg -X -Q _alt > x.alts.gam
 vg convert -G x.alts.gam x.vg > x.alts.gaf
-vg gbwt -A -o x.alts.gaf.gbwt -x x.vg x.alts.gaf
+vg gbwt -A --num-jobs 1 -o x.alts.gaf.gbwt -x x.vg x.alts.gaf
 is $? 0 "chromosome X GAF with vg gbwt"
-vg gbwt -A --gam-format -o x.alts.gam.gbwt -x x.vg x.alts.gam
+vg gbwt -A --num-jobs 1 --gam-format -o x.alts.gam.gbwt -x x.vg x.alts.gam
 is $? 0 "chromosome X GAM with vg gbwt"
 cmp x.alts.gaf.gbwt x.alts.gaf.gbwt
 is $? 0 "identical construction results from GAF and GAM"

From a585bbcffa162f08dbe2ae9a745e37dde8bb5595 Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Sun, 3 Nov 2024 01:31:20 -0800
Subject: [PATCH 07/12] Faster metadata construction for GAM/GAF GBWT

---
 src/haplotype_indexer.cpp | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/haplotype_indexer.cpp b/src/haplotype_indexer.cpp
index b7c935d7e05..75399a86e64 100644
--- a/src/haplotype_indexer.cpp
+++ b/src/haplotype_indexer.cpp
@@ -2,8 +2,9 @@
  * \file haplotype_indexer.cpp: implementations of haplotype indexing with the GBWT
  */
 
+#include "haplotype_indexer.hpp"
+
 #include <iostream>
-#include <map>
 #include <mutex>
 #include <vector>
 #include <string>
@@ -14,10 +15,9 @@
 
 #include "gbwt_helper.hpp"
 
-#include "haplotype_indexer.hpp"
-
-#include "path.hpp"
 #include "alignment.hpp"
+#include "hash_map.hpp"
+#include "path.hpp"
 
 using namespace std;
 
@@ -442,7 +442,9 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
     // GBWT construction.
     std::vector<std::mutex> builder_mutexes(jobs.size());
     std::vector<std::unique_ptr<gbwt::GBWTBuilder>> builders(jobs.size());
-    std::vector<std::vector<std::string>> read_names(jobs.size()); // TODO: Concatenated strings + starting offsets may save space.
+    // This is a bit inefficient, as read names are often longer than the SSO threshold for GCC (but not for Clang).
+    // TODO: Maybe use concatenated 0-terminated names?
+    std::vector<std::vector<std::string>> read_names(jobs.size());
     for (size_t i = 0; i < jobs.size(); i++) {
         builders[i].reset(new gbwt::GBWTBuilder(gbwt_node_width(graph), this->gbwt_buffer_size * gbwt::MILLION, this->id_interval));
     }
@@ -504,7 +506,6 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
     partial_indexes.clear();
 
     // Create the metadata.
-    // TODO: This is quite slow.
     if (this->show_progress) {
         #pragma omp critical
         {
@@ -514,14 +515,15 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
     result->addMetadata();
     result->metadata.setContigs({ "unknown" });
     {
-        std::map<std::string, std::pair<size_t, size_t>> read_info; // name -> (sample id, fragment count)
+        // We can use 32-bit values, as GBWT metadata uses them as well.
+        string_hash_map<std::string, std::pair<std::uint32_t, std::uint32_t>> read_info; // name -> (sample id, fragment count)
         for (auto& names : read_names) {
             for (const std::string& name : names) {
-                size_t sample_id = 0, fragment_count = 0;
+                std::uint32_t sample_id = 0, fragment_count = 0;
                 auto iter = read_info.find(name);
                 if (iter == read_info.end()) {
                     sample_id = read_info.size();
-                    read_info[name] = std::pair<size_t, size_t>(sample_id, fragment_count);
+                    read_info[name] = std::make_pair(sample_id, fragment_count);
                 } else {
                     sample_id = iter->second.first;
                     fragment_count = iter->second.second;
@@ -535,7 +537,7 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
         for (auto& p : read_info) {
             sample_names[p.second.first] = p.first;
         }
-        read_info.clear();
+        read_info = string_hash_map<std::string, std::pair<std::uint32_t, std::uint32_t>>();
         result->metadata.setSamples(sample_names);
         result->metadata.setHaplotypes(sample_names.size());
     }

From c1b58030cd2bc4079ff12ea6ec441e3d19cc791f Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Mon, 4 Nov 2024 00:12:53 -0800
Subject: [PATCH 08/12] Tests for multi-chromosome GAM/GAF GBWT

---
 test/t/37_vg_gbwt.t | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t
index 1046da788d3..a5849bc47a5 100644
--- a/test/t/37_vg_gbwt.t
+++ b/test/t/37_vg_gbwt.t
@@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap
 
 PATH=../bin:$PATH # for vg
 
-plan tests 149
+plan tests 155
 
 
 # Build vg graphs for two chromosomes
@@ -50,7 +50,7 @@ rm -f x.ref.gbwt
 
 
 # Single chromosome: alignments
-vg paths -v x.vg -X -Q _alt > x.alts.gam
+vg paths -x x.vg -X -Q _alt > x.alts.gam
 vg convert -G x.alts.gam x.vg > x.alts.gaf
 vg gbwt -A --num-jobs 1 -o x.alts.gaf.gbwt -x x.vg x.alts.gaf
 is $? 0 "chromosome X GAF with vg gbwt"
@@ -108,6 +108,25 @@ rm -f x.gbwt y.gbwt xy.merge.gbwt xy.fast.gbwt xy.parallel.gbwt xy.direct.gbwt x
 rm -f xy.1000gp.gbwt
 
 
+# Multiple chromosomes: alignments
+vg paths -x xy-alt.xg -X -Q _alt > xy.alts.gam
+vg convert -G xy.alts.gam xy.xg > xy.alts.gaf
+vg gbwt -A --num-jobs 1 -o xy.alts.gaf.gbwt -x xy.xg xy.alts.gaf
+is $? 0 "multi-chromosome GAF with vg gbwt"
+vg gbwt -A --num-jobs 1 --gam-format -o xy.alts.gam.gbwt -x xy.xg xy.alts.gam
+is $? 0 "multi-chromosome GAM with vg gbwt"
+cmp xy.alts.gaf.gbwt xy.alts.gaf.gbwt
+is $? 0 "identical construction results from GAF and GAM"
+
+vg gbwt -A --num-jobs 2 -o multi.gbwt -x xy.xg xy.alts.gaf
+is $? 0 "multi-chromosome GAF with vg gbwt using multiple jobs"
+is $(vg gbwt -c xy.alts.gaf.gbwt) 58 "single job: 58 paths"
+is $(vg gbwt -c multi.gbwt) 58 "multiple jobs: 58 paths"
+
+rm -f xy.alts.gam xy.alts.gaf
+rm -f xy.alts.gaf.gbwt xy.alts.gam.gbwt multi.gbwt
+
+
 # Multiple chromosomes: paths as contigs
 vg gbwt -E -o xy.contigs.gbwt -x xy.xg
 is $? 0 "paths as contigs with vg gbwt"

From 6957377dbfbe6a1a65567b964269994b663f14c2 Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Mon, 4 Nov 2024 00:35:38 -0800
Subject: [PATCH 09/12] Maybe SDSL will now point to the right hash

---
 deps/sdsl-lite | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/sdsl-lite b/deps/sdsl-lite
index 115dfd5b737..4b4b1b8951b 160000
--- a/deps/sdsl-lite
+++ b/deps/sdsl-lite
@@ -1 +1 @@
-Subproject commit 115dfd5b7371ef99a1e0611389b956bb88bf9fc4
+Subproject commit 4b4b1b8951bf7fbdb5e050f98b5e61453964c167

From a4e7ddd41774d8431160490db26670514c3fa921 Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Mon, 4 Nov 2024 02:39:48 -0800
Subject: [PATCH 10/12] vg was apparently using another branch of SDSL

---
 deps/sdsl-lite | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/sdsl-lite b/deps/sdsl-lite
index 4b4b1b8951b..ef23c5fe989 160000
--- a/deps/sdsl-lite
+++ b/deps/sdsl-lite
@@ -1 +1 @@
-Subproject commit 4b4b1b8951bf7fbdb5e050f98b5e61453964c167
+Subproject commit ef23c5fe9899f2b0afa53a32162ed0b06aff0e89

From 612d446061ce17ec14807cb883f33ba9f8895942 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 4 Nov 2024 17:49:53 -0500
Subject: [PATCH 11/12] Use toil-vg that uses vg gbwt to make gbwt

---
 vgci/vgci.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vgci/vgci.sh b/vgci/vgci.sh
index 9a5e53b08f3..ca04cfbe7b4 100755
--- a/vgci/vgci.sh
+++ b/vgci/vgci.sh
@@ -30,7 +30,7 @@ KEEP_INTERMEDIATE_FILES=0
 # Should we show stdout and stderr from tests? If so, set to "-s".
 SHOW_OPT=""
 # What toil-vg should we install?
-TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@c9bd6414f935e6095574a41a34addbb8d87b41a6"
+TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@d16da00b92c491f90433e151cb4f5a89a44395b8"
 # What toil should we install?
 # Could be something like "toil[aws,mesos]==3.20.0"
 # or "git+https://github.com/DataBiosphere/toil.git@3ab74776a3adebd6db75de16985ce9d734f60743#egg=toil[aws,mesos]"

From 8a12486abcefc7644f00180f2c4b29916c22384c Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Tue, 5 Nov 2024 10:51:24 -0500
Subject: [PATCH 12/12] Use toil-vg that knows about DI2 and uses outfile
 correctly

---
 vgci/vgci.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vgci/vgci.sh b/vgci/vgci.sh
index ca04cfbe7b4..c48ddbcf553 100755
--- a/vgci/vgci.sh
+++ b/vgci/vgci.sh
@@ -30,7 +30,7 @@ KEEP_INTERMEDIATE_FILES=0
 # Should we show stdout and stderr from tests? If so, set to "-s".
 SHOW_OPT=""
 # What toil-vg should we install?
-TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@d16da00b92c491f90433e151cb4f5a89a44395b8"
+TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@45782c7ba5a372e7c3587ac1c63f4895176fc828"
 # What toil should we install?
 # Could be something like "toil[aws,mesos]==3.20.0"
 # or "git+https://github.com/DataBiosphere/toil.git@3ab74776a3adebd6db75de16985ce9d734f60743#egg=toil[aws,mesos]"