From 9a6206e4667167257ec918802e89675e5aaebd5d Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Tue, 13 Aug 2024 14:24:09 +0300 Subject: [PATCH 01/12] Better error handling in vg gbwt --- src/subcommand/gbwt_main.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp index db1175b426e..b22f0c0ce8e 100644 --- a/src/subcommand/gbwt_main.cpp +++ b/src/subcommand/gbwt_main.cpp @@ -999,9 +999,11 @@ void validate_gbwt_config(GBWTConfig& config) { if (!config.to_remove.empty()) { if (config.build == GBWTConfig::build_gbz) { std::cerr << "error: [vg gbwt] the GBWT extracted from GBZ cannot have paths modified" << std::endl; + std::exit(EXIT_FAILURE); } if (config.build == GBWTConfig::build_gbwtgraph) { std::cerr << "error: [vg gbwt] the GBWT loaded with a GBWTGraph cannot have paths modified" << std::endl; + std::exit(EXIT_FAILURE); } if (!(config.input_filenames.size() == 1 || config.merge != GBWTConfig::merge_none) || !has_gbwt_output) { std::cerr << "error: [vg gbwt] removing a sample requires one input GBWT and output GBWT" << std::endl; From a57b473efb79f6986b03eb5d1db394df38309c3f Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Thu, 31 Oct 2024 20:59:09 -0700 Subject: [PATCH 02/12] Start getting rid of the old path/thread terminology --- src/subcommand/gbwt_main.cpp | 75 ++++++++++++++++++------------------ test/t/37_vg_gbwt.t | 62 ++++++++++++++--------------- 2 files changed, 69 insertions(+), 68 deletions(-) diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp index b22f0c0ce8e..af88d10f2e9 100644 --- a/src/subcommand/gbwt_main.cpp +++ b/src/subcommand/gbwt_main.cpp @@ -37,7 +37,7 @@ struct GBWTConfig { build_mode build = build_none; merge_mode merge = merge_none; path_cover_mode path_cover = path_cover_none; - bool metadata_mode = false, thread_mode = false; + bool metadata_mode = false, path_mode = false; // Input GBWT construction. HaplotypeIndexer haplotype_indexer; @@ -55,8 +55,8 @@ struct GBWTConfig { // Other parameters and flags. bool show_progress = false; - bool count_threads = false; - bool metadata = false, contigs = false, haplotypes = false, samples = false, list_names = false, thread_names = false, tags = false; + bool count_paths = false; + bool metadata = false, contigs = false, haplotypes = false, samples = false, list_names = false, path_names = false, tags = false; bool include_named_paths = false; size_t num_paths = default_num_paths(), context_length = default_context_length(); bool num_paths_set = false; @@ -70,7 +70,7 @@ struct GBWTConfig { // File names. std::string gbwt_output; // Output GBWT. - std::string thread_output; // Threads in SDSL format. + std::string path_output; // Paths in SDSL format. std::string graph_output; // Output GBWTGraph. std::string segment_translation; // Segment to node translation output. std::string r_index_name; // Output r-index. @@ -163,7 +163,7 @@ void step_4_path_cover(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& con void step_5_gbwtgraph(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config); void step_6_r_index(GBWTHandler& gbwts, GBWTConfig& config); void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config); -void step_8_threads(GBWTHandler& gbwts, GBWTConfig& config); +void step_8_paths(GBWTHandler& gbwts, GBWTConfig& config); void report_time_memory(const std::string& what, double start_time, const GBWTConfig& config); void print_metadata(std::ostream& out, const GBWTHandler& gbwts); @@ -233,9 +233,9 @@ int main_gbwt(int argc, char** argv) { step_7_metadata(gbwts, config); } - // Thread options. - if (config.thread_mode) { - step_8_threads(gbwts, config); + // Path options. + if (config.path_mode) { + step_8_paths(gbwts, config); } return 0; @@ -260,7 +260,7 @@ void help_gbwt(char** argv) { std::cerr << " --id-interval N store path ids at one out of N positions (default " << gbwt::DynamicGBWT::SAMPLE_INTERVAL << ")" << std::endl; std::cerr << std::endl; std::cerr << "Multithreading:" << std::endl; - std::cerr << " --num-jobs N use at most N parallel build jobs (for -v, -G, -l, -P; default " << GBWTConfig::default_build_jobs() << ")" << std::endl; + std::cerr << " --num-jobs N use at most N parallel build jobs (for -v, -G, -A, -l, -P; default " << GBWTConfig::default_build_jobs() << ")" << std::endl; std::cerr << " --num-threads N use N parallel search threads (for -b and -r; default " << omp_get_max_threads() << ")" << std::endl; std::cerr << std::endl; std::cerr << "Step 1: GBWT construction (requires -o and one of { -v, -G, -Z, -E, A }):" << std::endl; @@ -330,12 +330,12 @@ void help_gbwt(char** argv) { std::cerr << " -H, --haplotypes print the number of haplotypes" << std::endl; std::cerr << " -S, --samples print the number of samples" << std::endl; std::cerr << " -L, --list-names list contig/sample names (use with -C or -S)" << std::endl; - std::cerr << " -T, --thread-names list thread names" << std::endl; + std::cerr << " -T, --thread-names list path names" << std::endl; // FIXME change to --path-names; make --thread-names a deprecated alias std::cerr << " --tags list GBWT tags" << std::endl; std::cerr << std::endl; - std::cerr << "Step 8: Threads (one input GBWT):" << std::endl; - std::cerr << " -c, --count-threads print the number of threads" << std::endl; - std::cerr << " -e, --extract FILE extract threads in SDSL format to FILE" << std::endl; + std::cerr << "Step 8: Paths (one input GBWT):" << std::endl; + std::cerr << " -c, --count-threads print the number of paths" << std::endl; // FIXME also here, change to --count-paths + std::cerr << " -e, --extract FILE extract paths in SDSL format to FILE" << std::endl; std::cerr << std::endl; } @@ -519,11 +519,11 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { { "haplotypes", no_argument, 0, 'H' }, { "samples", no_argument, 0, 'S' }, { "list-names", no_argument, 0, 'L' }, - { "thread-names", no_argument, 0, 'T' }, + { "thread-names", no_argument, 0, 'T' }, // FIXME { "tags", no_argument, 0, OPT_TAGS }, - // Threads - { "count-threads", no_argument, 0, 'c' }, + // Paths + { "count-threads", no_argument, 0, 'c' }, // FIXME { "extract", required_argument, 0, 'e' }, { "help", no_argument, 0, 'h' }, @@ -864,7 +864,7 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { config.metadata_mode = true; break; case 'T': - config.thread_names = true; + config.path_names = true; config.metadata_mode = true; break; case OPT_TAGS: @@ -872,14 +872,14 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { config.metadata_mode = true; break; - // Threads + // Paths case 'c': - config.count_threads = true; - config.thread_mode = true; + config.count_paths = true; + config.path_mode = true; break; case 'e': - config.thread_output = optarg; - config.thread_mode = true; + config.path_output = optarg; + config.path_mode = true; break; case 'h': @@ -1077,9 +1077,9 @@ void validate_gbwt_config(GBWTConfig& config) { } } - if (config.thread_mode) { + if (config.path_mode) { if (!one_input_gbwt) { - std::cerr << "error: [vg gbwt] thread operations require one input GBWT" << std::endl; + std::cerr << "error: [vg gbwt] path operations require one input GBWT" << std::endl; std::exit(EXIT_FAILURE); } } @@ -1373,6 +1373,7 @@ void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& co if (config.show_progress) { std::cerr << "Input type: " << (config.gam_format ? "GAM" : "GAF") << std::endl; } + // FIXME: Parallelize this. std::unique_ptr temp = config.haplotype_indexer.build_gbwt(*(graphs.path_graph), config.input_filenames, (config.gam_format ? "GAM" : "GAF")); gbwts.use(*temp); } @@ -1458,7 +1459,7 @@ void remove_samples(GBWTHandler& gbwts, GBWTConfig& config) { gbwts.use_dynamic(); if (!(gbwts.dynamic.hasMetadata() && gbwts.dynamic.metadata.hasPathNames() && gbwts.dynamic.metadata.hasSampleNames())) { - std::cerr << "error: [vg gbwt] the index does not contain metadata with thread and sample names" << std::endl; + std::cerr << "error: [vg gbwt] the index does not contain metadata with path and sample names" << std::endl; std::exit(EXIT_FAILURE); } @@ -1471,11 +1472,11 @@ void remove_samples(GBWTHandler& gbwts, GBWTConfig& config) { } std::vector path_ids = gbwts.dynamic.metadata.removeSample(sample_id); if (path_ids.empty()) { - std::cerr << "warning: [vg gbwt] no threads associated with sample " << sample_name << std::endl; + std::cerr << "warning: [vg gbwt] no paths associated with sample " << sample_name << std::endl; continue; } if (config.show_progress) { - std::cerr << "Removing " << path_ids.size() << " threads for sample " << sample_name << std::endl; + std::cerr << "Removing " << path_ids.size() << " paths for sample " << sample_name << std::endl; } gbwts.dynamic.remove(path_ids); } @@ -1666,7 +1667,7 @@ void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config) { } } - if (config.thread_names) { + if (config.path_names) { auto& metadata = get_metadata(); if (metadata.hasPathNames()) { // Precompute some metadata @@ -1676,7 +1677,7 @@ void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config) { std::cout << gbwtgraph::compose_path_name(gbwts.compressed, i, sense) << std::endl; } } else { - std::cerr << "error: [vg gbwt] the metadata does not contain thread names" << std::endl; + std::cerr << "error: [vg gbwt] the metadata does not contain path names" << std::endl; } } @@ -1692,19 +1693,19 @@ void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config) { //---------------------------------------------------------------------------- -void step_8_threads(GBWTHandler& gbwts, GBWTConfig& config) { - // Extract threads in SDSL format. - if (!config.thread_output.empty()) { +void step_8_paths(GBWTHandler& gbwts, GBWTConfig& config) { + // Extract paths in SDSL format. + if (!config.path_output.empty()) { double start = gbwt::readTimer(); if (config.show_progress) { - std::cerr << "Extracting threads to " << config.thread_output << std::endl; + std::cerr << "Extracting paths to " << config.path_output << std::endl; } gbwts.use_compressed(); if (config.show_progress) { std::cerr << "Starting the extraction" << std::endl; } gbwt::size_type node_width = gbwt::bit_length(gbwts.compressed.sigma() - 1); - gbwt::text_buffer_type out(config.thread_output, std::ios::out, gbwt::MEGABYTE, node_width); + gbwt::text_buffer_type out(config.path_output, std::ios::out, gbwt::MEGABYTE, node_width); for (gbwt::size_type id = 0; id < gbwts.compressed.sequences(); id += 2) { // Ignore reverse complements. gbwt::vector_type sequence = gbwts.compressed.extract(id); for (auto node : sequence) { @@ -1713,11 +1714,11 @@ void step_8_threads(GBWTHandler& gbwts, GBWTConfig& config) { out.push_back(gbwt::ENDMARKER); } out.close(); - report_time_memory("Threads extracted", start, config); + report_time_memory("Paths extracted", start, config); } - // There are two sequences for each thread. - if (config.count_threads) { + // There are two sequences for each path. + if (config.count_paths) { gbwts.use_compressed(); std::cout << (gbwts.compressed.sequences() / 2) << std::endl; } diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t index 6c11196fe7a..9fdc0c8a4fb 100644 --- a/test/t/37_vg_gbwt.t +++ b/test/t/37_vg_gbwt.t @@ -33,11 +33,11 @@ cmp x.gbwt parse_x.gbwt is $? 0 "identical construction results with vg gbwt and from VCF parse" # Single chromosome: metadata for haplotypes -is $(vg gbwt -c x.gbwt) 2 "chromosome X: 2 threads" +is $(vg gbwt -c x.gbwt) 2 "chromosome X: 2 paths" is $(vg gbwt -C x.gbwt) 1 "chromosome X: 1 contig" is $(vg gbwt -H x.gbwt) 2 "chromosome X: 2 haplotypes" is $(vg gbwt -S x.gbwt) 1 "chromosome X: 1 sample" -is $(vg gbwt -T x.gbwt | wc -l) 2 "chromosome X: 2 thread names" +is $(vg gbwt -T x.gbwt | wc -l) 2 "chromosome X: 2 path names" is $(vg gbwt -C -L x.gbwt | wc -l) 1 "chromosome X: 1 contig name" is $(vg gbwt -S -L x.gbwt | wc -l) 1 "chromosome X: 1 sample name" @@ -52,7 +52,7 @@ vg index -G x2.ref.gbwt -T x.vg is $? 0 "chromosome X reference GBWT with vg index" cmp x.ref.gbwt x2.ref.gbwt is $? 0 "identical construction results with vg gbwt and vg index" -is $(vg gbwt -c x.ref.gbwt) 1 "chromosome X reference: 1 thread" +is $(vg gbwt -c x.ref.gbwt) 1 "chromosome X reference: 1 path" rm -f x.ref.gbwt x2.ref.gbwt @@ -115,7 +115,7 @@ vg gbwt -x xy-alt.xg -o xy.1000gp.gbwt --preset 1000gp -v small/xy2.vcf.gz is $? 0 "construction preset: 1000gp" # Multiple chromosomes: metadata for haplotypes -is $(vg gbwt -c xy.merge.gbwt) 4 "multiple chromosomes: 4 threads" +is $(vg gbwt -c xy.merge.gbwt) 4 "multiple chromosomes: 4 paths" is $(vg gbwt -C xy.merge.gbwt) 2 "multiple chromosomes: 2 contigs" is $(vg gbwt -H xy.merge.gbwt) 2 "multiple chromosomes: 2 haplotypes" is $(vg gbwt -S xy.merge.gbwt) 1 "multiple chromosomes: 1 sample" @@ -131,7 +131,7 @@ vg index -G xy2.contigs.gbwt -T xy.xg is $? 0 "paths as contigs with vg index" cmp xy.contigs.gbwt xy2.contigs.gbwt is $? 0 "identical construction results with vg gbwt and vg index" -is $(vg gbwt -c xy.contigs.gbwt) 2 "paths as contigs: 2 threads" +is $(vg gbwt -c xy.contigs.gbwt) 2 "paths as contigs: 2 paths" is $(vg gbwt -C xy.contigs.gbwt) 2 "paths as contigs: 2 contigs" is $(vg gbwt -H xy.contigs.gbwt) 1 "paths as contigs: 1 haplotype" is $(vg gbwt -S xy.contigs.gbwt) 1 "paths as contigs: 1 sample" @@ -194,35 +194,35 @@ rm -f x.gbwt empty.gbwt x2.gbwt vg gbwt -x xy-alt.xg -o xy.gbwt -v small/xy2.vcf.gz vg gbwt -E -o xy.ref.gbwt -x xy.xg vg gbwt -m -o xy.both.gbwt xy.gbwt xy.ref.gbwt -is $(vg gbwt -c xy.both.gbwt) 6 "haplotypes and paths: 6 threads" +is $(vg gbwt -c xy.both.gbwt) 6 "haplotypes and paths: 6 paths" # Remove the reference sample that GBWTs use for paths vg gbwt -R _gbwt_ref -o xy.removed.gbwt xy.both.gbwt is $? 0 "samples can be removed from a GBWT index" -is $(vg gbwt -c xy.removed.gbwt) 4 "haplotypes only: 4 threads" +is $(vg gbwt -c xy.removed.gbwt) 4 "haplotypes only: 4 paths" rm -f xy.gbwt xy.ref.gbwt xy.both.gbwt xy.removed.gbwt # Build a three-sample GBWT from a simple GFA vg gbwt -o all.gbwt -G graphs/three_samples.gfa -is $(vg gbwt -c all.gbwt) 12 "all samples: 12 threads" +is $(vg gbwt -c all.gbwt) 12 "all samples: 12 paths" is $(vg gbwt -H all.gbwt) 6 "all samples: 6 haplotypes" # Remove samples 1 and 3 vg gbwt -R sample1 -R sample3 -o removed.gbwt all.gbwt is $? 0 "multiple samples can be removed from a GBWT index" -is $(vg gbwt -c removed.gbwt) 4 "sample 2: 4 threads" +is $(vg gbwt -c removed.gbwt) 4 "sample 2: 4 paths" is $(vg gbwt -H removed.gbwt) 2 "sample 2: 2 haplotypes" rm -f all.gbwt removed.gbwt -# Extract threads from GBWT +# Extract paths from GBWT vg gbwt -x x.vg -o x.gbwt -v small/xy2.vcf.gz vg gbwt -e x.extract x.gbwt -is $? 0 "threads can be extracted from GBWT" -is $(cat x.extract | wc -c) 121 "correct size for the thread file" +is $? 0 "paths can be extracted from GBWT" +is $(cat x.extract | wc -c) 121 "correct size for the paths file" rm -f x.gbwt x.extract @@ -271,7 +271,7 @@ rm -f extracted.gbwt extracted2.gbwt extracted2.gg vg gbwt -P -n 16 -x xy.xg -g xy.cover.gg -o xy.cover.gbwt is $? 0 "Path cover GBWTGraph construction" is $(md5sum xy.cover.gg | cut -f 1 -d\ ) 6a2738f51472e0ba1553a815a005b157 "GBWTGraph was serialized correctly" -is $(vg gbwt -c xy.cover.gbwt) 32 "path cover: 32 threads" +is $(vg gbwt -c xy.cover.gbwt) 32 "path cover: 32 paths" is $(vg gbwt -C xy.cover.gbwt) 2 "path cover: 2 contigs" is $(vg gbwt -H xy.cover.gbwt) 16 "path cover: 16 haplotypes" is $(vg gbwt -S xy.cover.gbwt) 16 "path cover: 16 samples" @@ -282,10 +282,10 @@ rm -f xy.cover.gg xy.cover.gbwt vg gbwt -P -n 16 -x xy.xg -g xy.cover.gg -o xy.cover.gbwt --pass-paths is $? 0 "Path cover GBWTGraph construction" is $(md5sum xy.cover.gg | cut -f 1 -d\ ) 6a2738f51472e0ba1553a815a005b157 "GBWTGraph was serialized correctly" -is $(vg gbwt -c xy.cover.gbwt) 34 "path cover w/ paths: 34 threads" -is $(vg gbwt -C xy.cover.gbwt) 2 "path cover w/ paths: 2 contigs" -is $(vg gbwt -H xy.cover.gbwt) 17 "path cover w/ paths: 17 haplotypes" -is $(vg gbwt -S xy.cover.gbwt) 17 "path cover w/ paths: 17 samples" +is $(vg gbwt -c xy.cover.gbwt) 34 "path cover w/ reference paths: 34 paths" +is $(vg gbwt -C xy.cover.gbwt) 2 "path cover w/ reference paths: 2 contigs" +is $(vg gbwt -H xy.cover.gbwt) 17 "path cover w/ reference paths: 17 haplotypes" +is $(vg gbwt -S xy.cover.gbwt) 17 "path cover w/ reference paths: 17 samples" rm -f xy.cover.gg xy.cover.gbwt @@ -293,7 +293,7 @@ rm -f xy.cover.gg xy.cover.gbwt vg gbwt -x xy-alt.xg -g xy.local.gg -l -n 16 -o xy.local.gbwt -v small/xy2.vcf.gz is $? 0 "Local haplotypes GBWTGraph construction" is $(md5sum xy.local.gg | cut -f 1 -d\ ) 00429586246711abcf1367a97d3c468c "GBWTGraph was serialized correctly" -is $(vg gbwt -c xy.local.gbwt) 32 "local haplotypes: 32 threads" +is $(vg gbwt -c xy.local.gbwt) 32 "local haplotypes: 32 paths" is $(vg gbwt -C xy.local.gbwt) 2 "local haplotypes: 2 contigs" is $(vg gbwt -H xy.local.gbwt) 16 "local haplotypes: 16 haplotypes" is $(vg gbwt -S xy.local.gbwt) 16 "local haplotypes: 16 samples" @@ -314,10 +314,10 @@ rm -f xy.local.gg xy.local.gbwt xy.local.gbz vg gbwt -x xy-alt.xg -g xy.local.gg -l -n 16 -o xy.local.gbwt -v small/xy2.vcf.gz --pass-paths is $? 0 "Local haplotypes GBWTGraph construction" is $(md5sum xy.local.gg | cut -f 1 -d\ ) 6a2738f51472e0ba1553a815a005b157 "GBWTGraph was serialized correctly" -is $(vg gbwt -c xy.local.gbwt) 34 "local haplotypes w/ paths: 34 threads" -is $(vg gbwt -C xy.local.gbwt) 2 "local haplotypes w/ paths: 2 contigs" -is $(vg gbwt -H xy.local.gbwt) 17 "local haplotypes w/ paths: 17 haplotypes" -is $(vg gbwt -S xy.local.gbwt) 17 "local haplotypes w/ paths: 17 samples" +is $(vg gbwt -c xy.local.gbwt) 34 "local haplotypes w/ reference paths: 34 paths" +is $(vg gbwt -C xy.local.gbwt) 2 "local haplotypes w/ reference paths: 2 contigs" +is $(vg gbwt -H xy.local.gbwt) 17 "local haplotypes w/ reference paths: 17 haplotypes" +is $(vg gbwt -S xy.local.gbwt) 17 "local haplotypes w/ reference paths: 17 samples" rm -f xy.local.gg xy.local.gbwt @@ -325,11 +325,11 @@ rm -f xy.local.gg xy.local.gbwt vg gbwt -G haplotype-sampling/micb-kir3dl1.gfa -g large.gbz --gbz-format vg gbwt -Z large.gbz -l -n 16 --pass-paths -o large.local.gbwt is $? 0 "Local haplotypes with reference paths from a larger GBZ" -is $(vg gbwt -c large.local.gbwt) 36 "local haplotypes w/ paths: 36 threads" -is $(vg gbwt -C large.local.gbwt) 2 "local haplotypes w/ paths: 2 contigs" -is $(vg gbwt -H large.local.gbwt) 18 "local haplotypes w/ paths: 18 haplotypes" -is $(vg gbwt -S large.local.gbwt) 18 "local haplotypes w/ paths: 18 samples" -is $(vg gbwt --tags large.local.gbwt | grep -c reference_samples) 1 "local haplotypes w/ paths: reference_samples set" +is $(vg gbwt -c large.local.gbwt) 36 "local haplotypes w/ reference paths: 36 paths" +is $(vg gbwt -C large.local.gbwt) 2 "local haplotypes w/ reference paths: 2 contigs" +is $(vg gbwt -H large.local.gbwt) 18 "local haplotypes w/ reference paths: 18 haplotypes" +is $(vg gbwt -S large.local.gbwt) 18 "local haplotypes w/ reference paths: 18 samples" +is $(vg gbwt --tags large.local.gbwt | grep -c reference_samples) 1 "local haplotypes w/ reference paths: reference_samples set" rm -f large.gbz large.local.gbwt @@ -339,7 +339,7 @@ vg gbwt -x x.vg -o x.gbwt -v small/xy2.vcf.gz vg gbwt -a -n 16 -x xy.xg -g augmented.gg -o augmented.gbwt x.gbwt is $? 0 "Augmented GBWTGraph construction" is $(md5sum augmented.gg | cut -f 1 -d\ ) 00429586246711abcf1367a97d3c468c "GBWTGraph was serialized correctly" -is $(vg gbwt -c augmented.gbwt) 18 "augmented: 18 threads" +is $(vg gbwt -c augmented.gbwt) 18 "augmented: 18 paths" is $(vg gbwt -C augmented.gbwt) 2 "augmented: 2 contigs" is $(vg gbwt -H augmented.gbwt) 18 "augmented: 18 haplotypes" is $(vg gbwt -S augmented.gbwt) 17 "augmented: 17 samples" @@ -355,7 +355,7 @@ rm -f x.vg y.vg x.xg xy.xg xy-alt.xg vg gbwt -o gfa.gbwt -G graphs/components_walks.gfa is $? 0 "GBWT construction from GFA" is $(md5sum gfa.gbwt | cut -f 1 -d\ ) 44c27c37c7af6911c26aea2a41008460 "GBWT was serialized correctly" -is $(vg gbwt -c gfa.gbwt) 4 "gfa: 4 threads" +is $(vg gbwt -c gfa.gbwt) 4 "gfa: 4 paths" is $(vg gbwt -C gfa.gbwt) 2 "gfa: 2 contigs" is $(vg gbwt -H gfa.gbwt) 2 "gfa: 2 haplotypes" is $(vg gbwt -S gfa.gbwt) 1 "gfa: 1 sample" @@ -376,7 +376,7 @@ is $(md5sum gfa2.gbz | cut -f 1 -d\ ) ab241a3f79a781a367b701cb8888bf01 "GBZ was # Build GBWT and GBWTGraph from GFA with node chopping vg gbwt -o chopping.gbwt -g chopping.gg --translation chopping.trans --max-node 2 -G graphs/chopping_walks.gfa is $? 0 "GBWT+GBWTGraph construction from GFA with chopping" -is $(vg gbwt -c chopping.gbwt) 3 "chopping: 3 threads" +is $(vg gbwt -c chopping.gbwt) 3 "chopping: 3 paths" is $(vg gbwt -C chopping.gbwt) 1 "chopping: 1 contig" is $(vg gbwt -H chopping.gbwt) 3 "chopping: 3 haplotypes" is $(vg gbwt -S chopping.gbwt) 2 "chopping: 2 samples" @@ -391,7 +391,7 @@ is $(wc -l < from_gbz.trans) 8 "from GBZ: 8 translations" # Build GBWT and GBWTGraph from GFA with both paths and walks vg gbwt -o ref_paths.gbwt -g ref_paths.gg --translation ref_paths.trans -G graphs/components_paths_walks.gfa is $? 0 "GBWT+GBWTGraph construction from GFA with reference paths" -is $(vg gbwt -c ref_paths.gbwt) 6 "ref paths: 6 threads" +is $(vg gbwt -c ref_paths.gbwt) 6 "ref paths: 6 paths" is $(vg gbwt -C ref_paths.gbwt) 2 "ref paths: 2 contigs" is $(vg gbwt -H ref_paths.gbwt) 3 "ref paths: 3 haplotypes" is $(vg gbwt -S ref_paths.gbwt) 2 "ref paths: 2 samples" From 6377996ef9f148b02e6e890e023d32dbf01ad96c Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Thu, 31 Oct 2024 21:13:18 -0700 Subject: [PATCH 03/12] Rename some long options --- src/subcommand/gbwt_main.cpp | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp index af88d10f2e9..5e55928a4fe 100644 --- a/src/subcommand/gbwt_main.cpp +++ b/src/subcommand/gbwt_main.cpp @@ -330,11 +330,11 @@ void help_gbwt(char** argv) { std::cerr << " -H, --haplotypes print the number of haplotypes" << std::endl; std::cerr << " -S, --samples print the number of samples" << std::endl; std::cerr << " -L, --list-names list contig/sample names (use with -C or -S)" << std::endl; - std::cerr << " -T, --thread-names list path names" << std::endl; // FIXME change to --path-names; make --thread-names a deprecated alias + std::cerr << " -T, --path-names list path names" << std::endl; std::cerr << " --tags list GBWT tags" << std::endl; std::cerr << std::endl; std::cerr << "Step 8: Paths (one input GBWT):" << std::endl; - std::cerr << " -c, --count-threads print the number of paths" << std::endl; // FIXME also here, change to --count-paths + std::cerr << " -c, --count-paths print the number of paths" << std::endl; std::cerr << " -e, --extract FILE extract paths in SDSL format to FILE" << std::endl; std::cerr << std::endl; } @@ -424,7 +424,11 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { constexpr int OPT_PASS_PATHS = 1400; constexpr int OPT_GBZ_FORMAT = 1500; constexpr int OPT_TAGS = 1700; - + + // Deprecated options. + constexpr int OPT_THREAD_NAMES = 2000; + constexpr int OPT_COUNT_THREADS = 2001; + // Make a collection of all the known tags and their descriptions. Use an ordered map so that we can do some typo guessing. // Values are description and list of prohibited characters. const std::map>> KNOWN_TAGS = { @@ -519,11 +523,13 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { { "haplotypes", no_argument, 0, 'H' }, { "samples", no_argument, 0, 'S' }, { "list-names", no_argument, 0, 'L' }, - { "thread-names", no_argument, 0, 'T' }, // FIXME + { "path-names", no_argument, 0, 'T' }, + { "thread-names", no_argument, 0, OPT_THREAD_NAMES }, { "tags", no_argument, 0, OPT_TAGS }, // Paths - { "count-threads", no_argument, 0, 'c' }, // FIXME + { "count-paths", no_argument, 0, 'c' }, + { "count-threads", no_argument, 0, OPT_COUNT_THREADS }, { "extract", required_argument, 0, 'e' }, { "help", no_argument, 0, 'h' }, @@ -867,6 +873,11 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { config.path_names = true; config.metadata_mode = true; break; + case OPT_THREAD_NAMES: + std::cerr << "warning: [vg gbwt] option --thread-names is deprecated; use --path-names instead" << std::endl; + config.path_names = true; + config.metadata_mode = true; + break; case OPT_TAGS: config.tags = true; config.metadata_mode = true; @@ -877,6 +888,11 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { config.count_paths = true; config.path_mode = true; break; + case OPT_COUNT_THREADS: + std::cerr << "warning: [vg gbwt] option --count-threads is deprecated; use --count-paths instead" << std::endl; + config.count_paths = true; + config.path_mode = true; + break; case 'e': config.path_output = optarg; config.path_mode = true; From 7c6b977c989b90e538cd7b1ab1ff422b718473d2 Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Thu, 31 Oct 2024 22:28:58 -0700 Subject: [PATCH 04/12] Remove GBWT options from vg index --- src/subcommand/index_main.cpp | 218 ++++------------------------------ test/t/05_vg_find.t | 4 +- test/t/06_vg_index.t | 73 ++---------- test/t/07_vg_map.t | 3 +- test/t/11_vg_paths.t | 3 +- test/t/13_vg_sim.t | 4 +- test/t/18_vg_call.t | 2 +- test/t/26_deconstruct.t | 3 +- test/t/33_vg_mpmap.t | 3 +- test/t/37_vg_gbwt.t | 32 +---- test/t/38_vg_prune.t | 6 +- test/t/46_vg_minimizer.t | 6 +- 12 files changed, 62 insertions(+), 295 deletions(-) diff --git a/src/subcommand/index_main.cpp b/src/subcommand/index_main.cpp index 26a761162f2..731fdcff26d 100644 --- a/src/subcommand/index_main.cpp +++ b/src/subcommand/index_main.cpp @@ -1,4 +1,4 @@ -// index.cpp: define the "vg index" subcommand, which makes xg, GCSA2, and GBWT indexes +// index.cpp: define the "vg index" subcommand, which makes xg, GCSA2, and distance indexes #include #include @@ -11,7 +11,6 @@ #include "subcommand.hpp" #include "../vg.hpp" -#include "../haplotype_indexer.hpp" #include "xg.hpp" #include #include @@ -23,12 +22,10 @@ #include "../integrated_snarl_finder.hpp" #include "../snarl_distance_index.hpp" #include "../source_sink_overlay.hpp" -#include "../gbwt_helper.hpp" #include "../gbwtgraph_helper.hpp" #include "../gcsa_helper.hpp" #include -#include #include #include @@ -47,24 +44,6 @@ void help_index(char** argv) { << "xg options:" << endl << " -x, --xg-name FILE use this file to store a succinct, queryable version of the graph(s), or read for GCSA or distance indexing" << endl << " -L, --xg-alts include alt paths in xg" << endl - << "gbwt options (more in vg gbwt):" << endl - << " -v, --vcf-phasing FILE generate threads from the haplotypes in the VCF file FILE" << endl - << " -W, --ignore-missing don't warn when variants in the VCF are missing from the graph; silently skip them" << endl - << " -T, --store-threads generate threads from the embedded paths" << endl - << " -M, --store-gam FILE generate threads from the alignments in gam FILE (many allowed)" << endl - << " -F, --store-gaf FILE generate threads from the alignments in gaf FILE (many allowed)" << endl - << " -G, --gbwt-name FILE store the threads as GBWT in FILE" << endl - << " -z, --actual-phasing do not make unphased homozygous genotypes phased"<< endl - << " -P, --force-phasing replace unphased genotypes with randomly phased ones" << endl - << " -o, --discard-overlaps skip overlapping alternate alleles if the overlap cannot be resolved" << endl - << " -B, --batch-size N number of samples per batch (default 200)" << endl - << " -u, --buffer-size N GBWT construction buffer size in millions of nodes (default 100)" << endl - << " -n, --id-interval N store haplotype ids at one out of N positions (default 1024)" << endl - << " -R, --range X..Y process samples X to Y (inclusive)" << endl - << " -r, --rename V=P rename contig V in the VCFs to path P in the graph (may repeat)" << endl - << " --rename-variants when renaming contigs, find variants in the graph based on the new name" << endl - << " -I, --region C:S-E operate on only the given 1-based region of the given VCF contig (may repeat)" << endl - << " -E, --exclude SAMPLE exclude any samples with the given name from haplotype indexing" << endl << "gcsa options:" << endl << " -g, --gcsa-out FILE output a GCSA2 index to the given file" << endl //<< " -i, --dbg-in FILE use kmers from FILE instead of input VG (may repeat)" << endl @@ -83,12 +62,6 @@ void help_index(char** argv) { << " if N is 0 then don't store distances, only the snarl tree" << endl; } -void multiple_thread_sources() { - std::cerr << "error: [vg index] cannot generate threads from multiple sources (VCF, GAM, GAF, paths)" << std::endl; - std::cerr << "error: [vg index] GBWT indexes can be built separately and merged with vg gbwt" << std::endl; - std::exit(EXIT_FAILURE); -} - int main_index(int argc, char** argv) { if (argc == 2) { @@ -101,25 +74,18 @@ int main_index(int argc, char** argv) { #define OPT_DISTANCE_SNARL_LIMIT 1002 // Which indexes to build. - bool build_xg = false, build_gbwt = false, build_gcsa = false, build_dist = false; + bool build_xg = false, build_gcsa = false, build_dist = false; // Files we should read. string vcf_name, mapping_name; vector dbg_names; // Files we should write. - string xg_name, gbwt_name, gcsa_name, dist_name; - + string xg_name, gcsa_name, dist_name; // General bool show_progress = false; - // GBWT - HaplotypeIndexer haplotype_indexer; - enum thread_source_type { thread_source_none, thread_source_vcf, thread_source_paths, thread_source_gam, thread_source_gaf }; - thread_source_type thread_source = thread_source_none; - vector aln_file_names; - // GCSA gcsa::size_type kmer_size = gcsa::Key::MAX_LENGTH; gcsa::ConstructionParameters params; @@ -152,7 +118,7 @@ int main_index(int argc, char** argv) { {"thread-db", required_argument, 0, 'F'}, {"xg-alts", no_argument, 0, 'L'}, - // GBWT + // GBWT. These have been removed and will return an error. {"vcf-phasing", required_argument, 0, 'v'}, {"ignore-missing", no_argument, 0, 'W'}, {"store-threads", no_argument, 0, 'T'}, @@ -211,7 +177,6 @@ int main_index(int argc, char** argv) { break; case 'p': show_progress = true; - haplotype_indexer.show_progress = true; break; // XG @@ -223,111 +188,26 @@ int main_index(int argc, char** argv) { xg_alts = true; break; - // GBWT - case 'v': - if (thread_source != thread_source_none) { - multiple_thread_sources(); - } - thread_source = thread_source_vcf; - vcf_name = optarg; - break; - case 'W': - haplotype_indexer.warn_on_missing_variants = false; - break; - case 'T': - if (thread_source != thread_source_none) { - multiple_thread_sources(); - } - thread_source = thread_source_paths; - break; - case 'M': - if (thread_source != thread_source_none && thread_source != thread_source_gam) { - multiple_thread_sources(); - } - thread_source = thread_source_gam; - build_gbwt = true; - aln_file_names.push_back(optarg); - break; - case 'F': - if (thread_source != thread_source_none && thread_source != thread_source_gaf) { - multiple_thread_sources(); - } - thread_source = thread_source_gaf; - build_gbwt = true; - aln_file_names.push_back(optarg); - break; - case 'G': - build_gbwt = true; - gbwt_name = optarg; - break; - case 'z': - haplotype_indexer.phase_homozygous = false; - break; - case 'P': - haplotype_indexer.force_phasing = true; - break; - case 'o': - haplotype_indexer.discard_overlaps = true; - break; - case 'B': - haplotype_indexer.samples_in_batch = std::max(parse(optarg), 1ul); - break; - case 'u': - haplotype_indexer.gbwt_buffer_size = std::max(parse(optarg), 1ul); - break; - case 'n': - haplotype_indexer.id_interval = parse(optarg); - break; - case 'R': - { - // Parse first..last - string temp(optarg); - size_t found = temp.find(".."); - if(found == string::npos || found == 0 || found + 2 == temp.size()) { - cerr << "error: [vg index] could not parse range " << temp << endl; - exit(1); - } - haplotype_indexer.sample_range.first = parse(temp.substr(0, found)); - haplotype_indexer.sample_range.second = parse(temp.substr(found + 2)) + 1; - } - break; - case 'r': - { - // Parse the rename old=new - string key_value(optarg); - auto found = key_value.find('='); - if (found == string::npos || found == 0 || found + 1 == key_value.size()) { - cerr << "error: [vg index] could not parse rename " << key_value << endl; - exit(1); - } - // Parse out the two parts - string vcf_contig = key_value.substr(0, found); - string graph_contig = key_value.substr(found + 1); - // Add the name mapping - haplotype_indexer.path_to_vcf[graph_contig] = vcf_contig; - } - break; - case OPT_RENAME_VARIANTS: - haplotype_indexer.rename_variants = true; - break; - case 'I': - { - // We want to parse this region specifier - string region(optarg); - - Region parsed; - parse_region(region, parsed); - if (parsed.start <= 0 || parsed.end <= 0) { - // We need both range bounds, and we can't accept 0 since input is 1-based. - cerr << "error: [vg index] could not parse 1-based region " << optarg << endl; - } - - // Make sure to correct the coordinates to 0-based exclusive-end, from 1-based inclusive-end - haplotype_indexer.regions[parsed.seq] = make_pair((size_t) (parsed.start - 1), (size_t) parsed.end); - } - break; + // GBWT. The options remain, but they are no longer supported. + case 'v': // Fall through + case 'W': // Fall through + case 'T': // Fall through + case 'M': // Fall through + case 'F': // Fall through + case 'G': // Fall through + case 'z': // Fall through + case 'P': // Fall through + case 'o': // Fall through + case 'B': // Fall through + case 'u': // Fall through + case 'n': // Fall through + case 'R': // Fall through + case 'r': // Fall through + case OPT_RENAME_VARIANTS: // Fall through + case 'I': // Fall through case 'E': - haplotype_indexer.excluded_samples.insert(optarg); + std::cerr << "error: [vg index] GBWT construction options have been removed; use vg gbwt instead" << std::endl; + std::exit(EXIT_FAILURE); break; // GCSA @@ -391,37 +271,11 @@ int main_index(int argc, char** argv) { } - if (xg_name.empty() && gbwt_name.empty() && - gcsa_name.empty() && !build_gai_index && !build_vgi_index && dist_name.empty()) { + if (xg_name.empty() && gcsa_name.empty() && !build_gai_index && !build_vgi_index && dist_name.empty()) { cerr << "error: [vg index] index type not specified" << endl; return 1; } - if (build_gbwt && thread_source == thread_source_none) { - cerr << "error: [vg index] cannot build GBWT without threads" << endl; - return 1; - } - - if (thread_source != thread_source_none && !build_gbwt) { - cerr << "error: [vg index] no GBWT output specified for the threads" << endl; - return 1; - } - - if (thread_source == thread_source_gam || thread_source == thread_source_gaf) { - for (const auto& name : aln_file_names) { - if (name == "-") { - cerr << "error: [vg index] GAM (-M) and GAF (-F) input files cannot be read from stdin (-)" << endl; - return 1; - } - } - } - - if (thread_source != thread_source_none && file_names.size() != 1) { - cerr << "error: [vg index] exactly one graph required for generating threads" << std::endl; - cerr << "error: [vg index] you may combine the graphs with vg index -x combined.xg --xg-alts" << std::endl; - return 1; - } - if (file_names.size() <= 0 && dbg_names.empty()){ //cerr << "No graph provided for indexing. Please provide a .vg file or GCSA2-format deBruijn graph to index." << endl; //return 1; @@ -481,30 +335,6 @@ int main_index(int argc, char** argv) { vg::io::save_handle_graph(&xg_index, xg_name); } - // Generate threads - if (thread_source != thread_source_none) { - - // Load the only input graph. - unique_ptr path_handle_graph; - path_handle_graph = vg::io::VPKG::load_one(file_names[0]); - - std::unique_ptr gbwt_index(nullptr); - if (thread_source == thread_source_vcf) { - std::vector parse_files = haplotype_indexer.parse_vcf(vcf_name, *path_handle_graph); - path_handle_graph.reset(); // Save memory by deleting the graph. - gbwt_index = haplotype_indexer.build_gbwt(parse_files); - } else if (thread_source == thread_source_paths) { - gbwt_index = haplotype_indexer.build_gbwt(*path_handle_graph); - } else if (thread_source == thread_source_gam) { - gbwt_index = haplotype_indexer.build_gbwt(*path_handle_graph, aln_file_names, "GAM"); - } else if (thread_source == thread_source_gaf) { - gbwt_index = haplotype_indexer.build_gbwt(*path_handle_graph, aln_file_names, "GAF"); - } - if (build_gbwt && gbwt_index.get() != nullptr) { - save_gbwt(*gbwt_index, gbwt_name, show_progress); - } - } // End of thread indexing. - // Build GCSA if (build_gcsa) { diff --git a/test/t/05_vg_find.t b/test/t/05_vg_find.t index f7c54e36eee..67a51741da1 100644 --- a/test/t/05_vg_find.t +++ b/test/t/05_vg_find.t @@ -112,7 +112,7 @@ rm -f t.xg t.vg t.x:30:35.vg t.x:10:20.vg q.x:30:35.vg q.x:10:20.vg t.bed vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null vg index -x x.xg x.vg -vg index -G x.gbwt -v small/xy2.vcf.gz x.vg +vg gbwt -v small/xy2.vcf.gz -o x.gbwt -x x.vg is $(vg find -p x -x x.xg -K 16 -H x.gbwt | cut -f 5 | sort | uniq -c | tail -n 1 | awk '{ print $1 }') 1510 "we find the expected number of kmers with haplotype frequency equal to 2" rm -f x.vg x.xg x.gbwt @@ -124,7 +124,7 @@ is $? 0 "GFA i/o for find -n consistent with converting both ways" # Find nodes that map to the provided ids vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null -vg index -G x.gbwt -v small/xy2.vcf.gz x.vg +vg gbwt -v small/xy2.vcf.gz -o x.gbwt -x x.vg vg prune -u -m x.mapping -g x.gbwt -e 1 x.vg > x.unfolded.vg rm -f expected.gfa diff --git a/test/t/06_vg_index.t b/test/t/06_vg_index.t index a8ac6add8dc..0e3edf78dbc 100644 --- a/test/t/06_vg_index.t +++ b/test/t/06_vg_index.t @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg export LC_ALL="en_US.utf8" # force ekg's favorite sort order -plan tests 54 +plan tests 45 # Single graph without haplotypes vg construct -r small/x.fa -v small/x.vcf.gz > x.vg @@ -39,9 +39,6 @@ rm -f x3.gcsa x3.gcsa.lcp # Single graph with haplotypes vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg -vg index -G x.gbwt -v small/x.vcf.gz x.vg -is $? 0 "building a GBWT index of a graph with haplotypes" - vg index -x x.xg x.vg is $? 0 "building an XG index of a graph with haplotypes" @@ -55,51 +52,25 @@ is $(vg paths -x x-ap.xg -L | wc -l) $(vg paths -v x.vg -L | wc -l) "xg index do vg index -g x.gcsa x.vg is $? 0 "building a GCSA index of a graph with haplotypes" -vg index -x x2.xg -G x2.gbwt -v small/x.vcf.gz -g x2.gcsa x.vg +vg index -x x2.xg -g x2.gcsa x.vg is $? 0 "building all indexes at once" -cmp x.xg x2.xg && cmp x.gbwt x2.gbwt && cmp x.gcsa x2.gcsa && cmp x.gcsa.lcp x2.gcsa.lcp +cmp x.xg x2.xg && cmp x.gcsa x2.gcsa && cmp x.gcsa.lcp x2.gcsa.lcp is $? 0 "the indexes are identical" -vg index -x x2-ap.xg -G x2-ap.gbwt -v small/x.vcf.gz -g x2-ap.gcsa x.vg -L +vg index -x x2-ap.xg -g x2-ap.gcsa x.vg -L is $? 0 "building all indexes at once, while leaving alt paths in xg" -cmp x.gbwt x2-ap.gbwt && cmp x.gcsa x2-ap.gcsa && cmp x.gcsa.lcp x2-ap.gcsa.lcp +cmp x.gcsa x2-ap.gcsa && cmp x.gcsa.lcp x2-ap.gcsa.lcp is $? 0 "the indexes are identical with -L" is $(vg paths -x x2-ap.xg -L | wc -l) $(vg paths -v x.vg -L | wc -l) "xg index does contains alt paths with index -L all at once" -# Exclude a sample from the GBWT index -vg index -G empty.gbwt -v small/x.vcf.gz --exclude 1 x.vg -is $? 0 "samples can be excluded from haplotype indexing" -is $(vg gbwt -c empty.gbwt) 0 "excluded samples were not included in the GBWT index" - -# Make GBWT from GAM -vg paths -v x.vg -X -Q _alt > x-alts.gam -vg index x.vg -M x-alts.gam -G x-gam.gbwt -# Make GBWT from GAF -vg convert x.vg -G x-alts.gam > x-alts.gaf -vg index x.vg -F x-alts.gaf -G x-gaf.gbwt -cmp x-gaf.gbwt x-gam.gbwt -is $? 0 "GBWT from GAF same as from GAM" - rm -f x.vg -rm -f x.xg x-ap.xg x.gbwtx.gcsa x.gcsa.lcp -rm -f x2.xg x2.gbwt x2.gcsa x2.gcsa.lcp -rm -f x2-ap.xg x2-ap.gbwt x2-ap.gcsa x2-ap.gcsa.lcp -rm -f empty.gbwt -rm -f x-alts.gam x-alts.gaf x-gam.gbwt x-gaf.gbwt - - -# Subregion graph with haplotypes -vg construct -r small/x.fa -v small/x.vcf.gz -a --region x:100-200 > x.part.vg - -vg index -x x.part.xg -G x.part.gbwt --region x:100-200 -v small/x.vcf.gz x.part.vg 2>log.txt -is $? 0 "building GBWT index for a regional graph" - -is "$(cat log.txt | wc -c)" "0" "no warnings about missing variants produced" - -rm -f x.part.vg x.part.xg x.part.gbwt log.txt +rm -f x.xg x-ap.xg x.gcsa x.gcsa.lcp +rm -f x2.xg x2.gcsa x2.gcsa.lcp +rm -f x2-ap.xg x2-ap.gcsa x2-ap.gcsa.lcp +rm -f x-alts.gam x-alts.gaf # Multiple graphs without haplotypes @@ -129,9 +100,6 @@ vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null vg construct -r small/xy.fa -v small/xy2.vcf.gz -R y -C -a > y.vg 2> /dev/null vg ids -j x.vg y.vg -vg index -G x.gbwt -v small/xy2.vcf.gz x.vg && vg index -G y.gbwt -v small/xy2.vcf.gz y.vg && vg gbwt -m -f -o xy.gbwt x.gbwt y.gbwt -is $? 0 "building a GBWT index of multiple graphs with haplotypes" - vg index -x xy.xg x.vg y.vg is $? 0 "building an XG index of multiple graphs with haplotypes" @@ -144,31 +112,14 @@ is $? 0 "building XG and GCSA indexes at once" vg index -x xy-alt.xg -L x.vg y.vg is $? 0 "building an XG index with alt paths" -vg index -G xy2.gbwt -v small/xy2.vcf.gz xy-alt.xg -is $? 0 "building a GBWT index from an XG index" - -cmp xy.xg xy2.xg && cmp xy.gcsa xy2.gcsa && cmp xy.gcsa.lcp xy2.gcsa.lcp && cmp xy.gbwt xy2.gbwt +cmp xy.xg xy2.xg && cmp xy.gcsa xy2.gcsa && cmp xy.gcsa.lcp xy2.gcsa.lcp is $? 0 "the indexes are identical" rm -f x.vg y.vg -rm -f x.gbwt y.gbwt -rm -f xy.xg xy.gbwt xy.gcsa xy.gcsa.lcp -rm -f xy2.xg xy2.gbwt xy2.gcsa xy2.gcsa.lcp +rm -f xy.xg xy.gcsa xy.gcsa.lcp +rm -f xy2.xg xy2.gcsa xy2.gcsa.lcp rm -f xy-alt.xg - -# GBWT construction options -vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null - -vg index -G x_ref.gbwt -T x.vg -is $? 0 "GBWT can be built for paths" - -rm -f x_ref.gbwt - -# We do not test GBWT construction parameters (-B, -u, -n) because they matter only for large inputs. -# We do not test chromosome-length path generation (-P, -o) for the same reason. - - # Other tests vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz >x.vg vg index -x x.xg x.vg bogus123.vg 2>/dev/null diff --git a/test/t/07_vg_map.t b/test/t/07_vg_map.t index 08a99edc398..a7c1e41c9a8 100644 --- a/test/t/07_vg_map.t +++ b/test/t/07_vg_map.t @@ -152,7 +152,8 @@ is $? 1 "error on vg map -f (paired, RHS)" # Now do the GBWT vg construct -a -r small/x.fa -v small/x.vcf.gz >x.vg -vg index -x x.xg -g x.gcsa -v small/x.vcf.gz --gbwt-name x.gbwt -k 16 x.vg +vg index -x x.xg -g x.gcsa -k 16 x.vg +vg gbwt -v small/x.vcf.gz -o x.gbwt -x x.vg # This read is all ref which matches no haplotype in x.vcf.gz and visits some unused nodes is "$(vg map -x x.xg -g x.gcsa --gbwt-name x.gbwt --hap-exp 1 --full-l-bonus 0 -f reads/x.unvisited.fq -j | jq -r '.score')" "36" "mapping a read that touches unused nodes gets the base score" diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index e21a8fe2c09..e83fa8de659 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -11,7 +11,8 @@ plan tests 26 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg -vg index -x x.xg -G x.gbwt -v small/x.vcf.gz x.vg +vg index -x x.xg x.vg +vg gbwt -v small/x.vcf.gz -o x.gbwt -x x.vg # List path/thread names from various input formats is "$(vg paths --list -v x2.vg)" "x" "path listing works from vg" diff --git a/test/t/13_vg_sim.t b/test/t/13_vg_sim.t index 0481a82e463..ecc0799eeda 100644 --- a/test/t/13_vg_sim.t +++ b/test/t/13_vg_sim.t @@ -11,7 +11,7 @@ plan tests 36 vg construct -r small/x.fa -v small/x.vcf.gz >x.vg vg construct -r small/x.fa -v small/x.vcf.gz -a >x2.vg vg index -x x.xg x.vg -vg index -G x.gbwt -v small/x.vcf.gz x2.vg +vg gbwt -o x.gbwt -v small/x.vcf.gz -x x2.vg is $(vg sim -l 100 -n 100 -x x.xg | wc -l) 100 \ "vg sim creates the correct number of reads" @@ -58,7 +58,7 @@ rm -f x.vg x2.vg x.xg x.gbwt n.vg n.fa n.xg vg construct -r small/xy.fa -v small/x.vcf.gz -a >xy.vg vg index -x xy.xg xy.vg -vg index -G xy.gbwt -v small/x.vcf.gz xy.vg +vg gbwt -o xy.gbwt -v small/x.vcf.gz -x xy.vg vg sim -s 12345 -n 1000 -l 2 -e 0.1 -x xy.xg -g xy.gbwt --sample-name 1 --any-path >/dev/null is $? "0" "Sample simulation works along with --any-path" diff --git a/test/t/18_vg_call.t b/test/t/18_vg_call.t index 6a457f4eb9f..b97cee3fb08 100644 --- a/test/t/18_vg_call.t +++ b/test/t/18_vg_call.t @@ -89,7 +89,7 @@ is "${REF_COUNT_V}" "${REF_COUNT_A}" "Same number of reference calls with -a as # Output snarl traversals into a GBWT then genotype that vg call HGSVC_alts.xg -k HGSVC_alts.pack -s HG00514 -T | gzip > HGSVC_travs.gaf.gz -vg index HGSVC_alts.xg -F HGSVC_travs.gaf.gz -G HGSVC_travs.gbwt +vg gbwt -o HGSVC_travs.gbwt -x HGSVC_alts.xg -A HGSVC_travs.gaf.gz vg call HGSVC_alts.xg -k HGSVC_alts.pack -g HGSVC_travs.gbwt -s HG00514 > HGSVC_travs.vcf vg call HGSVC_alts.xg -k HGSVC_alts.pack -s HG00514 > HGSVC_direct.vcf # extract the called genotypes diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index 03c7f8f3a87..47a28c4ae35 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -121,7 +121,8 @@ is "$?" 0 "deconstructing vg graph gives same output as xg graph" rm -f tiny_names.gfa tiny_names.vg tiny_names.xg tiny_names_decon.vcf tiny_names_decon_vg.vcf vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg -vg index -x x.xg -G x.gbwt -v small/x.vcf.gz x.vg +vg index -x x.xg x.vg +vg gbwt -v small/x.vcf.gz -o x.gbwt -x x.vg vg deconstruct x.xg -g x.gbwt | bgzip > x.decon.vcf.gz tabix -f -p vcf x.decon.vcf.gz cat small/x.fa | bcftools consensus small/x.vcf.gz -s 1 -H 1 > small.s1.h1.fa diff --git a/test/t/33_vg_mpmap.t b/test/t/33_vg_mpmap.t index 16af8c4cef9..e569b2b9ed6 100644 --- a/test/t/33_vg_mpmap.t +++ b/test/t/33_vg_mpmap.t @@ -11,7 +11,8 @@ plan tests 21 # Exercise the GBWT # Index a couple of nearly identical contigs vg construct -m 1000 -a -r small/xy.fa -v small/xy2.vcf.gz >xy2.vg -vg index -x xy2.xg -g xy2.gcsa -v small/xy2.vcf.gz --gbwt-name xy2.gbwt -k 16 xy2.vg +vg index -x xy2.xg -g xy2.gcsa -k 16 xy2.vg +vg gbwt -v small/xy2.vcf.gz -o xy2.gbwt -x xy2.vg # We turn off the background model calibration with -B and ignore it with -P 1 diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t index 9fdc0c8a4fb..870b5d9674a 100644 --- a/test/t/37_vg_gbwt.t +++ b/test/t/37_vg_gbwt.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 159 +plan tests 149 # Build vg graphs for two chromosomes @@ -21,10 +21,6 @@ vg index -x xy-alt.xg -L x.vg y.vg # Single chromosome: haplotypes vg gbwt -x x.vg -o x.gbwt -v small/xy2.vcf.gz is $? 0 "chromosome X GBWT with vg gbwt" -vg index -G x2.gbwt -v small/xy2.vcf.gz x.vg -is $? 0 "chromosome X GBWT with vg index" -cmp x.gbwt x2.gbwt -is $? 0 "identical construction results with vg gbwt and vg index" vg gbwt -x x.vg -o parse --parse-only -v small/xy2.vcf.gz is $? 0 "chromosome X VCF parse" ../deps/gbwt/bin/build_gbwt -p -r parse_x > /dev/null 2> /dev/null @@ -41,20 +37,16 @@ is $(vg gbwt -T x.gbwt | wc -l) 2 "chromosome X: 2 path names" is $(vg gbwt -C -L x.gbwt | wc -l) 1 "chromosome X: 1 contig name" is $(vg gbwt -S -L x.gbwt | wc -l) 1 "chromosome X: 1 sample name" -rm -f x.gbwt x2.gbwt parse_x.gbwt +rm -f x.gbwt parse_x.gbwt rm -f parse_x parse_x_0_1 # Single chromosome: paths vg gbwt -E -o x.ref.gbwt -x x.vg is $? 0 "chromosome X reference GBWT with vg gbwt" -vg index -G x2.ref.gbwt -T x.vg -is $? 0 "chromosome X reference GBWT with vg index" -cmp x.ref.gbwt x2.ref.gbwt -is $? 0 "identical construction results with vg gbwt and vg index" is $(vg gbwt -c x.ref.gbwt) 1 "chromosome X reference: 1 path" -rm -f x.ref.gbwt x2.ref.gbwt +rm -f x.ref.gbwt # Single chromosome: alignments @@ -62,17 +54,13 @@ vg paths -v x.vg -X -Q _alt > x.alts.gam vg convert -G x.alts.gam x.vg > x.alts.gaf vg gbwt -A -o x.alts.gaf.gbwt -x x.vg x.alts.gaf is $? 0 "chromosome X GAF with vg gbwt" -vg index -F x.alts.gaf -G x2.alts.gaf.gbwt x.vg -is $? 0 "chromosome X GAF with vg index" -cmp x.alts.gaf.gbwt x2.alts.gaf.gbwt -is $? 0 "identical construction results with vg gbwt and vg index" vg gbwt -A --gam-format -o x.alts.gam.gbwt -x x.vg x.alts.gam is $? 0 "chromosome X GAM with vg gbwt" cmp x.alts.gaf.gbwt x.alts.gaf.gbwt is $? 0 "identical construction results from GAF and GAM" rm -f x.alts.gam x.alts.gaf -rm -f x.alts.gaf.gbwt x2.alts.gaf.gbwt x.alts.gam.gbwt +rm -f x.alts.gaf.gbwt x.alts.gam.gbwt # Graph region: haplotypes @@ -80,12 +68,8 @@ vg construct -r small/x.fa -v small/x.vcf.gz -a --region x:100-200 > x.part.vg vg gbwt -x x.part.vg -o x.part.gbwt --vcf-region x:100-200 -v small/x.vcf.gz 2> log.txt is $? 0 "chromosome X subgraph GBWT with vg gbwt" is "$(cat log.txt | wc -c)" 0 "no warnings about missing variants" -vg index -G x2.part.gbwt --region x:100-200 -v small/x.vcf.gz x.part.vg 2> log.txt -is $? 0 "chromosome X subgraph GBWT with vg index" -cmp x.part.gbwt x2.part.gbwt -is $? 0 "identical construction results with vg gbwt and vg index" -rm -f x.part.vg x.part.gbwt x2.part.gbwt log.txt +rm -f x.part.vg x.part.gbwt log.txt # Multiple chromosomes: haplotypes @@ -127,16 +111,12 @@ rm -f xy.1000gp.gbwt # Multiple chromosomes: paths as contigs vg gbwt -E -o xy.contigs.gbwt -x xy.xg is $? 0 "paths as contigs with vg gbwt" -vg index -G xy2.contigs.gbwt -T xy.xg -is $? 0 "paths as contigs with vg index" -cmp xy.contigs.gbwt xy2.contigs.gbwt -is $? 0 "identical construction results with vg gbwt and vg index" is $(vg gbwt -c xy.contigs.gbwt) 2 "paths as contigs: 2 paths" is $(vg gbwt -C xy.contigs.gbwt) 2 "paths as contigs: 2 contigs" is $(vg gbwt -H xy.contigs.gbwt) 1 "paths as contigs: 1 haplotype" is $(vg gbwt -S xy.contigs.gbwt) 1 "paths as contigs: 1 sample" -rm -f xy.contigs.gbwt xy2.contigs.gbwt +rm -f xy.contigs.gbwt # Build an r-index diff --git a/test/t/38_vg_prune.t b/test/t/38_vg_prune.t index 3cd5db9cde0..1aa7533c260 100644 --- a/test/t/38_vg_prune.t +++ b/test/t/38_vg_prune.t @@ -10,7 +10,7 @@ plan tests 21 # Build a graph with one path and two threads vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null -vg index -G x.gbwt -v small/xy2.vcf.gz x.vg +vg gbwt -o x.gbwt -v small/xy2.vcf.gz -x x.vg # Basic pruning: 5 components, 51 nodes, 51 edges vg prune -e 1 x.vg > y.vg @@ -54,8 +54,8 @@ rm -f x.vg x.gbwt vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R y -C -a > y.vg 2> /dev/null vg ids -j -m xy.mapping x.vg y.vg -vg index -G x.gbwt -v small/xy2.vcf.gz x.vg -vg index -G y.gbwt -v small/xy2.vcf.gz y.vg +vg gbwt -o x.gbwt -v small/xy2.vcf.gz -x x.vg +vg gbwt -o y.gbwt -v small/xy2.vcf.gz -x y.vg # Prune a single-chromosome graph using multi-chromosome GBWT vg gbwt -m -o xy.gbwt x.gbwt y.gbwt diff --git a/test/t/46_vg_minimizer.t b/test/t/46_vg_minimizer.t index 06f9133f09d..e4090b54c9d 100644 --- a/test/t/46_vg_minimizer.t +++ b/test/t/46_vg_minimizer.t @@ -62,8 +62,10 @@ rm -f x.vg x.xg x.gbwt x.snarls x.dist x.mi x.gg x.gbz vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null vg construct -r small/xy.fa -v small/xy2.vcf.gz -R y -C -a > y.vg 2> /dev/null vg ids -j x.vg y.vg -vg index -x x.xg -G x.gbwt -v small/xy2.vcf.gz x.vg -vg index -x y.xg -G y.gbwt -v small/xy2.vcf.gz y.vg +vg index -x x.xg x.vg +vg gbwt -o x.gbwt -x x.vg -v small/xy2.vcf.gz +vg index -x y.xg y.vg +vg gbwt -o y.gbwt -x y.vg -v small/xy2.vcf.gz # Appending to the index vg minimizer --no-dist -t 1 -o x.mi -g x.gbwt x.xg From 5a2c3af1fda989c2e31ad9da5c1897ec999ac91a Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Thu, 31 Oct 2024 23:01:56 -0700 Subject: [PATCH 05/12] Update SDSL, GCSA, GBWT, GBWTGraph --- deps/gbwt | 2 +- deps/gbwtgraph | 2 +- deps/gcsa2 | 2 +- deps/sdsl-lite | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/deps/gbwt b/deps/gbwt index 029c2b45667..2aab62f0664 160000 --- a/deps/gbwt +++ b/deps/gbwt @@ -1 +1 @@ -Subproject commit 029c2b456675eab0fbb55de21c8002e1660cb0d3 +Subproject commit 2aab62f0664b2ce8eb4cffd6b3872c89b85fdfa6 diff --git a/deps/gbwtgraph b/deps/gbwtgraph index de61d340695..d2a423b5eb5 160000 --- a/deps/gbwtgraph +++ b/deps/gbwtgraph @@ -1 +1 @@ -Subproject commit de61d340695f64b8aa978816b195d6687e7fda5a +Subproject commit d2a423b5eb54c3c2f0676cb2925b46ad0ba7e189 diff --git a/deps/gcsa2 b/deps/gcsa2 index 7dbf9305817..8b6b049ab64 160000 --- a/deps/gcsa2 +++ b/deps/gcsa2 @@ -1 +1 @@ -Subproject commit 7dbf93058171b5af420f1b6166d4d2bf54cd8748 +Subproject commit 8b6b049ab6444e891bb7c38ec8f38169ce62e5fb diff --git a/deps/sdsl-lite b/deps/sdsl-lite index 86fa3534c1b..115dfd5b737 160000 --- a/deps/sdsl-lite +++ b/deps/sdsl-lite @@ -1 +1 @@ -Subproject commit 86fa3534c1bf02b5468bbe58e2e0b8f6ae2d6fa4 +Subproject commit 115dfd5b7371ef99a1e0611389b956bb88bf9fc4 From ba0ed68ebff2d05dc23e6190b572284d7f750527 Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Sat, 2 Nov 2024 04:09:56 -0700 Subject: [PATCH 06/12] Multithreaded GAF/GAM to GBWT --- src/gbwt_helper.cpp | 20 ----- src/gbwt_helper.hpp | 7 -- src/haplotype_indexer.cpp | 146 +++++++++++++++++++++++++++-------- src/haplotype_indexer.hpp | 11 ++- src/subcommand/gbwt_main.cpp | 4 +- test/t/37_vg_gbwt.t | 4 +- 6 files changed, 126 insertions(+), 66 deletions(-) diff --git a/src/gbwt_helper.cpp b/src/gbwt_helper.cpp index 0958ba45533..19306a7dfe3 100644 --- a/src/gbwt_helper.cpp +++ b/src/gbwt_helper.cpp @@ -88,26 +88,6 @@ gbwt::size_type gbwt_node_width(const HandleGraph& graph) { return gbwt::bit_length(gbwt::Node::encode(graph.max_node_id(), true)); } -void finish_gbwt_constuction(gbwt::GBWTBuilder& builder, - const std::vector& sample_names, - const std::vector& contig_names, - size_t haplotype_count, bool print_metadata, - const std::string& header) { - - builder.finish(); - builder.index.metadata.setSamples(sample_names); - builder.index.metadata.setHaplotypes(haplotype_count); - builder.index.metadata.setContigs(contig_names); - if (print_metadata) { - #pragma omp critical - { - std::cerr << header << ": "; - gbwt::operator<<(std::cerr, builder.index.metadata); - std::cerr << std::endl; - } - } -} - //------------------------------------------------------------------------------ void load_gbwt(gbwt::GBWT& index, const std::string& filename, bool show_progress) { diff --git a/src/gbwt_helper.hpp b/src/gbwt_helper.hpp index 33e1545066c..57b819a579b 100644 --- a/src/gbwt_helper.hpp +++ b/src/gbwt_helper.hpp @@ -67,13 +67,6 @@ gbwt::vector_type path_predecessors(const PathHandleGraph& graph, const std::str /// Determine the node width in bits for the GBWT nodes based on the given graph. gbwt::size_type gbwt_node_width(const HandleGraph& graph); -/// Finish GBWT construction and optionally print the metadata. -void finish_gbwt_constuction(gbwt::GBWTBuilder& builder, - const std::vector& sample_names, - const std::vector& contig_names, - size_t haplotype_count, bool print_metadata, - const std::string& header = "GBWT"); - //------------------------------------------------------------------------------ /* diff --git a/src/haplotype_indexer.cpp b/src/haplotype_indexer.cpp index 528160485cd..b7c935d7e05 100644 --- a/src/haplotype_indexer.cpp +++ b/src/haplotype_indexer.cpp @@ -4,10 +4,12 @@ #include #include +#include #include #include #include +#include #include #include "gbwt_helper.hpp" @@ -394,7 +396,7 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const std::vecto // And add every path that passes the filter (including haplotype paths) from the source graph. gbwtgraph::store_paths(builder, *graph, {PathSense::GENERIC, PathSense::REFERENCE, PathSense::HAPLOTYPE}, &path_filter); - // Finish the construction for this set of threads and put the index back. + // Finish the construction for this set of paths and put the index back. builder.finish(); builder.swapIndex(*index); } @@ -413,24 +415,43 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const PathHandle return build_gbwt({}, "GBWT", &graph); } -std::unique_ptr HaplotypeIndexer::build_gbwt(const PathHandleGraph& graph, - const std::vector& aln_filenames, const std::string& aln_format) const { +std::unique_ptr HaplotypeIndexer::build_gbwt(const HandleGraph& graph, + const std::vector& aln_filenames, const std::string& aln_format, size_t parallel_jobs) const { - // GBWT metadata. - std::vector sample_names, contig_names; - std::map> sample_info; // name -> (id, count) - contig_names.push_back("0"); // An artificial contig. - size_t haplotype_count = 0; + // Handle multithreading and parallel jobs. + parallel_jobs = std::max(1, parallel_jobs); + int old_threads = omp_get_max_threads(); + omp_set_num_threads(parallel_jobs); + + // Partition the graph into construction jobs. + if (this->show_progress) { + #pragma omp critical + { + std::cerr << "Partitioning the graph into GBWT construction jobs" << std::endl; + } + } + size_t target_size = graph.get_node_count() / parallel_jobs; + gbwtgraph::ConstructionJobs jobs = gbwtgraph::gbwt_construction_jobs(graph, target_size); + if (this->show_progress) { + #pragma omp critical + { + std::cerr << "Created " << jobs.size() << " parallel construction jobs" << std::endl; + } + } // GBWT construction. - gbwt::GBWTBuilder builder(gbwt_node_width(graph), this->gbwt_buffer_size * gbwt::MILLION, this->id_interval); - builder.index.addMetadata(); + std::vector builder_mutexes(jobs.size()); + std::vector> builders(jobs.size()); + std::vector> read_names(jobs.size()); // TODO: Concatenated strings + starting offsets may save space. + for (size_t i = 0; i < jobs.size(); i++) { + builders[i].reset(new gbwt::GBWTBuilder(gbwt_node_width(graph), this->gbwt_buffer_size * gbwt::MILLION, this->id_interval)); + } // Actual work. if (this->show_progress) { #pragma omp critical { - std::cerr << "Converting " << aln_format << " to threads" << std::endl; + std::cerr << "Converting " << aln_format << " to GBWT paths" << std::endl; } } std::function lambda = [&](Alignment& aln) { @@ -438,37 +459,98 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const PathHandle for (auto& m : aln.path().mapping()) { buffer.push_back(mapping_to_gbwt(m)); } - builder.insert(buffer, true); // Insert in both orientations. - size_t sample_id = 0, sample_count = 0; - auto iter = sample_info.find(aln.name()); - if (iter == sample_info.end()) { - sample_id = sample_names.size(); - sample_names.push_back(aln.name()); - sample_info[aln.name()] = std::pair(sample_id, sample_count); - haplotype_count++; - } else { - sample_id = iter->second.first; - sample_count = iter->second.second; - iter->second.second++; + size_t job_id = 0; + if (buffer.size() > 0) { + job_id = jobs.job(gbwt::Node::id(buffer.front())); + if (job_id >= jobs.size()) { + job_id = 0; + } + } + { + // Insert the path into the appropriate builder and record the read name. + std::lock_guard lock(builder_mutexes[job_id]); + builders[job_id]->insert(buffer, true); + read_names[job_id].push_back(aln.name()); } - builder.index.metadata.addPath(sample_id, 0, 0, sample_count); }; for (auto& file_name : aln_filenames) { if (aln_format == "GAM") { get_input_file(file_name, [&](istream& in) { - vg::io::for_each(in, lambda); + vg::io::for_each_parallel(in, lambda); }); } else { assert(aln_format == "GAF"); - vg::io::gaf_unpaired_for_each(graph, file_name, lambda); + vg::io::gaf_unpaired_for_each_parallel(graph, file_name, lambda); } } - - // Finish the construction and extract the index. - finish_gbwt_constuction(builder, sample_names, contig_names, haplotype_count, this->show_progress); - std::unique_ptr built(new gbwt::DynamicGBWT()); - builder.swapIndex(*built); - return built; + + // Finish the construction and convert to compressed GBWT. + std::vector partial_indexes(jobs.size()); + #pragma omp parallel for schedule(dynamic, 1) + for (size_t i = 0; i < jobs.size(); i++) { + builders[i]->finish(); + partial_indexes[i] = gbwt::GBWT(builders[i]->index); + builders[i].reset(); + } + + // Merge the partial indexes. + if (this->show_progress) { + #pragma omp critical + { + std::cerr << "Merging the partial GBWTs" << std::endl; + } + } + std::unique_ptr result(new gbwt::GBWT(partial_indexes)); + partial_indexes.clear(); + + // Create the metadata. + // TODO: This is quite slow. + if (this->show_progress) { + #pragma omp critical + { + std::cerr << "Creating metadata" << std::endl; + } + } + result->addMetadata(); + result->metadata.setContigs({ "unknown" }); + { + std::map> read_info; // name -> (sample id, fragment count) + for (auto& names : read_names) { + for (const std::string& name : names) { + size_t sample_id = 0, fragment_count = 0; + auto iter = read_info.find(name); + if (iter == read_info.end()) { + sample_id = read_info.size(); + read_info[name] = std::pair(sample_id, fragment_count); + } else { + sample_id = iter->second.first; + fragment_count = iter->second.second; + iter->second.second++; + } + result->metadata.addPath(sample_id, 0, 0, fragment_count); + } + names = std::vector(); + } + std::vector sample_names(read_info.size()); + for (auto& p : read_info) { + sample_names[p.second.first] = p.first; + } + read_info.clear(); + result->metadata.setSamples(sample_names); + result->metadata.setHaplotypes(sample_names.size()); + } + if (this->show_progress) { + #pragma omp critical + { + std::cerr << "GBWT: "; + gbwt::operator<<(std::cerr, result->metadata); + std::cerr << std::endl; + } + } + + // Restore the number of threads. + omp_set_num_threads(old_threads); + return result; } } diff --git a/src/haplotype_indexer.hpp b/src/haplotype_indexer.hpp index f555aec947b..ca212a94267 100644 --- a/src/haplotype_indexer.hpp +++ b/src/haplotype_indexer.hpp @@ -135,10 +135,15 @@ class HaplotypeIndexer : public Progressive { * the same name, the corresponding GBWT path names will have the same * sample identifier but different values in the count field. * - * aln_format can be "GAM" or "GAF" + * aln_format can be "GAM" or "GAF". + * + * Runs approximately the given number of jobs in parallel. The exact + * number depends on the sizes of weakly connected components in the graph. + * Each job uses at most 2 threads. */ - std::unique_ptr build_gbwt(const PathHandleGraph& graph, - const std::vector& aln_filenames, const std::string& aln_format) const; + std::unique_ptr build_gbwt(const HandleGraph& graph, + const std::vector& aln_filenames, const std::string& aln_format, + size_t parallel_jobs = 1) const; }; } diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp index 5e55928a4fe..4eca4e47a81 100644 --- a/src/subcommand/gbwt_main.cpp +++ b/src/subcommand/gbwt_main.cpp @@ -1389,8 +1389,8 @@ void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& co if (config.show_progress) { std::cerr << "Input type: " << (config.gam_format ? "GAM" : "GAF") << std::endl; } - // FIXME: Parallelize this. - std::unique_ptr temp = config.haplotype_indexer.build_gbwt(*(graphs.path_graph), config.input_filenames, (config.gam_format ? "GAM" : "GAF")); + std::unique_ptr temp = + config.haplotype_indexer.build_gbwt(*(graphs.path_graph), config.input_filenames, (config.gam_format ? "GAM" : "GAF"), config.build_jobs); gbwts.use(*temp); } diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t index 870b5d9674a..1046da788d3 100644 --- a/test/t/37_vg_gbwt.t +++ b/test/t/37_vg_gbwt.t @@ -52,9 +52,9 @@ rm -f x.ref.gbwt # Single chromosome: alignments vg paths -v x.vg -X -Q _alt > x.alts.gam vg convert -G x.alts.gam x.vg > x.alts.gaf -vg gbwt -A -o x.alts.gaf.gbwt -x x.vg x.alts.gaf +vg gbwt -A --num-jobs 1 -o x.alts.gaf.gbwt -x x.vg x.alts.gaf is $? 0 "chromosome X GAF with vg gbwt" -vg gbwt -A --gam-format -o x.alts.gam.gbwt -x x.vg x.alts.gam +vg gbwt -A --num-jobs 1 --gam-format -o x.alts.gam.gbwt -x x.vg x.alts.gam is $? 0 "chromosome X GAM with vg gbwt" cmp x.alts.gaf.gbwt x.alts.gaf.gbwt is $? 0 "identical construction results from GAF and GAM" From a585bbcffa162f08dbe2ae9a745e37dde8bb5595 Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Sun, 3 Nov 2024 01:31:20 -0800 Subject: [PATCH 07/12] Faster metadata construction for GAM/GAF GBWT --- src/haplotype_indexer.cpp | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/haplotype_indexer.cpp b/src/haplotype_indexer.cpp index b7c935d7e05..75399a86e64 100644 --- a/src/haplotype_indexer.cpp +++ b/src/haplotype_indexer.cpp @@ -2,8 +2,9 @@ * \file haplotype_indexer.cpp: implementations of haplotype indexing with the GBWT */ +#include "haplotype_indexer.hpp" + #include -#include #include #include #include @@ -14,10 +15,9 @@ #include "gbwt_helper.hpp" -#include "haplotype_indexer.hpp" - -#include "path.hpp" #include "alignment.hpp" +#include "hash_map.hpp" +#include "path.hpp" using namespace std; @@ -442,7 +442,9 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const HandleGraph& grap // GBWT construction. std::vector builder_mutexes(jobs.size()); std::vector> builders(jobs.size()); - std::vector> read_names(jobs.size()); // TODO: Concatenated strings + starting offsets may save space. + // This is a bit inefficient, as read names are often longer than the SSO threshold for GCC (but not for Clang). + // TODO: Maybe use concatenated 0-terminated names? + std::vector> read_names(jobs.size()); for (size_t i = 0; i < jobs.size(); i++) { builders[i].reset(new gbwt::GBWTBuilder(gbwt_node_width(graph), this->gbwt_buffer_size * gbwt::MILLION, this->id_interval)); } @@ -504,7 +506,6 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const HandleGraph& grap partial_indexes.clear(); // Create the metadata. - // TODO: This is quite slow. if (this->show_progress) { #pragma omp critical { @@ -514,14 +515,15 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const HandleGraph& grap result->addMetadata(); result->metadata.setContigs({ "unknown" }); { - std::map> read_info; // name -> (sample id, fragment count) + // We can use 32-bit values, as GBWT metadata uses them as well. + string_hash_map> read_info; // name -> (sample id, fragment count) for (auto& names : read_names) { for (const std::string& name : names) { - size_t sample_id = 0, fragment_count = 0; + std::uint32_t sample_id = 0, fragment_count = 0; auto iter = read_info.find(name); if (iter == read_info.end()) { sample_id = read_info.size(); - read_info[name] = std::pair(sample_id, fragment_count); + read_info[name] = std::make_pair(sample_id, fragment_count); } else { sample_id = iter->second.first; fragment_count = iter->second.second; @@ -535,7 +537,7 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const HandleGraph& grap for (auto& p : read_info) { sample_names[p.second.first] = p.first; } - read_info.clear(); + read_info = string_hash_map>(); result->metadata.setSamples(sample_names); result->metadata.setHaplotypes(sample_names.size()); } From c1b58030cd2bc4079ff12ea6ec441e3d19cc791f Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Mon, 4 Nov 2024 00:12:53 -0800 Subject: [PATCH 08/12] Tests for multi-chromosome GAM/GAF GBWT --- test/t/37_vg_gbwt.t | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t index 1046da788d3..a5849bc47a5 100644 --- a/test/t/37_vg_gbwt.t +++ b/test/t/37_vg_gbwt.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 149 +plan tests 155 # Build vg graphs for two chromosomes @@ -50,7 +50,7 @@ rm -f x.ref.gbwt # Single chromosome: alignments -vg paths -v x.vg -X -Q _alt > x.alts.gam +vg paths -x x.vg -X -Q _alt > x.alts.gam vg convert -G x.alts.gam x.vg > x.alts.gaf vg gbwt -A --num-jobs 1 -o x.alts.gaf.gbwt -x x.vg x.alts.gaf is $? 0 "chromosome X GAF with vg gbwt" @@ -108,6 +108,25 @@ rm -f x.gbwt y.gbwt xy.merge.gbwt xy.fast.gbwt xy.parallel.gbwt xy.direct.gbwt x rm -f xy.1000gp.gbwt +# Multiple chromosomes: alignments +vg paths -x xy-alt.xg -X -Q _alt > xy.alts.gam +vg convert -G xy.alts.gam xy.xg > xy.alts.gaf +vg gbwt -A --num-jobs 1 -o xy.alts.gaf.gbwt -x xy.xg xy.alts.gaf +is $? 0 "multi-chromosome GAF with vg gbwt" +vg gbwt -A --num-jobs 1 --gam-format -o xy.alts.gam.gbwt -x xy.xg xy.alts.gam +is $? 0 "multi-chromosome GAM with vg gbwt" +cmp xy.alts.gaf.gbwt xy.alts.gaf.gbwt +is $? 0 "identical construction results from GAF and GAM" + +vg gbwt -A --num-jobs 2 -o multi.gbwt -x xy.xg xy.alts.gaf +is $? 0 "multi-chromosome GAF with vg gbwt using multiple jobs" +is $(vg gbwt -c xy.alts.gaf.gbwt) 58 "single job: 58 paths" +is $(vg gbwt -c multi.gbwt) 58 "multiple jobs: 58 paths" + +rm -f xy.alts.gam xy.alts.gaf +rm -f xy.alts.gaf.gbwt xy.alts.gam.gbwt multi.gbwt + + # Multiple chromosomes: paths as contigs vg gbwt -E -o xy.contigs.gbwt -x xy.xg is $? 0 "paths as contigs with vg gbwt" From 6957377dbfbe6a1a65567b964269994b663f14c2 Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Mon, 4 Nov 2024 00:35:38 -0800 Subject: [PATCH 09/12] Maybe SDSL will now point to the right hash --- deps/sdsl-lite | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/sdsl-lite b/deps/sdsl-lite index 115dfd5b737..4b4b1b8951b 160000 --- a/deps/sdsl-lite +++ b/deps/sdsl-lite @@ -1 +1 @@ -Subproject commit 115dfd5b7371ef99a1e0611389b956bb88bf9fc4 +Subproject commit 4b4b1b8951bf7fbdb5e050f98b5e61453964c167 From a4e7ddd41774d8431160490db26670514c3fa921 Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Mon, 4 Nov 2024 02:39:48 -0800 Subject: [PATCH 10/12] vg was apparently using another branch of SDSL --- deps/sdsl-lite | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/sdsl-lite b/deps/sdsl-lite index 4b4b1b8951b..ef23c5fe989 160000 --- a/deps/sdsl-lite +++ b/deps/sdsl-lite @@ -1 +1 @@ -Subproject commit 4b4b1b8951bf7fbdb5e050f98b5e61453964c167 +Subproject commit ef23c5fe9899f2b0afa53a32162ed0b06aff0e89 From 612d446061ce17ec14807cb883f33ba9f8895942 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 4 Nov 2024 17:49:53 -0500 Subject: [PATCH 11/12] Use toil-vg that uses vg gbwt to make gbwt --- vgci/vgci.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vgci/vgci.sh b/vgci/vgci.sh index 9a5e53b08f3..ca04cfbe7b4 100755 --- a/vgci/vgci.sh +++ b/vgci/vgci.sh @@ -30,7 +30,7 @@ KEEP_INTERMEDIATE_FILES=0 # Should we show stdout and stderr from tests? If so, set to "-s". SHOW_OPT="" # What toil-vg should we install? -TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@c9bd6414f935e6095574a41a34addbb8d87b41a6" +TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@d16da00b92c491f90433e151cb4f5a89a44395b8" # What toil should we install? # Could be something like "toil[aws,mesos]==3.20.0" # or "git+https://github.com/DataBiosphere/toil.git@3ab74776a3adebd6db75de16985ce9d734f60743#egg=toil[aws,mesos]" From 8a12486abcefc7644f00180f2c4b29916c22384c Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 5 Nov 2024 10:51:24 -0500 Subject: [PATCH 12/12] Use toil-vg that knows about DI2 and uses outfile correctly --- vgci/vgci.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vgci/vgci.sh b/vgci/vgci.sh index ca04cfbe7b4..c48ddbcf553 100755 --- a/vgci/vgci.sh +++ b/vgci/vgci.sh @@ -30,7 +30,7 @@ KEEP_INTERMEDIATE_FILES=0 # Should we show stdout and stderr from tests? If so, set to "-s". SHOW_OPT="" # What toil-vg should we install? -TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@d16da00b92c491f90433e151cb4f5a89a44395b8" +TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@45782c7ba5a372e7c3587ac1c63f4895176fc828" # What toil should we install? # Could be something like "toil[aws,mesos]==3.20.0" # or "git+https://github.com/DataBiosphere/toil.git@3ab74776a3adebd6db75de16985ce9d734f60743#egg=toil[aws,mesos]"