Skip to content

Commit 3286e13

Browse files
authored
Merge pull request #1791 from jltsiren/master
Better haplotype generation
2 parents b6b47f7 + 0956052 commit 3286e13

File tree

5 files changed

+51
-14
lines changed

5 files changed

+51
-14
lines changed

Diff for: deps/gbwt

Diff for: src/subcommand/index_main.cpp

+42-7
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ void help_index(char** argv) {
4545
<< " -H, --write-haps FILE store the threads as sequences in FILE" << endl
4646
<< " -F, --thread-db FILE write thread database to FILE" << endl
4747
<< " -P, --force-phasing replace unphased genotypes with randomly phased ones" << endl
48+
<< " -o, --discard-overlaps skip overlapping alternate alleles if the overlap cannot be resolved" << endl
49+
<< " -O, --check-overlaps print information on overlapping variants to stderr" << endl
4850
<< " -B, --batch-size N number of samples per batch (default 200)" << endl
4951
<< " -R, --range X..Y process samples X to Y (inclusive)" << endl
5052
<< " -r, --rename V=P rename contig V in the VCFs to path P in the graph (may repeat)" << endl
@@ -137,12 +139,13 @@ int main_index(int argc, char** argv) {
137139
// GBWT
138140
bool index_haplotypes = false, index_paths = false, index_gam = false;
139141
vector<string> gam_file_names;
140-
bool force_phasing = false;
142+
bool force_phasing = false, discard_overlaps = false, check_overlaps = false;
141143
size_t samples_in_batch = 200; // Samples per batch.
142144
std::pair<size_t, size_t> sample_range(0, ~(size_t)0); // The semiopen range of samples to process.
143145
map<string, string> path_to_vcf; // Path name conversion from --rename.
144146
map<string, pair<size_t, size_t>> regions; // Region restrictions for contigs, in VCF name space, as 0-based exclusive-end ranges.
145147
unordered_set<string> excluded_samples; // Excluded sample names from --exclude.
148+
std::set<std::pair<gbwt::size_type, gbwt::size_type>> overlaps; // Unresolved overlaps in the haplotypes.
146149

147150
// GCSA
148151
gcsa::size_type kmer_size = gcsa::Key::MAX_LENGTH;
@@ -180,6 +183,8 @@ int main_index(int argc, char** argv) {
180183
{"gbwt-name", required_argument, 0, 'G'},
181184
{"write-haps", required_argument, 0, 'H'},
182185
{"force-phasing", no_argument, 0, 'P'},
186+
{"discard-overlaps", no_argument, 0, 'o'},
187+
{"check-overlaps", no_argument, 0, 'O'},
183188
{"batch-size", required_argument, 0, 'B'},
184189
{"range", required_argument, 0, 'R'},
185190
{"rename", required_argument, 0, 'r'},
@@ -207,7 +212,7 @@ int main_index(int argc, char** argv) {
207212
};
208213

209214
int option_index = 0;
210-
c = getopt_long (argc, argv, "b:t:px:F:v:TG:H:PB:R:r:I:E:g:i:f:k:X:Z:Vd:maANDP:CM:h",
215+
c = getopt_long (argc, argv, "b:t:px:F:v:TM:G:H:PoOB:R:r:I:E:g:i:f:k:X:Z:Vd:maANDCh",
211216
long_options, &option_index);
212217

213218
// Detect the end of the options.
@@ -262,6 +267,12 @@ int main_index(int argc, char** argv) {
262267
case 'P':
263268
force_phasing = true;
264269
break;
270+
case 'o':
271+
discard_overlaps = true;
272+
break;
273+
case 'O':
274+
check_overlaps = true;
275+
break;
265276
case 'B':
266277
samples_in_batch = std::max(parse<size_t>(optarg), 1ul);
267278
break;
@@ -616,6 +627,9 @@ int main_index(int argc, char** argv) {
616627

617628
// Determine the reference nodes for the current variant and create a variant site.
618629
// If the variant is not an insertion, there should be a path for the ref allele.
630+
// Otherwise the reference position can be determined from the predecessors of the
631+
// alternate alleles.
632+
// TODO: What if the reference visits the same node several times?
619633
var.position--; // Use a 0-based position to get the correct var_name.
620634
std::string var_name = make_variant_id(var);
621635
std::string ref_path_name = "_alt_" + var_name + "_0";
@@ -634,19 +648,24 @@ int main_index(int argc, char** argv) {
634648
bool found = false;
635649
for (size_t alt_index = 1; alt_index < var.alleles.size(); alt_index++) {
636650
std::string alt_path_name = "_alt_" + var_name + "_" + to_string(alt_index);
651+
size_t candidate_pos = 0;
652+
bool candidate_found = false;
637653
auto alt_path_iter = alt_paths.find(alt_path_name);
638654
if (alt_path_iter != alt_paths.end()) {
639655
gbwt::vector_type pred_nodes = predecessors(*xg_index, alt_path_iter->second);
640656
for (auto node : pred_nodes) {
641657
size_t pred_pos = variants.firstOccurrence(node);
642658
if (pred_pos != variants.invalid_position()) {
643-
ref_pos = pred_pos + 1;
659+
candidate_pos = std::max(candidate_pos, pred_pos + 1);
660+
candidate_found = true;
644661
found = true;
645-
break;
646662
}
647663
}
648-
if (found) {
649-
break;
664+
// For each alternate allele, find the rightmost reference node among
665+
// its predecessors. If multiple alleles have candidates for the
666+
// reference position, choose the leftmost one.
667+
if (candidate_found) {
668+
ref_pos = std::min(ref_pos, candidate_pos);
650669
}
651670
}
652671
}
@@ -695,6 +714,9 @@ int main_index(int argc, char** argv) {
695714
}
696715
cerr << "- Phasing information: " << gbwt::inMegabytes(phasing_bytes) << " MB" << endl;
697716
}
717+
if (check_overlaps) {
718+
gbwt::checkOverlaps(variants, cerr, true);
719+
}
698720

699721
// Save memory:
700722
// - Delete the alt paths if we no longer need them.
@@ -724,12 +746,25 @@ int main_index(int argc, char** argv) {
724746
<< "_" << haplotype.phase
725747
<< "_" << haplotype.count;
726748
store_thread(haplotype.path, sn.str());
749+
},
750+
[&](gbwt::size_type site, gbwt::size_type allele) -> bool {
751+
if (check_overlaps) {
752+
overlaps.insert(std::make_pair(site, allele));
753+
}
754+
return discard_overlaps;
727755
});
728756
if (show_progress) {
729757
cerr << "- Processed samples " << phasings[batch].offset() << " to " << (phasings[batch].offset() + phasings[batch].size() - 1) << endl;
730758
}
731759
}
732-
} // End of contig.
760+
if (check_overlaps && !overlaps.empty()) {
761+
cerr << overlaps.size() << " unresolved overlaps:" << endl;
762+
for (auto overlap : overlaps) {
763+
cerr << "- site " << overlap.first << ", allele " << overlap.second << endl;
764+
}
765+
overlaps.clear();
766+
}
767+
} // End of contigs.
733768
} // End of haplotypes.
734769

735770
// Store the thread database. Write it to disk if a filename is given,

Diff for: src/vg_set.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ void VGset::to_xg(xg::XG& index, bool store_threads, const regex& paths_to_take,
113113

114114
// Sort out all the mappings from the paths we pulled out
115115
for(Path& path : paths_taken) {
116+
mappings[path.name()] = map<int64_t, Mapping>(); // We want to include empty paths as well.
117+
116118
for(size_t i = 0; i < path.mapping_size(); i++) {
117119
// For each mapping, file it under its rank if a rank is
118120
// specified, or at the last rank otherwise.

Diff for: test/t/06_vg_index.t

+1-1
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ rm -f xy.vg xy.xg
189189

190190
vg construct -r small/x.fa -v small/x.vcf.gz -a >x.vg
191191
vg index -x x.xg -v small/x.vcf.gz -H haps.bin x.vg
192-
is $(du -b haps.bin | cut -f 1) 345 "threads may be exported to binary for use in GBWT construction"
192+
is $(du -b haps.bin | cut -f 1) 329 "threads may be exported to binary for use in GBWT construction"
193193

194194
rm -f x.vg x.xg part.vg x.gcsa haps.bin x.gbwt
195195

Diff for: test/t/38_vg_prune.t

+5-5
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,11 @@ is $(vg stats -N y.vg) 44 "pruning with path restoring leaves the correct number
2626
is $(vg stats -E y.vg) 48 "pruning with path restoring leaves the correct number of edges"
2727
rm -f y.vg
2828

29-
# Unfold threads: 1 component, 57 nodes, 68 edges
29+
# Unfold threads: 1 component, 60 nodes, 72 edges
3030
vg prune -u -g x.gbwt -e 1 x.vg > y.vg
3131
is $(vg stats -s y.vg | wc -l) 1 "pruning with path unfolding produces the correct number of components"
32-
is $(vg stats -N y.vg) 57 "pruning with path unfolding produces the correct number of nodes"
33-
is $(vg stats -E y.vg) 68 "pruning with path unfolding produces the correct number of edges"
32+
is $(vg stats -N y.vg) 60 "pruning with path unfolding produces the correct number of nodes"
33+
is $(vg stats -E y.vg) 72 "pruning with path unfolding produces the correct number of edges"
3434
rm -f y.vg
3535

3636
rm -f x.vg x.gbwt
@@ -46,8 +46,8 @@ vg index -G y.gbwt -v small/xy2.vcf.gz y.vg
4646
# Check the range in the node mapping at each point
4747
is $(head -c 16 xy.mapping | hexdump -v -e '4/4 "%u_"') 99_0_99_0_ "empty mapping starts from the right node id"
4848
vg prune -u -g x.gbwt -e 1 -a -m xy.mapping x.vg > /dev/null
49-
is $(head -c 16 xy.mapping | hexdump -v -e '4/4 "%u_"') 99_0_125_0_ "the first unfolded graph adds the correct number of nodes to the mapping"
49+
is $(head -c 16 xy.mapping | hexdump -v -e '4/4 "%u_"') 99_0_128_0_ "the first unfolded graph adds the correct number of nodes to the mapping"
5050
vg prune -u -g y.gbwt -e 1 -a -m xy.mapping y.vg > /dev/null
51-
is $(head -c 16 xy.mapping | hexdump -v -e '4/4 "%u_"') 99_0_150_0_ "the second unfolded graph adds the correct number of nodes to the mapping"
51+
is $(head -c 16 xy.mapping | hexdump -v -e '4/4 "%u_"') 99_0_156_0_ "the second unfolded graph adds the correct number of nodes to the mapping"
5252

5353
rm -f x.vg y.vg x.gbwt y.gbwt xy.mapping

0 commit comments

Comments
 (0)