@@ -45,6 +45,8 @@ void help_index(char** argv) {
45
45
<< " -H, --write-haps FILE store the threads as sequences in FILE" << endl
46
46
<< " -F, --thread-db FILE write thread database to FILE" << endl
47
47
<< " -P, --force-phasing replace unphased genotypes with randomly phased ones" << endl
48
+ << " -o, --discard-overlaps skip overlapping alternate alleles if the overlap cannot be resolved" << endl
49
+ << " -O, --check-overlaps print information on overlapping variants to stderr" << endl
48
50
<< " -B, --batch-size N number of samples per batch (default 200)" << endl
49
51
<< " -R, --range X..Y process samples X to Y (inclusive)" << endl
50
52
<< " -r, --rename V=P rename contig V in the VCFs to path P in the graph (may repeat)" << endl
@@ -137,12 +139,13 @@ int main_index(int argc, char** argv) {
137
139
// GBWT
138
140
bool index_haplotypes = false , index_paths = false , index_gam = false ;
139
141
vector<string> gam_file_names;
140
- bool force_phasing = false ;
142
+ bool force_phasing = false , discard_overlaps = false , check_overlaps = false ;
141
143
size_t samples_in_batch = 200 ; // Samples per batch.
142
144
std::pair<size_t , size_t > sample_range (0 , ~(size_t )0 ); // The semiopen range of samples to process.
143
145
map<string, string> path_to_vcf; // Path name conversion from --rename.
144
146
map<string, pair<size_t , size_t >> regions; // Region restrictions for contigs, in VCF name space, as 0-based exclusive-end ranges.
145
147
unordered_set<string> excluded_samples; // Excluded sample names from --exclude.
148
+ std::set<std::pair<gbwt::size_type, gbwt::size_type>> overlaps; // Unresolved overlaps in the haplotypes.
146
149
147
150
// GCSA
148
151
gcsa::size_type kmer_size = gcsa::Key::MAX_LENGTH;
@@ -180,6 +183,8 @@ int main_index(int argc, char** argv) {
180
183
{" gbwt-name" , required_argument, 0 , ' G' },
181
184
{" write-haps" , required_argument, 0 , ' H' },
182
185
{" force-phasing" , no_argument, 0 , ' P' },
186
+ {" discard-overlaps" , no_argument, 0 , ' o' },
187
+ {" check-overlaps" , no_argument, 0 , ' O' },
183
188
{" batch-size" , required_argument, 0 , ' B' },
184
189
{" range" , required_argument, 0 , ' R' },
185
190
{" rename" , required_argument, 0 , ' r' },
@@ -207,7 +212,7 @@ int main_index(int argc, char** argv) {
207
212
};
208
213
209
214
int option_index = 0 ;
210
- c = getopt_long (argc, argv, " b:t:px:F:v:TG:H:PB :R:r:I:E:g:i:f:k:X:Z:Vd:maANDP:CM:h " ,
215
+ c = getopt_long (argc, argv, " b:t:px:F:v:TM:G:H:PoOB :R:r:I:E:g:i:f:k:X:Z:Vd:maANDCh " ,
211
216
long_options, &option_index);
212
217
213
218
// Detect the end of the options.
@@ -262,6 +267,12 @@ int main_index(int argc, char** argv) {
262
267
case ' P' :
263
268
force_phasing = true ;
264
269
break ;
270
+ case ' o' :
271
+ discard_overlaps = true ;
272
+ break ;
273
+ case ' O' :
274
+ check_overlaps = true ;
275
+ break ;
265
276
case ' B' :
266
277
samples_in_batch = std::max (parse<size_t >(optarg ), 1ul );
267
278
break ;
@@ -616,6 +627,9 @@ int main_index(int argc, char** argv) {
616
627
617
628
// Determine the reference nodes for the current variant and create a variant site.
618
629
// If the variant is not an insertion, there should be a path for the ref allele.
630
+ // Otherwise the reference position can be determined from the predecessors of the
631
+ // alternate alleles.
632
+ // TODO: What if the reference visits the same node several times?
619
633
var.position --; // Use a 0-based position to get the correct var_name.
620
634
std::string var_name = make_variant_id (var);
621
635
std::string ref_path_name = " _alt_" + var_name + " _0" ;
@@ -634,19 +648,24 @@ int main_index(int argc, char** argv) {
634
648
bool found = false ;
635
649
for (size_t alt_index = 1 ; alt_index < var.alleles .size (); alt_index++) {
636
650
std::string alt_path_name = " _alt_" + var_name + " _" + to_string (alt_index);
651
+ size_t candidate_pos = 0 ;
652
+ bool candidate_found = false ;
637
653
auto alt_path_iter = alt_paths.find (alt_path_name);
638
654
if (alt_path_iter != alt_paths.end ()) {
639
655
gbwt::vector_type pred_nodes = predecessors (*xg_index, alt_path_iter->second );
640
656
for (auto node : pred_nodes) {
641
657
size_t pred_pos = variants.firstOccurrence (node);
642
658
if (pred_pos != variants.invalid_position ()) {
643
- ref_pos = pred_pos + 1 ;
659
+ candidate_pos = std::max (candidate_pos, pred_pos + 1 );
660
+ candidate_found = true ;
644
661
found = true ;
645
- break ;
646
662
}
647
663
}
648
- if (found) {
649
- break ;
664
+ // For each alternate allele, find the rightmost reference node among
665
+ // its predecessors. If multiple alleles have candidates for the
666
+ // reference position, choose the leftmost one.
667
+ if (candidate_found) {
668
+ ref_pos = std::min (ref_pos, candidate_pos);
650
669
}
651
670
}
652
671
}
@@ -695,6 +714,9 @@ int main_index(int argc, char** argv) {
695
714
}
696
715
cerr << " - Phasing information: " << gbwt::inMegabytes (phasing_bytes) << " MB" << endl;
697
716
}
717
+ if (check_overlaps) {
718
+ gbwt::checkOverlaps (variants, cerr, true );
719
+ }
698
720
699
721
// Save memory:
700
722
// - Delete the alt paths if we no longer need them.
@@ -724,12 +746,25 @@ int main_index(int argc, char** argv) {
724
746
<< " _" << haplotype.phase
725
747
<< " _" << haplotype.count ;
726
748
store_thread (haplotype.path , sn.str ());
749
+ },
750
+ [&](gbwt::size_type site, gbwt::size_type allele) -> bool {
751
+ if (check_overlaps) {
752
+ overlaps.insert (std::make_pair (site, allele));
753
+ }
754
+ return discard_overlaps;
727
755
});
728
756
if (show_progress) {
729
757
cerr << " - Processed samples " << phasings[batch].offset () << " to " << (phasings[batch].offset () + phasings[batch].size () - 1 ) << endl;
730
758
}
731
759
}
732
- } // End of contig.
760
+ if (check_overlaps && !overlaps.empty ()) {
761
+ cerr << overlaps.size () << " unresolved overlaps:" << endl;
762
+ for (auto overlap : overlaps) {
763
+ cerr << " - site " << overlap.first << " , allele " << overlap.second << endl;
764
+ }
765
+ overlaps.clear ();
766
+ }
767
+ } // End of contigs.
733
768
} // End of haplotypes.
734
769
735
770
// Store the thread database. Write it to disk if a filename is given,
0 commit comments