forked from DaehwanKimLab/hisat2
-
Notifications
You must be signed in to change notification settings - Fork 6
/
group_walk.h
1649 lines (1558 loc) · 56.8 KB
/
group_walk.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright 2011, Ben Langmead <[email protected]>
*
* This file is part of Bowtie 2.
*
* Bowtie 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Bowtie 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* group_walk.h
*
* Classes and routines for walking a set of BW ranges backwards from the edge
* of a seed hit with the goal of resolving the offset of each row in each
* range. Here "offset" means offset into the concatenated string of all
* references. The main class is 'GroupWalk' and an important helper is
* 'GWState'.
*
* For each combination of seed offset and orientation, there is an associated
* QVal. Each QVal describes a (possibly empty) set of suffix array ranges.
* Call these "seed range sets." Each range in the set is "backed" by a range
* of the salist, represented as a PListSlice. Such a range is the origin of a
* walk.
*
* When an offset is resolved, it is entered into the salist via the
* PListSlice. Note that other routines in this same thread might also be
* setting elements of the salist, so routines here should expect that elements
* can go from unresolved to resolved at any time.
*
* What bookkeeping do we have to do as we walk? Before the first step, we
* convert the initial QVal into a list of SATuples; the SATuples are our link
* to the correpsonding ranges in the suffix array. The list of SATuples is
* then converted to a list of GWState objects; these keep track of where we
* are in our walk (e.g. what 'top' and 'bot' are, how many steps have we gone,
* etc) as well as how the elements in the current range correspond to elements
* from the original range.
*
* The user asks the GroupWalk to resolve another offset by calling advance().
* advance() can be called in various ways:
*
* (a) The user can request that the GroupWalk proceed until a
* *particular* element is resolved, then return that resolved
* element. Other elements may be resolved along the way, but
* those results are buffered and may be dispensed in future calls
* to advance().
*
* (b) The user can request that the GroupWalk select an as-yet-
* unreported element at random and and proceed until that element
* is resolved and report it. Again, other elements may be
* resolved along the way but they are buffered.
*
* (c) The user can request that the GroupWalk resolve elements in a
* particular BW range (with a particular offset and orientation)
* in an order of its choosing. The GroupWalk in this case
* attempts to resolve as many offsets as possible as quickly as
* possible, and returns them as soon as they're found. The res_
* buffer is used in this case.
*
* (d) Like (c) but resolving elements at a paritcular offset and
* orientation instead of at a specific BW range. The res_ buffer
* is used in this case, since there's a chance that the
*
* There are simple ways to heuristically reduce the problem size while
* maintaining randomness. For instance, the user put a ceiling on the
* number of elements that we walk from any given seed offset or range.
* We can then trim away random subranges to reduce the size of the
* problem. There is no need for the caller to do this for us.
*/
#ifndef GROUP_WALK_H_
#define GROUP_WALK_H_
#include <stdint.h>
#include <limits>
#include "ds.h"
#include "gfm.h"
#include "read.h"
#include "reference.h"
#include "mem_ids.h"
/**
* Encapsulate an SA range and an associated list of slots where the resolved
* offsets can be placed.
*/
template<typename T, typename index_t>
class SARangeWithOffs {
public:
SARangeWithOffs() { reset(); };
SARangeWithOffs(
index_t tf,
index_t bf,
index_t ntf,
index_t nbf,
const EList<pair<index_t, index_t> >& n_iedge_count,
size_t len,
const T& o) {
init(tf, bf, ntf, nbf, n_iedge_count, len, o);
}
void init(
index_t tf,
index_t bf,
index_t ntf,
index_t nbf,
const EList<pair<index_t, index_t> >& n_iedge_count,
size_t len_,
const T& o) {
topf = tf;
botf = bf;
assert_lt(topf, botf);
node_top = ntf;
node_bot = nbf;
assert_leq(node_bot - node_top, botf - topf);
node_iedge_count = n_iedge_count;
len = len_,
offs = o;
}
/**
* Reset to uninitialized state.
*/
void reset() { topf = (index_t)INDEX_MAX; }
/**
* Return true if this is initialized.
*/
bool inited() const {
return topf != (index_t)INDEX_MAX;
}
/**
* Return the number of times this reference substring occurs in the
* reference, which is also the size of the 'offs' TSlice.
*/
size_t size() const { return offs.size(); }
index_t topf; // top in GBWT index
index_t botf;
index_t node_top; // top node
index_t node_bot;
EList<pair<index_t, index_t> > node_iedge_count;
size_t len; // length of the reference sequence involved
T offs; // offsets
};
/**
* A group of per-thread state that can be shared between all the GroupWalks
* used in that thread.
*/
template <typename index_t>
struct GroupWalkState {
GroupWalkState(int cat) : map(cat) {
masks[0].setCat(cat);
masks[1].setCat(cat);
masks[2].setCat(cat);
masks[3].setCat(cat);
}
EList<bool> masks[4]; // temporary list for masks; used in GWState
EList<index_t, 16> map; // temporary list of GWState maps
};
/**
* Encapsulates counters that encode how much work the walk-left logic
* has done.
*/
struct WalkMetrics {
WalkMetrics() {
reset();
}
/**
* Sum each across this object and 'm'. This is the only safe way
* to update a WalkMetrics shared by many threads.
*/
void merge(const WalkMetrics& m, bool getLock = false) {
ThreadSafe ts(&mutex_m, getLock);
bwops += m.bwops;
branches += m.branches;
resolves += m.resolves;
refresolves += m.refresolves;
reports += m.reports;
}
/**
* Set all to 0.
*/
void reset() {
bwops = branches = resolves = refresolves = reports = 0;
}
uint64_t bwops; // Burrows-Wheeler operations
uint64_t branches; // BW range branch-offs
uint64_t resolves; // # offs resolved with BW walk-left
uint64_t refresolves; // # resolutions caused by reference scanning
uint64_t reports; // # offs reported (1 can be reported many times)
MUTEX_T mutex_m;
};
/**
* Coordinates for a BW element that the GroupWalk might resolve.
*/
template <typename index_t>
struct GWElt {
GWElt() { reset(); }
/**
* Reset GWElt to uninitialized state.
*/
void reset() {
offidx = range = elt = len = (index_t)OFF_MASK;
fw = false;
}
/**
* Initialize this WalkResult.
*/
void init(
index_t oi,
bool f,
index_t r,
index_t e,
index_t l)
{
offidx = oi;
fw = f;
range = r;
elt = e;
len = l;
}
/**
* Return true iff this GWElt and the given GWElt refer to the same
* element.
*/
bool operator==(const GWElt& o) const {
return offidx == o.offidx &&
fw == o.fw &&
range == o.range &&
elt == o.elt &&
len == o.len;
}
/**
* Return true iff this GWElt and the given GWElt refer to
* different elements.
*/
bool operator!=(const GWElt& o) const {
return !(*this == o);
}
index_t offidx; // seed offset index
bool fw; // strand
index_t range; // range
index_t elt; // element
index_t len; // length
};
/**
* A record encapsulating the result of looking up one BW element in
* the Bowtie index.
*/
template <typename index_t>
struct WalkResult {
WalkResult() { reset(); }
/**
* Reset GWElt to uninitialized state.
*/
void reset() {
elt.reset();
bwrow = toff = (index_t)OFF_MASK;
}
/**
* Initialize this WalkResult.
*/
void init(
index_t oi, // seed offset index
bool f, // strand
index_t r, // range
index_t e, // element
index_t bwr, // BW row
index_t len, // length
index_t to) // text offset
{
elt.init(oi, f, r, e, len);
bwrow = bwr;
toff = to;
}
GWElt<index_t> elt; // element resolved
index_t bwrow; // SA row resolved
index_t toff; // resolved offset from SA sample
};
/**
* A GW hit encapsulates an SATuple describing a reference substring
* in the cache, along with a bool indicating whether each element of
* the hit has been reported yet.
*/
template<typename index_t, typename T>
class GWHit {
public:
GWHit() :
fmap(0, GW_CAT),
offidx((index_t)OFF_MASK),
fw(false),
range((index_t)OFF_MASK),
len((index_t)OFF_MASK),
reported_(0, GW_CAT),
nrep_(0)
{
assert(repOkBasic());
}
/**
* Initialize with a new SA range. Resolve the done vector so that
* there's one bool per suffix array element.
*/
void init(
SARangeWithOffs<T, index_t>& sa,
index_t oi,
bool f,
index_t r)
{
nrep_ = 0;
offidx = oi;
fw = f;
range = r;
len = (index_t)sa.len;
reported_.resize(sa.offs.size());
reported_.fill(false);
fmap.resize(sa.offs.size());
fmap.fill(make_pair((index_t)OFF_MASK, (index_t)OFF_MASK));
}
/**
* Clear contents of sat and done.
*/
void reset() {
reported_.clear();
fmap.clear();
nrep_ = 0;
offidx = (index_t)OFF_MASK;
fw = false;
range = (index_t)OFF_MASK;
len = (index_t)OFF_MASK;
}
#ifndef NDEBUG
/**
* Check that GWHit is internally consistent. If a pointer to an
* EList of GWStates is given, we assume that it is the EList
* corresponding to this GWHit and check whether the forward and
* reverse mappings match up for the as-yet-unresolved elements.
*/
bool repOk(const SARangeWithOffs<T, index_t>& sa) const {
assert_eq(reported_.size(), sa.offs.size());
assert_eq(fmap.size(), sa.offs.size());
// Shouldn't be any repeats among as-yet-unresolveds
size_t nrep = 0;
for(size_t i = 0; i < fmap.size(); i++) {
if(reported_[i]) nrep++;
if(sa.offs[i] != (index_t)OFF_MASK) {
continue;
}
for(size_t j = i+1; j < fmap.size(); j++) {
if(sa.offs[j] != (index_t)OFF_MASK) {
continue;
}
assert(fmap[i] != fmap[j]);
}
}
assert_eq(nrep_, nrep);
return true;
}
/**
* Return true iff this GWHit is not obviously corrupt.
*/
bool repOkBasic() {
return true;
}
#endif
/**
* Set the ith element to be reported.
*/
void setReported(index_t i) {
assert(!reported_[i]);
assert_lt(i, reported_.size());
reported_[i] = true;
nrep_++;
}
/**
* Return true iff element i has been reported.
*/
bool reported(index_t i) const {
assert_lt(i, reported_.size());
return reported_[i];
}
/**
* Return true iff all elements have been reported.
*/
bool done() const {
assert_leq(nrep_, reported_.size());
return nrep_ == reported_.size();
}
EList<std::pair<index_t, index_t>, 16> fmap; // forward map; to GWState & elt
index_t offidx; // offset idx
bool fw; // orientation
index_t range; // original range index
index_t len; // length of hit
protected:
EList<bool, 16> reported_; // per-elt bool indicating whether it's been reported
index_t nrep_;
};
/**
* Encapsulates the progress made along a particular path from the original
* range.
*/
template<typename index_t, typename T>
class GWState {
public:
GWState() : map_(0, GW_CAT) {
reset(); assert(repOkBasic());
}
/**
* Initialize this GWState with new gfm, top, bot, step, and sat.
*
* We assume map is already set up.
*
* Returns true iff at least one elt was resolved.
*/
template<int S>
pair<int, int> init(
const GFM<index_t>& gfm, // index to walk left in
const BitPairReference& ref, // bitpair-encoded reference
SARangeWithOffs<T, index_t>& sa, // SA range with offsets
EList<GWState, S>& sts, // EList of GWStates for range being advanced
GWHit<index_t, T>& hit, // Corresponding hit structure
index_t range, // which range is this?
bool reportList, // if true, "report" resolved offsets immediately by adding them to 'res' list
EList<WalkResult<index_t>, 16>* res, // EList where resolved offsets should be appended
index_t tp, // top of range at this step
index_t bt, // bot of range at this step
index_t n_tp, // node at top
index_t n_bt, // node at bot
const EList<pair<index_t, index_t> >& n_iedge_count,
index_t st, // # steps taken to get to this step
WalkMetrics& met)
{
assert_gt(bt, tp);
assert_lt(range, sts.size());
top = tp;
bot = bt;
node_top = n_tp;
node_bot = n_bt;
node_iedge_count = n_iedge_count;
step = st;
assert(!inited_);
ASSERT_ONLY(inited_ = true);
ASSERT_ONLY(lastStep_ = step-1);
return init(gfm, ref, sa, sts, hit, range, reportList, res, met);
}
/**
* Initialize this GWState.
*
* We assume map is already set up, and that 'step' is equal to the
* number of steps taken to get to the new top/bot pair *currently*
* in the top and bot fields.
*
* Returns a pair of numbers, the first being the number of
* resolved but unreported offsets found during this advance, the
* second being the number of as-yet-unresolved offsets.
*/
template<int S>
pair<int, int> init(
const GFM<index_t>& gfm, // forward Bowtie index
const BitPairReference& ref, // bitpair-encoded reference
SARangeWithOffs<T, index_t>& sa, // SA range with offsets
EList<GWState, S>& st, // EList of GWStates for advancing range
GWHit<index_t, T>& hit, // Corresponding hit structure
index_t range, // range being inited
bool reportList, // report resolutions, adding to 'res' list?
EList<WalkResult<index_t>, 16>* res, // EList to append resolutions
WalkMetrics& met) // update these metrics
{
assert(inited_);
assert_eq(step, lastStep_+1);
ASSERT_ONLY(lastStep_++);
assert_leq((index_t)step, gfm.gh().len());
assert_lt(range, st.size());
pair<int, int> ret = make_pair(0, 0);
index_t trimBegin = 0, trimEnd = 0;
bool empty = true; // assume all resolved until proven otherwise
// Commit new information, if any, to the PListSlide. Also,
// trim and check if we're done.
assert_eq(node_bot - node_top, map_.size());
ASSERT_ONLY(index_t num_orig_iedges = 0, orig_e = 0);
index_t num_iedges = 0, e = 0;
for(size_t i = mapi_; i < map_.size(); i++) {
bool resolved = (off((index_t)i, sa) != (index_t)OFF_MASK);
if(!resolved) {
#ifndef NDEBUG
while(orig_e < sa.node_iedge_count.size()) {
if(map((index_t)i) <= sa.node_iedge_count[orig_e].first) {
break;
}
num_orig_iedges += sa.node_iedge_count[orig_e].second;
orig_e++;
}
#endif
while(e < node_iedge_count.size()) {
if(i <= node_iedge_count[e].first) {
break;
}
num_iedges += node_iedge_count[e].second;
e++;
}
// Elt not resolved yet; try to resolve it now
index_t bwrow = (index_t)(top + i + num_iedges);
index_t node = (index_t)(node_top + i);
index_t toff = gfm.tryOffset(bwrow, node);
ASSERT_ONLY(index_t origBwRow = sa.topf + map((index_t)i) + num_orig_iedges);
ASSERT_ONLY(index_t origNode = sa.node_top + map((index_t)i));
assert_eq(bwrow, gfm.walkLeft(origBwRow, step));
if(toff != (index_t)OFF_MASK) {
// Yes, toff was resolvable
assert_eq(toff, gfm.getOffset(bwrow, node));
met.resolves++;
toff += step;
assert_eq(toff, gfm.getOffset(origBwRow, origNode));
setOff((index_t)i, toff, sa, met);
if(!reportList) ret.first++;
#if 0
// used to be #ifndef NDEBUG, but since we no longer require that the reference
// string info be included, this is no longer relevant.
// Sanity check that the reference characters under this
// hit match the seed characters in hit.satup->key.seq.
// This is NOT a check that we associated the exact right
// text offset with the BW row. This is an important
// distinction because when resolved offsets are filled in
// via refernce scanning, they are not necessarily the
// exact right text offsets to associate with the
// respective BW rows but they WILL all be correct w/r/t
// the reference sequence underneath, which is what really
// matters here.
index_t tidx = (index_t)OFF_MASK, tof, tlen;
bool straddled = false;
gfm.joinedToTextOff(
hit.len, // length of seed
toff, // offset in joined reference string
tidx, // reference sequence id
tof, // offset in reference coordinates
tlen, // length of reference sequence
true, // don't reject straddlers
straddled);
if(tidx != (index_t)OFF_MASK &&
hit.satup->key.seq != std::numeric_limits<uint64_t>::max())
{
// key: 2-bit characters packed into a 64-bit word with
// the least significant bitpair corresponding to the
// rightmost character on the Watson reference strand.
uint64_t key = hit.satup->key.seq;
for(int64_t j = tof + hit.len-1; j >= tof; j--) {
// Get next reference base to the left
int c = ref.getBase(tidx, j);
assert_range(0, 3, c);
// Must equal least significant bitpair of key
if(c != (int)(key & 3)) {
// Oops; when we jump to the piece of the
// reference where the seed hit is, it doesn't
// match the seed hit. Before dying, check
// whether we have the right spot in the joined
// reference string
SString<char> jref;
gfm.restore(jref);
uint64_t key2 = hit.satup->key.seq;
for(int64_t k = toff + hit.len-1; k >= toff; k--) {
int c = jref[k];
assert_range(0, 3, c);
assert_eq(c, (int)(key2 & 3));
key2 >>= 2;
}
assert(false);
}
key >>= 2;
}
}
#endif
}
}
// Is the element resolved? We ask this regardless of how it was
// resolved (whether this function did it just now, whether it did
// it a while ago, or whether some other function outside GroupWalk
// did it).
if(off((index_t)i, sa) != (index_t)OFF_MASK) {
if(reportList && !hit.reported(map((index_t)i))) {
// Report it
index_t toff = off((index_t)i, sa);
assert(res != NULL);
res->expand();
index_t origBwRow = sa.topf + map((index_t)i);
res->back().init(
hit.offidx, // offset idx
hit.fw, // orientation
hit.range, // original range index
map((index_t)i), // original element offset
origBwRow, // BW row resolved
hit.len, // hit length
toff); // text offset
hit.setReported(map((index_t)i));
met.reports++;
}
// Offset resolved
if(empty) {
// Haven't seen a non-empty entry yet, so we
// can trim this from the beginning.
trimBegin++;
} else {
trimEnd++;
}
} else {
// Offset not yet resolved
ret.second++;
trimEnd = 0;
empty = false;
// Set the forward map in the corresponding GWHit
// object to point to the appropriate element of our
// range
assert_geq(i, mapi_);
index_t bmap = map((index_t)i);
hit.fmap[bmap].first = range;
hit.fmap[bmap].second = (index_t)i;
#ifndef NDEBUG
for(size_t j = 0; j < bmap; j++) {
if(sa.offs[j] == (index_t)OFF_MASK &&
hit.fmap[j].first == range)
{
assert_neq(i, hit.fmap[j].second);
}
}
#endif
}
}
// Trim from beginning
assert_geq(trimBegin, 0);
mapi_ += trimBegin;
if(trimBegin > 0) {
top += trimBegin;
index_t e = 0;
for(; e < node_iedge_count.size(); e++) {
if(node_iedge_count[e].first >= trimBegin) break;
assert_geq(top, node_iedge_count[e].second);
top += node_iedge_count[e].second;
}
if(e > 0) node_iedge_count.erase(0, e);
for(e = 0; e < node_iedge_count.size(); e++) {
assert_geq(node_iedge_count[e].first, trimBegin);
node_iedge_count[e].first -= trimBegin;
}
}
node_top += trimBegin;
if(trimEnd > 0) {
// Trim from end
map_.resize(map_.size() - trimEnd);
bot -= trimEnd;
index_t node_range = node_bot - node_top;
while(node_iedge_count.size() > 0) {
if(node_iedge_count.back().first < (node_range - trimEnd)) break;
assert_geq(bot, node_iedge_count.back().second);
bot -= node_iedge_count.back().second;
node_iedge_count.pop_back();
}
}
node_bot -= trimEnd;
#ifndef NDEBUG
assert_leq(node_top, node_bot);
index_t num_nodes = node_bot - node_top;
index_t add = 0;
for(index_t e = 0; e < node_iedge_count.size(); e++) {
assert_lt(node_iedge_count[e].first, num_nodes);
add += node_iedge_count[e].second;
}
assert_eq(bot - top, num_nodes + add);
#endif
if(empty) {
assert(done());
#ifndef NDEBUG
// If range is done, all elements from map should be
// resolved
for(size_t i = mapi_; i < map_.size(); i++) {
assert_neq((index_t)OFF_MASK, off((index_t)i, sa));
}
// If this range is done, then it should be the case that
// all elements in the corresponding GWHit that point to
// this range are resolved.
for(size_t i = 0; i < hit.fmap.size(); i++) {
if(sa.offs[i] == (index_t)OFF_MASK) {
assert_neq(range, hit.fmap[i].first);
}
}
#endif
return ret;
} else {
assert(!done());
}
// Is there a dollar sign in the middle of the range?
tmp_zOffs.clear();
for(index_t i = 0; i < gfm._zOffs.size(); i++) {
#ifndef NDEBUG
if(i > 0) {
assert_lt(gfm._zOffs[i-1], gfm._zOffs[i]);
}
#endif
assert_neq(top, gfm._zOffs[i]);
// assert_neq(bot-1, gfm._zOffs[i]);
if(gfm._zOffs[i] > top && gfm._zOffs[i] < bot) {
tmp_zOffs.push_back(gfm._zOffs[i]);
}
}
// Yes, the dollar sign is in the middle of this range. We
// must split it into the two ranges on either side of the
// dollar. Let 'bot' and 'top' delimit the portion of the
// range prior to the dollar.
if(tmp_zOffs.size() > 0) {
tmp_gbwt_to_node.clear();
index_t n = 0, e = 0;
for(index_t r = 0; r < (bot - top); r++) {
tmp_gbwt_to_node.push_back(n);
if(e < node_iedge_count.size()) {
assert_leq(n, node_iedge_count[e].first);
if(n == node_iedge_count[e].first) {
for(index_t a = 0; a < node_iedge_count[e].second; a++) {
tmp_gbwt_to_node.push_back(n);
r++;
}
e++;
}
}
n++;
}
assert_eq(bot - top, tmp_gbwt_to_node.size());
for(index_t i = 0; i < tmp_zOffs.size(); i++) {
assert_lt(top, tmp_zOffs[i]);
index_t diff = tmp_zOffs[i] - top;
assert_lt(diff, tmp_gbwt_to_node.size());
for(index_t j = diff + 1; j < tmp_gbwt_to_node.size(); j++) {
if(tmp_gbwt_to_node[i] == tmp_gbwt_to_node[j]) {
tmp_gbwt_to_node[j] = (index_t)INDEX_MAX;
} else {
break;
}
}
tmp_gbwt_to_node[diff] = (index_t)INDEX_MAX;
}
for(index_t i = 0; i < tmp_zOffs.size(); i++) {
// Note: might be able to do additional trimming off the end.
// Create a new range for the portion after the dollar.
index_t new_top = tmp_zOffs[i] + 1;
while(new_top - top < tmp_gbwt_to_node.size()) {
if(tmp_gbwt_to_node[new_top - top] != (index_t)INDEX_MAX) {
break;
}
new_top++;
}
assert_leq(new_top - top, tmp_gbwt_to_node.size());
if(new_top - top == tmp_gbwt_to_node.size()) {
#if 0
if(node_iedge_count.size() > 0 &&
node_iedge_count.back().first + 1 == node_bot - node_top) {
assert_gt(node_iedge_count.back().second, 0);
node_iedge_count.back().second -= 1;
if(node_iedge_count.back().second == 0) {
node_iedge_count.resize(node_iedge_count.size()- 1);
}
}
#endif
break;
}
index_t new_node_top = tmp_gbwt_to_node[new_top - top] + node_top;
assert_lt(new_node_top, node_bot);
index_t new_bot;
if(i + 1 < tmp_zOffs.size()) {
new_bot = tmp_zOffs[i+1];
} else {
new_bot = bot;
}
index_t new_bot2 = new_bot;
while(new_bot2 - top < tmp_gbwt_to_node.size()) {
if(tmp_gbwt_to_node[new_bot2 - top] != (index_t)INDEX_MAX) {
break;
}
new_bot2++;
}
index_t new_node_bot = node_bot;
if(new_bot2 - top < tmp_gbwt_to_node.size()) {
new_node_bot = node_top + tmp_gbwt_to_node[new_bot2 - top];
}
tmp_node_iedge_count.clear();
if(new_top >= new_bot) continue;
for(index_t j = new_top - top; j + 1 < new_bot - top;) {
index_t n = tmp_gbwt_to_node[j];
index_t j2 = j + 1;
while(j2 < new_bot - top) {
if(n != tmp_gbwt_to_node[j2]) {
break;
}
j2++;
}
if(j + 1 < j2) {
tmp_node_iedge_count.expand();
assert_lt(node_top, new_node_top);
tmp_node_iedge_count.back().first = n - (new_node_top - node_top);
tmp_node_iedge_count.back().second = j2 - j - 1;
}
j = j2;
}
st.expand();
st.back().reset();
st.back().initMap(new_node_bot - new_node_top);
for(index_t j = new_node_top; j < new_node_bot; j++) {
st.back().map_[j - new_node_top] = map(j - node_top + mapi_);
}
st.back().init(
gfm,
ref,
sa,
st,
hit,
(index_t)st.size()-1,
reportList,
res,
new_top,
new_bot,
new_node_top,
new_node_bot,
tmp_node_iedge_count,
step,
met);
}
assert_eq((index_t)map_.size(), node_bot - node_top + mapi_);
bot = tmp_zOffs[0];
assert_lt(bot - top, tmp_gbwt_to_node.size());
node_bot = tmp_gbwt_to_node[bot - top - 1] + node_top + 1;
map_.resize(node_bot - node_top + mapi_);
index_t width = node_bot - node_top;
for(index_t e = 0; e < node_iedge_count.size(); e++) {
if(node_iedge_count[e].first >= node_bot - node_top) {
node_iedge_count.resize(e);
break;
}
width += node_iedge_count[e].second;
}
if(width != bot - top) {
assert_eq(width, bot - top + 1);
assert_gt(node_iedge_count.size(), 0);
assert_gt(node_iedge_count.back().second, 0);
node_iedge_count.back().second -= 1;
if(node_iedge_count.back().second == 0) {
node_iedge_count.resize(node_iedge_count.size()- 1);
}
}
}
assert_gt(bot, top);
// Prepare SideLocus's for next step
if(bot-top > 1) {
SideLocus<index_t>::initFromTopBot(top, bot, gfm.gh(), gfm.gfm(), tloc, bloc);
assert(tloc.valid()); assert(tloc.repOk(gfm.gh()));
assert(bloc.valid()); assert(bloc.repOk(gfm.gh()));
} else {
tloc.initFromRow(top, gfm.gh(), gfm.gfm());
assert(tloc.valid()); assert(tloc.repOk(gfm.gh()));
bloc.invalidate();
}
return ret;
}
#ifndef NDEBUG
/**
* Check if this GWP is internally consistent.
*/
bool repOk(
const GFM<index_t>& gfm,
GWHit<index_t, T>& hit,
index_t range) const
{
assert(done() || bot > top);
assert(doneResolving(hit) || (tloc.valid() && tloc.repOk(gfm.gh())));
assert(doneResolving(hit) || bot == top+1 || (bloc.valid() && bloc.repOk(gfm.gh())));
assert_eq(map_.size()-mapi_, bot-top);
// Make sure that 'done' is compatible with whether we have >=
// 1 elements left to resolve.
int left = 0;
for(size_t i = mapi_; i < map_.size(); i++) {
ASSERT_ONLY(index_t row = (index_t)(top + i - mapi_));
ASSERT_ONLY(index_t origRow = hit.satup->topf + map(i));
assert(step == 0 || row != origRow);
assert_eq(row, gfm.walkLeft(origRow, step));
assert_lt(map_[i], hit.satup->offs.size());
if(off(i, hit) == (index_t)OFF_MASK) left++;
}
assert(repOkMapRepeats());
assert(repOkMapInclusive(hit, range));
return true;
}
/**
* Return true iff this GWState is not obviously corrupt.
*/
bool repOkBasic() {
assert_geq(bot, top);
return true;
}
/**
* Check that the fmap elements pointed to by our map_ include all
* of the fmap elements that point to this range.
*/
bool repOkMapInclusive(GWHit<index_t, T>& hit, index_t range) const {
for(size_t i = 0; i < hit.fmap.size(); i++) {
if(hit.satup->offs[i] == (index_t)OFF_MASK) {
if(range == hit.fmap[i].first) {
ASSERT_ONLY(bool found = false);
for(size_t j = mapi_; j < map_.size(); j++) {
if(map(j) == i) {
ASSERT_ONLY(found = true);
break;
}
}
assert(found);
}
}
}
return true;
}
/**
* Check that no two elements in map_ are the same.
*/
bool repOkMapRepeats() const {
for(size_t i = mapi_; i < map_.size(); i++) {
for(size_t j = i+1; j < map_.size(); j++) {
assert_neq(map_[i], map_[j]);
}
}
return true;
}
#endif
/**
* Return the offset currently assigned to the ith element. If it
* has not yet been resolved, return 0xffffffff.
*/
index_t off(
index_t i,
const SARangeWithOffs<T, index_t>& sa)
{
assert_geq(i, mapi_);
assert_lt(i, map_.size());
assert_lt(map_[i], sa.offs.size());
return sa.offs.get(map_[i]);
}
/**
* Return the offset of the element within the original range's
* PListSlice that the ith element of this range corresponds to.