@@ -220,16 +220,40 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *p,
220
220
{
221
221
/* Do a lexicographic search. Essentially this:
222
222
>>> max(needle[i:] for i in range(len(needle)+1))
223
- Also find the period of the right half. */
223
+ Also find the period of the right half.
224
+ Direction:
225
+ dir : {-1, 1}
226
+ if dir == -1, then the problem is reverse
227
+ In short:
228
+ _lex_search(x, -1) == _lex_search(x[::-1], 1)
229
+
230
+ Returned cut is "the size of the cut towards chosen direction".
231
+ E.g.:
232
+ >>> x = '1234'
233
+ >>> cut, period = factorize(x, dir=1) # cut = 0
234
+ >>> cut
235
+ 0
236
+ >>> cut_idx = cut
237
+ >>> x[:cut_idx], x[cut_idx:]
238
+ '', '1234'
239
+ >>> x = '4321'
240
+ >>> cut, period = factorize(x, dir=-1)
241
+ >>> cut
242
+ 0
243
+ >>> cut_idx = len(x) - cut
244
+ >>> x[:cut_idx], x[cut_idx:]
245
+ '4321', ''
246
+ */
224
247
Py_ssize_t max_suffix = 0 ;
225
248
Py_ssize_t candidate = 1 ;
226
249
Py_ssize_t k = 0 ;
227
250
// The period of the right half.
228
251
Py_ssize_t period = 1 ;
252
+ // stt is starting position from chosen direction
229
253
Py_ssize_t stt = dir == 1 ? 0 : m - 1 ;
230
254
STRINGLIB_CHAR a , b ;
231
255
while (candidate + k < m ) {
232
- // each loop increases candidate + k + max_suffix
256
+ // each loop increases (in chosen direction) candidate + k + max_suffix
233
257
a = p [stt + dir * (candidate + k )];
234
258
b = p [stt + dir * (max_suffix + k )];
235
259
// check if the suffix at candidate is better than max_suffix
@@ -306,22 +330,12 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *p,
306
330
The length of this minimal repetition is 7, which is indeed the
307
331
period of the original string.
308
332
309
- This is how reverse direction compares to forward:
310
- returned cut is "the size of the cut from the start point". E.g.:
311
- >>> x = '1234'
312
- >>> cut, period = factorize(x, 1) # cut = 0
313
- >>> cut
314
- 0
315
- >>> cut_idx = cut
316
- >>> x[:cut_idx], x[cut_idx:]
317
- '', '1234'
318
- >>> x = '4321'
319
- >>> cut, period = factorize(x, -1)
320
- >>> cut
321
- 0
322
- >>> cut_idx = len(x) - cut
323
- >>> x[:cut_idx], x[cut_idx:]
324
- '4321', ''
333
+ Direction:
334
+ dir : {-1, 1}
335
+ if dir == -1, then the problem is reverse
336
+ In short:
337
+ _factorize(x, -1) == _factorize(x[::-1], 1)
338
+ See docstring of _lex_search if still unclear
325
339
*/
326
340
Py_ssize_t cut1 , period1 , cut2 , period2 , cut , period ;
327
341
cut1 = STRINGLIB (_lex_search )(p , m , & period1 , 0 , dir );
@@ -348,13 +362,13 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *p,
348
362
#define TABLE_MASK (TABLE_SIZE - 1U)
349
363
350
364
typedef struct STRINGLIB (_pre ) {
351
- const STRINGLIB_CHAR * p ;
352
- Py_ssize_t m ;
353
- Py_ssize_t cut ;
354
- Py_ssize_t period ;
355
- Py_ssize_t gap ;
365
+ const STRINGLIB_CHAR * p ; // needle
366
+ Py_ssize_t m ; // length of the needle
367
+ Py_ssize_t cut ; // Critical Factorization Cut
368
+ Py_ssize_t period ; // Global Period of the string
369
+ Py_ssize_t gap ; // "Good Suffix" Last Character Gap
356
370
int is_periodic ;
357
- SHIFT_TYPE table [TABLE_SIZE ];
371
+ SHIFT_TYPE table [TABLE_SIZE ]; // Boyer-Moore "Bad Character" table
358
372
} STRINGLIB (prework );
359
373
360
374
@@ -438,8 +452,14 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n,
438
452
Py_ssize_t maxcount , int mode ,
439
453
STRINGLIB (prework ) * pw , int direction )
440
454
{
441
- // Crochemore and Perrin's (1991) Two-Way algorithm.
442
- // See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
455
+ /* Crochemore and Perrin's (1991) Two-Way algorithm.
456
+ See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
457
+ Bi-Directional Conventions:
458
+ See docstring of horspool_find
459
+
460
+ Critical factorization reversion:
461
+ See docstring of _factorize
462
+ */
443
463
if (mode == FAST_COUNT ) {
444
464
LOG ("Two-way Count.\n" );
445
465
}
@@ -464,7 +484,6 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n,
464
484
// Direction Independent
465
485
const Py_ssize_t w = n - m ;
466
486
const Py_ssize_t m_m1 = m - 1 ;
467
-
468
487
// Direction Dependent
469
488
const Py_ssize_t p_stt = dir == 1 ? 0 : m - 1 ;
470
489
const Py_ssize_t s_stt = dir == 1 ? 0 : n - 1 ;
@@ -531,6 +550,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n,
531
550
if (j != m ) {
532
551
continue ;
533
552
}
553
+
534
554
j = Py_MIN (memory , cut ); // Needed for j == cut below to be correct
535
555
for (; j < cut ; j ++ ) {
536
556
ihits ++ ;
@@ -590,7 +610,93 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n,
590
610
int direction , int dynamic )
591
611
{
592
612
/* Boyer–Moore–Horspool algorithm
593
- with optional dynamic fallback to Two-Way algorithm */
613
+ with optional dynamic fallback to Two-Way algorithm
614
+ Bi-Directional Conventions:
615
+ stt - start index
616
+ end - end index
617
+ ss - pointer to last window index that matches last needle character
618
+ >>> dir_fwd, dir_rev = 1, -1
619
+ >>> s = [0, 1, 2, 3, 4, 5]
620
+ >>> s_stt_fwd, s_stt_rev = 0, 5
621
+ >>> s_end_fwd, s_end_rev = 5, 0
622
+ >>> p = [0, 1]
623
+ >>> m = len(p)
624
+ >>> s = 0
625
+ >>> ss_fwd = s + s_stt_fwd + dir_fwd * (m - 1)
626
+ >>> ss_rev = s + s_stt_rev + dir_rev * (m - 1)
627
+ >>> ss_fwd, ss_rev
628
+ (1, 4)
629
+
630
+ There is one more important variable here: j_off
631
+ It brings ss in alignment with a needle.
632
+ So that it stands at the first absolute index of the window
633
+
634
+ >>> i = 0 # first step
635
+ >>> p_stt_fwd, p_stt_rev = 0, 1
636
+ >>> p_end_fwd, p_end_rev = 1, 0
637
+ >>> j_off_fwd = dir_fwd * i - p_end_fwd
638
+ >>> ss_fwd + j_off_fwd
639
+ 0
640
+
641
+ such that [0, 1, 2, 3, 4, 5]
642
+ [0, 1]
643
+ * - both indices are at 0 here
644
+
645
+ >>> j_off_rev = dir_rev * i - p_end_rev
646
+ >>> ss_rev + j_off_rev
647
+ 4
648
+
649
+ such that [0, 1, 2, 3, 4, 5]
650
+ [0, 1]
651
+ * - both indices are at 0 here
652
+ Finally, which side it iterates from is determined by:
653
+ jp = p_stt + (reversed ? -j : j);
654
+
655
+ With this transformation the problem becomes direction agnostic
656
+
657
+ Dynamic mode
658
+ 'Horspool' algorithm will switch to `two_way_find` if it predicts
659
+ that it can solve the problem faster.
660
+
661
+ Calibration
662
+ The simple model for run time of search algorithm is as follows:
663
+ loop - actual loop that happens (not theoretical)
664
+ init_cost - initialization cost per 1 needle character in ns
665
+ loop_cost - cost of 1 main loop
666
+ hit_cost - cost of 1 false positive character check
667
+ avg_hit - average number of false positive hits per 1 loop
668
+
669
+ >>> m = len(needle)
670
+ >>> run_time = m * m + n_loops * (loop_cost + hit_cost * avg_hit)
671
+
672
+ Calibrate:
673
+ 1. expose function to run without handling special cases first.
674
+ 2. set dynamic = 0
675
+ 3. Enable counter printing to know how many hits and loops happened
676
+ iloop & ihits at the end of the function
677
+
678
+ 4. init_cost = run_time(horspool_find(s='', p='*' * m)) / m
679
+
680
+ 5. `two_way` only has loop cost.
681
+ run_time(two_way_find(s='*' * 1000)) - init_cost
682
+ loop_cost = ------------------------------------------------
683
+ n_loops (from stdout)
684
+ Note, iloop & ihits of `two_way` should be the same.
685
+
686
+ 6. To get loop_cost and hit_cost of `horspool_find` solve
687
+ equation system representing 2 different runs
688
+ n_loops1 * loop_cost + n_hits1 * hit_cost = run_time(problem_1)
689
+ n_loops2 * loop_cost + n_hits2 * hit_cost = run_time(problem_2)
690
+
691
+ init_cost of `horspool` for larger problems is negligible
692
+ Furthermore, it is not used from within as it has already happened
693
+
694
+ 7. Run above for different problems. if results differ take averages
695
+ Compare with current calibration constants
696
+
697
+ 8. It works well, but is not perfect.
698
+ See if you can come up with more accurate model.
699
+ */
594
700
if (mode == FAST_COUNT ) {
595
701
LOG ("Horspool Count.\n" );
596
702
}
@@ -614,7 +720,6 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n,
614
720
const Py_ssize_t m_m1 = m - 1 ;
615
721
const Py_ssize_t m_p1 = m + 1 ;
616
722
const Py_ssize_t w = n - m ;
617
-
618
723
// Direction Dependent
619
724
const Py_ssize_t s_stt = dir == 1 ? 0 : n - 1 ;
620
725
const Py_ssize_t p_stt = dir == 1 ? 0 : m - 1 ;
@@ -688,6 +793,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n,
688
793
i += shift ;
689
794
continue ;
690
795
}
796
+
691
797
// assert(s_last == p_last); // true_gap
692
798
// assert((s_last & TABLE_MASK) == (p_last & TABLE_MASK)); // else
693
799
j_off = ip - p_end ;
@@ -723,6 +829,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n,
723
829
LOG ("Move by table gap = %ld\n" , gap );
724
830
i += gap ;
725
831
}
832
+
726
833
if (dynamic ) {
727
834
if (ihits - ihits_last < 100 && iloop - iloop_last < 100 ) {
728
835
continue ;
@@ -1001,7 +1108,8 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
1001
1108
return res == 0 ? 0 : -1 ;
1002
1109
}
1003
1110
}
1004
- int dyn = 1 ;
1005
- int dir = mode != FAST_RSEARCH ? 1 : -1 ;
1006
- return STRINGLIB (horspool_find )(s , n , p , m , maxcount , mode , dir , dyn );
1111
+ int dynamic = 1 ;
1112
+ int direction = mode != FAST_RSEARCH ? 1 : -1 ;
1113
+ return STRINGLIB (horspool_find )(s , n , p , m , maxcount , mode ,
1114
+ direction , dynamic );
1007
1115
}
0 commit comments