Skip to content

Commit 8c87a57

Browse files
committed
styling, comments and PR feedback
1 parent b5bd4c5 commit 8c87a57

File tree

1 file changed

+140
-32
lines changed

1 file changed

+140
-32
lines changed

Objects/stringlib/fastsearch.h

+140-32
Original file line numberDiff line numberDiff line change
@@ -220,16 +220,40 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *p,
220220
{
221221
/* Do a lexicographic search. Essentially this:
222222
>>> max(needle[i:] for i in range(len(needle)+1))
223-
Also find the period of the right half. */
223+
Also find the period of the right half.
224+
Direction:
225+
dir : {-1, 1}
226+
if dir == -1, then the problem is reverse
227+
In short:
228+
_lex_search(x, -1) == _lex_search(x[::-1], 1)
229+
230+
Returned cut is "the size of the cut towards chosen direction".
231+
E.g.:
232+
>>> x = '1234'
233+
>>> cut, period = factorize(x, dir=1) # cut = 0
234+
>>> cut
235+
0
236+
>>> cut_idx = cut
237+
>>> x[:cut_idx], x[cut_idx:]
238+
'', '1234'
239+
>>> x = '4321'
240+
>>> cut, period = factorize(x, dir=-1)
241+
>>> cut
242+
0
243+
>>> cut_idx = len(x) - cut
244+
>>> x[:cut_idx], x[cut_idx:]
245+
'4321', ''
246+
*/
224247
Py_ssize_t max_suffix = 0;
225248
Py_ssize_t candidate = 1;
226249
Py_ssize_t k = 0;
227250
// The period of the right half.
228251
Py_ssize_t period = 1;
252+
// stt is starting position from chosen direction
229253
Py_ssize_t stt = dir == 1 ? 0 : m - 1;
230254
STRINGLIB_CHAR a, b;
231255
while (candidate + k < m) {
232-
// each loop increases candidate + k + max_suffix
256+
// each loop increases (in chosen direction) candidate + k + max_suffix
233257
a = p[stt + dir*(candidate + k)];
234258
b = p[stt + dir*(max_suffix + k)];
235259
// check if the suffix at candidate is better than max_suffix
@@ -306,22 +330,12 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *p,
306330
The length of this minimal repetition is 7, which is indeed the
307331
period of the original string.
308332
309-
This is how reverse direction compares to forward:
310-
returned cut is "the size of the cut from the start point". E.g.:
311-
>>> x = '1234'
312-
>>> cut, period = factorize(x, 1) # cut = 0
313-
>>> cut
314-
0
315-
>>> cut_idx = cut
316-
>>> x[:cut_idx], x[cut_idx:]
317-
'', '1234'
318-
>>> x = '4321'
319-
>>> cut, period = factorize(x, -1)
320-
>>> cut
321-
0
322-
>>> cut_idx = len(x) - cut
323-
>>> x[:cut_idx], x[cut_idx:]
324-
'4321', ''
333+
Direction:
334+
dir : {-1, 1}
335+
if dir == -1, then the problem is reverse
336+
In short:
337+
_factorize(x, -1) == _factorize(x[::-1], 1)
338+
See docstring of _lex_search if still unclear
325339
*/
326340
Py_ssize_t cut1, period1, cut2, period2, cut, period;
327341
cut1 = STRINGLIB(_lex_search)(p, m, &period1, 0, dir);
@@ -348,13 +362,13 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *p,
348362
#define TABLE_MASK (TABLE_SIZE - 1U)
349363

350364
typedef struct STRINGLIB(_pre) {
351-
const STRINGLIB_CHAR *p;
352-
Py_ssize_t m;
353-
Py_ssize_t cut;
354-
Py_ssize_t period;
355-
Py_ssize_t gap;
365+
const STRINGLIB_CHAR *p; // needle
366+
Py_ssize_t m; // length of the needle
367+
Py_ssize_t cut; // Critical Factorization Cut
368+
Py_ssize_t period; // Global Period of the string
369+
Py_ssize_t gap; // "Good Suffix" Last Character Gap
356370
int is_periodic;
357-
SHIFT_TYPE table[TABLE_SIZE];
371+
SHIFT_TYPE table[TABLE_SIZE]; // Boyer-Moore "Bad Character" table
358372
} STRINGLIB(prework);
359373

360374

@@ -438,8 +452,14 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n,
438452
Py_ssize_t maxcount, int mode,
439453
STRINGLIB(prework) *pw, int direction)
440454
{
441-
// Crochemore and Perrin's (1991) Two-Way algorithm.
442-
// See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
455+
/* Crochemore and Perrin's (1991) Two-Way algorithm.
456+
See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
457+
Bi-Directional Conventions:
458+
See docstring of horspool_find
459+
460+
Critical factorization reversion:
461+
See docstring of _factorize
462+
*/
443463
if (mode == FAST_COUNT) {
444464
LOG("Two-way Count.\n");
445465
}
@@ -464,7 +484,6 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n,
464484
// Direction Independent
465485
const Py_ssize_t w = n - m;
466486
const Py_ssize_t m_m1 = m - 1;
467-
468487
// Direction Dependent
469488
const Py_ssize_t p_stt = dir == 1 ? 0 : m - 1;
470489
const Py_ssize_t s_stt = dir == 1 ? 0 : n - 1;
@@ -531,6 +550,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n,
531550
if (j != m) {
532551
continue;
533552
}
553+
534554
j = Py_MIN(memory, cut); // Needed for j == cut below to be correct
535555
for (; j < cut; j++) {
536556
ihits++;
@@ -590,7 +610,93 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n,
590610
int direction, int dynamic)
591611
{
592612
/* Boyer–Moore–Horspool algorithm
593-
with optional dynamic fallback to Two-Way algorithm */
613+
with optional dynamic fallback to Two-Way algorithm
614+
Bi-Directional Conventions:
615+
stt - start index
616+
end - end index
617+
ss - pointer to last window index that matches last needle character
618+
>>> dir_fwd, dir_rev = 1, -1
619+
>>> s = [0, 1, 2, 3, 4, 5]
620+
>>> s_stt_fwd, s_stt_rev = 0, 5
621+
>>> s_end_fwd, s_end_rev = 5, 0
622+
>>> p = [0, 1]
623+
>>> m = len(p)
624+
>>> s = 0
625+
>>> ss_fwd = s + s_stt_fwd + dir_fwd * (m - 1)
626+
>>> ss_rev = s + s_stt_rev + dir_rev * (m - 1)
627+
>>> ss_fwd, ss_rev
628+
(1, 4)
629+
630+
There is one more important variable here: j_off
631+
It brings ss in alignment with a needle.
632+
So that it stands at the first absolute index of the window
633+
634+
>>> i = 0 # first step
635+
>>> p_stt_fwd, p_stt_rev = 0, 1
636+
>>> p_end_fwd, p_end_rev = 1, 0
637+
>>> j_off_fwd = dir_fwd * i - p_end_fwd
638+
>>> ss_fwd + j_off_fwd
639+
0
640+
641+
such that [0, 1, 2, 3, 4, 5]
642+
[0, 1]
643+
* - both indices are at 0 here
644+
645+
>>> j_off_rev = dir_rev * i - p_end_rev
646+
>>> ss_rev + j_off_rev
647+
4
648+
649+
such that [0, 1, 2, 3, 4, 5]
650+
[0, 1]
651+
* - both indices are at 0 here
652+
Finally, which side it iterates from is determined by:
653+
jp = p_stt + (reversed ? -j : j);
654+
655+
With this transformation the problem becomes direction agnostic
656+
657+
Dynamic mode
658+
'Horspool' algorithm will switch to `two_way_find` if it predicts
659+
that it can solve the problem faster.
660+
661+
Calibration
662+
The simple model for run time of search algorithm is as follows:
663+
loop - actual loop that happens (not theoretical)
664+
init_cost - initialization cost per 1 needle character in ns
665+
loop_cost - cost of 1 main loop
666+
hit_cost - cost of 1 false positive character check
667+
avg_hit - average number of false positive hits per 1 loop
668+
669+
>>> m = len(needle)
670+
>>> run_time = m * m + n_loops * (loop_cost + hit_cost * avg_hit)
671+
672+
Calibrate:
673+
1. expose function to run without handling special cases first.
674+
2. set dynamic = 0
675+
3. Enable counter printing to know how many hits and loops happened
676+
iloop & ihits at the end of the function
677+
678+
4. init_cost = run_time(horspool_find(s='', p='*' * m)) / m
679+
680+
5. `two_way` only has loop cost.
681+
run_time(two_way_find(s='*' * 1000)) - init_cost
682+
loop_cost = ------------------------------------------------
683+
n_loops (from stdout)
684+
Note, iloop & ihits of `two_way` should be the same.
685+
686+
6. To get loop_cost and hit_cost of `horspool_find` solve
687+
equation system representing 2 different runs
688+
n_loops1 * loop_cost + n_hits1 * hit_cost = run_time(problem_1)
689+
n_loops2 * loop_cost + n_hits2 * hit_cost = run_time(problem_2)
690+
691+
init_cost of `horspool` for larger problems is negligible
692+
Furthermore, it is not used from within as it has already happened
693+
694+
7. Run above for different problems. if results differ take averages
695+
Compare with current calibration constants
696+
697+
8. It works well, but is not perfect.
698+
See if you can come up with more accurate model.
699+
*/
594700
if (mode == FAST_COUNT) {
595701
LOG("Horspool Count.\n");
596702
}
@@ -614,7 +720,6 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n,
614720
const Py_ssize_t m_m1 = m - 1;
615721
const Py_ssize_t m_p1 = m + 1;
616722
const Py_ssize_t w = n - m;
617-
618723
// Direction Dependent
619724
const Py_ssize_t s_stt = dir == 1 ? 0 : n - 1;
620725
const Py_ssize_t p_stt = dir == 1 ? 0 : m - 1;
@@ -688,6 +793,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n,
688793
i += shift;
689794
continue;
690795
}
796+
691797
// assert(s_last == p_last); // true_gap
692798
// assert((s_last & TABLE_MASK) == (p_last & TABLE_MASK)); // else
693799
j_off = ip - p_end;
@@ -723,6 +829,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n,
723829
LOG("Move by table gap = %ld\n", gap);
724830
i += gap;
725831
}
832+
726833
if (dynamic) {
727834
if (ihits - ihits_last < 100 && iloop - iloop_last < 100) {
728835
continue;
@@ -1001,7 +1108,8 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
10011108
return res == 0 ? 0 : -1;
10021109
}
10031110
}
1004-
int dyn = 1;
1005-
int dir = mode != FAST_RSEARCH ? 1 : -1;
1006-
return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, dir, dyn);
1111+
int dynamic = 1;
1112+
int direction = mode != FAST_RSEARCH ? 1 : -1;
1113+
return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode,
1114+
direction, dynamic);
10071115
}

0 commit comments

Comments
 (0)