Skip to content

Commit

Permalink
Allow for a chunked regexp API by exposing low-level tools.
Browse files Browse the repository at this point in the history
  • Loading branch information
ashinn committed Mar 17, 2024
1 parent 5b27b01 commit b303bf3
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 28 deletions.
82 changes: 55 additions & 27 deletions lib/chibi/regexp.scm
Original file line number Diff line number Diff line change
Expand Up @@ -300,11 +300,9 @@
(if (not (eq? m (searcher-matches sr1)))
(searcher-matches-set! sr1 (copy-regexp-match m)))))

(define (searcher-max sr1 sr2)
(if (or (not (searcher? sr2))
(regexp-match>=? (searcher-matches sr1) (searcher-matches sr2)))
sr1
sr2))
(define (searcher>=? sr1 sr2)
(or (not (searcher? sr2))
(regexp-match>=? (searcher-matches sr1) (searcher-matches sr2))))

(define (searcher-start-match sr)
(regexp-match-ref (searcher-matches sr) 0))
Expand Down Expand Up @@ -344,6 +342,26 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Execution

;; The intermediate state of a regexp search. Differs from a match in that a
;; match has not necessarily occurred, and includes additional information
;; needed to resume searching.

(define-record-type Regexp-State
(%make-regexp-state searchers accept string)
regexp-state?
(searchers regexp-state-searchers regexp-state-searchers-set!)
(accept regexp-state-accept regexp-state-accept-set!)
(string regexp-state-string regexp-state-string-set!))

(define (make-regexp-state . o)
(let ((searchers (if (pair? o) (car o) (posse)))
(accept (and (pair? o) (pair? (cdr o)) (cadr o))))
(%make-regexp-state searchers accept #f)))

(define (regexp-state-matches state)
(cond ((regexp-state-accept state) => searcher-matches)
(else #f)))

;; A transition which doesn't advance the index.

(define (epsilon-state? st)
Expand All @@ -370,7 +388,7 @@
;; Advance epsilons together - if the State is newly added to the
;; group and is an epsilon state, recursively add the transition.

(define (posse-advance! new seen accept sr str i start end)
(define (posse-advance! new seen state sr str i start end)
(let advance! ((sr sr))
(let ((st (searcher-state sr)))
;; Update match data.
Expand All @@ -394,7 +412,10 @@
;; Follow transitions.
(cond
((state-accept? st)
(set-cdr! accept (searcher-max sr (cdr accept))))
(cond
((searcher>=? sr (regexp-state-accept state))
(regexp-state-accept-set! state sr)
(regexp-state-string-set! state str))))
((posse-ref seen sr)
=> (lambda (sr-prev) (searcher-merge! sr-prev sr)))
((epsilon-state? st)
Expand Down Expand Up @@ -424,59 +445,66 @@
;; Add new searcher.
(posse-add! new sr))))))

;; Run so long as there is more to match.

(define (regexp-run-offsets search? rx str start end)
;;> Advances the search until an optimal match is found or the end of the string
;;> is reached, and returns the resulting regexp state.
(define (regexp-advance! search? init? rx str start end . o)
(let ((rx (regexp rx))
(epsilons (posse))
(accept (list #f)))
(state (if (pair? o) (car o) (make-regexp-state)))
(epsilons (posse)))
(let lp ((i start)
(searchers1 (posse))
(searchers2 (posse)))
;; Advance initial epsilons once from the first index, or every
;; time when searching.
(cond
((or search? (string-cursor=? i start))
(posse-advance! searchers1 epsilons accept (make-start-searcher rx str)
((or search? (and init? (string-cursor=? i start)))
(posse-advance! searchers1 epsilons state (make-start-searcher rx str)
str i start end)
(posse-clear! epsilons)))
(cond
((or (string-cursor>=? i end)
(and search?
(searcher? (cdr accept))
(let ((accept-start (searcher-start-match (cdr accept))))
(searcher? (regexp-state-accept state))
(let ((accept-start (searcher-start-match (regexp-state-accept state))))
(posse-every
(lambda (searcher)
(string-cursor>? (searcher-start-match searcher)
accept-start))
searchers1)))
(and (not search?)
(posse-empty? searchers1)))
;; Terminate when the string is done or there are no more
;; searchers. If we terminate prematurely and are not
;; searching, return false.
(and (searcher? (cdr accept))
(let ((matches (searcher-matches (cdr accept))))
(and (or search? (string-cursor>=? (regexp-match-ref matches 1)
end))
(searcher-matches (cdr accept))))))
;; Terminate when the string is done or there are no more searchers or
;; we've found an accept state which started before any pending matches.
;; If we terminate prematurely and are not searching, return false.
(regexp-state-searchers-set! state searchers1)
state)
(else
;; Otherwise advance normally.
;; Otherwise advance normally from searchers1, storing the new state in
;; searchers2, and recurse swapping the two (to reduce garbage).
(let ((ch (string-cursor-ref str i))
(i2 (string-cursor-next str i)))
(posse-for-each ;; NOTE: non-deterministic from hash order
(posse-for-each ;; NOTE: non-deterministic from hash order
(lambda (sr)
(cond
((state-matches? (searcher-state sr) str i ch
start end (searcher-matches sr))
(searcher-state-set! sr (state-next1 (searcher-state sr)))
;; Epsilons are considered at the next position.
(posse-advance! searchers2 epsilons accept sr str i2 start end)
(posse-advance! searchers2 epsilons state sr str i2 start end)
(posse-clear! epsilons))))
searchers1)
(posse-clear! searchers1)
(lp i2 searchers2 searchers1)))))))

;; Run so long as there is more to match.

(define (regexp-run-offsets search? rx str start end)
(let ((state (regexp-advance! search? #t rx str start end)))
(and (searcher? (regexp-state-accept state))
(let ((matches (searcher-matches (regexp-state-accept state))))
(and (or search? (string-cursor>=? (regexp-match-ref matches 1) end))
matches)))))

;; Wrapper to determine start and end offsets.

(define (regexp-run search? rx str . o)
Expand Down
8 changes: 7 additions & 1 deletion lib/chibi/regexp.sld
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,13 @@
regexp-match? regexp-match-count
regexp-match-submatch regexp-match-submatch/list
regexp-match-submatch-start regexp-match-submatch-end
regexp-match->list regexp-match->sexp)
regexp-match->list regexp-match->sexp
;; low-level
regexp-advance! regexp-state?
make-regexp-state regexp-state-accept
regexp-state-searchers regexp-state-matches
regexp-match-ref
)
(import (srfi 69))
;; Chibi's char-set library is more factored than SRFI-14.
(cond-expand
Expand Down

0 comments on commit b303bf3

Please sign in to comment.