diff --git a/api.lisp b/api.lisp index 21550e1..965875f 100644 --- a/api.lisp +++ b/api.lisp @@ -103,8 +103,9 @@ modify its first argument \(but only if it's a parse tree).")) (when flags (setq parse-tree (list :group (cons :flags flags) parse-tree)))) (let ((*syntax-error-string* nil)) - (multiple-value-bind (regex reg-num starts-with reg-names) + (multiple-value-bind (regex reg-num starts-with reg-names subpattern-refs) (convert parse-tree) + (declare (special subpattern-refs)) ;; simplify REGEX by flattening nested SEQ and ALTERNATION ;; constructs and gathering STR objects (let ((regex (gather-strings (flatten regex)))) @@ -132,6 +133,9 @@ modify its first argument \(but only if it's a parse tree).")) ;; initialize the counters for CREATE-MATCHER-AUX (*rep-num* 0) (*zero-length-num* 0) + ;; keep track of the matcher functions of registers + ;; referenced by subpattern references + (register-matchers (cons nil nil)) ;; create the actual matcher function (which does all the ;; work of matching the regular expression) corresponding ;; to REGEX and at the same time set the special @@ -148,7 +152,10 @@ modify its first argument \(but only if it's a parse tree).")) (create-bmh-matcher (str starts-with) (case-insensitive-p starts-with)))))) - (declare (special end-string-offset end-anchored-p end-string)) + (declare (special end-string-offset + end-anchored-p + end-string + register-matchers)) ;; now create the scanner and return it (values (create-scanner-aux match-fn (regex-min-length regex) diff --git a/closures.lisp b/closures.lisp index a275078..f7d7604 100644 --- a/closures.lisp +++ b/closures.lisp @@ -86,42 +86,108 @@ such that the call to NEXT-FN after the match would succeed.")) (defmethod create-matcher-aux ((register register) next-fn) (declare #.*standard-optimize-settings*) - ;; the position of this REGISTER within the whole regex; we start to - ;; count at 0 - (let ((num (num register))) - (declare (fixnum num)) - ;; STORE-END-OF-REG is a thin wrapper around NEXT-FN which will - ;; update the corresponding values of *REGS-START* and *REGS-END* - ;; after the inner matcher has succeeded - (flet ((store-end-of-reg (start-pos) - (declare (fixnum start-pos) - (function next-fn)) - (setf (svref *reg-starts* num) (svref *regs-maybe-start* num) - (svref *reg-ends* num) start-pos) - (funcall next-fn start-pos))) + (declare (special register-matchers)) + (let ((num (num register)) + (inner-register-count (inner-register-count register)) + ;; a place to store the next function to call when we arrive + ;; here via a subpattern reference + subpattern-ref-continuations) + (declare (fixnum num inner-register-count) + (list subpattern-ref-continuations)) + (labels + ((push-registers-state (new-starts new-maybe-starts new-ends) + (declare (list new-starts new-maybe-starts new-ends)) + ;; only push the register states for this register and registers + ;; local to it + (loop for idx from num upto (+ num inner-register-count) do + (locally (declare (fixnum idx)) + (push (svref *reg-ends* idx) (svref *reg-ends-stacks* idx)) + (setf (svref *reg-ends* idx) (pop new-ends)) + (push (svref *regs-maybe-start* idx) (svref *regs-maybe-start-stacks* idx)) + (setf (svref *regs-maybe-start* idx) (pop new-maybe-starts)) + (push (svref *reg-starts* idx) (svref *reg-starts-stacks* idx)) + (setf (svref *reg-starts* idx) (pop new-starts))))) + (pop-registers-state () + ;; return the state that was destroyed by this restore + (let (old-starts old-maybe-starts old-ends) + (declare (list old-starts old-maybe-starts old-ends)) + (loop for idx from (+ num inner-register-count) downto num do + (locally (declare (fixnum idx)) + (push (svref *reg-ends* idx) old-ends) + (setf (svref *reg-ends* idx) (pop (svref *reg-ends-stacks* idx))) + (push (svref *regs-maybe-start* idx) old-maybe-starts) + (setf (svref *regs-maybe-start* idx) (pop (svref *regs-maybe-start-stacks* idx))) + (push (svref *reg-starts* idx) old-starts) + (setf (svref *reg-starts* idx) (pop (svref *reg-starts-stacks* idx))))) + (values old-starts old-maybe-starts old-ends))) + ;; STORE-END-OF-REG is a thin wrapper around NEXT-FN which + ;; will update register offsets after the inner matcher has + ;; succeded + (store-end-of-reg (start-pos) + (declare (fixnum start-pos) + (function next-fn)) + (cond + (subpattern-ref-continuations + ;; we're returning from a register that was entered + ;; through a subpattern reference; restore the registers + ;; state as it was upon entering the subpattern + ;; reference, but save the intermediary state for when + ;; we have to backtrack or unwind + (multiple-value-bind (saved-starts saved-maybe-starts saved-ends) + (pop-registers-state) + (let ((next-fn (pop subpattern-ref-continuations))) + (prog1 (funcall (the function next-fn) start-pos) + ;; un-restore the registers state so we + ;; backtrack/unwind cleanly + (push-registers-state saved-starts saved-maybe-starts saved-ends) + (push next-fn subpattern-ref-continuations))))) + (t + ;; we're returning from a register that was entered + ;; directly save the start and end positions, and match + ;; the rest of the pattern + (setf (svref *reg-starts* num) (svref *regs-maybe-start* num) + (svref *reg-ends* num) start-pos) + (funcall next-fn start-pos))))) + (declare (inline push-registers-state pop-registers-state)) ;; the inner matcher is a closure corresponding to the regex ;; wrapped by this REGISTER (let ((inner-matcher (create-matcher-aux (regex register) #'store-end-of-reg))) (declare (function inner-matcher)) - ;; here comes the actual closure for REGISTER - (lambda (start-pos) - (declare (fixnum start-pos)) - ;; remember the old values of *REGS-START* and friends in - ;; case we cannot match - (let ((old-*reg-starts* (svref *reg-starts* num)) - (old-*regs-maybe-start* (svref *regs-maybe-start* num)) - (old-*reg-ends* (svref *reg-ends* num))) - ;; we cannot use *REGS-START* here because Perl allows - ;; regular expressions like /(a|\1x)*/ - (setf (svref *regs-maybe-start* num) start-pos) - (let ((next-pos (funcall inner-matcher start-pos))) - (unless next-pos - ;; restore old values on failure - (setf (svref *reg-starts* num) old-*reg-starts* - (svref *regs-maybe-start* num) old-*regs-maybe-start* - (svref *reg-ends* num) old-*reg-ends*)) - next-pos))))))) + ;; here comes the actual closure for REGISTER; save it in a + ;; special variable so it can be called by subpattern + ;; references + (setf (getf (car register-matchers) num) + (lambda (start-pos &optional cont) + (declare (fixnum start-pos)) + (cond + (cont + ;; the presence of CONT indicates that this + ;; register has been entered via a subpattern + ;; reference closure; save the registers state, + ;; creating fresh new "bindings" for the local + ;; register offsets; restore the state before + ;; returning to the caller + (push cont subpattern-ref-continuations) + (push-registers-state nil nil nil) + (prog1 + (funcall inner-matcher start-pos) + (pop-registers-state) + (pop subpattern-ref-continuations))) + (t + (let ((old-*reg-starts* (svref *reg-starts* num)) + (old-*regs-maybe-start* (svref *regs-maybe-start* num)) + (old-*reg-ends* (svref *reg-ends* num))) + ;; we cannot use *REG-STARTS* here because Perl + ;; allows regular expressions like /(a|\1x)*/ + (setf (svref *regs-maybe-start* num) start-pos) + (let ((next-pos (funcall inner-matcher start-pos))) + (unless next-pos + ;; restore old values on failure + (setf (svref *reg-starts* num) old-*reg-starts* + (svref *regs-maybe-start* num) old-*regs-maybe-start* + (svref *reg-ends* num) old-*reg-ends*)) + next-pos)))))))))) (defmethod create-matcher-aux ((lookahead lookahead) next-fn) (declare #.*standard-optimize-settings*) @@ -427,6 +493,20 @@ against CHR-EXPR." reg-start reg-end) (funcall next-fn next-pos))))))))) +(defmethod create-matcher-aux ((subpattern-reference subpattern-reference) next-fn) + (declare #.*standard-optimize-settings*) + (declare (special register-matchers) + (function next-fn)) + ;; close over the special variable REGISTER-MATCHERS in order to + ;; reference it during the match phase + (let ((num (num subpattern-reference)) + (register-matchers register-matchers)) + (declare (fixnum num) (function next-fn)) + (lambda (start-pos) + (funcall (the function (getf (car register-matchers) (1- num))) + start-pos + next-fn)))) + (defmethod create-matcher-aux ((branch branch) next-fn) (declare #.*standard-optimize-settings*) (let* ((test (test branch)) diff --git a/convert.lisp b/convert.lisp index c2683ae..d34288b 100644 --- a/convert.lisp +++ b/convert.lisp @@ -300,6 +300,12 @@ NIL or a STR object of the same case mode. Always returns NIL." (setq accumulate-start-p nil)))) nil) +(defun has-subpattern-ref-p (parse-tree) + (declare #.*standard-optimize-settings*) + (and (consp parse-tree) + (or (eql (car parse-tree) :subpattern-reference) + (some #'has-subpattern-ref-p (cdr parse-tree))))) + (declaim (inline convert-aux)) (defun convert-aux (parse-tree) "Converts the parse tree PARSE-TREE into a REGEX object and returns @@ -312,6 +318,12 @@ it. Will also - keep track of all named registers seen in the special variable REG-NAMES - keep track of the highest backreference seen in the special variable MAX-BACK-REF, + - keep track of the highest subpattern reference seen in the special + variable MAX-SUBPATTERN-REF, + - keep track of all numbered subpattern references seen in the + special variable NUMBERED-SUBPATTERN-REFS, + - keep track of all named subpattern references seen in the special + variable NAMED-SUBPATTERN-REFS, - maintain and adher to the currently applicable modifiers in the special variable FLAGS, and - maybe even wash your car..." @@ -575,7 +587,7 @@ called with GREEDYP set to NIL as you would expect." "The case for \(:REGISTER ). Also used for named registers when NAME is not NIL." (declare #.*standard-optimize-settings*) - (declare (special flags reg-num reg-names)) + (declare (special flags reg-num reg-names has-subpattern-ref accumulate-start-p)) ;; keep the effect of modifiers local to the enclosed regex; also, ;; assign the current value of REG-NUM to the corresponding slot of ;; the REGISTER object and increase this counter afterwards; for @@ -583,15 +595,28 @@ when NAME is not NIL." ;; slot of the REGISTER object too (let ((flags (copy-list flags)) (stored-reg-num reg-num)) - (declare (special flags reg-seen named-reg-seen)) + (declare (special flags reg-seen)) + (declare (fixnum stored-reg-num)) (setq reg-seen t) - (when name (setq named-reg-seen t)) (incf (the fixnum reg-num)) (push name reg-names) + ;; while inside registers, we cannot indiscriminately accumulate + ;; into the special variable STARTS-WITH because recursive + ;; subpattern references might cause too much of the string to be + ;; skipped or endless recursion, as with: + ;; (scan "(\\([^()]*(?:(?1)|\\))\\))" "(())") + ;; for now, we set ACCUMULATE-START-P to NIL if the regex has one + ;; or more subpattern references, but it may be possible to + ;; determine which registers are referenced--either now or at the + ;; matcher generation phase--and not needlessly throw away + ;; information that may be helpful in optimization + (when has-subpattern-ref + (setq accumulate-start-p nil)) (make-instance 'register :regex (convert-aux (if name (third parse-tree) (second parse-tree))) :num stored-reg-num - :name name))) + :name name + :inner-register-count (- (the fixnum reg-num) stored-reg-num 1)))) (defmethod convert-compound-parse-tree ((token (eql :named-register)) parse-tree &key) "The case for \(:NAMED-REGISTER )." @@ -669,6 +694,37 @@ when NAME is not NIL." (t (make-back-ref backref-number)))))) +(defmethod convert-compound-parse-tree ((token (eql :subpattern-reference)) parse-tree &key) + "The case for parse trees like \(:SUBPATTERN-REFERENCE |)." + (declare #.*standard-optimize-settings*) + (declare (special max-subpattern-ref + numbered-subpattern-refs + named-subpattern-refs + accumulate-start-p)) + ;; stop accumulating into STARTS-WITH + (setq accumulate-start-p nil) + ;; subpattern references may refer to registers that come later in + ;; the regex, so we don't validate the register name/number until + ;; the entire object has been constructed + (let* ((reg (second parse-tree)) + (reg-name (and (stringp reg) reg)) + (reg-num (and (null reg-name) + (typep reg 'fixnum) + (plusp reg) + reg))) + (when (not (or reg-name reg-num)) + (signal-syntax-error "Illegal subpattern reference: ~S." parse-tree)) + (if reg-name + (pushnew reg-name named-subpattern-refs :test #'string=) + (progn + (setf max-subpattern-ref (max max-subpattern-ref reg-num)) + (pushnew reg-num numbered-subpattern-refs :test #'=))) + (make-instance 'subpattern-reference + ;; for named references, register numbers will be + ;; computed later + :num (or reg-num -1) + :name (copy-seq reg-name)))) + (defmethod convert-compound-parse-tree ((token (eql :regex)) parse-tree &key) "The case for \(:REGEX )." (declare #.*standard-optimize-settings*) @@ -844,29 +900,80 @@ parse trees which are atoms.") (convert-aux (copy-tree translation)) (signal-syntax-error "Unknown token ~A in parse tree." parse-tree)))) +(defun convert-named-subpattern-refs (converted-tree) + "Convert named subpattern references to numbered references." + (declare #.*standard-optimize-settings*) + (declare (special reg-names reg-num numbered-subpattern-refs)) + (typecase converted-tree + (subpattern-reference + (when (= -1 (num converted-tree)) + ;; find which register corresponds to the given name + (let* ((reg-name (name converted-tree)) + (this-reg-num nil)) + ;; when more than one register have the same name, a named + ;; subpattern reference refers to the first + (loop for name in reg-names + for reg-index from 0 + when (string= name reg-name) + do (setf this-reg-num (- reg-num reg-index))) + (pushnew this-reg-num numbered-subpattern-refs :test #'=) + (setf (num converted-tree) this-reg-num)))) + (seq + (mapc #'convert-named-subpattern-refs (elements converted-tree))) + (alternation + (mapc #'convert-named-subpattern-refs (choices converted-tree))) + ((or lookahead lookbehind repetition register standalone) + (convert-named-subpattern-refs (regex converted-tree))) + (branch + (mapc #'convert-named-subpattern-refs (list (then-regex converted-tree) + (else-regex converted-tree)))))) + (defun convert (parse-tree) - "Converts the parse tree PARSE-TREE into an equivalent REGEX object -and returns three values: the REGEX object, the number of registers -seen and an object the regex starts with which is either a STR object -or an EVERYTHING object \(if the regex starts with something like -\".*\") or NIL." + "Converts the parse tree PARSE-TREE into an equivalent REGEX object and +returns five values: the REGEX object; the number of registers seen; an object +the regex starts with, which is either a STR object or an EVERYTHING object \(if +the regex starts with something like \".*\") or NIL; a list of named registers +defined in the REGEX; and a list of numbers denoting the registers referred to +by subpattern references in the REGEX." (declare #.*standard-optimize-settings*) ;; this function basically just initializes the special variables ;; and then calls CONVERT-AUX to do all the work (let* ((flags (list nil nil nil)) (reg-num 0) reg-names - named-reg-seen + numbered-subpattern-refs + named-subpattern-refs (accumulate-start-p t) starts-with (max-back-ref 0) + (max-subpattern-ref 0) + (has-subpattern-ref (has-subpattern-ref-p parse-tree)) (converted-parse-tree (convert-aux parse-tree))) - (declare (special flags reg-num reg-names named-reg-seen - accumulate-start-p starts-with max-back-ref)) + (declare (special flags reg-num + reg-names + accumulate-start-p + starts-with + max-back-ref + max-subpattern-ref + numbered-subpattern-refs + named-subpattern-refs + has-subpattern-ref)) ;; make sure we don't reference registers which aren't there (when (> (the fixnum max-back-ref) (the fixnum reg-num)) (signal-syntax-error "Backreference to register ~A which has not been defined." max-back-ref)) + (when (> (the fixnum max-subpattern-ref) + (the fixnum reg-num)) + (signal-syntax-error "Subpattern reference to register ~A which has not been defined." + max-subpattern-ref)) + (when named-subpattern-refs + (let ((nonexistent-regs + (set-difference named-subpattern-refs reg-names :test #'string=))) + (when nonexistent-regs + (signal-syntax-error + "Subpattern reference to named register \"~A\" which has not been defined." + (car nonexistent-regs)))) + (convert-named-subpattern-refs converted-parse-tree)) (when (typep starts-with 'str) (setf (slot-value starts-with 'str) (coerce (slot-value starts-with 'str) @@ -875,5 +982,7 @@ or an EVERYTHING object \(if the regex starts with something like (values converted-parse-tree reg-num starts-with ;; we can't simply use *ALLOW-NAMED-REGISTERS* ;; since parse-tree syntax ignores it - (when named-reg-seen - (nreverse reg-names))))) + (when (some #'identity reg-names) + (nreverse reg-names)) + (when numbered-subpattern-refs + (nreverse numbered-subpattern-refs))))) diff --git a/doc/index.html b/doc/index.html index 0d7ee98..21fb233 100644 --- a/doc/index.html +++ b/doc/index.html @@ -207,8 +207,9 @@

Scanning

in man perlre including extended features like non-greedy repetitions, positive and negative look-ahead and look-behind -assertions, "standalone" subexpressions, and conditional -subpatterns. The following Perl features are (currently) not +assertions, "standalone" subexpressions, conditional +subpatterns, and subpattern references, including recursive +references. The following Perl features are (currently) not supported: