Skip to content
Bruce Mitchener edited this page Jan 6, 2014 · 1 revision
;; -*-*-*- START re.goo -*-*-*-

;;
;; Name: re
;; Author: Neel Krishnaswami <[email protected]>
;; Date: 24 Sep 2002
;; Version: 0.1
;; Synopsis: This module implements a regexp engine in GOO.
;
; This module implements regular expression matching. These regexps
; don't need to be compiled before they can be matched against input.
; They can also match patterns of arbitrary objects in arbitrary
; sequences. Right now the main drawback is that no subexpression
; matching is possible to get substrings.
;
; Examples:
;
; (match? (re-alt (re-seq "foo") (re-seq "bar")) "foo") => #t
; (match? (re-alt (re-seq "foo") (re-seq "bar")) "baz") => #f
; (match? (re-cat (re-star (re-sym 'foo)) (re-sym 'bar))
;         '(foo foo bar)) => #t
;
; TODO:
;  o Add macros to enable Olin Shiver's SRE regexp syntax.
;  o Add infrastructure to enable subexpressions and semantic
;    actions.
;
(use goo)
(use tests/driver)
;
;; The basic regexp classes.
;
;; Classes and constructors.
;
(dc <regexp> (<any>))
;
;; re-zero
;; never successfully matches
;
(dc <re-zero> (<regexp>))
(dv re-zero (new <re-zero>))
;
;; re-unit
;; matches the empty string
;
(dc <re-unit> (<regexp>))
(dv re-unit (new <re-unit>))
;
;; re-sym (o|<any> => <regexp>) -- a regexp that matches one object
;
(dc <re-sym>  (<regexp>))
(dp val (<re-sym> => <any>))
;
(df re-sym (o|<any> => <re-sym>)
  (new <re-sym> val o))
;
;; re-any
;; This regexp matches any single object.
;
(dc <re-any> (<regexp>))
(dv re-any (new <re-any>))
;
;; star (r|<regexp> => <regexp>)
;; construct a regexp that matches r's pattern zero or more times
;
(dc <re-star> (<regexp>))
(dp re (<re-star> => <regexp>))
;
(dg re-star (r|<regexp> => <regexp>))
(dm re-star (r|<re-unit> => <regexp>) re-unit)
(dm re-star (r|<re-zero> => <regexp>) re-unit)
(dm re-star (r|<regexp> => <regexp>)  (new <re-star> re r))
;
;; re-plus (r|<regexp> => <regexp>)
;; construct a regexp that matches r's pattern one or more times.
;
(dc <re-plus> (<regexp>))
(dp re (<re-plus> => <regexp>))
;
(dg re-plus (r|<regexp> => <regexp>))
(dm re-plus (r|<re-unit> => <regexp>) re-unit)
(dm re-plus (r|<re-zero> => <regexp>) re-zero)
(dm re-plus (r|<regexp> => <regexp>)  (new <re-plus> re r))
;
;; re-opt (r|<regexp> => <regexp>)
;; construct a regexp that matches either r or epsilon
;
(dc <re-opt> (<regexp>))
(dp re (<re-opt> => <regexp>))
;
(dg re-opt (r|<regexp> => <regexp>))
(dm re-opt (r|<re-unit> => <regexp>) re-unit)
(dm re-opt (r|<re-zero> => <regexp>) re-unit)
(dm re-opt (r|<regexp> => <regexp>)  (new <re-opt> re r))
;
;; re-cat (a|<regexp> rest|... => <regexp>)
;; Construct the regexp that matches the concatenation of a and every
;; successive regexp it receives as an argument.
;
(dc <re-cat> (<regexp>))
(dp fst (<re-cat> => <regexp>))
(dp snd (<re-cat> => <regexp>))
;
;; re-cat* (a|<regexp>  b|<regexp> => <regexp>)
;; a utility function that makes an re that matches the concatenation of a & b.
;
(dg re-cat* (a|<regexp>  b|<regexp> => <regexp>))
(dm re-cat* (a|<re-zero> b|<regexp> => <regexp>)  re-zero)
(dm re-cat* (a|<regexp>  b|<re-zero> => <regexp>) re-zero)
(dm re-cat* (a|<re-zero> b|<re-zero> => <regexp>) re-zero)
(dm re-cat* (a|<re-unit> b|<regexp> => <regexp>)  b)
(dm re-cat* (a|<regexp>  b|<re-unit> => <regexp>) a)
(dm re-cat* (a|<re-unit> b|<re-unit> => <regexp>) re-unit)
(dm re-cat* (a|<regexp>  b|<regexp> => <regexp>)  (new <re-cat> fst a snd b))
;
(df re-cat (a|<regexp> rest|... => <regexp>)
  (if (empty? rest)
      a
      (re-cat* a (rep loop ((i 0))
                   (if (= i (len rest))
                       re-unit
                       (re-cat* ?rest i (loop (+ i 1))))))))
;
;
(dc <re-alt> (<regexp>))
(dp fst (<re-alt> => <regexp>))
(dp snd (<re-alt> => <regexp>))
;
;; re-alt* (a|<regexp> b|<regexp> => <regexp>)
;; re-alt* is a utility fn creates a regexp that can match EITHER a OR b
;;
(dg re-alt* (a|<regexp> b|<regexp> => <regexp>))
(dm re-alt* (a|<re-zero> b|<re-zero> => <regexp>) re-zero)
(dm re-alt* (a|<re-zero> b|<regexp> => <regexp>)  b)
(dm re-alt* (a|<regexp>  b|<re-zero> => <regexp>) a)
(dm re-alt* (a|<regexp> b|<regexp> => <regexp>) (new <re-alt> fst a snd b))
;
;; re-alt (a|<regexp> rest|... => <regexp>)
;; re-alt takes all of the regexps it receives as arguments and
;; constructs a new regexp that can match any of them.
(df re-alt (a|<regexp> rest|... => <regexp>)
  (fold re-alt* a rest))
;
;; nullable? (r|<regexp> => <log>)
;; if (nullable? r) => #t, then r matches the zero-length string
;
(dg nullable? (r|<regexp>  => <log>))
(dm nullable? (r|<re-zero> => <log>) #f)
(dm nullable? (r|<re-unit> => <log>) #t)
(dm nullable? (r|<re-sym>  => <log>) #f)
(dm nullable? (r|<re-any>  => <log>) #f)
(dm nullable? (r|<re-star> => <log>) #t)
(dm nullable? (r|<re-plus> => <log>) (nullable? (re r)))
(dm nullable? (r|<re-opt>  => <log>) #t)
(dm nullable? (r|<re-cat>  => <log>)
  (and (nullable? (fst r)) (nullable? (snd r))))
(dm nullable? (r|<re-alt>  => <log>)
  (or (nullable? (fst r)) (nullable? (snd r))))
;
;; resid (e|<regexp> x|<any> => <regexp>))
;; return a new regexp that recognizes the language L(e*) = {w | xw \in L(e)}
;
(dg resid (e|<regexp>  x|<any> => <regexp>))
(dm resid (e|<re-zero> x|<any> => <regexp>) re-zero)
(dm resid (e|<re-unit> x|<any> => <regexp>) re-zero)
(dm resid (e|<re-sym>  x|<any> => <regexp>)
  (if (= x (val e))
      re-unit
      re-zero))
(dm resid (e|<re-any>  x|<any> => <regexp>) re-unit)
(dm resid (e|<re-star> x|<any> => <regexp>)
  (re-cat (resid (re e) x) (re-star (re e))))
(dm resid (e|<re-plus> x|<any> => <regexp>)
  (re-cat (resid (re e) x) (re-star (re e))))
(dm resid (e|<re-opt>  x|<any> => <regexp>)
  (resid (re e) x))
(dm resid (e|<re-cat>  x|<any> => <regexp>)
  (def e1 (fst e))
  (def e2 (snd e))
  (if (nullable? e1)
      (re-alt (re-cat (resid e1 x) e2)
              (resid e2 x))
      (re-cat (resid e1 x) e2)))
(dm resid (e|<re-alt>  x|<any> => <regexp>)
  (re-alt (resid (fst e) x)
          (resid (snd e) x)))
;
;; match? (r|<regexp> s|<seq> => <log>)
;; (match? r s) is true, iff s is an element of L(r).
;
(df match? (r|<regexp> s|<seq> => <log>)
  (nullable? (fold resid r s)))
;
;; re-seq (s|<seq> => <regexp>)
;; Take a sequence of objects and return a regexp that matches them in
;
(df re-seq (s|<seq> => <regexp>)
  (fold {acc x . (re-cat (re-sym x) acc)}
        re-unit
        (rev s)))
;
(df re-choose (s|<seq> => <regexp>)
  (if (zero? (len s))
      re-unit
      (rep loop ((i 0))
        (if (= i (- (len s) 1))
            (re-sym ?s i)
            (re-alt (re-sym ?s i) (loop (+ i 1)))))))
;
;;
;
(export <regexp>)
(export re-zero)
(export re-unit)
(export re-sym)
(export re-any)
(export re-opt)
(export re-star)
(export re-plus)
(export re-cat)
(export re-alt)
(export re-seq)
(export re-choose)
;
(export match?)
;
;; -*-*-*- EMD re.goo -*-*-*-
Clone this wiki locally