diff --git a/project.clj b/project.clj index e1516a9..b65b02b 100644 --- a/project.clj +++ b/project.clj @@ -1,4 +1,4 @@ -(defproject carocad/parcera "0.3.0" +(defproject carocad/parcera "0.3.1" :description "Grammar-based Clojure(script) parser" :url "https://github.com/carocad/parcera" :license {:name "LGPLv3" diff --git a/src/parcera/terminals.cljc b/src/parcera/terminals.cljc index c9788de..b9e3c69 100644 --- a/src/parcera/terminals.cljc +++ b/src/parcera/terminals.cljc @@ -1,15 +1,28 @@ -(ns parcera.terminals) +(ns parcera.terminals + "Clojure symbols, keywords, numbers and string/regex share quite a lot + of matching logic. This namespace is aimed towards clearly identifying + those pieces and share them among the different definitions to + avoid recurring issues") -; todo: anchor ALL to the beginning of string -; todo: try to avoid lookahead +;; Clojure's reader is quite permissive so we follow the motto +;; "if it is not forbidden, it is allowed" +(def not-allowed "\\s\\(\\)\\[\\]{}\"@~\\^;`\\\\\\/,") +(def allowed-characters (str "[^" not-allowed "]*")) +(def not-number "(?![+-]?\\d+)") +(def symbol-end "(?=[\\s\"()\\[\\]{},]|$)") + +(defn- name-pattern + [restriction] + (let [first-character (str "[^" restriction not-allowed "]")] + (str "(" first-character allowed-characters "\\/)?" + "(\\/|(" first-character allowed-characters "))" + symbol-end))) + + +(def symbol-pattern (str not-number (name-pattern ":#\\'"))) +(def simple-keyword (str ":" (name-pattern ":"))) +(def macro-keyword (str "::" (name-pattern ":"))) -;; Clojure's reader is quite permissive so we follow the motto "if it is not forbidden, it is allowed" -; todo: dont allow / -(def NAME "[^\\s\\(\\)\\[\\]{}\"@~\\^;`\\\\]+") -; todo: (?!\/) do i need that ? -;; symbols cannot start with a number, :, # nor ' -; todo: no need for negative lookahead of chars -(def symbol-pattern (str "(?![:#\\',]|[+-]?\\d+)(" NAME "\\/)?(\\/|(" NAME "))")) (def double-suffix "(((\\.\\d*)?([eE][-+]?\\d+)?)M?)") (def long-suffix "((0[xX]([\\dA-Fa-f]+)|0([0-7]+)|([1-9]\\d?)[rR]([\\d\\w]+)|0\\d+)?N?)") @@ -21,19 +34,11 @@ ; mentioned here: https://www.regular-expressions.info/unicode.html ; It's cooked by this generator: http://kourge.net/projects/regexp-unicode-block ; ticking all 'Combining Diacritical Marks' boxes *)) -; todo: repeated pattern could be simplified -(def unicode-char "([^\\u0300-\\u036F\\u1DC0-\\u1DFF\\u20D0-\\u20FF][\\u0300-\\u036F\\u1DC0-\\u1DFF\\u20D0-\\u20FF]*)") +(def unicode-char "([^\\u0300-\\u036F\\u1DC0-\\u1DFF\\u20D0-\\u20FF])") (def named-char "(newline|return|space|tab|formfeed|backspace)") (def unicode "(u[\\dD-Fd-f]{4})") -; todo: use word boundary to avoid lookahead (def character-pattern (str "\\\\(" unicode-char "|" named-char "|" unicode ")(?!\\w+)")) -; : is not allowed as first keyword character -; todo: no need for negative lookahead of symbol -(def simple-keyword (str ":(?!:)" symbol-pattern)) -(def macro-keyword (str "::(?!:)" NAME)) - - (def string-pattern "\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"") (def regex-pattern (str "#" string-pattern)) diff --git a/test/parcera/test/core.cljc b/test/parcera/test/core.cljc index 766d31b..0ae9324 100644 --- a/test/parcera/test/core.cljc +++ b/test/parcera/test/core.cljc @@ -132,7 +132,11 @@ (as-> "{:hello ;2} 2}" input (and (is (valid? input)) (is (roundtrip input)) - (is (clear input)))))) + (is (clear input))))) + (testing "symbols" + (as-> "hello/world/" input (is (not (valid? input)))) + (as-> ":hello/world/" input (is (not (valid? input)))) + (as-> "::hello/world/" input (is (not (valid? input)))))) (deftest macros