From a58e9e3c12e780469a65561ab8357cbacf689e42 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Tue, 15 Oct 2019 22:13:27 +0200 Subject: [PATCH 01/10] fix: disallow / as final char in a symbol fix: a symbol can only be followed by empty spaces or a collection --- src/parcera/terminals.cljc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/parcera/terminals.cljc b/src/parcera/terminals.cljc index c9788de..2a6e3ff 100644 --- a/src/parcera/terminals.cljc +++ b/src/parcera/terminals.cljc @@ -1,15 +1,14 @@ (ns parcera.terminals) -; todo: anchor ALL to the beginning of string ; todo: try to avoid lookahead ;; Clojure's reader is quite permissive so we follow the motto "if it is not forbidden, it is allowed" ; todo: dont allow / -(def NAME "[^\\s\\(\\)\\[\\]{}\"@~\\^;`\\\\]+") +(def NAME "[^\\s\\(\\)\\[\\]{}\"@~\\^;`\\\\\\/]+") ; todo: (?!\/) do i need that ? ;; symbols cannot start with a number, :, # nor ' ; todo: no need for negative lookahead of chars -(def symbol-pattern (str "(?![:#\\',]|[+-]?\\d+)(" NAME "\\/)?(\\/|(" NAME "))")) +(def symbol-pattern (str "(?![:#\\',]|[+-]?\\d+)(" NAME "\\/)?(\\/|(" NAME "))(?=[\\s\"()\\[\\]{}]|$)")) (def double-suffix "(((\\.\\d*)?([eE][-+]?\\d+)?)M?)") (def long-suffix "((0[xX]([\\dA-Fa-f]+)|0([0-7]+)|([1-9]\\d?)[rR]([\\d\\w]+)|0\\d+)?N?)") From 902123f31a4b5c95074e1d6fefe2e3d1c6051b86 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Tue, 15 Oct 2019 22:21:25 +0200 Subject: [PATCH 02/10] fix: dont allow , as a valid symbol char fix: ::hello/world is not a valid macro keyword --- src/parcera/terminals.cljc | 4 ++-- test/parcera/test/core.cljc | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/parcera/terminals.cljc b/src/parcera/terminals.cljc index 2a6e3ff..0cbcb44 100644 --- a/src/parcera/terminals.cljc +++ b/src/parcera/terminals.cljc @@ -4,11 +4,11 @@ ;; Clojure's reader is quite permissive so we follow the motto "if it is not forbidden, it is allowed" ; todo: dont allow / -(def NAME "[^\\s\\(\\)\\[\\]{}\"@~\\^;`\\\\\\/]+") +(def NAME "[^\\s\\(\\)\\[\\]{}\"@~\\^;`\\\\\\/,]+") ; todo: (?!\/) do i need that ? ;; symbols cannot start with a number, :, # nor ' ; todo: no need for negative lookahead of chars -(def symbol-pattern (str "(?![:#\\',]|[+-]?\\d+)(" NAME "\\/)?(\\/|(" NAME "))(?=[\\s\"()\\[\\]{}]|$)")) +(def symbol-pattern (str "(?![:#\\',]|[+-]?\\d+)(" NAME "\\/)?(\\/|(" NAME "))(?=[\\s\"()\\[\\]{},]|$)")) (def double-suffix "(((\\.\\d*)?([eE][-+]?\\d+)?)M?)") (def long-suffix "((0[xX]([\\dA-Fa-f]+)|0([0-7]+)|([1-9]\\d?)[rR]([\\d\\w]+)|0\\d+)?N?)") diff --git a/test/parcera/test/core.cljc b/test/parcera/test/core.cljc index 766d31b..1c8887e 100644 --- a/test/parcera/test/core.cljc +++ b/test/parcera/test/core.cljc @@ -203,9 +203,9 @@ (is (clear input))))) (testing "keyword" - (as-> "::hello/world [1 a \"3\"]" input (and (is (valid? input)) - (is (roundtrip input)) - (is (clear input)))) + (as-> "::hello [1 a \"3\"]" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) (as-> "::hello" input (and (is (valid? input)) (is (roundtrip input)) (is (clear input))))) From 1dfea2f6e2cf6bb1c18e31866b5665a96a5863c1 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Tue, 15 Oct 2019 22:25:42 +0200 Subject: [PATCH 03/10] fix: macro keywords follow the same pattern as symbols rollback: test case with macro keyword --- src/parcera/terminals.cljc | 2 +- test/parcera/test/core.cljc | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/parcera/terminals.cljc b/src/parcera/terminals.cljc index 0cbcb44..3bf9407 100644 --- a/src/parcera/terminals.cljc +++ b/src/parcera/terminals.cljc @@ -31,7 +31,7 @@ ; : is not allowed as first keyword character ; todo: no need for negative lookahead of symbol (def simple-keyword (str ":(?!:)" symbol-pattern)) -(def macro-keyword (str "::(?!:)" NAME)) +(def macro-keyword (str "::(?!:)" symbol-pattern)) (def string-pattern "\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"") diff --git a/test/parcera/test/core.cljc b/test/parcera/test/core.cljc index 1c8887e..766d31b 100644 --- a/test/parcera/test/core.cljc +++ b/test/parcera/test/core.cljc @@ -203,9 +203,9 @@ (is (clear input))))) (testing "keyword" - (as-> "::hello [1 a \"3\"]" input (and (is (valid? input)) - (is (roundtrip input)) - (is (clear input)))) + (as-> "::hello/world [1 a \"3\"]" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) (as-> "::hello" input (and (is (valid? input)) (is (roundtrip input)) (is (clear input))))) From 24001424ff38e6635b55a2e61443c35095c478c9 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Tue, 15 Oct 2019 22:26:24 +0200 Subject: [PATCH 04/10] version bumped [skip ci] --- project.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project.clj b/project.clj index e1516a9..b65b02b 100644 --- a/project.clj +++ b/project.clj @@ -1,4 +1,4 @@ -(defproject carocad/parcera "0.3.0" +(defproject carocad/parcera "0.3.1" :description "Grammar-based Clojure(script) parser" :url "https://github.com/carocad/parcera" :license {:name "LGPLv3" From baf88390e6674fefd80cd68bd7fe5e32765b7096 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Tue, 15 Oct 2019 22:29:15 +0200 Subject: [PATCH 05/10] cosmetic changes --- src/parcera/terminals.cljc | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/parcera/terminals.cljc b/src/parcera/terminals.cljc index 3bf9407..9bdd2a2 100644 --- a/src/parcera/terminals.cljc +++ b/src/parcera/terminals.cljc @@ -1,14 +1,13 @@ (ns parcera.terminals) -; todo: try to avoid lookahead - -;; Clojure's reader is quite permissive so we follow the motto "if it is not forbidden, it is allowed" -; todo: dont allow / +;; Clojure's reader is quite permissive so we follow the motto +;; "if it is not forbidden, it is allowed" (def NAME "[^\\s\\(\\)\\[\\]{}\"@~\\^;`\\\\\\/,]+") -; todo: (?!\/) do i need that ? ;; symbols cannot start with a number, :, # nor ' ; todo: no need for negative lookahead of chars -(def symbol-pattern (str "(?![:#\\',]|[+-]?\\d+)(" NAME "\\/)?(\\/|(" NAME "))(?=[\\s\"()\\[\\]{},]|$)")) +(def first-character "(?![:#\\',]|[+-]?\\d+)") +(def symbol-end "(?=[\\s\"()\\[\\]{},]|$)") +(def symbol-pattern (str first-character "(" NAME "\\/)?(\\/|(" NAME "))" symbol-end)) (def double-suffix "(((\\.\\d*)?([eE][-+]?\\d+)?)M?)") (def long-suffix "((0[xX]([\\dA-Fa-f]+)|0([0-7]+)|([1-9]\\d?)[rR]([\\d\\w]+)|0\\d+)?N?)") From 2266af126e7018d70aa2ebddceeff40379e74b6a Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Tue, 15 Oct 2019 22:34:36 +0200 Subject: [PATCH 06/10] extra test cases added --- test/parcera/test/core.cljc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/parcera/test/core.cljc b/test/parcera/test/core.cljc index 766d31b..0ae9324 100644 --- a/test/parcera/test/core.cljc +++ b/test/parcera/test/core.cljc @@ -132,7 +132,11 @@ (as-> "{:hello ;2} 2}" input (and (is (valid? input)) (is (roundtrip input)) - (is (clear input)))))) + (is (clear input))))) + (testing "symbols" + (as-> "hello/world/" input (is (not (valid? input)))) + (as-> ":hello/world/" input (is (not (valid? input)))) + (as-> "::hello/world/" input (is (not (valid? input)))))) (deftest macros From 276349398efbde9228caf39b812bb1bd0e7b9058 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Tue, 15 Oct 2019 23:00:29 +0200 Subject: [PATCH 07/10] fix: avoid negative lookahead --- src/parcera/terminals.cljc | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/parcera/terminals.cljc b/src/parcera/terminals.cljc index 9bdd2a2..935a2c1 100644 --- a/src/parcera/terminals.cljc +++ b/src/parcera/terminals.cljc @@ -2,12 +2,19 @@ ;; Clojure's reader is quite permissive so we follow the motto ;; "if it is not forbidden, it is allowed" -(def NAME "[^\\s\\(\\)\\[\\]{}\"@~\\^;`\\\\\\/,]+") -;; symbols cannot start with a number, :, # nor ' -; todo: no need for negative lookahead of chars -(def first-character "(?![:#\\',]|[+-]?\\d+)") +(def not-allowed "\\s\\(\\)\\[\\]{}\"@~\\^;`\\\\\\/,") +(def allowed-characters (str "[^" not-allowed "]*")) +(def not-number "(?![+-]?\\d+)") (def symbol-end "(?=[\\s\"()\\[\\]{},]|$)") -(def symbol-pattern (str first-character "(" NAME "\\/)?(\\/|(" NAME "))" symbol-end)) + +(defn- name-pattern + [restriction] + (let [first-character (str "[^" restriction not-allowed "]")] + (str "(" first-character allowed-characters "\\/)?" + "(\\/|(" first-character allowed-characters "))" + symbol-end))) + +(def symbol-pattern (str not-number (name-pattern ":#\\'"))) (def double-suffix "(((\\.\\d*)?([eE][-+]?\\d+)?)M?)") (def long-suffix "((0[xX]([\\dA-Fa-f]+)|0([0-7]+)|([1-9]\\d?)[rR]([\\d\\w]+)|0\\d+)?N?)") @@ -29,8 +36,8 @@ ; : is not allowed as first keyword character ; todo: no need for negative lookahead of symbol -(def simple-keyword (str ":(?!:)" symbol-pattern)) -(def macro-keyword (str "::(?!:)" symbol-pattern)) +(def simple-keyword (str ":" (name-pattern ":"))) +(def macro-keyword (str "::" (name-pattern ":"))) (def string-pattern "\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"") From 2a153a331d3ba5da42ef27a4c5781008c97bec53 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Tue, 15 Oct 2019 23:17:01 +0200 Subject: [PATCH 08/10] cosmetic changes --- src/parcera/terminals.cljc | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/parcera/terminals.cljc b/src/parcera/terminals.cljc index 935a2c1..c282e42 100644 --- a/src/parcera/terminals.cljc +++ b/src/parcera/terminals.cljc @@ -1,4 +1,8 @@ -(ns parcera.terminals) +(ns parcera.terminals + "Clojure symbols, keywords, numbers and string/regex share quite a lot + of matching logic. This namespace is aimed towards clearly identifying + those pieces and share them among the different definitions to + avoid recurring issues") ;; Clojure's reader is quite permissive so we follow the motto ;; "if it is not forbidden, it is allowed" @@ -14,7 +18,11 @@ "(\\/|(" first-character allowed-characters "))" symbol-end))) + (def symbol-pattern (str not-number (name-pattern ":#\\'"))) +(def simple-keyword (str ":" (name-pattern ":"))) +(def macro-keyword (str "::" (name-pattern ":"))) + (def double-suffix "(((\\.\\d*)?([eE][-+]?\\d+)?)M?)") (def long-suffix "((0[xX]([\\dA-Fa-f]+)|0([0-7]+)|([1-9]\\d?)[rR]([\\d\\w]+)|0\\d+)?N?)") @@ -34,11 +42,5 @@ (def character-pattern (str "\\\\(" unicode-char "|" named-char "|" unicode ")(?!\\w+)")) -; : is not allowed as first keyword character -; todo: no need for negative lookahead of symbol -(def simple-keyword (str ":" (name-pattern ":"))) -(def macro-keyword (str "::" (name-pattern ":"))) - - (def string-pattern "\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"") (def regex-pattern (str "#" string-pattern)) From 4450349989ec441a642693b6698665f825c2e3cd Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Tue, 15 Oct 2019 23:18:47 +0200 Subject: [PATCH 09/10] removed unnecessary repetition in chars --- src/parcera/terminals.cljc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parcera/terminals.cljc b/src/parcera/terminals.cljc index c282e42..0802cc8 100644 --- a/src/parcera/terminals.cljc +++ b/src/parcera/terminals.cljc @@ -35,7 +35,7 @@ ; It's cooked by this generator: http://kourge.net/projects/regexp-unicode-block ; ticking all 'Combining Diacritical Marks' boxes *)) ; todo: repeated pattern could be simplified -(def unicode-char "([^\\u0300-\\u036F\\u1DC0-\\u1DFF\\u20D0-\\u20FF][\\u0300-\\u036F\\u1DC0-\\u1DFF\\u20D0-\\u20FF]*)") +(def unicode-char "([^\\u0300-\\u036F\\u1DC0-\\u1DFF\\u20D0-\\u20FF])") (def named-char "(newline|return|space|tab|formfeed|backspace)") (def unicode "(u[\\dD-Fd-f]{4})") ; todo: use word boundary to avoid lookahead From b4ca5c659e55f00781e37bee1dc6bb400460e307 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Tue, 15 Oct 2019 23:27:56 +0200 Subject: [PATCH 10/10] removed todos --- src/parcera/terminals.cljc | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/parcera/terminals.cljc b/src/parcera/terminals.cljc index 0802cc8..b9e3c69 100644 --- a/src/parcera/terminals.cljc +++ b/src/parcera/terminals.cljc @@ -34,11 +34,9 @@ ; mentioned here: https://www.regular-expressions.info/unicode.html ; It's cooked by this generator: http://kourge.net/projects/regexp-unicode-block ; ticking all 'Combining Diacritical Marks' boxes *)) -; todo: repeated pattern could be simplified (def unicode-char "([^\\u0300-\\u036F\\u1DC0-\\u1DFF\\u20D0-\\u20FF])") (def named-char "(newline|return|space|tab|formfeed|backspace)") (def unicode "(u[\\dD-Fd-f]{4})") -; todo: use word boundary to avoid lookahead (def character-pattern (str "\\\\(" unicode-char "|" named-char "|" unicode ")(?!\\w+)"))