From d70aa681cb09df5d28cb45a1373983e0b9c08817 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Wed, 9 Oct 2019 23:10:26 +0200 Subject: [PATCH 01/50] moved reusable parts to terminals ns redefined tokens as regexp --- src/parcera/core.clj | 98 +++++++++------------------------------ src/parcera/terminals.clj | 23 +++++++++ 2 files changed, 45 insertions(+), 76 deletions(-) create mode 100644 src/parcera/terminals.clj diff --git a/src/parcera/core.clj b/src/parcera/core.clj index 04557cb..a0fb07f 100644 --- a/src/parcera/core.clj +++ b/src/parcera/core.clj @@ -1,7 +1,10 @@ (ns parcera.core - (:require [instaparse.core :as instaparse])) + (:require [instaparse.core :as instaparse] + [instaparse.combinators-source :as combi] + [instaparse.cfg :as cfg] + [parcera.terminals :as terminal])) -(def grammar +(def grammar-rules "code: form*;
: whitespace ( literal @@ -26,10 +29,12 @@ map: map-namespace? <'{'> map-content <'}'> ; - map-namespace: <'#'> (keyword | auto-resolve); + map-namespace: <'#'> ( keyword | auto-resolve ); map-content: (form form)* + auto-resolve: '::'; + set: <'#{'> form* <'}'> ; : @@ -43,12 +48,6 @@ symbolic: #'##(Inf|-Inf|NaN)' - number: ( DOUBLE | RATIO | LONG ) !symbol (* remove ambiguity with symbols 1/5 - 1 -> number, / -> symbol, 5 -> number *); - - character: <'\\\\'> ( SIMPLE-CHAR | UNICODE-CHAR ) !symbol (* remove ambiguity with symbols \backspace - \b -> character, ackspace -> symbol *); - : dispatch | metadata @@ -89,57 +88,24 @@ conditional-splicing: <'#?@'> list; - string : <'\"'> #'[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*' <'\"'>; + string : #'\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"'; - symbol: !SYMBOL-HEAD name; + symbol: !number symbol-body : simple-keyword | macro-keyword ; - auto-resolve: '::' ; - - simple-keyword: <':'> !':' name; - - macro-keyword: !':' name; - - comment: <';'> #'.*'; - - (* - ;; symbols cannot start with number, :, # - ;; / is a valid symbol as long as it is not part of the name - ;; note: added ' as invalid first character due to ambiguity in #'hello - ;; -> [:tag [:symbol 'hello]] - ;; -> [:var-quote [:symbol hello]] - *) - SYMBOL-HEAD: number | ':' | '#' | '\\'' - - (* - ;; NOTE: several characters are not allowed according to clojure reference. - ;; https://clojure.org/reference/reader#_symbols - ;; EDN reader says otherwise https://github.com/edn-format/edn#symbols - ;; nil, true, false are actually symbols with special meaning ... not grammar rules - ;; on their own - VALID-CHARACTERS>: #'[^\\s\\(\\)\\[\\]{}\"@~\\^;`]+' - *) - : #'([^\\s\\(\\)\\[\\]{}\"@~,\\\\^;`]+\\/)?(\\/|([^\\s\\(\\)\\[\\]{}\"@~,\\\\^;`]+))(?!\\/)' + comment: #';.*';") - (* HIDDEN PARSERS ------------------------------------------------------ *) +(def grammar-terminals + {:character (combi/regexp terminal/CHARACTER) + :symbol-body (combi/hide-tag (combi/regexp terminal/SYMBOL)) + :number (combi/regexp terminal/NUMBER) + :macro-keyword (combi/regexp terminal/MACRO-KEYWORD) + :simple-keyword (combi/regexp terminal/SIMPLE-KEYWORD)}) - : #'[-+]?(\\d+(\\.\\d*)?([eE][-+]?\\d+)?)(M)?' +(combi/regexp terminal/NUMBER) - : #'[-+]?(\\d+)/(\\d+)'; - - : #'[-+]?(?:(0)|([1-9]\\d*)|0[xX]([\\dA-Fa-f]+)|0([0-7]+)|([1-9]\\d?)[rR]([\\d\\w]+)|0\\d+)(N)?'; - - : #'u[\\dD-Fd-f]{4}'; - - : - 'newline' - | 'return' - | 'space' - | 'tab' - | 'formfeed' - | 'backspace' - | #'\\P{M}\\p{M}*+'; (* https://www.regular-expressions.info/unicode.html *)") +(def grammar (merge (cfg/ebnf grammar-rules) grammar-terminals)) (def clojure @@ -154,7 +120,7 @@ For a description of all possible options, visit Instaparse's official documentation: https://github.com/Engelberg/instaparse#reference" - (instaparse/parser grammar)) + (instaparse/parser grammar :start :code)) (defn- code* @@ -193,30 +159,10 @@ (doseq [child (rest ast)] (code* child string-builder)) (. string-builder (append "}"))) - (:number :whitespace :symbolic :auto-resolve :symbol) + (:number :whitespace :symbolic :auto-resolve :symbol :simple-keyword + :macro-keyword :comment :character :string) (. string-builder (append (second ast))) - :string - (do (. string-builder (append "\"")) - (. string-builder (append (second ast))) - (. string-builder (append "\""))) - - :character - (do (. string-builder (append "\\")) - (. string-builder (append (second ast)))) - - :simple-keyword - (do (. string-builder (append ":")) - (. string-builder (append (second ast)))) - - :macro-keyword - (do (. string-builder (append "::")) - (. string-builder (append (second ast)))) - - :comment - (do (. string-builder (append ";")) - (. string-builder (append (second ast)))) - :metadata (do (. string-builder (append "^")) (doseq [child (rest ast)] (code* child string-builder))) diff --git a/src/parcera/terminals.clj b/src/parcera/terminals.clj new file mode 100644 index 0000000..dfdc095 --- /dev/null +++ b/src/parcera/terminals.clj @@ -0,0 +1,23 @@ +(ns parcera.terminals) + +;; Clojure's reader is quite permissive so we follow the motto "if it is not forbidden, it is allowed" +(def NAME "[^\\s\\(\\)\\[\\]{}\"@~\\^;`]+") +; todo: (?!\/) do i need that ? +;; symbols cannot start with a number, :, # nor ' +(def SYMBOL (str "(?![:#\\'])(" NAME "\\/)?(\\/|(" NAME "))")) + + +(def DOUBLE "((\\d+(\\.\\d*)?([eE][-+]?\\d+)?)(M)?)") +; todo: (0)|([1-9]\d*) is this needed ? +(def LONG "(((0)|([1-9]\\d*)|0[xX]([\\dA-Fa-f]+)|0([0-7]+)|([1-9]\\d?)[rR]([\\d\\w]+)|0\\d+)(N)?)") +(def RATIO "((\\d+)\\/(\\d+))") +(def NUMBER (str "[+-]?(" LONG "|" DOUBLE "|" RATIO ")(?![\\.\\/])")) + +(def unicode-char "(\\P{M}\\p{M}*+)") ;; https://www.regular-expressions.info/unicode.html +(def named-char "(newline|return|space|tab|formfeed|backspace)") +(def unicode "(u[\\dD-Fd-f]{4})") +(def CHARACTER (str "\\\\" unicode-char "|" named-char "|" unicode)) + +; : is not allowed as first keyword character +(def SIMPLE-KEYWORD (str ":(?!:)" SYMBOL)) +(def MACRO-KEYWORD (str "::(?!:)" NAME)) From dadff0312e06e384d301463d3008cc8c8c0a13dd Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Wed, 9 Oct 2019 23:23:22 +0200 Subject: [PATCH 02/50] fix: all number rules start with \d+ --- src/parcera/terminals.clj | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/parcera/terminals.clj b/src/parcera/terminals.clj index dfdc095..ae176f3 100644 --- a/src/parcera/terminals.clj +++ b/src/parcera/terminals.clj @@ -7,11 +7,11 @@ (def SYMBOL (str "(?![:#\\'])(" NAME "\\/)?(\\/|(" NAME "))")) -(def DOUBLE "((\\d+(\\.\\d*)?([eE][-+]?\\d+)?)(M)?)") +(def DOUBLE "(((\\.\\d*)?([eE][-+]?\\d+)?)M?)") ; todo: (0)|([1-9]\d*) is this needed ? -(def LONG "(((0)|([1-9]\\d*)|0[xX]([\\dA-Fa-f]+)|0([0-7]+)|([1-9]\\d?)[rR]([\\d\\w]+)|0\\d+)(N)?)") -(def RATIO "((\\d+)\\/(\\d+))") -(def NUMBER (str "[+-]?(" LONG "|" DOUBLE "|" RATIO ")(?![\\.\\/])")) +(def LONG "((0[xX]([\\dA-Fa-f]+)|0([0-7]+)|([1-9]\\d?)[rR]([\\d\\w]+)|0\\d+)?N?)") +(def RATIO "(\\/(\\d+))") +(def NUMBER (str "[+-]?\\d+(" LONG "|" DOUBLE "|" RATIO ")(?![\\.\\/])")) (def unicode-char "(\\P{M}\\p{M}*+)") ;; https://www.regular-expressions.info/unicode.html (def named-char "(newline|return|space|tab|formfeed|backspace)") From 583db51943754f547fa1de342e5580d7d5594a02 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Wed, 9 Oct 2019 23:29:35 +0200 Subject: [PATCH 03/50] fix: a name cannot contain escape character \ --- src/parcera/terminals.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parcera/terminals.clj b/src/parcera/terminals.clj index ae176f3..685e490 100644 --- a/src/parcera/terminals.clj +++ b/src/parcera/terminals.clj @@ -1,7 +1,7 @@ (ns parcera.terminals) ;; Clojure's reader is quite permissive so we follow the motto "if it is not forbidden, it is allowed" -(def NAME "[^\\s\\(\\)\\[\\]{}\"@~\\^;`]+") +(def NAME "[^\\s\\(\\)\\[\\]{}\"@~\\^;`\\\\]+") ; todo: (?!\/) do i need that ? ;; symbols cannot start with a number, :, # nor ' (def SYMBOL (str "(?![:#\\'])(" NAME "\\/)?(\\/|(" NAME "))")) From 72ba3eb4c3f7335bf145438d4680f8f334459429 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Wed, 9 Oct 2019 23:32:51 +0200 Subject: [PATCH 04/50] cosmetic changes --- src/parcera/core.clj | 12 +++++------- src/parcera/terminals.clj | 16 ++++++++-------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/src/parcera/core.clj b/src/parcera/core.clj index a0fb07f..cff49c5 100644 --- a/src/parcera/core.clj +++ b/src/parcera/core.clj @@ -97,13 +97,11 @@ comment: #';.*';") (def grammar-terminals - {:character (combi/regexp terminal/CHARACTER) - :symbol-body (combi/hide-tag (combi/regexp terminal/SYMBOL)) - :number (combi/regexp terminal/NUMBER) - :macro-keyword (combi/regexp terminal/MACRO-KEYWORD) - :simple-keyword (combi/regexp terminal/SIMPLE-KEYWORD)}) - -(combi/regexp terminal/NUMBER) + {:character (combi/regexp terminal/character-pattern) + :symbol-body (combi/hide-tag (combi/regexp terminal/symbol-pattern)) + :number (combi/regexp terminal/number-pattern) + :macro-keyword (combi/regexp terminal/macro-keyword) + :simple-keyword (combi/regexp terminal/simple-keyword)}) (def grammar (merge (cfg/ebnf grammar-rules) grammar-terminals)) diff --git a/src/parcera/terminals.clj b/src/parcera/terminals.clj index 685e490..a930122 100644 --- a/src/parcera/terminals.clj +++ b/src/parcera/terminals.clj @@ -4,20 +4,20 @@ (def NAME "[^\\s\\(\\)\\[\\]{}\"@~\\^;`\\\\]+") ; todo: (?!\/) do i need that ? ;; symbols cannot start with a number, :, # nor ' -(def SYMBOL (str "(?![:#\\'])(" NAME "\\/)?(\\/|(" NAME "))")) +(def symbol-pattern (str "(?![:#\\'])(" NAME "\\/)?(\\/|(" NAME "))")) -(def DOUBLE "(((\\.\\d*)?([eE][-+]?\\d+)?)M?)") +(def double-suffix "(((\\.\\d*)?([eE][-+]?\\d+)?)M?)") ; todo: (0)|([1-9]\d*) is this needed ? -(def LONG "((0[xX]([\\dA-Fa-f]+)|0([0-7]+)|([1-9]\\d?)[rR]([\\d\\w]+)|0\\d+)?N?)") -(def RATIO "(\\/(\\d+))") -(def NUMBER (str "[+-]?\\d+(" LONG "|" DOUBLE "|" RATIO ")(?![\\.\\/])")) +(def long-suffix "((0[xX]([\\dA-Fa-f]+)|0([0-7]+)|([1-9]\\d?)[rR]([\\d\\w]+)|0\\d+)?N?)") +(def ratio-suffix "(\\/(\\d+))") +(def number-pattern (str "[+-]?\\d+(" long-suffix "|" double-suffix "|" ratio-suffix ")(?![\\.\\/])")) (def unicode-char "(\\P{M}\\p{M}*+)") ;; https://www.regular-expressions.info/unicode.html (def named-char "(newline|return|space|tab|formfeed|backspace)") (def unicode "(u[\\dD-Fd-f]{4})") -(def CHARACTER (str "\\\\" unicode-char "|" named-char "|" unicode)) +(def character-pattern (str "\\\\" unicode-char "|" named-char "|" unicode)) ; : is not allowed as first keyword character -(def SIMPLE-KEYWORD (str ":(?!:)" SYMBOL)) -(def MACRO-KEYWORD (str "::(?!:)" NAME)) +(def simple-keyword (str ":(?!:)" symbol-pattern)) +(def macro-keyword (str "::(?!:)" NAME)) From 575c604206ecbb2144df1df7bcfbe3c838640a39 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Wed, 9 Oct 2019 23:48:44 +0200 Subject: [PATCH 05/50] fix: a character cannot end with more letters --- src/parcera/terminals.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parcera/terminals.clj b/src/parcera/terminals.clj index a930122..81f5963 100644 --- a/src/parcera/terminals.clj +++ b/src/parcera/terminals.clj @@ -16,7 +16,7 @@ (def unicode-char "(\\P{M}\\p{M}*+)") ;; https://www.regular-expressions.info/unicode.html (def named-char "(newline|return|space|tab|formfeed|backspace)") (def unicode "(u[\\dD-Fd-f]{4})") -(def character-pattern (str "\\\\" unicode-char "|" named-char "|" unicode)) +(def character-pattern (str "\\\\(" unicode-char "|" named-char "|" unicode ")(?!\\w+)")) ; : is not allowed as first keyword character (def simple-keyword (str ":(?!:)" symbol-pattern)) From 048cfd90de9a8c63de0996fe80eee1702fb7cc4c Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 14:27:36 +0200 Subject: [PATCH 06/50] refactored strings and regex rules to be terminals --- src/parcera/core.cljc | 17 +++++------------ src/parcera/terminals.clj | 6 ++++++ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index c383246..60d9bf7 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -67,8 +67,6 @@ : ( symbol | string | keyword ); - regex: <'#'> string; - var-quote: <'#\\''> symbol; quote: <'\\''> form; @@ -89,8 +87,6 @@ conditional-splicing: <'#?@'> list; - string : #'\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"'; - symbol: !number symbol-body : simple-keyword | macro-keyword ; @@ -99,10 +95,12 @@ (def grammar-terminals {:character (combi/regexp terminal/character-pattern) + :string (combi/regexp terminal/string-pattern) :symbol-body (combi/hide-tag (combi/regexp terminal/symbol-pattern)) :number (combi/regexp terminal/number-pattern) :macro-keyword (combi/regexp terminal/macro-keyword) - :simple-keyword (combi/regexp terminal/simple-keyword)}) + :simple-keyword (combi/regexp terminal/simple-keyword) + :regex (combi/regexp terminal/regex-pattern)}) (def grammar (merge (cfg/ebnf grammar-rules) grammar-terminals)) @@ -160,7 +158,7 @@ (. string-builder (append "}"))) (:number :whitespace :symbolic :auto-resolve :symbol :simple-keyword - :macro-keyword :comment :character :string) + :macro-keyword :comment :character :string :regex) (. string-builder (append (second ast))) :metadata @@ -171,10 +169,6 @@ (do (. string-builder (append "'")) (doseq [child (rest ast)] (code* child string-builder))) - :regex - (do (. string-builder (append "#")) - (code* (second ast) string-builder)) - :var-quote (do (. string-builder (append "#'")) (code* (second ast) string-builder)) @@ -226,8 +220,7 @@ In general (= input (parcera/code (parcera/clojure input)))" [ast] - (let [string-builder #?(:clj (new StringBuilder) - :cljs (new StringBuffer))] + (let [string-builder (new StringBuilder)] (code* ast string-builder) (. string-builder (toString)))) diff --git a/src/parcera/terminals.clj b/src/parcera/terminals.clj index 81f5963..7ea00bc 100644 --- a/src/parcera/terminals.clj +++ b/src/parcera/terminals.clj @@ -13,11 +13,17 @@ (def ratio-suffix "(\\/(\\d+))") (def number-pattern (str "[+-]?\\d+(" long-suffix "|" double-suffix "|" ratio-suffix ")(?![\\.\\/])")) + (def unicode-char "(\\P{M}\\p{M}*+)") ;; https://www.regular-expressions.info/unicode.html (def named-char "(newline|return|space|tab|formfeed|backspace)") (def unicode "(u[\\dD-Fd-f]{4})") (def character-pattern (str "\\\\(" unicode-char "|" named-char "|" unicode ")(?!\\w+)")) + ; : is not allowed as first keyword character (def simple-keyword (str ":(?!:)" symbol-pattern)) (def macro-keyword (str "::(?!:)" NAME)) + + +(def string-pattern "\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"") +(def regex-pattern (str "#" string-pattern)) From 8b3cba2a13f14f6fabdc7af639dd562220ec1b1b Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 14:41:52 +0200 Subject: [PATCH 07/50] fix: comments are not literal forms --- src/parcera/core.cljc | 25 +++++++++++++------------ test/parcera/test/core.cljc | 5 +++++ 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 60d9bf7..5093564 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -8,14 +8,16 @@ (def grammar-rules "code: form*; - : whitespace ( literal - | symbol - | collection - | reader-macro - ) - whitespace; + : comment? whitespace ( literal + | symbol + | collection + | reader-macro + ) + whitespace; - whitespace = #'[,\\s]*' + whitespace = #'[,\\s]*'; + + comment: #';.*'; : &#'[\\(\\[{#]' ( list | vector @@ -43,7 +45,6 @@ | string | character | keyword - | comment | symbolic ; @@ -87,11 +88,9 @@ conditional-splicing: <'#?@'> list; - symbol: !number symbol-body - - : simple-keyword | macro-keyword ; + symbol: !number symbol-body; - comment: #';.*';") + : simple-keyword | macro-keyword ;") (def grammar-terminals {:character (combi/regexp terminal/character-pattern) @@ -232,3 +231,5 @@ [clojure.data :as data] [clojure.string :as str]))) :trace true)) + +#_(instaparse/disable-tracing!) diff --git a/test/parcera/test/core.cljc b/test/parcera/test/core.cljc index 2268f2f..809df0c 100644 --- a/test/parcera/test/core.cljc +++ b/test/parcera/test/core.cljc @@ -85,6 +85,11 @@ (as-> "ϕ" input (is (and (valid? input) (roundtrip input) (clear input)))) (as-> "❤️" input (is (and (valid? input) (roundtrip input) (clear input)))))) +(deftest edge-cases + (testing "comments" + (as-> "{:hello ;2} + 2}" input (is (and (valid? input) (roundtrip input) (clear input)))))) + (deftest macros (testing "metadata" From fc4141ae45491204c5e7934acda87974c5742a98 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 15:48:34 +0200 Subject: [PATCH 08/50] makes comments behave as whitespace --- src/parcera/core.cljc | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 5093564..a37923e 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -8,16 +8,14 @@ (def grammar-rules "code: form*; - : comment? whitespace ( literal - | symbol - | collection - | reader-macro - ) - whitespace; + : whitespace ( literal + | symbol + | collection + | reader-macro + ) + whitespace; - whitespace = #'[,\\s]*'; - - comment: #';.*'; + whitespace = #'(;.*)?[,\\s]*'; (* we treat comments the same way as commas *) : &#'[\\(\\[{#]' ( list | vector @@ -157,7 +155,7 @@ (. string-builder (append "}"))) (:number :whitespace :symbolic :auto-resolve :symbol :simple-keyword - :macro-keyword :comment :character :string :regex) + :macro-keyword :character :string :regex) (. string-builder (append (second ast))) :metadata From 5874c6f77a09ef4c6c35a88a33bbc8b33d8b0284 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 16:14:07 +0200 Subject: [PATCH 09/50] fix: missing whitespace before comment --- src/parcera/core.cljc | 2 +- test/parcera/test/core.cljc | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index a37923e..8620804 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -15,7 +15,7 @@ ) whitespace; - whitespace = #'(;.*)?[,\\s]*'; (* we treat comments the same way as commas *) + whitespace = #'[,\\s]*(;.*)?[,\\s]*' (* we treat comments the same way as commas *); : &#'[\\(\\[{#]' ( list | vector diff --git a/test/parcera/test/core.cljc b/test/parcera/test/core.cljc index 809df0c..c3dc10e 100644 --- a/test/parcera/test/core.cljc +++ b/test/parcera/test/core.cljc @@ -8,19 +8,23 @@ [instaparse.core :as instaparse]) #?(:cljs (:require-macros [parcera.slurp :refer [slurp]]))) + (defn- roundtrip "checks parcera can parse and write back the exact same input code" [input] (= input (parcera/code (parcera/clojure input)))) + (defn- valid? [input] (not (instaparse/failure? (parcera/clojure input)))) + (defn- clear [input] (= 1 (count (instaparse/parses parcera/clojure input)))) + (def validity "The grammar definition of parcera is valid for any clojure value. Meaning that for any clojure value, parcera can create an AST for it" @@ -42,6 +46,7 @@ (prop/for-all [input (gen/fmap pr-str gen/any)] (= 1 (count (instaparse/parses parcera/clojure input))))) + (deftest simple (testing "character literals" (as-> "\\t" input (is (= input (parcera/code (parcera/clojure input))))) @@ -53,6 +58,7 @@ (as-> "\\ï" input (is (= input (parcera/code (parcera/clojure input))))) (as-> "\\ϕ" input (is (= input (parcera/code (parcera/clojure input))))))) + (deftest data-structures (testing "grammar definitions" (let [result (tc/quick-check 200 validity)] @@ -73,6 +79,7 @@ "high accuracy\n" (with-out-str (pprint/pprint result))))))) + (deftest unit-tests (testing "names" (as-> "foo" input (is (and (valid? input) (roundtrip input) (clear input)))) @@ -85,6 +92,7 @@ (as-> "ϕ" input (is (and (valid? input) (roundtrip input) (clear input)))) (as-> "❤️" input (is (and (valid? input) (roundtrip input) (clear input)))))) + (deftest edge-cases (testing "comments" (as-> "{:hello ;2} @@ -109,8 +117,11 @@ (as-> "#_\"[a b 2]\"" input (is (and (valid? input) (roundtrip input) (clear input))))) (testing "comments" - (as-> ";[a b 2]" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> ";; \"[a b 2]\"" input (is (and (valid? input) (roundtrip input) (clear input))))) + ; todo: should I allow a file containing only with a comment ? + #_(as-> ";[a b 2]" input (is (and (valid? input) (roundtrip input) (clear input)))) + #_(as-> ";; \"[a b 2]\"" input (is (and (valid? input) (roundtrip input) (clear input)))) + (as-> "2 ;[a b 2]" input (is (and (valid? input) (roundtrip input) (clear input)))) + (as-> " :hello ;; \"[a b 2]\"" input (is (and (valid? input) (roundtrip input) (clear input))))) (testing "var quote" (as-> "#'hello/world" input (is (and (valid? input) (roundtrip input) (clear input)))) From 92315ed31242d31640ddc567265028e6fcb5c09b Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 16:51:46 +0200 Subject: [PATCH 10/50] make whitespace a valid form. Treat maps key/value pair restriction as a runtime verification not a parser one --- src/parcera/core.cljc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 8620804..f6c33d7 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -8,14 +8,14 @@ (def grammar-rules "code: form*; - : whitespace ( literal - | symbol - | collection - | reader-macro - ) - whitespace; + : ( literal + | symbol + | collection + | reader-macro + | whitespace + ); - whitespace = #'[,\\s]*(;.*)?[,\\s]*' (* we treat comments the same way as commas *); + whitespace = #'[,\\s]*(;.*)?[,\\s]+' (* we treat comments the same way as commas *); : &#'[\\(\\[{#]' ( list | vector @@ -32,7 +32,7 @@ map-namespace: <'#'> ( keyword | auto-resolve ); - map-content: (form form)* + map-content: form* auto-resolve: '::'; From 7f727750421ca2645bda8a5438c7f20659d10179 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 16:52:07 +0200 Subject: [PATCH 11/50] fix: a symbol cannot start with , --- src/parcera/terminals.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parcera/terminals.clj b/src/parcera/terminals.clj index 7ea00bc..dd17248 100644 --- a/src/parcera/terminals.clj +++ b/src/parcera/terminals.clj @@ -4,7 +4,7 @@ (def NAME "[^\\s\\(\\)\\[\\]{}\"@~\\^;`\\\\]+") ; todo: (?!\/) do i need that ? ;; symbols cannot start with a number, :, # nor ' -(def symbol-pattern (str "(?![:#\\'])(" NAME "\\/)?(\\/|(" NAME "))")) +(def symbol-pattern (str "(?![:#\\',])(" NAME "\\/)?(\\/|(" NAME "))")) (def double-suffix "(((\\.\\d*)?([eE][-+]?\\d+)?)M?)") From eca43dfa21eac6e517227e19e92f697af657f7e9 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 17:00:16 +0200 Subject: [PATCH 12/50] fix: whitespace could be end of file --- src/parcera/core.cljc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index f6c33d7..468a366 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -15,7 +15,7 @@ | whitespace ); - whitespace = #'[,\\s]*(;.*)?[,\\s]+' (* we treat comments the same way as commas *); + whitespace = #'([,\\s]*;.*)?([,\\s]+|$)' (* we treat comments the same way as commas *); : &#'[\\(\\[{#]' ( list | vector From 2330feed75e2b24632b88540f9fd779d20e8f614 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 17:51:25 +0200 Subject: [PATCH 13/50] symbolic as literals --- src/parcera/core.cljc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 468a366..410ab58 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -21,8 +21,7 @@ | vector | map | set - ) - ; + ); list: <'('> form* <')'> ; @@ -46,7 +45,7 @@ | symbolic ; - symbolic: #'##(Inf|-Inf|NaN)' + symbolic: '##Inf' | '##-Inf' | '##NaN' : dispatch From 95f740f2d9dc1cd4699067c1282617609e145b2d Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 17:54:20 +0200 Subject: [PATCH 14/50] cosmetic changes --- src/parcera/core.cljc | 33 ++++++--------------------------- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 410ab58..5dac5ea 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -8,20 +8,11 @@ (def grammar-rules "code: form*; - : ( literal - | symbol - | collection - | reader-macro - | whitespace - ); + : literal | symbol | collection | reader-macro | whitespace; whitespace = #'([,\\s]*;.*)?([,\\s]+|$)' (* we treat comments the same way as commas *); - : &#'[\\(\\[{#]' ( list - | vector - | map - | set - ); + : &#'[\\(\\[{#]' ( list | vector | map | set ); list: <'('> form* <')'> ; @@ -37,25 +28,11 @@ set: <'#{'> form* <'}'> ; - : - number - | string - | character - | keyword - | symbolic - ; + : number | string | character | keyword | symbolic; symbolic: '##Inf' | '##-Inf' | '##NaN' - : - dispatch - | metadata - | deref - | quote - | backtick - | unquote - | unquote-splicing - ; + : dispatch | metadata | deref | quote | backtick | unquote | unquote-splicing; : &'#' ( function | regex | var-quote | discard | tag | conditional | conditional-splicing); @@ -89,6 +66,7 @@ : simple-keyword | macro-keyword ;") + (def grammar-terminals {:character (combi/regexp terminal/character-pattern) :string (combi/regexp terminal/string-pattern) @@ -98,6 +76,7 @@ :simple-keyword (combi/regexp terminal/simple-keyword) :regex (combi/regexp terminal/regex-pattern)}) + (def grammar (merge (cfg/ebnf grammar-rules) grammar-terminals)) From 40cabde1c77aa110bda3379d068e87447e6ffa9d Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 17:58:13 +0200 Subject: [PATCH 15/50] rollback symbolic as regex --- src/parcera/core.cljc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 5dac5ea..4d58b3e 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -30,7 +30,7 @@ : number | string | character | keyword | symbolic; - symbolic: '##Inf' | '##-Inf' | '##NaN' + symbolic: #'##(Inf|-Inf|NaN)' : dispatch | metadata | deref | quote | backtick | unquote | unquote-splicing; From 0908ecb075312b6e50e80dd6348628c667f15754 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 18:00:16 +0200 Subject: [PATCH 16/50] function as element not as list --- src/parcera/core.cljc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 4d58b3e..56d5259 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -36,7 +36,7 @@ : &'#' ( function | regex | var-quote | discard | tag | conditional | conditional-splicing); - function: <'#'> list; + function: <'#('> form* <')'>; metadata: <'^'> ( map | shorthand-metadata ) form; @@ -181,8 +181,9 @@ (doseq [child (rest ast)] (code* child string-builder))) :function - (do (. string-builder (append "#")) - (code* (second ast) string-builder)))) + (do (. string-builder (append "#(")) + (doseq [child (rest ast)] (code* child string-builder)) + (. string-builder (append ")"))))) (defn code From 79c4682a3ccb41b79701e2d08e5784bfa4a61f49 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 18:03:39 +0200 Subject: [PATCH 17/50] no need for extra rule --- src/parcera/core.cljc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 56d5259..f230ab4 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -38,9 +38,7 @@ function: <'#('> form* <')'>; - metadata: <'^'> ( map | shorthand-metadata ) form; - - : ( symbol | string | keyword ); + metadata: <'^'> ( map | symbol | string | keyword ) form; var-quote: <'#\\''> symbol; From 3dcb428c1713f5e94621542fd57926796b33b927 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 18:41:15 +0200 Subject: [PATCH 18/50] restrict metadata form to the accepted elements --- src/parcera/core.cljc | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index f230ab4..b5d8248 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -38,7 +38,10 @@ function: <'#('> form* <')'>; - metadata: <'^'> ( map | symbol | string | keyword ) form; + metadata: (meta-info whitespace)+ + (symbol | collection | tag | unquote | unquote-splicing); + + meta-info: <'^'> ( map | symbol | string | keyword ); var-quote: <'#\\''> symbol; @@ -135,8 +138,13 @@ (. string-builder (append (second ast))) :metadata - (do (. string-builder (append "^")) - (doseq [child (rest ast)] (code* child string-builder))) + (do (doseq [child (rest (butlast ast))] (code* child string-builder)) + (code* (last ast) string-builder)) + + :meta-info + (doseq [child (rest ast)] + (. string-builder (append "^")) + (code* child string-builder)) :quote (do (. string-builder (append "'")) From 06e16e8977455269deab4ccb2c5ce2ba967292a7 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 19:04:43 +0200 Subject: [PATCH 19/50] run benchmark for this branch --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 7b6d7c9..83263fe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,8 @@ jobs: # only run the benchmark is we are on master # otherwise the build takes too long - stage: Benchmark - if: head_branch = master + #if: head_branch = master + if: branch = master script: - lein trampoline test :benchmark From 91d7c0ac549cf1ea2517ff2b17f65217d0c84986 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 19:08:51 +0200 Subject: [PATCH 20/50] fix: wrong extension --- src/parcera/{terminals.clj => terminals.cljc} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/parcera/{terminals.clj => terminals.cljc} (100%) diff --git a/src/parcera/terminals.clj b/src/parcera/terminals.cljc similarity index 100% rename from src/parcera/terminals.clj rename to src/parcera/terminals.cljc From f824b73b42a1cfb8b3a7e75050c4289489ec8ea6 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 21:26:09 +0200 Subject: [PATCH 21/50] fix: use unicode char regex from master --- src/parcera/terminals.cljc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/parcera/terminals.cljc b/src/parcera/terminals.cljc index dd17248..005035d 100644 --- a/src/parcera/terminals.cljc +++ b/src/parcera/terminals.cljc @@ -14,7 +14,11 @@ (def number-pattern (str "[+-]?\\d+(" long-suffix "|" double-suffix "|" ratio-suffix ")(?![\\.\\/])")) -(def unicode-char "(\\P{M}\\p{M}*+)") ;; https://www.regular-expressions.info/unicode.html +; This is supposed to be the JavaScript friendly version of #'\P{M}\p{M}*+' +; mentioned here: https://www.regular-expressions.info/unicode.html +; It's cooked by this generator: http://kourge.net/projects/regexp-unicode-block +; ticking all 'Combining Diacritical Marks' boxes *)) +(def unicode-char "([^\\u0300-\\u036F\\u1DC0-\\u1DFF\\u20D0-\\u20FF][\\u0300-\\u036F\\u1DC0-\\u1DFF\\u20D0-\\u20FF]*)") (def named-char "(newline|return|space|tab|formfeed|backspace)") (def unicode "(u[\\dD-Fd-f]{4})") (def character-pattern (str "\\\\(" unicode-char "|" named-char "|" unicode ")(?!\\w+)")) From ae4efe586b38216931f7caa2c43850b5641c00e1 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 22:06:07 +0200 Subject: [PATCH 22/50] moved symbol to literal rule --- src/parcera/core.cljc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index b5d8248..55f8a75 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -8,7 +8,7 @@ (def grammar-rules "code: form*; - : literal | symbol | collection | reader-macro | whitespace; + : literal | collection | reader-macro | whitespace; whitespace = #'([,\\s]*;.*)?([,\\s]+|$)' (* we treat comments the same way as commas *); @@ -28,7 +28,7 @@ set: <'#{'> form* <'}'> ; - : number | string | character | keyword | symbolic; + : symbol | number | string | character | keyword | symbolic; symbolic: #'##(Inf|-Inf|NaN)' @@ -207,7 +207,8 @@ (. string-builder (toString)))) ; Successful parse. -; Profile: {:create-node 1651, :push-full-listener 2, :push-stack 1651, :push-listener 1689, :push-result 273, :push-message 275} +; Profile: {:create-node 1651, :push-full-listener 2, :push-stack 1651, +; :push-listener 1689, :push-result 273, :push-message 275} ; "Elapsed time: 141.452323 msecs" #_(time (clojure (str '(ns parcera.core (:require [instaparse.core :as instaparse] From 7c0ca83b2e1175138d2ae41f4e525b05312ab071 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 22:13:02 +0200 Subject: [PATCH 23/50] lookahead to avoid checking all macro rules --- src/parcera/core.cljc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 55f8a75..145f684 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -32,9 +32,11 @@ symbolic: #'##(Inf|-Inf|NaN)' - : dispatch | metadata | deref | quote | backtick | unquote | unquote-splicing; + : &#'[#^\\'`~@]' (dispatch | metadata | deref | quote + | backtick | unquote | unquote-splicing); - : &'#' ( function | regex | var-quote | discard | tag | conditional | conditional-splicing); + : function | regex | var-quote | discard | tag | + conditional | conditional-splicing; function: <'#('> form* <')'>; From 2b7fbca2be31a01df375e2621c27a6b27e1ba201 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 22:22:57 +0200 Subject: [PATCH 24/50] moved set to reader macro splitted map and namespaced-map moved namespaced map to reader macro --- src/parcera/core.cljc | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 145f684..88d8170 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -12,17 +12,15 @@ whitespace = #'([,\\s]*;.*)?([,\\s]+|$)' (* we treat comments the same way as commas *); - : &#'[\\(\\[{#]' ( list | vector | map | set ); + : &#'[\\(\\[{]' ( list | vector | map ); list: <'('> form* <')'> ; vector: <'['> form* <']'> ; - map: map-namespace? <'{'> map-content <'}'> ; + namespaced-map: <'#'> ( keyword | auto-resolve ) map - map-namespace: <'#'> ( keyword | auto-resolve ); - - map-content: form* + map: <'{'> form* <'}'>; auto-resolve: '::'; @@ -33,7 +31,8 @@ symbolic: #'##(Inf|-Inf|NaN)' : &#'[#^\\'`~@]' (dispatch | metadata | deref | quote - | backtick | unquote | unquote-splicing); + | backtick | unquote | unquote-splicing + | set | namespaced-map); : function | regex | var-quote | discard | tag | conditional | conditional-splicing; @@ -118,14 +117,11 @@ (doseq [child (rest ast)] (code* child string-builder)) (. string-builder (append "]"))) - :map - (doseq [child (rest ast)] (code* child string-builder)) - - :map-namespace + :namespaced-map (do (. string-builder (append "#")) - (code* (second ast) string-builder)) + (doseq [child (rest ast)] (code* child string-builder))) - :map-content + :map (do (. string-builder (append "{")) (doseq [child (rest ast)] (code* child string-builder)) (. string-builder (append "}"))) From 5c3b6975cd331ad4227ea4fc3e811eb5f23d3d3d Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 22:24:55 +0200 Subject: [PATCH 25/50] cosmetic changes --- src/parcera/core.cljc | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 88d8170..7cccca1 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -30,9 +30,16 @@ symbolic: #'##(Inf|-Inf|NaN)' - : &#'[#^\\'`~@]' (dispatch | metadata | deref | quote - | backtick | unquote | unquote-splicing - | set | namespaced-map); + : &#'[#^\\'`~@]' ( dispatch + | metadata + | deref + | quote + | backtick + | unquote + | unquote-splicing + | set + | namespaced-map + ); : function | regex | var-quote | discard | tag | conditional | conditional-splicing; From e79be7838b1a4e6e727a431ac53bd6c39524eb78 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 22:27:02 +0200 Subject: [PATCH 26/50] moved symbolic to reader macro --- src/parcera/core.cljc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 7cccca1..278612a 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -26,9 +26,7 @@ set: <'#{'> form* <'}'> ; - : symbol | number | string | character | keyword | symbolic; - - symbolic: #'##(Inf|-Inf|NaN)' + : symbol | number | string | character | keyword; : &#'[#^\\'`~@]' ( dispatch | metadata @@ -39,6 +37,7 @@ | unquote-splicing | set | namespaced-map + | symbolic ); : function | regex | var-quote | discard | tag | @@ -73,7 +72,9 @@ symbol: !number symbol-body; - : simple-keyword | macro-keyword ;") + : simple-keyword | macro-keyword ; + + symbolic: #'##(Inf|-Inf|NaN)'") (def grammar-terminals From e36ae5a3f1657680e99ce7271c79efaa0959bd6d Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 22:30:46 +0200 Subject: [PATCH 27/50] cosmetic changes --- src/parcera/core.cljc | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 278612a..4dd8b76 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -18,8 +18,6 @@ vector: <'['> form* <']'> ; - namespaced-map: <'#'> ( keyword | auto-resolve ) map - map: <'{'> form* <'}'>; auto-resolve: '::'; @@ -28,6 +26,10 @@ : symbol | number | string | character | keyword; + symbol: !number symbol-body; + + : simple-keyword | macro-keyword ; + : &#'[#^\\'`~@]' ( dispatch | metadata | deref @@ -40,18 +42,13 @@ | symbolic ); - : function | regex | var-quote | discard | tag | - conditional | conditional-splicing; - - function: <'#('> form* <')'>; + namespaced-map: <'#'> ( keyword | auto-resolve ) map metadata: (meta-info whitespace)+ (symbol | collection | tag | unquote | unquote-splicing); meta-info: <'^'> ( map | symbol | string | keyword ); - var-quote: <'#\\''> symbol; - quote: <'\\''> form; backtick: <'`'> form; @@ -62,6 +59,18 @@ deref: <'@'> form; + : function + | regex + | var-quote + | discard + | tag + | conditional + | conditional-splicing; + + function: <'#('> form* <')'>; + + var-quote: <'#\\''> symbol; + discard: <'#_'> form; tag: <#'#(?![_?])'> symbol form; @@ -70,10 +79,6 @@ conditional-splicing: <'#?@'> list; - symbol: !number symbol-body; - - : simple-keyword | macro-keyword ; - symbolic: #'##(Inf|-Inf|NaN)'") From 937a1af336e4f2a053534d4974fa68353913105f Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 22:49:50 +0200 Subject: [PATCH 28/50] literal lookahead added for performance --- src/parcera/core.cljc | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 4dd8b76..7ce6fb1 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -20,11 +20,13 @@ map: <'{'> form* <'}'>; - auto-resolve: '::'; - - set: <'#{'> form* <'}'> ; - - : symbol | number | string | character | keyword; + (* a literal is basically anything that is not a collection, macro or whitespace *) + : &#'[^\\(\\[{#^\\'`~@\\s]' ( symbol + | number + | string + | character + | keyword + ); symbol: !number symbol-body; From faf60b9820adff54d4af9743eac3e16f51f51d6a Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 22:52:10 +0200 Subject: [PATCH 29/50] fix: tag form cannot be a whitespace cosmetic changes --- src/parcera/core.cljc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 7ce6fb1..120d622 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -12,6 +12,8 @@ whitespace = #'([,\\s]*;.*)?([,\\s]+|$)' (* we treat comments the same way as commas *); + (* for parsing purposes we dont consider a Set a collection since it starts + with # -> dispatch macro *) : &#'[\\(\\[{]' ( list | vector | map ); list: <'('> form* <')'> ; @@ -44,7 +46,11 @@ | symbolic ); - namespaced-map: <'#'> ( keyword | auto-resolve ) map + set: <'#{'> form* <'}'>; + + namespaced-map: <'#'> ( keyword | auto-resolve ) map; + + auto-resolve: '::'; metadata: (meta-info whitespace)+ (symbol | collection | tag | unquote | unquote-splicing); @@ -75,7 +81,7 @@ discard: <'#_'> form; - tag: <#'#(?![_?])'> symbol form; + tag: <#'#(?![_?])'> symbol whitespace? (literal | collection); conditional: <'#?'> list; From e06f94a23496d3c855383f672c7397c8ff90c3b3 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Fri, 11 Oct 2019 23:35:35 +0200 Subject: [PATCH 30/50] cosmetic changes --- src/parcera/core.cljc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 120d622..1e35a9b 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -52,10 +52,14 @@ auto-resolve: '::'; - metadata: (meta-info whitespace)+ - (symbol | collection | tag | unquote | unquote-splicing); + metadata: (metadata-entry whitespace)+ ( symbol + | collection + | tag + | unquote + | unquote-splicing + ); - meta-info: <'^'> ( map | symbol | string | keyword ); + metadata-entry: <'^'> ( map | symbol | string | keyword ); quote: <'\\''> form; @@ -160,7 +164,7 @@ (do (doseq [child (rest (butlast ast))] (code* child string-builder)) (code* (last ast) string-builder)) - :meta-info + :metadata-entry (doseq [child (rest ast)] (. string-builder (append "^")) (code* child string-builder)) From c584abf58a948bc9f0e697cead828c5460295bdc Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Sat, 12 Oct 2019 00:30:53 +0200 Subject: [PATCH 31/50] removes negative lookahead in symbol --- src/parcera/core.cljc | 10 ++++------ src/parcera/terminals.cljc | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 1e35a9b..f6b0445 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -30,8 +30,6 @@ | keyword ); - symbol: !number symbol-body; - : simple-keyword | macro-keyword ; : &#'[#^\\'`~@]' ( dispatch @@ -97,7 +95,7 @@ (def grammar-terminals {:character (combi/regexp terminal/character-pattern) :string (combi/regexp terminal/string-pattern) - :symbol-body (combi/hide-tag (combi/regexp terminal/symbol-pattern)) + :symbol (combi/regexp terminal/symbol-pattern) :number (combi/regexp terminal/number-pattern) :macro-keyword (combi/regexp terminal/macro-keyword) :simple-keyword (combi/regexp terminal/simple-keyword) @@ -230,9 +228,9 @@ (. string-builder (toString)))) ; Successful parse. -; Profile: {:create-node 1651, :push-full-listener 2, :push-stack 1651, -; :push-listener 1689, :push-result 273, :push-message 275} -; "Elapsed time: 141.452323 msecs" +; Profile: {:create-node 594, :push-full-listener 2, :push-stack 594, +; :push-listener 592, :push-result 257, :push-message 257} +; "Elapsed time: 63.203691 msecs" #_(time (clojure (str '(ns parcera.core (:require [instaparse.core :as instaparse] [clojure.data :as data] diff --git a/src/parcera/terminals.cljc b/src/parcera/terminals.cljc index 005035d..de1b284 100644 --- a/src/parcera/terminals.cljc +++ b/src/parcera/terminals.cljc @@ -4,7 +4,7 @@ (def NAME "[^\\s\\(\\)\\[\\]{}\"@~\\^;`\\\\]+") ; todo: (?!\/) do i need that ? ;; symbols cannot start with a number, :, # nor ' -(def symbol-pattern (str "(?![:#\\',])(" NAME "\\/)?(\\/|(" NAME "))")) +(def symbol-pattern (str "(?![:#\\',]|[+-]?\\d+)(" NAME "\\/)?(\\/|(" NAME "))")) (def double-suffix "(((\\.\\d*)?([eE][-+]?\\d+)?)M?)") From d2aa821acdae80913b7277421703f1e31a632b90 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Sat, 12 Oct 2019 00:31:13 +0200 Subject: [PATCH 32/50] fix: missing reader conditional --- src/parcera/core.cljc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index f6b0445..1bf58f3 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -223,7 +223,8 @@ In general (= input (parcera/code (parcera/clojure input)))" [ast] - (let [string-builder (new StringBuilder)] + (let [string-builder #?(:clj (new StringBuilder) + :cljs (new StringBuffer))] (code* ast string-builder) (. string-builder (toString)))) From d9eda1578da4cc3f4d85c9253f2b57a90fd01c08 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Sat, 12 Oct 2019 00:43:08 +0200 Subject: [PATCH 33/50] cosmetic change --- src/parcera/core.cljc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 1bf58f3..f104d2d 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -10,7 +10,8 @@ : literal | collection | reader-macro | whitespace; - whitespace = #'([,\\s]*;.*)?([,\\s]+|$)' (* we treat comments the same way as commas *); + (* we treat comments the same way as commas *) + whitespace = #'([,\\s]*;.*)?([,\\s]+|$)'; (* for parsing purposes we dont consider a Set a collection since it starts with # -> dispatch macro *) From 2c4803791e50eb36553a6b80555c7003085ebffe Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Sat, 12 Oct 2019 00:54:19 +0200 Subject: [PATCH 34/50] replaced lookahead by ordered choice sorted elements according to expected occurrence fix: namespaced map moved to dispatch macro --- src/parcera/core.cljc | 56 +++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index f104d2d..ffe707c 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -8,14 +8,14 @@ (def grammar-rules "code: form*; - : literal | collection | reader-macro | whitespace; + : whitespace / collection / literal / reader-macro; (* we treat comments the same way as commas *) whitespace = #'([,\\s]*;.*)?([,\\s]+|$)'; (* for parsing purposes we dont consider a Set a collection since it starts with # -> dispatch macro *) - : &#'[\\(\\[{]' ( list | vector | map ); + : list / vector / map; list: <'('> form* <')'> ; @@ -24,26 +24,25 @@ map: <'{'> form* <'}'>; (* a literal is basically anything that is not a collection, macro or whitespace *) - : &#'[^\\(\\[{#^\\'`~@\\s]' ( symbol - | number - | string - | character - | keyword - ); - - : simple-keyword | macro-keyword ; - - : &#'[#^\\'`~@]' ( dispatch - | metadata - | deref - | quote - | backtick - | unquote - | unquote-splicing - | set - | namespaced-map - | symbolic - ); + : ( symbol + / keyword + / string + / number + / character + ); + + : simple-keyword / macro-keyword ; + + : ( set + / dispatch + / metadata + / deref + / quote + / backtick + / unquote + / unquote-splicing + / symbolic + ); set: <'#{'> form* <'}'>; @@ -71,12 +70,13 @@ deref: <'@'> form; : function - | regex - | var-quote - | discard - | tag - | conditional - | conditional-splicing; + / regex + / namespaced-map + / var-quote + / discard + / tag + / conditional + / conditional-splicing; function: <'#('> form* <')'>; From 90fdfa6472e90aa9d64b4cb971e65dfed1077876 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Sat, 12 Oct 2019 00:59:32 +0200 Subject: [PATCH 35/50] updated trace results --- src/parcera/core.cljc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index ffe707c..8d2782f 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -230,9 +230,9 @@ (. string-builder (toString)))) ; Successful parse. -; Profile: {:create-node 594, :push-full-listener 2, :push-stack 594, -; :push-listener 592, :push-result 257, :push-message 257} -; "Elapsed time: 63.203691 msecs" +; Profile: {:create-node 440, :push-full-listener 2, :push-stack 440, +; :push-listener 438, :push-result 234, :push-message 234} +; "Elapsed time: 52.561627 msecs" #_(time (clojure (str '(ns parcera.core (:require [instaparse.core :as instaparse] [clojure.data :as data] From 19f81ae50a754acd8a1ea291966a279c7e3a7ab1 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Sat, 12 Oct 2019 13:22:13 +0200 Subject: [PATCH 36/50] reordered parser rules according to frequencies made all or operations ordered --- src/parcera/core.cljc | 47 ++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 8d2782f..c34d183 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -5,10 +5,19 @@ [parcera.terminals :as terminal]) #?(:cljs (:import goog.string.StringBuffer))) +; NOTE: Through my experiments I found out that Instaparse will gladly take the +; first match as long as the grammar is not ambiguous. Therefore I switched the +; unordered OR (|) with an ordered one (/). This of course implies an heuristic +; of knowing which grammar rules are expected to match more often. I use +; Clojure's core as a reference with the following code snippet +#_(let [core-content (slurp "https://raw.githubusercontent.com/clojure/clojure/master/src/clj/clojure/core.clj")] + (time (sort-by second > (frequencies (filter keyword? (flatten (clojure core-content :optimize :memory))))))) +#_(let [core-content (slurp "https://raw.githubusercontent.com/clojure/clojurescript/master/src/main/clojure/cljs/core.cljc")] + (time (sort-by second > (frequencies (filter keyword? (flatten (clojure core-content :optimize :memory))))))) (def grammar-rules "code: form*; - : whitespace / collection / literal / reader-macro; + : whitespace / literal / collection / reader-macro; (* we treat comments the same way as commas *) whitespace = #'([,\\s]*;.*)?([,\\s]+|$)'; @@ -33,31 +42,30 @@ : simple-keyword / macro-keyword ; - : ( set - / dispatch + : ( unquote / metadata - / deref - / quote / backtick - / unquote + / quote + / dispatch / unquote-splicing + / deref / symbolic ); set: <'#{'> form* <'}'>; - namespaced-map: <'#'> ( keyword | auto-resolve ) map; + namespaced-map: <'#'> ( keyword / auto-resolve ) map; auto-resolve: '::'; metadata: (metadata-entry whitespace)+ ( symbol - | collection - | tag - | unquote - | unquote-splicing + / collection + / tag + / unquote + / unquote-splicing ); - metadata-entry: <'^'> ( map | symbol | string | keyword ); + metadata-entry: <'^'> ( map / symbol / string / keyword ); quote: <'\\''> form; @@ -71,12 +79,13 @@ : function / regex + / set + / conditional + / conditional-splicing / namespaced-map / var-quote / discard - / tag - / conditional - / conditional-splicing; + / tag; function: <'#('> form* <')'>; @@ -84,7 +93,7 @@ discard: <'#_'> form; - tag: <#'#(?![_?])'> symbol whitespace? (literal | collection); + tag: <#'#(?![_?])'> symbol whitespace? (literal / collection); conditional: <'#?'> list; @@ -230,9 +239,9 @@ (. string-builder (toString)))) ; Successful parse. -; Profile: {:create-node 440, :push-full-listener 2, :push-stack 440, -; :push-listener 438, :push-result 234, :push-message 234} -; "Elapsed time: 52.561627 msecs" +; Profile: {:create-node 384, :push-full-listener 2, :push-stack 384, +; :push-listener 382, :push-result 227, :push-message 227 } +; "Elapsed time: 47.25084 msecs" #_(time (clojure (str '(ns parcera.core (:require [instaparse.core :as instaparse] [clojure.data :as data] From 504a8c4b60115e2fea04e19c82d20bab727a8e4c Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Sat, 12 Oct 2019 13:22:28 +0200 Subject: [PATCH 37/50] benchmark added --- test/parcera/test/benchmark.clj | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/test/parcera/test/benchmark.clj b/test/parcera/test/benchmark.clj index fe6e42c..e144eb4 100644 --- a/test/parcera/test/benchmark.clj +++ b/test/parcera/test/benchmark.clj @@ -2,7 +2,8 @@ (:require [clojure.test :refer [deftest is testing]] [clojure.test.check :as tc] [criterium.core :as criterium] - [parcera.test.core :as pt])) + [parcera.test.core :as pt] + [parcera.core :as parcera])) (deftest ^:benchmark parsing (println "Benchmark: Time parsing Clojure values ⌛") @@ -15,3 +16,16 @@ (println "Benchmark: Round trip of Clojure values 🚀") (criterium/quick-bench (tc/quick-check 30 pt/symmetric) :os :runtime :verbose)) + + +;; execute last ... hopefully +(deftest ^:benchmark z-known-namespace + (newline) + (newline) + (println "Benchmark: Parsing parcera namespace with traces 👮") + (criterium/quick-bench (parcera/clojure (str '(ns parcera.core + (:require [instaparse.core :as instaparse] + [clojure.data :as data] + [clojure.string :as str]))) + :trace true) + :os :runtime :verbose)) From 8e60181d6d466c96c5b926db2a6b64d9f1e42a63 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Sat, 12 Oct 2019 13:36:07 +0200 Subject: [PATCH 38/50] refactored conditional and splicing to have forms as children not a single list --- src/parcera/core.cljc | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index c34d183..efadb22 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -95,9 +95,9 @@ tag: <#'#(?![_?])'> symbol whitespace? (literal / collection); - conditional: <'#?'> list; + conditional: <'#?('> form* <')'>; - conditional-splicing: <'#?@'> list; + conditional-splicing: <'#?@('> form* <')'>; symbolic: #'##(Inf|-Inf|NaN)'") @@ -206,12 +206,14 @@ (doseq [child (rest ast)] (code* child string-builder))) :conditional - (do (. string-builder (append "#?")) - (code* (second ast) string-builder)) + (do (. string-builder (append "#?(")) + (doseq [child (rest ast)] (code* child string-builder)) + (. string-builder (append ")"))) :conditional-splicing - (do (. string-builder (append "#?@")) - (code* (second ast) string-builder)) + (do (. string-builder (append "#?@(")) + (doseq [child (rest ast)] (code* child string-builder)) + (. string-builder (append ")"))) :deref (do (. string-builder (append "@")) From cba134dfdaa7fa51a0e7455bce8b791ed9582a2d Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Sat, 12 Oct 2019 13:39:41 +0200 Subject: [PATCH 39/50] removed already done todos --- src/parcera/terminals.cljc | 1 - test/parcera/test/core.cljc | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/parcera/terminals.cljc b/src/parcera/terminals.cljc index de1b284..2338757 100644 --- a/src/parcera/terminals.cljc +++ b/src/parcera/terminals.cljc @@ -8,7 +8,6 @@ (def double-suffix "(((\\.\\d*)?([eE][-+]?\\d+)?)M?)") -; todo: (0)|([1-9]\d*) is this needed ? (def long-suffix "((0[xX]([\\dA-Fa-f]+)|0([0-7]+)|([1-9]\\d?)[rR]([\\d\\w]+)|0\\d+)?N?)") (def ratio-suffix "(\\/(\\d+))") (def number-pattern (str "[+-]?\\d+(" long-suffix "|" double-suffix "|" ratio-suffix ")(?![\\.\\/])")) diff --git a/test/parcera/test/core.cljc b/test/parcera/test/core.cljc index c3dc10e..afc5c8a 100644 --- a/test/parcera/test/core.cljc +++ b/test/parcera/test/core.cljc @@ -117,9 +117,8 @@ (as-> "#_\"[a b 2]\"" input (is (and (valid? input) (roundtrip input) (clear input))))) (testing "comments" - ; todo: should I allow a file containing only with a comment ? - #_(as-> ";[a b 2]" input (is (and (valid? input) (roundtrip input) (clear input)))) - #_(as-> ";; \"[a b 2]\"" input (is (and (valid? input) (roundtrip input) (clear input)))) + (as-> ";[a b 2]" input (is (and (valid? input) (roundtrip input) (clear input)))) + (as-> ";; \"[a b 2]\"" input (is (and (valid? input) (roundtrip input) (clear input)))) (as-> "2 ;[a b 2]" input (is (and (valid? input) (roundtrip input) (clear input)))) (as-> " :hello ;; \"[a b 2]\"" input (is (and (valid? input) (roundtrip input) (clear input))))) From 6bf6c229b2f81d01e3509fc189071f6eaa10da16 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Sat, 12 Oct 2019 13:41:10 +0200 Subject: [PATCH 40/50] dont trace parsing to avoid overly long prints --- test/parcera/test/benchmark.clj | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/parcera/test/benchmark.clj b/test/parcera/test/benchmark.clj index e144eb4..4bb385f 100644 --- a/test/parcera/test/benchmark.clj +++ b/test/parcera/test/benchmark.clj @@ -26,6 +26,5 @@ (criterium/quick-bench (parcera/clojure (str '(ns parcera.core (:require [instaparse.core :as instaparse] [clojure.data :as data] - [clojure.string :as str]))) - :trace true) + [clojure.string :as str])))) :os :runtime :verbose)) From b38cefece824c3d93a9ef63b9c45354746b568ca Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Sat, 12 Oct 2019 18:57:59 +0200 Subject: [PATCH 41/50] improve test reporting cosmetic changes --- test/parcera/test/core.cljc | 204 +++++++++++++++++++++++++++--------- 1 file changed, 153 insertions(+), 51 deletions(-) diff --git a/test/parcera/test/core.cljc b/test/parcera/test/core.cljc index afc5c8a..d7e165b 100644 --- a/test/parcera/test/core.cljc +++ b/test/parcera/test/core.cljc @@ -82,105 +82,207 @@ (deftest unit-tests (testing "names" - (as-> "foo" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "foo-bar" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "foo->bar" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "->" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "->as" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "föl" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "Öl" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "ϕ" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "❤️" input (is (and (valid? input) (roundtrip input) (clear input)))))) + (as-> "foo" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "foo-bar" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "foo->bar" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "->" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "->as" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "föl" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "Öl" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "ϕ" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "❤️" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))))) (deftest edge-cases (testing "comments" (as-> "{:hello ;2} - 2}" input (is (and (valid? input) (roundtrip input) (clear input)))))) + 2}" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))))) (deftest macros (testing "metadata" - (as-> "^String [a b 2]" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "^\"String\" [a b 2]" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "^:string [a b 2]" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "^{:a 1} [a b 2]" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "^:hello ^\"World\" ^{:a 1} [a b 2]" input (is (and (valid? input) (roundtrip input) (clear input))))) + (as-> "^String [a b 2]" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "^\"String\" [a b 2]" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "^:string [a b 2]" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "^{:a 1} [a b 2]" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "^:hello ^\"World\" ^{:a 1} [a b 2]" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input))))) (testing "discard" - (as-> "#_[a b 2]" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "#_(a b 2)" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "#_{:a 1}" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "#_macros" input (is (and (valid? input) (roundtrip input) (clear input))))) + (as-> "#_[a b 2]" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "#_(a b 2)" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "#_{:a 1}" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "#_macros" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input))))) (testing "regex" - (as-> "#_\"[a b 2]\"" input (is (and (valid? input) (roundtrip input) (clear input))))) + (as-> "#_\"[a b 2]\"" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input))))) (testing "comments" - (as-> ";[a b 2]" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> ";; \"[a b 2]\"" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "2 ;[a b 2]" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> " :hello ;; \"[a b 2]\"" input (is (and (valid? input) (roundtrip input) (clear input))))) + (as-> ";[a b 2]" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> ";; \"[a b 2]\"" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "2 ;[a b 2]" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> " :hello ;; \"[a b 2]\"" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input))))) (testing "var quote" - (as-> "#'hello/world" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "#'/" input (is (and (valid? input) (roundtrip input) (clear input))))) + (as-> "#'hello/world" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "#'/" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input))))) (testing "tag" - (as-> "#hello/world [1 a \"3\"]" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "#hello/world {1 \"3\"}" input (is (and (valid? input) (roundtrip input) (clear input))))) + (as-> "#hello/world [1 a \"3\"]" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "#hello/world {1 \"3\"}" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input))))) (testing "keyword" - (as-> "::hello/world [1 a \"3\"]" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "::hello" input (is (and (valid? input) (roundtrip input) (clear input))))) + (as-> "::hello/world [1 a \"3\"]" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "::hello" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input))))) (testing "quote" - (as-> "'hello/world" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "'hello" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "'/" input (is (and (valid? input) (roundtrip input) (clear input))))) + (as-> "'hello/world" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "'hello" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "'/" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input))))) (testing "backtick" - (as-> "`hello/world" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "`hello" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "`/" input (is (and (valid? input) (roundtrip input) (clear input))))) + (as-> "`hello/world" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "`hello" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "`/" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input))))) (testing "unquote" - (as-> "~hello/world" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "~(hello 2 3)" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "~/" input (is (and (valid? input) (roundtrip input) (clear input))))) + (as-> "~hello/world" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "~(hello 2 3)" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "~/" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input))))) (testing "quote splicing" - (as-> "~@hello/world" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "~@(hello 2 b)" input (is (and (valid? input) (roundtrip input) (clear input))))) + (as-> "~@hello/world" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "~@(hello 2 b)" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input))))) (testing "deref" - (as-> "@hello/world" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "@hello" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "@/" input (is (and (valid? input) (roundtrip input) (clear input))))) + (as-> "@hello/world" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "@hello" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "@/" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input))))) (testing "anonymous function" - (as-> "#(= (str %1 %2 %&))" input (is (and (valid? input) (roundtrip input) (clear input))))) + (as-> "#(= (str %1 %2 %&))" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input))))) (testing "namespaced map" - (as-> "#::{:a 1 b 3}" input (is (and (valid? input) (roundtrip input) (clear input)))) - (as-> "#::hello{:a 1 b 3}" input (is (and (valid? input) (roundtrip input) (clear input))))) + (as-> "#::{:a 1 b 3}" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "#::hello{:a 1 b 3}" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input))))) (testing "reader conditional" (as-> "#?(:clj Double/NaN :cljs js/NaN :default nil)" input - (is (and (valid? input) (roundtrip input) (clear input)))) + (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) (as-> "[1 2 #?@(:clj [3 4] :cljs [5 6])]" input - (is (and (valid? input) (roundtrip input) (clear input)))))) + (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))))) (deftest bootstrap (testing "parcera should be able to parse itself" (let [input (slurp "./src/parcera/core.cljc")] - (is (and (valid? input) (roundtrip input) (clear input))))) + (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input))))) (testing "parcera should be able to parse its own test suite" (let [input (slurp "./test/parcera/test/core.cljc")] - (is (and (valid? input) (roundtrip input) (clear input)))))) + (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))))) (deftest clojure$cript From d3b503dedd92f904ff59af3c3626c76dcaba0c92 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Sun, 13 Oct 2019 14:25:27 +0200 Subject: [PATCH 42/50] added more tests to bootstrap --- test/parcera/test/core.cljc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/parcera/test/core.cljc b/test/parcera/test/core.cljc index d7e165b..717a568 100644 --- a/test/parcera/test/core.cljc +++ b/test/parcera/test/core.cljc @@ -274,12 +274,20 @@ (testing "parcera should be able to parse itself" (let [input (slurp "./src/parcera/core.cljc")] + (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (let [input (slurp "./src/parcera/slurp.cljc")] (and (is (valid? input)) (is (roundtrip input)) (is (clear input))))) (testing "parcera should be able to parse its own test suite" (let [input (slurp "./test/parcera/test/core.cljc")] + (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (let [input (slurp "./test/parcera/test/benchmark.clj")] (and (is (valid? input)) (is (roundtrip input)) (is (clear input)))))) From 2ac2230416a7f4d171915ccfe21dc5174e745f35 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Sun, 13 Oct 2019 14:26:44 +0200 Subject: [PATCH 43/50] check ambiguity of hiden rules as well --- test/parcera/test/core.cljc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/parcera/test/core.cljc b/test/parcera/test/core.cljc index 717a568..60e1c77 100644 --- a/test/parcera/test/core.cljc +++ b/test/parcera/test/core.cljc @@ -22,7 +22,7 @@ (defn- clear [input] - (= 1 (count (instaparse/parses parcera/clojure input)))) + (= 1 (count (instaparse/parses parcera/clojure input :unhide :all)))) (def validity From 7ed8c3df3d9f825e307a36444c84778997eacbdf Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Sun, 13 Oct 2019 14:28:27 +0200 Subject: [PATCH 44/50] re-use test functions --- test/parcera/test/core.cljc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/parcera/test/core.cljc b/test/parcera/test/core.cljc index 60e1c77..a54516b 100644 --- a/test/parcera/test/core.cljc +++ b/test/parcera/test/core.cljc @@ -29,14 +29,14 @@ "The grammar definition of parcera is valid for any clojure value. Meaning that for any clojure value, parcera can create an AST for it" (prop/for-all [input (gen/fmap pr-str gen/any)] - (= false (instaparse/failure? (parcera/clojure input))))) + (valid? input))) (def symmetric "The read <-> write process of parcera MUST be symmetrical. Meaning that the AST and the text representation are equivalent" (prop/for-all [input (gen/fmap pr-str gen/any)] - (= input (parcera/code (parcera/clojure input))))) + (roundtrip input))) (def unambiguous @@ -44,7 +44,7 @@ that any input should (but must not) only have 1 AST representation ... however I have found this is not always possible" (prop/for-all [input (gen/fmap pr-str gen/any)] - (= 1 (count (instaparse/parses parcera/clojure input))))) + (clear input))) (deftest simple From c4c71f67c7dfbf9afbfcfa04a9fcc2e31ed3ad08 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Sun, 13 Oct 2019 14:30:08 +0200 Subject: [PATCH 45/50] more tests --- test/parcera/test/core.cljc | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/test/parcera/test/core.cljc b/test/parcera/test/core.cljc index a54516b..a1e296d 100644 --- a/test/parcera/test/core.cljc +++ b/test/parcera/test/core.cljc @@ -49,14 +49,30 @@ (deftest simple (testing "character literals" - (as-> "\\t" input (is (= input (parcera/code (parcera/clojure input))))) - (as-> "\\n" input (is (= input (parcera/code (parcera/clojure input))))) - (as-> "\\r" input (is (= input (parcera/code (parcera/clojure input))))) - (as-> "\\a" input (is (= input (parcera/code (parcera/clojure input))))) - (as-> "\\é" input (is (= input (parcera/code (parcera/clojure input))))) - (as-> "\\ö" input (is (= input (parcera/code (parcera/clojure input))))) - (as-> "\\ï" input (is (= input (parcera/code (parcera/clojure input))))) - (as-> "\\ϕ" input (is (= input (parcera/code (parcera/clojure input))))))) + (as-> "\\t" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "\\n" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "\\r" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "\\a" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "\\é" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "\\ö" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "\\ï" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))) + (as-> "\\ϕ" input (and (is (valid? input)) + (is (roundtrip input)) + (is (clear input)))))) (deftest data-structures From f433ee093aea751a7f9255add0ebd8eeea17a788 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Sun, 13 Oct 2019 16:57:29 +0200 Subject: [PATCH 46/50] TODOs added --- src/parcera/core.cljc | 1 + src/parcera/terminals.cljc | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index efadb22..3411dc0 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -14,6 +14,7 @@ (time (sort-by second > (frequencies (filter keyword? (flatten (clojure core-content :optimize :memory))))))) #_(let [core-content (slurp "https://raw.githubusercontent.com/clojure/clojurescript/master/src/main/clojure/cljs/core.cljc")] (time (sort-by second > (frequencies (filter keyword? (flatten (clojure core-content :optimize :memory))))))) +; todo: performance of [,\s]*;.*|[,\s]+ for whitespace (def grammar-rules "code: form*; diff --git a/src/parcera/terminals.cljc b/src/parcera/terminals.cljc index 2338757..c9788de 100644 --- a/src/parcera/terminals.cljc +++ b/src/parcera/terminals.cljc @@ -1,29 +1,36 @@ (ns parcera.terminals) +; todo: anchor ALL to the beginning of string +; todo: try to avoid lookahead + ;; Clojure's reader is quite permissive so we follow the motto "if it is not forbidden, it is allowed" +; todo: dont allow / (def NAME "[^\\s\\(\\)\\[\\]{}\"@~\\^;`\\\\]+") ; todo: (?!\/) do i need that ? ;; symbols cannot start with a number, :, # nor ' +; todo: no need for negative lookahead of chars (def symbol-pattern (str "(?![:#\\',]|[+-]?\\d+)(" NAME "\\/)?(\\/|(" NAME "))")) - (def double-suffix "(((\\.\\d*)?([eE][-+]?\\d+)?)M?)") (def long-suffix "((0[xX]([\\dA-Fa-f]+)|0([0-7]+)|([1-9]\\d?)[rR]([\\d\\w]+)|0\\d+)?N?)") (def ratio-suffix "(\\/(\\d+))") -(def number-pattern (str "[+-]?\\d+(" long-suffix "|" double-suffix "|" ratio-suffix ")(?![\\.\\/])")) +(def number-pattern (str "[+-]?\\d+(" long-suffix "|" double-suffix "|" ratio-suffix ")(?![\\.\\/])")) ; todo: word boundary ? ; This is supposed to be the JavaScript friendly version of #'\P{M}\p{M}*+' ; mentioned here: https://www.regular-expressions.info/unicode.html ; It's cooked by this generator: http://kourge.net/projects/regexp-unicode-block ; ticking all 'Combining Diacritical Marks' boxes *)) +; todo: repeated pattern could be simplified (def unicode-char "([^\\u0300-\\u036F\\u1DC0-\\u1DFF\\u20D0-\\u20FF][\\u0300-\\u036F\\u1DC0-\\u1DFF\\u20D0-\\u20FF]*)") (def named-char "(newline|return|space|tab|formfeed|backspace)") (def unicode "(u[\\dD-Fd-f]{4})") +; todo: use word boundary to avoid lookahead (def character-pattern (str "\\\\(" unicode-char "|" named-char "|" unicode ")(?!\\w+)")) ; : is not allowed as first keyword character +; todo: no need for negative lookahead of symbol (def simple-keyword (str ":(?!:)" symbol-pattern)) (def macro-keyword (str "::(?!:)" NAME)) From 3e1b216c6279697ae1a806cc4fbe0d5bba2b23f1 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Sun, 13 Oct 2019 17:31:43 +0200 Subject: [PATCH 47/50] more todos added --- src/parcera/core.cljc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index 3411dc0..88cca67 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -5,6 +5,14 @@ [parcera.terminals :as terminal]) #?(:cljs (:import goog.string.StringBuffer))) +; todo: implement advices from +; http://blog.reverberate.org/2013/09/ll-and-lr-in-context-why-parsing-tools.html +; https://www.loggly.com/blog/regexes-the-bad-better-best/ +; https://www.loggly.com/blog/five-invaluable-techniques-to-improve-regex-performance/ + +; todo: use advices in https://medium.appbase.io/analyzing-20k-github-repositories-af76de21c3fc +; to check if the heuristics are accurate + ; NOTE: Through my experiments I found out that Instaparse will gladly take the ; first match as long as the grammar is not ambiguous. Therefore I switched the ; unordered OR (|) with an ordered one (/). This of course implies an heuristic From 1bc26796cc1a6f213521f10c980d7ff61b1278a7 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Mon, 14 Oct 2019 20:11:19 +0200 Subject: [PATCH 48/50] version bumped --- project.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project.clj b/project.clj index e05b4d0..e1516a9 100644 --- a/project.clj +++ b/project.clj @@ -1,4 +1,4 @@ -(defproject carocad/parcera "0.2.1" +(defproject carocad/parcera "0.3.0" :description "Grammar-based Clojure(script) parser" :url "https://github.com/carocad/parcera" :license {:name "LGPLv3" From 4002a856b91af678e44e721c1468c760dee8441b Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Mon, 14 Oct 2019 20:23:40 +0200 Subject: [PATCH 49/50] README updated --- README.md | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index a95fbef..083ef8d 100644 --- a/README.md +++ b/README.md @@ -32,20 +32,29 @@ full explanation of the options available for a parser please visit Instaparse w [:symbol "parcera.core"] [:whitespace " "] [:list - [:simple-keyword "require"] + [:simple-keyword ":require"] [:whitespace " "] [:vector [:symbol "instaparse.core"] [:whitespace " "] - [:simple-keyword "as"] + [:simple-keyword ":as"] [:whitespace " "] [:symbol "instaparse"]] [:whitespace " "] - [:vector [:symbol "clojure.data"] [:whitespace " "] [:simple-keyword "as"] [:whitespace " "] [:symbol "data"]] + [:vector [:symbol "clojure.data"] [:whitespace " "] [:simple-keyword ":as"] [:whitespace " "] [:symbol "data"]] [:whitespace " "] - [:vector [:symbol "clojure.string"] [:whitespace " "] [:simple-keyword "as"] [:whitespace " "] [:symbol "str"]]]]] + [:vector [:symbol "clojure.string"] [:whitespace " "] [:simple-keyword ":as"] [:whitespace " "] [:symbol "str"]]]]] ;; convert an AST back into a string (parcera/code [:symbol "ns"]) ;; "ns" ``` + +### notes +There are some restrictions as to how much can a parser do. In my experience, these restrictions +are related to some [semantic context-sensitivity](http://blog.reverberate.org/2013/09/ll-and-lr-in-context-why-parsing-tools.html). +which the Clojure reader has embedded into itself. In general I have found the following ones: + - `parcera` doesnt check that a map contains an even number of elements. This is specially difficult + to do since Clojure supports the discard macro `#_ form` which is a valid element but "doesnt count as one" + - `parcera` doesnt check if a map has repeated keys + - `parcera` doesnt check if a set has repeated elements From c0066d3dc7c2a09c230fca20be31c39be54f7322 Mon Sep 17 00:00:00 2001 From: Camilo Roca Date: Mon, 14 Oct 2019 20:36:15 +0200 Subject: [PATCH 50/50] activate benchmark --- .travis.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 106b5a8..1d6d675 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,10 +18,9 @@ jobs: - lein trampoline test - nvm install 10.10 && nvm use 10.10 && lein trampoline cljsbuild test - # only run the benchmark is we are on master + # only run the benchmark if we are trying to merge to master # otherwise the build takes too long - stage: Benchmark - #if: head_branch = master if: branch = master script: - lein trampoline test :benchmark