diff --git a/.travis.yml b/.travis.yml index 04ed0b3..1d6d675 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,10 +18,10 @@ jobs: - lein trampoline test - nvm install 10.10 && nvm use 10.10 && lein trampoline cljsbuild test - # only run the benchmark is we are on master + # only run the benchmark if we are trying to merge to master # otherwise the build takes too long - stage: Benchmark - if: head_branch = master + if: branch = master script: - lein trampoline test :benchmark diff --git a/README.md b/README.md index a95fbef..083ef8d 100644 --- a/README.md +++ b/README.md @@ -32,20 +32,29 @@ full explanation of the options available for a parser please visit Instaparse w [:symbol "parcera.core"] [:whitespace " "] [:list - [:simple-keyword "require"] + [:simple-keyword ":require"] [:whitespace " "] [:vector [:symbol "instaparse.core"] [:whitespace " "] - [:simple-keyword "as"] + [:simple-keyword ":as"] [:whitespace " "] [:symbol "instaparse"]] [:whitespace " "] - [:vector [:symbol "clojure.data"] [:whitespace " "] [:simple-keyword "as"] [:whitespace " "] [:symbol "data"]] + [:vector [:symbol "clojure.data"] [:whitespace " "] [:simple-keyword ":as"] [:whitespace " "] [:symbol "data"]] [:whitespace " "] - [:vector [:symbol "clojure.string"] [:whitespace " "] [:simple-keyword "as"] [:whitespace " "] [:symbol "str"]]]]] + [:vector [:symbol "clojure.string"] [:whitespace " "] [:simple-keyword ":as"] [:whitespace " "] [:symbol "str"]]]]] ;; convert an AST back into a string (parcera/code [:symbol "ns"]) ;; "ns" ``` + +### notes +There are some restrictions as to how much can a parser do. In my experience, these restrictions +are related to some [semantic context-sensitivity](http://blog.reverberate.org/2013/09/ll-and-lr-in-context-why-parsing-tools.html). +which the Clojure reader has embedded into itself. In general I have found the following ones: + - `parcera` doesnt check that a map contains an even number of elements. This is specially difficult + to do since Clojure supports the discard macro `#_ form` which is a valid element but "doesnt count as one" + - `parcera` doesnt check if a map has repeated keys + - `parcera` doesnt check if a set has repeated elements diff --git a/project.clj b/project.clj index e05b4d0..e1516a9 100644 --- a/project.clj +++ b/project.clj @@ -1,4 +1,4 @@ -(defproject carocad/parcera "0.2.1" +(defproject carocad/parcera "0.3.0" :description "Grammar-based Clojure(script) parser" :url "https://github.com/carocad/parcera" :license {:name "LGPLv3" diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc index adb51f3..88cca67 100644 --- a/src/parcera/core.cljc +++ b/src/parcera/core.cljc @@ -1,76 +1,80 @@ (ns parcera.core - (:require [instaparse.core :as instaparse]) + (:require [instaparse.core :as instaparse] + [instaparse.combinators-source :as combi] + [instaparse.cfg :as cfg] + [parcera.terminals :as terminal]) #?(:cljs (:import goog.string.StringBuffer))) -(def grammar +; todo: implement advices from +; http://blog.reverberate.org/2013/09/ll-and-lr-in-context-why-parsing-tools.html +; https://www.loggly.com/blog/regexes-the-bad-better-best/ +; https://www.loggly.com/blog/five-invaluable-techniques-to-improve-regex-performance/ + +; todo: use advices in https://medium.appbase.io/analyzing-20k-github-repositories-af76de21c3fc +; to check if the heuristics are accurate + +; NOTE: Through my experiments I found out that Instaparse will gladly take the +; first match as long as the grammar is not ambiguous. Therefore I switched the +; unordered OR (|) with an ordered one (/). This of course implies an heuristic +; of knowing which grammar rules are expected to match more often. I use +; Clojure's core as a reference with the following code snippet +#_(let [core-content (slurp "https://raw.githubusercontent.com/clojure/clojure/master/src/clj/clojure/core.clj")] + (time (sort-by second > (frequencies (filter keyword? (flatten (clojure core-content :optimize :memory))))))) +#_(let [core-content (slurp "https://raw.githubusercontent.com/clojure/clojurescript/master/src/main/clojure/cljs/core.cljc")] + (time (sort-by second > (frequencies (filter keyword? (flatten (clojure core-content :optimize :memory))))))) +; todo: performance of [,\s]*;.*|[,\s]+ for whitespace +(def grammar-rules "code: form*; -