diff --git a/.gitignore b/.gitignore
index 5bae9d9..0655512 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,12 @@ pom.xml.asc
/.idea/
/nashorn_code_cache
/.cljs_nashorn_repl
+/build/
+/yarn-error.log
+/node_modules/
+/out/
+/src/java/
+/src/javascript
+/figwheel_server.log
+package*.json
+/.eastwood
diff --git a/.travis.yml b/.travis.yml
index 1d6d675..489935b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,16 +14,17 @@ jobs:
include:
- stage: Tests
script:
+ - curl -O https://www.antlr.org/download/antlr-4.7.1-complete.jar
+ # generate java
+ - java -jar antlr-4.7.1-complete.jar -Xexact-output-dir -o src/java/parcera/antlr -package parcera.antlr -Dlanguage=Java -no-listener -no-visitor src/Clojure.g4
+ # now we can actually proceed with clojure code
- lein do clean, compile, check, eastwood
- lein trampoline test
- - nvm install 10.10 && nvm use 10.10 && lein trampoline cljsbuild test
-
- # only run the benchmark if we are trying to merge to master
- # otherwise the build takes too long
- - stage: Benchmark
- if: branch = master
- script:
- lein trampoline test :benchmark
+ # todo - re-enable js
+ # generate javascript - todo
+ #- java -jar antlr-4.7.1-complete.jar -Xexact-output-dir -o src/javascript/parcera/antlr -package parcera.antlr -Dlanguage=JavaScript -no-listener -no-visitor src/Clojure.g4
+ #- nvm install 10.10 && nvm use 10.10 && lein trampoline cljsbuild test
- stage: Release
if: tag IS present
diff --git a/README.md b/README.md
index 60ab704..e45ac6c 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
[![Build Status](https://travis-ci.com/carocad/parcera.svg?branch=master)](https://travis-ci.com/carocad/parcera)
[![Clojars Project](https://img.shields.io/clojars/v/carocad/parcera.svg)](https://clojars.org/carocad/parcera)
-Grammar-based Clojure(script) parser.
+Grammar-based Clojure parser.
Parcera can safely read any Clojure file without any code evaluation.
@@ -19,42 +19,33 @@ full explanation of the options available for a parser please visit Instaparse w
[instaparse.core :as instaparse]))
;;parse clojure code from a string
-(parcera/clojure (str '(ns parcera.core
- (:require [instaparse.core :as instaparse]
- [clojure.data :as data]
- [clojure.string :as str]))))
+(parcera/ast (str '(ns parcera.core
+ (:require [instaparse.core :as instaparse]
+ [clojure.data :as data]
+ [clojure.string :as str]))))
;; => returns a data structure with the result from the parser
-[:code
- [:list
- [:symbol "ns"]
- [:whitespace " "]
- [:symbol "parcera.core"]
- [:whitespace " "]
- [:list
- [:simple-keyword ":require"]
- [:whitespace " "]
- [:vector
- [:symbol "instaparse.core"]
- [:whitespace " "]
- [:simple-keyword ":as"]
- [:whitespace " "]
- [:symbol "instaparse"]]
- [:whitespace " "]
- [:vector [:symbol "clojure.data"] [:whitespace " "] [:simple-keyword ":as"] [:whitespace " "] [:symbol "data"]]
- [:whitespace " "]
- [:vector [:symbol "clojure.string"] [:whitespace " "] [:simple-keyword ":as"] [:whitespace " "] [:symbol "str"]]]]]
+(:code
+ (:list
+ (:symbol "ns")
+ (:whitespace " ")
+ (:symbol "parcera.core")
+ (:whitespace " ")
+ (:list
+ (:simple_keyword "require")
+ (:whitespace " ")
+ (:vector
+ (:symbol "instaparse.core")
+ (:whitespace " ")
+ (:simple_keyword "as")
+ (:whitespace " ")
+ (:symbol "instaparse"))
+ (:whitespace " ")
+ (:vector (:symbol "clojure.data") (:whitespace " ") (:simple_keyword "as") (:whitespace " ") (:symbol "data"))
+ (:whitespace " ")
+ (:vector (:symbol "clojure.string") (:whitespace " ") (:simple_keyword "as") (:whitespace " ") (:symbol "str")))))
;; convert an AST back into a string
(parcera/code [:symbol "ns"])
;; "ns"
```
-
-### notes
-There are some restrictions as to how much can a parser do. In my experience, these restrictions
-are related to some [semantic context-sensitivity](http://blog.reverberate.org/2013/09/ll-and-lr-in-context-why-parsing-tools.html).
-which the Clojure reader has embedded into itself. In general I have found the following ones:
-- `parcera` doesnt check that a map contains an even number of elements. This is specially difficult
- to do since Clojure supports the discard macro `#_ form` which is a valid element but "doesnt count as one"
-- `parcera` doesnt check if a map has repeated keys
-- `parcera` doesnt check if a set has repeated elements
diff --git a/pom.xml b/pom.xml
index e78ceff..e9ec8cf 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,9 +3,9 @@
carocad
parcera
jar
- 0.3.1
+ 0.4.0
parcera
- Grammar-based Clojure(script) parser
+ Grammar-based Clojure parser
https://github.com/carocad/parcera
@@ -17,10 +17,10 @@
https://github.com/carocad/parcera
scm:git:git://github.com/carocad/parcera.git
scm:git:ssh://git@github.com/carocad/parcera.git
- b4ca5c659e55f00781e37bee1dc6bb400460e307
+ 4ff04f242eddc4791cfdf2df572f91890c202e6c
- src
+ src/clojure
test
@@ -28,13 +28,37 @@
+
+ target
+
resources
target
target/classes
-
+
+
+ org.codehaus.mojo
+ build-helper-maven-plugin
+ 1.7
+
+
+ add-source
+ generate-sources
+
+ add-source
+
+
+
+
+
+
+
+
+
+
+
@@ -68,14 +92,9 @@
1.10.1
- instaparse
- instaparse
- 1.4.10
-
-
- org.clojure
- clojurescript
- 1.10.520
+ org.antlr
+ antlr4-runtime
+ 4.7.1
provided
diff --git a/project.clj b/project.clj
index b65b02b..cc778c7 100644
--- a/project.clj
+++ b/project.clj
@@ -1,26 +1,23 @@
-(defproject carocad/parcera "0.3.1"
- :description "Grammar-based Clojure(script) parser"
+(defproject carocad/parcera "0.4.0"
+ :description "Grammar-based Clojure parser"
:url "https://github.com/carocad/parcera"
:license {:name "LGPLv3"
:url "https://github.com/carocad/parcera/blob/master/LICENSE.md"}
- :dependencies [[org.clojure/clojure "1.10.1"]
- [instaparse/instaparse "1.4.10"]]
- :profiles {:dev {:dependencies [[criterium/criterium "0.4.5"] ;; benchmark
- [org.clojure/test.check "0.10.0"]]
- :plugins [[jonase/eastwood "0.3.5"]
- [lein-cljsbuild "1.1.7"]]
- :cljsbuild {:builds
- [{:id "dev"
- :source-paths ["src" "test"]
- :compiler {:main parcera.test-runner
- :output-to "target/out/tests.js"
- :target :nodejs
- :optimizations :none}}]
- :test-commands
- {"test" ["node" "target/out/tests.js"]}}}
- :provided {:dependencies [[org.clojure/clojurescript "1.10.520"]]}}
+ :source-paths ["src/clojure"]
+ :java-source-paths ["src/java"]
+ :dependencies [[org.clojure/clojure "1.10.1"]]
+ :profiles {:dev {:dependencies [[criterium/criterium "0.4.5"] ;; benchmark
+ [org.clojure/test.check "0.10.0"]] ;; generative testing
+ :plugins [[jonase/eastwood "0.3.5"]] ;; linter
+ :resource-paths ["target"]
+ :clean-targets ^{:protect false} ["target"]}
+ ;; java reloader
+ ;[lein-virgil "0.1.9"]]
+ :provided {:dependencies [[org.antlr/antlr4-runtime "4.7.1"]]}}
+
:test-selectors {:default (fn [m] (not (some #{:benchmark} (keys m))))
:benchmark :benchmark}
+
:deploy-repositories [["clojars" {:url "https://clojars.org/repo"
:username :env/clojars_username
:password :env/clojars_password
diff --git a/scripts/figwheel.clj b/scripts/figwheel.clj
new file mode 100644
index 0000000..41fd686
--- /dev/null
+++ b/scripts/figwheel.clj
@@ -0,0 +1,2 @@
+(require '[figwheel.main.api :as fig])
+(fig/start "dev")
diff --git a/src/Clojure.g4 b/src/Clojure.g4
new file mode 100644
index 0000000..a271a7f
--- /dev/null
+++ b/src/Clojure.g4
@@ -0,0 +1,152 @@
+
+grammar Clojure;
+
+/*
+ * NOTES to myself and to other developers:
+ *
+ * - You have to remember that the parser cannot check for semantics
+ * - You have to find the right balance of dividing enforcement between the
+ * grammar and your own code.
+ *
+ * The parser should only check the syntax. So the rule of thumb is that when
+ * in doubt you let the parser pass the content up to your program. Then, in
+ * your program, you check the semantics and make sure that the rule actually
+ * have a proper meaning
+ *
+ * https://tomassetti.me/antlr-mega-tutorial/#lexers-and-parser
+*/
+
+code: form*;
+
+form: whitespace | literal | collection | reader_macro;
+
+// sets and namespaced map are not considerd collection from grammar perspective
+// since they start with # -> dispatch macro
+collection: list | vector | map;
+
+list: '(' form* ')';
+
+vector: '[' form* ']';
+
+map: '{' form* '}';
+
+literal: keyword | string | number | character | symbol;
+
+keyword: simple_keyword | macro_keyword;
+
+// making symbols, simple and macro keywords be based on NAME allows to
+// conform them all in the same way (see `conform` function)
+simple_keyword: ':' NAME;
+
+macro_keyword: '::' NAME;
+
+string: STRING;
+
+number: NUMBER;
+
+character: CHARACTER;
+
+symbol: NAME;
+
+reader_macro: ( unquote
+ | metadata
+ | backtick
+ | quote
+ | dispatch
+ | unquote_splicing
+ | deref
+ );
+
+unquote: '~' form;
+
+metadata: (metadata_entry whitespace?)+ ( symbol
+ | collection
+ | tag
+ | unquote
+ | unquote_splicing
+ );
+
+metadata_entry: '^' ( map | symbol | string | keyword );
+
+backtick: '`' form;
+
+quote: '\'' form;
+
+unquote_splicing: '~@' form;
+
+deref: '@' form;
+
+dispatch: function
+ | regex
+ | set
+ | conditional
+ | conditional_splicing
+ | namespaced_map
+ | var_quote
+ | discard
+ | tag
+ | symbolic;
+
+function: '#(' form* ')';
+
+regex: '#' STRING;
+
+set: '#{' form* '}';
+
+namespaced_map: '#' ( keyword | auto_resolve) map;
+
+auto_resolve: '::';
+
+var_quote: '#\'' symbol;
+
+discard: '#_' form;
+
+tag: '#' symbol whitespace? (literal | collection);
+
+conditional: '#?(' form* ')';
+
+conditional_splicing: '#?@(' form* ')';
+
+symbolic: '##' ('Inf' | '-Inf' | 'NaN');
+
+// whitespace or comment
+whitespace: WHITESPACE;
+
+NUMBER: [+-]? DIGIT+ (DOUBLE_SUFFIX | LONG_SUFFIX | RATIO_SUFFIX);
+
+STRING: '"' ~["\\]* ('\\' . ~["\\]*)* '"';
+
+WHITESPACE: (SPACE | COMMENT)+;
+
+COMMENT: ';' ~[\r\n]*;
+
+SPACE: [\r\n\t\f, ]+;
+
+CHARACTER: '\\' (UNICODE_CHAR | NAMED_CHAR | UNICODE);
+
+NAME: NAME_HEAD NAME_BODY*;
+
+fragment UNICODE_CHAR: ~[\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF];
+
+fragment NAMED_CHAR: 'newline' | 'return' | 'space' | 'tab' | 'formfeed' | 'backspace';
+
+fragment UNICODE: 'u' [0-9d-fD-F] [0-9d-fD-F] [0-9d-fD-F] [0-9d-fD-F];
+
+// re-allow :#' as valid characters inside the name itself
+fragment NAME_BODY: NAME_HEAD | [:#'0-9];
+
+// these is the set of characters that are allowed by all symbols and keywords
+// however, this is more strict that necessary so that we can re-use it for both
+fragment NAME_HEAD: ~[\r\n\t\f ()[\]{}"@~^;`\\,:#'0-9];
+
+fragment DOUBLE_SUFFIX: ((('.' DIGIT*)? ([eE][-+]?DIGIT+)?) 'M'?);
+
+fragment LONG_SUFFIX: ('0'[xX]((DIGIT|[A-Fa-f])+) |
+ '0'([0-7]+) |
+ ([1-9]DIGIT?)[rR](DIGIT[a-zA-Z]+) |
+ '0'DIGIT+
+ )?'N'?;
+
+fragment RATIO_SUFFIX: '/' DIGIT+;
+
+fragment DIGIT: [0-9];
diff --git a/src/clojure/parcera/antlr/java.clj b/src/clojure/parcera/antlr/java.clj
new file mode 100644
index 0000000..525038d
--- /dev/null
+++ b/src/clojure/parcera/antlr/java.clj
@@ -0,0 +1,99 @@
+(ns parcera.antlr.java
+ (:require [parcera.antlr.protocols :as antlr])
+ (:import (parcera.antlr ClojureParser ClojureLexer)
+ (org.antlr.v4.runtime ParserRuleContext Token CommonTokenStream CharStreams ANTLRErrorListener Parser)
+ (org.antlr.v4.runtime.tree ErrorNodeImpl)))
+
+(set! *warn-on-reflection* true)
+
+
+;; A custom Error Listener to avoid Antlr printing the errors on the terminal
+;; by default. This is also useful to mimic Instaparse :total parse mechanism
+;; such that if we get an error, we can report it as the result instead
+(defrecord AntlrFailure [reports]
+ ANTLRErrorListener
+ ;; I am not sure how to use these methods. If you came here wondering why
+ ;; is this being printed, please open an issue so that we can all benefit
+ ;; from your findings ;)
+ (reportAmbiguity [this parser dfa start-index stop-index exact ambig-alts configs]
+ ;; TODO
+ (println "report ambiguity: " parser dfa start-index stop-index exact ambig-alts configs))
+ (reportAttemptingFullContext [this parser dfa start-index stop-index conflicting-alts configs]
+ ;; TODO
+ (println "report attempting full context: " parser dfa start-index stop-index conflicting-alts configs))
+ (reportContextSensitivity [this parser dfa start-index stop-index prediction configs]
+ ;; TODO
+ (println "report context sensitivity: " parser dfa start-index stop-index prediction configs))
+ (syntaxError [this recognizer offending-symbol line char message error]
+ ;; recognizer is either clojureParser or clojureLexer
+ (let [report (merge {:row line
+ :column char
+ :message message
+ :type (if (instance? Parser recognizer) :parser :lexer)}
+ (when (instance? Parser recognizer)
+ {:symbol (str offending-symbol)
+ :stack (->> (.getRuleInvocationStack ^Parser recognizer)
+ (reverse)
+ (map keyword))})
+ (when (some? error)
+ {:error error}))]
+ (vswap! reports conj report))))
+
+
+;; start and end are tokens not positions.
+;; So '(hello/world)' has '(' 'hello/world' and ')' as tokens
+(extend-type ParserRuleContext
+ antlr/ParserRule
+ (children [^ParserRuleContext this] (.-children this))
+ (rule-index [^ParserRuleContext this] (.getRuleIndex this))
+ antlr/LocationInfo
+ (span [^ParserRuleContext this]
+ (let [start (.getStart this)
+ stop (.getStop this)]
+ (cond
+ ;; happens when the parser rule is a single lexer rule
+ (= start stop)
+ {::start {:row (.getLine start)
+ :column (.getCharPositionInLine start)}
+ ::end {:row (.getLine start)
+ :column (.getStopIndex start)}}
+
+ ;; no end found - happens on errors
+ (nil? stop)
+ {::start {:row (.getLine start)
+ :column (.getCharPositionInLine start)}}
+
+ :else
+ {::start {:row (.getLine start)
+ :column (.getCharPositionInLine start)}
+ ::end {:row (.getLine stop)
+ :column (.getCharPositionInLine stop)}}))))
+
+
+(extend-type ErrorNodeImpl
+ antlr/LocationInfo
+ (span [^ErrorNodeImpl this]
+ (let [token (.-symbol this)]
+ {::start {:row (.getLine token)
+ :column (.getCharPositionInLine token)}})))
+
+
+(extend-type ClojureParser
+ antlr/AntlrParser
+ (rules [^ClojureParser this] (into [] (map keyword) (.getRuleNames this)))
+ (tree [^ClojureParser this] (. this (code))))
+
+
+(defn parser
+ [input]
+ (let [listener (->AntlrFailure (volatile! ()))
+ chars (CharStreams/fromString input)
+ lexer (doto (new ClojureLexer chars)
+ (.removeErrorListeners)
+ (.addErrorListener listener))
+ tokens (new CommonTokenStream lexer)
+ parser (doto (new ClojureParser tokens)
+ (.setBuildParseTree true)
+ (.removeErrorListeners)
+ (.addErrorListener listener))]
+ {:parser parser :errors {:parser listener}}))
diff --git a/src/clojure/parcera/antlr/javascript.cljs b/src/clojure/parcera/antlr/javascript.cljs
new file mode 100644
index 0000000..abb0afe
--- /dev/null
+++ b/src/clojure/parcera/antlr/javascript.cljs
@@ -0,0 +1,50 @@
+(ns parcera.antlr.javascript
+ (:require [parcera.antlr.protocols :as antlr]
+ [antlr4 :refer [CharStreams CommonTokenStream]]
+ [parcera.antlr.ClojureLexer :refer [ClojureLexer]]
+ [parcera.antlr.ClojureParser :refer [ClojureParser]]))
+
+(set! *warn-on-infer* true)
+
+
+#_(extend-type ParserRuleContext
+ antlr/ParserRule
+ (children [^ParserRuleContext this] (.-children this))
+ (rule-index [^ParserRuleContext this] (.getRuleIndex this))
+ (start [^ParserRuleContext this] (.getStart this))
+ (end [^ParserRuleContext this] (.getStop this)))
+
+
+#_(extend-type ErrorNodeImpl
+ antlr/ErrorNode
+ (token [^ErrorNodeImpl this] (.-symbol this)))
+
+
+#_(extend-type Token
+ antlr/Token
+ (row [^Token this] (.getLine this))
+ (column [^Token this] (.getCharPositionInLine this)))
+
+
+#_(extend-type clojureParser
+ antlr/AntlrParser
+ (rules [^clojureParser this] (vec (.getRuleNames this)))
+ (tree [^clojureParser this] (. this (code))))
+
+
+(defn parser
+ [input]
+ {:parser input})
+
+#_(defn parser
+ [input listener]
+ (let [chars (CharStreams/fromString input)
+ lexer (doto (new clojureLexer chars)
+ (.removeErrorListeners))
+ ;; todo: how to handle lexer errors ?
+ ;(.addErrorListener listener))
+ tokens (new CommonTokenStream lexer)]
+ (doto (new clojureParser tokens)
+ (.setBuildParseTree true)
+ (.removeErrorListeners)
+ (.addErrorListener listener))))
diff --git a/src/clojure/parcera/antlr/protocols.cljc b/src/clojure/parcera/antlr/protocols.cljc
new file mode 100644
index 0000000..7acde2b
--- /dev/null
+++ b/src/clojure/parcera/antlr/protocols.cljc
@@ -0,0 +1,21 @@
+(ns parcera.antlr.protocols
+ "These protocols are a cheat: I use them to be able to dispatch
+ to both Java and JavaScript parser implementations without the
+ common code having to know about it")
+
+
+(defprotocol AntlrParser
+ (rules [this])
+ (tree [this]))
+
+
+(defprotocol ParserRule
+ (children [this])
+ (rule-index [this]))
+
+(defprotocol LocationInfo
+ (span [this]))
+
+
+(defprotocol ErrorNode
+ (token [this]))
diff --git a/src/clojure/parcera/core.cljc b/src/clojure/parcera/core.cljc
new file mode 100644
index 0000000..7e250f6
--- /dev/null
+++ b/src/clojure/parcera/core.cljc
@@ -0,0 +1,268 @@
+(ns parcera.core
+ (:require [parcera.antlr.protocols :as antlr]
+ #?(:clj [parcera.antlr.java :as platform]))
+ ; todo: re-enable once we have javscript support
+ ;:cljs [parcera.antlr.javascript :as platform]))
+ #?(:cljs (:import goog.string.StringBuffer)))
+
+
+(def default-hidden {:tags #{:form :collection :literal :keyword :reader_macro :dispatch}
+ :literals #{"(" ")" "[" "]" "{" "}" "#{" "#" "^" "`" "'" "~"
+ "~@" "@" "#(" "#'" "#_" "#?(" "#?@(" "##" ":" "::"}})
+
+
+;; for some reason cljs doesnt accept escaping the / characters
+(def name-pattern #?(:clj #"^([^\s\/]+\/)?(\/|[^\s\/]+)$"
+ :cljs #"^([^\s/]+/)?(/|[^\s/]+)$"))
+
+
+(defn- failure
+ "Checks that `rule` conforms to additional rules which are too difficult
+ to represent with pure Antlr4 syntax"
+ [rule children metadata]
+ (case rule
+ (:symbol :simple_keyword :macro_keyword)
+ (when (nil? (re-find name-pattern (first children)))
+ (with-meta (list ::failure (cons rule children))
+ (assoc-in metadata [::start :message]
+ (str "name cannot contain more than one /"))))
+
+ :map
+ (let [forms (remove (comp #{:whitespace :discard} first) children)]
+ (when (odd? (count forms))
+ (with-meta (list ::failure (cons rule children))
+ (assoc-in metadata [::start :message]
+ "Map literal must contain an even number of forms"))))
+
+ :set
+ (let [forms (remove (comp #{:whitespace :discard} first) children)
+ set-length (count forms)
+ unique-length (count (distinct forms))]
+ (when (not= set-length unique-length)
+ (with-meta (list ::failure (cons rule children))
+ (assoc-in metadata [::start :message]
+ "Set literal contains duplicate forms"))))
+
+ nil))
+
+
+(defn- hiccup
+ "transforms the tree `hiccup-like` ast data structure.
+
+ Yields a lazy sequence to avoid expensive computation whenever
+ the user is not interested in the full content."
+ [tree rule-names hide-tags hide-literals]
+ (cond
+ (boolean (satisfies? antlr/ParserRule tree))
+ (let [rule (get rule-names (antlr/rule-index tree))
+ children (for [child (antlr/children tree)
+ :let [child (hiccup child rule-names hide-tags hide-literals)]
+ :when (not (nil? child))]
+ child)
+ ;; attach meta data ... ala instaparse
+ ast-meta (antlr/span tree)
+ ;; extra validation rules
+ fail (failure rule children ast-meta)]
+ ;; parcera hidden tags are always "or" statements, so just take the single children
+ (if (contains? hide-tags rule)
+ (first children)
+ (or fail (with-meta (cons rule children) ast-meta))))
+
+ (boolean (satisfies? antlr/ErrorNode tree))
+ (with-meta (list ::failure (str tree))
+ (antlr/span tree))
+
+ :else
+ (let [text (str tree)]
+ (if (contains? hide-literals text) nil text))))
+
+
+(defn- unhide
+ [options]
+ (case (:unhide options)
+ :all (dissoc default-hidden :literals :tags)
+ :content (dissoc default-hidden :literals)
+ :tags (dissoc default-hidden :tags)
+ default-hidden))
+
+
+(defn ast
+ "Clojure (antlr4) parser. It can be used as:
+ - `(parcera/ast input-string)`
+ -> returns a lazy AST representation of input-string
+
+ The following options are accepted:
+ - `:unhide` can be one of `#{:tags :content :all}`. Defaults to `nil`
+
+ NOTE: Antlr returns a fully parsed version of the provided input string
+ however this function returns a lazy sequence in order to expose
+ those through Clojure's immutable data structures"
+ [input & {:as options}]
+ (let [hidden (unhide options)
+ {:keys [parser errors]} (platform/parser input)
+ rule-names (antlr/rules parser)
+ tree (antlr/tree parser)
+ result (hiccup tree rule-names (:tags hidden) (:literals hidden))
+ reports @(:reports (:parser errors))]
+ (vary-meta result assoc ::errors reports)))
+
+
+(defn- code*
+ "internal function used to imperatively build up the code from the provided
+ AST as Clojure's str would be too slow"
+ [ast #?(:clj ^StringBuilder string-builder
+ :cljs ^StringBuffer string-builder)]
+ (case (first ast)
+ :code
+ (doseq [child (rest ast)]
+ (code* child string-builder))
+
+ :list
+ (do (. string-builder (append "("))
+ (doseq [child (rest ast)] (code* child string-builder))
+ (. string-builder (append ")")))
+
+ :vector
+ (do (. string-builder (append "["))
+ (doseq [child (rest ast)] (code* child string-builder))
+ (. string-builder (append "]")))
+
+ :namespaced_map
+ (do (. string-builder (append "#"))
+ (doseq [child (rest ast)] (code* child string-builder)))
+
+ :map
+ (do (. string-builder (append "{"))
+ (doseq [child (rest ast)] (code* child string-builder))
+ (. string-builder (append "}")))
+
+ :set
+ (do (. string-builder (append "#{"))
+ (doseq [child (rest ast)] (code* child string-builder))
+ (. string-builder (append "}")))
+
+ (:number :whitespace :symbol :character :string)
+ (. string-builder (append (second ast)))
+
+ :symbolic
+ (do (. string-builder (append "##"))
+ (. string-builder (append (second ast))))
+
+ :regex
+ (do (. string-builder (append "#"))
+ (. string-builder (append (second ast))))
+
+ :auto_resolve
+ (. string-builder (append "::"))
+
+ :simple_keyword
+ (do (. string-builder (append ":"))
+ (. string-builder (append (second ast))))
+
+ :macro_keyword
+ (do (. string-builder (append "::"))
+ (. string-builder (append (second ast))))
+
+ :metadata
+ (do (doseq [child (rest (butlast ast))] (code* child string-builder))
+ (code* (last ast) string-builder))
+
+ :metadata_entry
+ (doseq [child (rest ast)]
+ (. string-builder (append "^"))
+ (code* child string-builder))
+
+ :quote
+ (do (. string-builder (append "'"))
+ (doseq [child (rest ast)] (code* child string-builder)))
+
+ :var_quote
+ (do (. string-builder (append "#'"))
+ (code* (second ast) string-builder))
+
+ :discard
+ (do (. string-builder (append "#_"))
+ (doseq [child (rest ast)] (code* child string-builder)))
+
+ :tag
+ (do (. string-builder (append "#"))
+ (doseq [child (rest ast)] (code* child string-builder)))
+
+ :backtick
+ (do (. string-builder (append "`"))
+ (doseq [child (rest ast)] (code* child string-builder)))
+
+ :unquote
+ (do (. string-builder (append "~"))
+ (doseq [child (rest ast)] (code* child string-builder)))
+
+ :unquote_splicing
+ (do (. string-builder (append "~@"))
+ (doseq [child (rest ast)] (code* child string-builder)))
+
+ :conditional
+ (do (. string-builder (append "#?("))
+ (doseq [child (rest ast)] (code* child string-builder))
+ (. string-builder (append ")")))
+
+ :conditional_splicing
+ (do (. string-builder (append "#?@("))
+ (doseq [child (rest ast)] (code* child string-builder))
+ (. string-builder (append ")")))
+
+ :deref
+ (do (. string-builder (append "@"))
+ (doseq [child (rest ast)] (code* child string-builder)))
+
+ :function
+ (do (. string-builder (append "#("))
+ (doseq [child (rest ast)] (code* child string-builder))
+ (. string-builder (append ")")))))
+
+
+(defn code
+ "Transforms your AST back into code
+
+ ast: The nested sequence of [:keyword & content] which MUST follow the
+ same structure as the result of `(parcera/clojure input-string)`
+
+ Returns a string representation of the provided AST
+
+ In general (= input (parcera/code (parcera/clojure input)))"
+ [ast]
+ (let [string-builder #?(:clj (new StringBuilder)
+ :cljs (new StringBuffer))]
+ (code* ast string-builder)
+ (. string-builder (toString))))
+
+
+(defn failure?
+ "Checks if ast contains any `::failure` instances.
+
+ NOTE: This function is potentially slow since it might have to check the
+ complete ast to be sure that there are no failures.
+
+ Whenever possible, prefer to handle errors directly appearing in the ast"
+ [ast]
+ (or
+ ;; ast is root node
+ (not (empty? (::errors (meta ast))))
+ ;; ast is child node
+ (and (seq? ast) (= ::failure (first ast)))
+ ;; ast is root node but "doesnt know" about the failure -> conformed
+ (some #{::failure} (filter keyword? (tree-seq seq? identity ast)))))
+
+#_(time (ast (str '(ns parcera.core
+ (:require [instaparse.core :as instaparse]
+ [clojure.data :as data]
+ [clojure.string :as str])))))
+
+#_(time (ast "(ns parcera.core
+ (:require [instaparse.core :as #{:hello \"world\" :hello}]
+ [clojure.data :as data]
+ [clojure.string :as str])"))
+
+#_(filter :meta (map #(hash-map :item % :meta (meta %))
+ (tree-seq seq? seq (ast "
+ (ns
+ parcera.core))"))))
diff --git a/src/parcera/slurp.cljc b/src/clojure/parcera/slurp.cljc
similarity index 100%
rename from src/parcera/slurp.cljc
rename to src/clojure/parcera/slurp.cljc
diff --git a/src/parcera/core.cljc b/src/parcera/core.cljc
deleted file mode 100644
index 88cca67..0000000
--- a/src/parcera/core.cljc
+++ /dev/null
@@ -1,262 +0,0 @@
-(ns parcera.core
- (:require [instaparse.core :as instaparse]
- [instaparse.combinators-source :as combi]
- [instaparse.cfg :as cfg]
- [parcera.terminals :as terminal])
- #?(:cljs (:import goog.string.StringBuffer)))
-
-; todo: implement advices from
-; http://blog.reverberate.org/2013/09/ll-and-lr-in-context-why-parsing-tools.html
-; https://www.loggly.com/blog/regexes-the-bad-better-best/
-; https://www.loggly.com/blog/five-invaluable-techniques-to-improve-regex-performance/
-
-; todo: use advices in https://medium.appbase.io/analyzing-20k-github-repositories-af76de21c3fc
-; to check if the heuristics are accurate
-
-; NOTE: Through my experiments I found out that Instaparse will gladly take the
-; first match as long as the grammar is not ambiguous. Therefore I switched the
-; unordered OR (|) with an ordered one (/). This of course implies an heuristic
-; of knowing which grammar rules are expected to match more often. I use
-; Clojure's core as a reference with the following code snippet
-#_(let [core-content (slurp "https://raw.githubusercontent.com/clojure/clojure/master/src/clj/clojure/core.clj")]
- (time (sort-by second > (frequencies (filter keyword? (flatten (clojure core-content :optimize :memory)))))))
-#_(let [core-content (slurp "https://raw.githubusercontent.com/clojure/clojurescript/master/src/main/clojure/cljs/core.cljc")]
- (time (sort-by second > (frequencies (filter keyword? (flatten (clojure core-content :optimize :memory)))))))
-; todo: performance of [,\s]*;.*|[,\s]+ for whitespace
-(def grammar-rules
- "code: form*;
-
-