feat!: Implement new core: new lexer/parser/visitor/plugin systems (#40)

* feat(fs): introduce the fs plugin * fix(fs): typo * feat(fs): convert sandboxdir into a function * fix(fs): fix return values * feat(fs): finish up and add tests * feat(git): introduce the git module * feat!(core,http,ethereum): unify the parser * feat(core)!: introduce new lexer/parser/visitor This new core here eases the heavy work on the developer by simplifing the API while keeping all the things we want. At first, we used to use Chevrotain with a method that didn't allow method that didn't allow us to detect the individual syntax errors of a custom statement we defined. I worked around that issue by casting black-magic spells over Chevrotain internals. I hated that solution, but at that time, it was the most practicle one. Now, here, I introduce the new core with its new lexer/parser/visitor that is smart enough to cover most of our real-world needs. The visitor is in perfect Shape shape. The lexer is almost perfect, perhaps needs just a little bit of polishing, but it is the one that is the simplest. The parser is smart enough to cover most of our real-world cases, but it also needs some treatment. I'm planning to have a ranking system over it. Currently, it uses the number of errors detected on a statement, and using that, it ranks its matches. This solution appears to be quite reliable in practice. The way we use the API is quite simple ("Given I" parts can also be "Then I"): import {Plugin} from '@slangroom/core' const p = new Plugin(); const cb = (ctx) => ctx.fail('example') // cb is called when statement is matched p.new('love asche', cb) // -> Given I love Asche p.new('open', 'read file contents', cb) // -> Given I open 'ident' and read file contents p.new('connect', 'send http request', cb) // -> Given I connect to 'ident' and send http request p.new(['base32'], 'convert to base64', cb) // -> Given I send base32 'ident' and convert to base64 p.new('connect', ['object', 'proxy'], 'send http request', cb) // -> Given I connect to 'ident' and send object 'ident' and send proxy 'ident' and send http request export const myPlugin = p; * feat(ethereum)!: port over to the new core and fix tests * feat(http)!: port over to the new core and fix tests * fix(core): relax phrase and params checks * feat(wallet)!: port over to the new core Currently, the tests fail for some reason. * feat(core)!: forbid spaces in params * fix(wallet)!: switch over to underscore params * fix: wrongo order of the bindings set /wor/ident/ * fix: the wallet params names * lint: linting * fix(core): fix parsing of params order * test(ethereum): fix rule ignore * fix(wallet): parameter names --------- Co-authored-by: Puria Nafisi Azizi <[email protected]>
dyne · Nov 16, 2023 · 94ebaa1 · 94ebaa1
1 parent f1c354f
commit 94ebaa1
Show file tree

Hide file tree

Showing 42 changed files with 3,034 additions and 1,523 deletions.
diff --git a/pkg/core/src/index.ts b/pkg/core/src/index.ts
@@ -1,4 +1,4 @@
-export * from '@slangroom/core/lexicon';
+export * from '@slangroom/core/lexer';
 export * from '@slangroom/core/parser';
 export * from '@slangroom/core/lexer';
 export * from '@slangroom/core/visitor';

diff --git a/pkg/core/src/lexer.ts b/pkg/core/src/lexer.ts
@@ -1,10 +1,49 @@
-import { Lexicon } from '@slangroom/core';
-import { Lexer } from '@slangroom/deps/chevrotain';
-
-/**
- * Lexes the given line.
- */
-export const lex = (lexicon: Lexicon, line: string) => {
-	const lexer = new Lexer(lexicon.tokens);
-	return lexer.tokenize(line);
+export class Token {
+	readonly name: string;
+	readonly isIdent: boolean;
+
+	constructor(
+		readonly raw: string,
+		readonly start: number,
+		readonly end: number,
+	) {
+		this.name = this.raw.toLowerCase();
+		this.isIdent = this.raw.charAt(0) === "'";
+	}
+}
+
+export class LexError extends Error {
+	constructor(t: Token) {
+		super();
+		this.name = 'LexError';
+		this.message = `unclosed single-quote at ${t.start},${t.end}: ${t.raw}`;
+	}
+}
+
+export const lex = (line: string): Token[] => {
+	const tokens: Token[] = [];
+	const c = [...line];
+	let raw = '';
+	let i = 0;
+
+	while (i < c.length) {
+		while (c[i] === ' ' || c[i] === '\t') ++i;
+
+		if (c[i] === "'") {
+			const start = i;
+			raw += c[i++];
+			while (i < c.length && c[i] !== "'") raw += c[i++];
+			if (i >= c.length) throw new LexError(new Token(raw, start, c.length - 1));
+			raw += c[i++];
+			tokens.push(new Token(raw, start, i - 1));
+			raw = '';
+		} else {
+			const start = i;
+			while (i < c.length && c[i] !== ' ' && c[i] !== '\t' && c[i] !== "'") raw += c[i++];
+			if (raw.length) tokens.push(new Token(raw, start, i - 1));
+			raw = '';
+		}
+	}
+
+	return tokens;
 };
diff --git a/pkg/core/src/lexicon.ts b/pkg/core/src/lexicon.ts
@@ -16,7 +16,7 @@ export class Lexicon {
 				name: 'Whitespace',
 				pattern: /\s+/,
 				group: Lexer.SKIPPED,
-			})
+			}),
 		);
 
 		this.#store.set(
@@ -25,15 +25,15 @@ export class Lexicon {
 				name: 'Comment',
 				pattern: /#[^\n\r]*/,
 				group: 'comments',
-			})
+			}),
 		);
 
 		this.#store.set(
 			'identifier',
 			createToken({
 				name: 'Identifier',
 				pattern: /'(?:[^\\']|\\(?:[bfnrtv'\\/]|u[0-9a-fA-F]{4}))*'/,
-			})
+			}),
 		);
 	}
 

diff --git a/pkg/core/src/parser.ts b/pkg/core/src/parser.ts
@@ -1,119 +1,154 @@
-import { Lexicon } from '@slangroom/core';
-import {
-	CstParser,
-	type IToken,
-	type CstNode,
-	type IOrAlt,
-	type ConsumeMethodOpts,
-} from '@slangroom/deps/chevrotain';
-
-export type StatementCst = CstNode & {
-	children: { [K in string]: [PhraseCst] };
-};
+import { PluginMap, Token, type PluginMapKey } from '@slangroom/core';
+
+export class ParseError extends Error {
+	static wrong(have: Token, wantFirst: string, ...wantRest: string[]) {
+		const wants = [wantFirst, ...wantRest];
+		return new ParseError(
+			`"${have.raw}" between (${have.start}, ${have.end}) must be one of: ${wants.join(
+				', ',
+			)}`,
+		);
+	}
 
-export type PhraseCst = CstNode & {
-	children: {
-		connect?: [IToken];
-	} & { open?: [IToken] } & { into?: [IToken] } & {
-		[K in string]: [IToken | PhraseCst];
-	};
-};
+	static missing(wantFirst: string, ...wantRest: string[]) {
+		const wants = [wantFirst, ...wantRest];
+		return new ParseError(`missing token(s): ${wants.join(', ')}`);
+	}
 
-export class Parser extends CstParser {
-	#phrases: IOrAlt<unknown>[];
-	#lexicon: Lexicon;
-
-	constructor(lexicon: Lexicon, parsers: ((this: Parser) => void)[]) {
-		super(lexicon.tokens, { maxLookahead: 1024 });
-		this.#lexicon = lexicon;
-		parsers = [...new Set(parsers)];
-		parsers.forEach((p) => p.apply(this));
-		this.#phrases = Object.entries(this).reduce((acc, [k, v]) => {
-			if (k.endsWith('Phrase') && typeof v === 'function')
-				acc.push({ ALT: () => this.SUBRULE(v) });
-			return acc;
-		}, [] as IOrAlt<unknown>[]);
-		this.performSelfAnalysis();
+	static extra(token: Token) {
+		return new ParseError(`extra token (${token.start}, ${token.end}): ${token.raw}`);
 	}
 
 	/**
-	 * {@inheritDoc Lexicon.token}
+	 * @internal
 	 */
-	#token(name: string) {
-		return this.#lexicon.token(name);
+	constructor(message: string) {
+		super(message);
+		this.name = 'ParseError';
 	}
+}
 
-	tokenn(idx: number, name: string, opts?: ConsumeMethodOpts) {
-		this.consume(idx, this.#token(name), opts);
-	}
+export type Cst = {
+	givenThen?: 'given' | 'then';
+	errors: ParseError[];
+	matches: Match[];
+};
 
-	token(name: string, opts?: ConsumeMethodOpts) {
-		this.tokenn(0, name, opts);
-	}
+export type Match = {
+	bindings: Map<string, string>;
+	key: PluginMapKey;
+	err: ParseError[];
+	into?: string;
+} & (
+		| {
+			open?: string;
+			connect?: never;
+		}
+		| {
+			open?: never;
+			connect?: string;
+		}
+	);
+
+export const parse = (p: PluginMap, t: Token[]): Cst => {
+	const cst: Cst = {
+		matches: [],
+		errors: [],
+	};
+	let givenThen: 'given' | 'then' | undefined;
 
-	token1(name: string, opts?: ConsumeMethodOpts) {
-		this.tokenn(1, name, opts);
-	}
+	// Given or Then
+	if (t[0]?.name === 'given') givenThen = 'given';
+	else if (t[0]?.name === 'then') givenThen = 'then';
+	else if (t[0]) cst.errors.push(ParseError.wrong(t[0], 'given', 'then'));
+	else cst.errors.push(ParseError.missing('given', 'then'));
 
-	token2(name: string, opts?: ConsumeMethodOpts) {
-		this.tokenn(2, name, opts);
-	}
+	// TODO: should we allow "that" here ("Given that I")
 
-	token3(name: string, opts?: ConsumeMethodOpts) {
-		this.tokenn(3, name, opts);
+	// I
+	if (t[1]) {
+		if (t[1]?.raw !== 'I') cst.errors.push(ParseError.wrong(t[1], 'I'));
+	} else {
+		cst.errors.push(ParseError.missing('I'));
 	}
-
-	statement = this.RULE('statement', () => {
-		this.OR(this.#phrases);
+	p.forEach(([k]) => {
+		let i = 1;
+		const m: Match = { key: k, bindings: new Map(), err: [] };
+		const curErrLen = cst.matches[0]?.err.length;
+		const lemmeout = {};
+		const newErr = (have: undefined | Token, wantsFirst: string, ...wantsRest: string[]) => {
+			if (have) m.err.push(ParseError.wrong(have, wantsFirst, ...wantsRest));
+			else m.err.push(ParseError.missing(wantsFirst, ...wantsRest));
+			if (curErrLen !== undefined && m.err.length > curErrLen) throw lemmeout;
+		};
+		try {
+			// Open 'ident' and|Connect to 'ident' and
+			if (k.openconnect === 'open') {
+				if (t[++i]?.name !== 'open') newErr(t[i], 'open');
+				const ident = t[++i];
+				if (ident?.isIdent) m.open = ident.raw.slice(1, -1);
+				else newErr(ident, '<identifier>');
+				if (t[++i]?.name !== 'and') newErr(t[i], 'and');
+			} else if (k.openconnect === 'connect') {
+				if (t[++i]?.name !== 'connect') newErr(t[i], 'connect');
+				if (t[++i]?.name !== 'to') newErr(t[i], 'connect');
+				const ident = t[++i];
+				if (ident?.isIdent) m.connect = ident.raw.slice(1, -1);
+				else newErr(ident, '<identifier>');
+				if (t[++i]?.name !== 'and') newErr(t[i], 'and');
+			}
+
+			// Send $buzzword 'ident' And
+			// TODO: allow spaces in between params
+			const params = new Set(k.params);
+			k.params?.forEach(() => {
+				if (t[++i]?.name !== 'send') newErr(t[i], 'send');
+
+				const tokName = t[++i];
+				if (tokName && params.has(tokName.name)) {
+					params.delete(tokName.name);
+				} else {
+					const [first, ...rest] = [...params.values()] as [string, ...string[]];
+					newErr(t[i], first, ...rest);
+				}
+
+				const ident = t[++i];
+				if (ident?.isIdent) {
+					if (tokName) m.bindings.set(tokName.name, ident.raw.slice(1, -1));
+				} else {
+					newErr(ident, '<identifier>');
+				}
+				if (t[++i]?.name !== 'and') newErr(t[i], 'and');
+			});
+
+			// $buzzwords
+			k.phrase.split(' ').forEach((name) => t[++i]?.name !== name && newErr(t[i], name));
+
+			// Save Output Into 'ident'
+			const ident = t[t.length - 1];
+			if (t.length - i >= 5 && ident?.isIdent) {
+				for (++i; i < t.length - 4; ++i) m.err.push(ParseError.extra(t[i] as Token));
+				if (t[t.length - 4]?.name !== 'and') newErr(t[t.length - 4], 'and');
+				if (t[t.length - 3]?.name !== 'output') newErr(t[t.length - 3], 'output');
+				if (t[t.length - 2]?.name !== 'into') newErr(t[t.length - 2], 'into');
+				if (
+					t[t.length - 4]?.name === 'and' &&
+					t[t.length - 3]?.name === 'output' &&
+					t[t.length - 2]?.name === 'into'
+				)
+					m.into = ident.raw.slice(1, -1);
+			} else {
+				for (++i; i < t.length; ++i) m.err.push(ParseError.extra(t[i] as Token));
+			}
+
+			if (curErrLen !== undefined && m.err.length > curErrLen) throw lemmeout;
+			if (curErrLen !== undefined && m.err.length < curErrLen) cst.matches.length = 0;
+			cst.matches.push(m);
+		} catch (e) {
+			if (e !== lemmeout) throw e;
+		}
 	});
 
-	connect() {
-		this.tokenn(255, 'connect');
-		this.tokenn(255, 'to');
-		this.tokenn(255, 'identifier', { LABEL: 'connect' });
-		this.tokenn(255, 'and');
-	}
-
-	open() {
-		this.tokenn(255, 'open');
-		this.tokenn(255, 'identifier', { LABEL: 'open' });
-		this.tokenn(255, 'and');
-	}
-
-	into() {
-		this.tokenn(254, 'and');
-		this.tokenn(254, 'output');
-		this.tokenn(254, 'into');
-		this.tokenn(254, 'identifier', { LABEL: 'into' });
-	}
-
-	sendpassn(idx: number, parameter: string) {
-		this.or(idx, [
-			{ ALT: () => this.tokenn(idx, 'send', { LABEL: `sendpass${idx}` }) },
-			{ ALT: () => this.tokenn(idx, 'pass', { LABEL: `sendpass${idx}` }) },
-		]);
-		this.tokenn(idx, parameter, { LABEL: `sendpass${idx}.parameter` });
-		this.tokenn(idx, 'identifier', { LABEL: `sendpass${idx}.identifier` });
-		this.tokenn(idx, 'and', { LABEL: `sendpass${idx}.and` });
-	}
-
-	sendpass(parameter: string) {
-		this.sendpassn(0, parameter);
-	}
-
-	sendpass1(parameter: string) {
-		this.sendpassn(1, parameter);
-	}
-
-	sendpass2(parameter: string) {
-		this.sendpassn(2, parameter);
-	}
-}
-
-export const parse = (parser: Parser, tokens: IToken[]) => {
-	parser.input = tokens;
-	return {
-		cst: parser.statement(),
-		errors: parser.errors,
-	};
+	if (givenThen) cst.givenThen = givenThen;
+	return cst;
 };