diff --git a/src/compiler/deserialization/ucd-compiler.ts b/src/compiler/deserialization/ucd-compiler.ts index 73709438..715967cc 100644 --- a/src/compiler/deserialization/ucd-compiler.ts +++ b/src/compiler/deserialization/ucd-compiler.ts @@ -350,6 +350,7 @@ export namespace UcdCompiler { | UccFeature | readonly UccFeature[] | undefined; + readonly embed?: EsSnippet | undefined; readonly exportDefaults?: boolean | undefined; createDeserializer?>( diff --git a/src/compiler/deserialization/ucd-function.ts b/src/compiler/deserialization/ucd-function.ts index cabfb831..4b141b17 100644 --- a/src/compiler/deserialization/ucd-function.ts +++ b/src/compiler/deserialization/ucd-function.ts @@ -65,7 +65,7 @@ export class UcdFunction = UcSc } exportFn(externalName: string, mode: UcDeserializer.Mode): EsFunction { - const { opaqueUcrx, defaultEntities, defaultFormats, onMeta } = this.lib; + const { opaqueUcrx, defaultEntities, defaultFormats, onMeta, embed } = this.lib; const stream = new EsSymbol('stream'); const options = (code: EsCode): void => { code.multiLine(code => { @@ -78,6 +78,7 @@ export class UcdFunction = UcSc 'formats,', 'onMeta,', opaqueUcrx ? esline`opaqueRx: ${opaqueUcrx.instantiate()},` : EsCode.none, + embed ? esline`embed: ${embed},` : EsCode.none, ) .write('}'); }); diff --git a/src/compiler/deserialization/ucd-lib.ts b/src/compiler/deserialization/ucd-lib.ts index 7a3a3659..159eeb3b 100644 --- a/src/compiler/deserialization/ucd-lib.ts +++ b/src/compiler/deserialization/ucd-lib.ts @@ -114,6 +114,10 @@ export class UcdLib extends UcrxLib { return this.#onMeta; } + get embed(): EsSnippet | undefined { + return this.#options.embed; + } + deserializerFor = UcSchema>( schema: TSchema, ): UcdFunction { @@ -148,6 +152,7 @@ export namespace UcdLib { formats(this: void, exportNs?: EsNamespace): EsSnippet; meta(this: void, exportNs?: EsNamespace): EsSnippet; onMeta?: EsSnippet | undefined; + readonly embed?: EsSnippet | undefined; readonly exportDefaults?: boolean | undefined; createDeserializer?>( diff --git a/src/compiler/deserialization/unknown.ucrx.class.ts b/src/compiler/deserialization/unknown.ucrx.class.ts index 53ed553c..25fc8fad 100644 --- a/src/compiler/deserialization/unknown.ucrx.class.ts +++ b/src/compiler/deserialization/unknown.ucrx.class.ts @@ -312,7 +312,12 @@ export class UnknownUcrxClass extends UcrxClass { #overrideRemainingMethods(): void { for (const { member, declared } of this.members()) { - if (!declared && member instanceof UcrxMethod && member !== UcrxCore.raw) { + if ( + !declared + && member instanceof UcrxMethod + && member !== UcrxCore.emb + && member !== UcrxCore.raw + ) { this.#declareMethod(member); } } diff --git a/src/compiler/rx/ucrx-core.ts b/src/compiler/rx/ucrx-core.ts index 1a4c9873..bd2a1510 100644 --- a/src/compiler/rx/ucrx-core.ts +++ b/src/compiler/rx/ucrx-core.ts @@ -12,6 +12,7 @@ export type UcrxCore = { readonly att: UcrxMethod; readonly bol: UcrxSetter; readonly big: UcrxSetter; + readonly emb: UcrxMethod<{ emit: EsArg; cx: EsArg }>; readonly ent: UcrxEntitySetter; readonly fmt: UcrxFormattedSetter; readonly nls: UcrxMethod<{ cx: EsArg }>; @@ -35,6 +36,10 @@ export const UcrxCore: UcrxCore = { att: /*#__PURE__*/ new UcrxAttrSetter('att'), bol: /*#__PURE__*/ new UcrxSetter('bol', { typeName: 'boolean', stub: UcrxCore$stub }), big: /*#__PURE__*/ new UcrxSetter('big', { typeName: 'bigint', stub: UcrxCore$stub }), + emb: /*#__PURE__*/ new UcrxMethod('emb', { + args: { emit: {}, cx: {} }, + stub: UcrxCore$stub, + }), ent: /*#__PURE__*/ new UcrxEntitySetter('ent'), fmt: /*#__PURE__*/ new UcrxFormattedSetter('fmt'), nls: /*#__PURE__*/ new UcrxMethod<{ cx: EsArg }>('nls', { diff --git a/src/deserializer/async-ucd-reader.ts b/src/deserializer/async-ucd-reader.ts index ca4527db..01f95861 100644 --- a/src/deserializer/async-ucd-reader.ts +++ b/src/deserializer/async-ucd-reader.ts @@ -1,5 +1,5 @@ import { Ucrx } from '../rx/ucrx.js'; -import { UcDeserializer } from '../schema/uc-deserializer.js'; +import { UcInputLexer } from '../syntax/uc-input-lexer.js'; import { UcToken } from '../syntax/uc-token.js'; import { ucdReadValue } from './impl/ucd-read-value.js'; import { UcrxHandle } from './impl/ucrx-handle.js'; @@ -7,14 +7,16 @@ import { UcdReader } from './ucd-reader.js'; export class AsyncUcdReader extends UcdReader { - readonly #reader: ReadableStreamDefaultReader; + #stream: ReadableStream; + #reader: ReadableStreamDefaultReader; #current: UcToken | undefined; readonly #prev: UcToken[] = []; #hasNext = true; - constructor(stream: ReadableStream, options?: UcDeserializer.Options) { + constructor(stream: ReadableStream, options?: UcdReader.Options) { super(options); + this.#stream = stream; this.#reader = stream.getReader(); } @@ -35,7 +37,21 @@ export class AsyncUcdReader extends UcdReader { } override async read(rx: Ucrx): Promise { - await ucdReadValue(this, new UcrxHandle(this, rx, [{}]), rx => rx.end()); + await ucdReadValue(this, new UcrxHandle(this, rx, [{}]), false); + } + + override async readEmbeds( + rx: Ucrx, + createLexer: (emit: (token: UcToken) => void) => UcInputLexer, + single: boolean, + ): Promise { + this.skip(); + + this.#reader.releaseLock(); + this.#stream = this.#stream.pipeThrough(new UcEmbedsStream(createLexer)); + this.#reader = this.#stream.getReader(); + + await ucdReadValue(this, new UcrxHandle(this, rx, [{}]), single); } override async next(): Promise { @@ -133,3 +149,27 @@ export class AsyncUcdReader extends UcdReader { } } + +export class UcEmbedsStream extends TransformStream { + + constructor(createLexer: (emit: (token: UcToken) => void) => UcInputLexer) { + let lexer: UcInputLexer; + let pass = (token: UcToken, _controller: TransformStreamDefaultController): void => { + if (typeof token === 'number') { + lexer.flush(); + pass = (token, controller) => controller.enqueue(token); + } else { + lexer.scan(token); + } + }; + + super({ + start: controller => { + lexer = createLexer(token => controller.enqueue(token)); + }, + transform: (token, controller) => pass(token, controller), + flush: () => lexer.flush(), + }); + } + +} diff --git a/src/deserializer/impl/ucd-read-value.sync.ts b/src/deserializer/impl/ucd-read-value.sync.ts index d078c54d..7701e216 100644 --- a/src/deserializer/impl/ucd-read-value.sync.ts +++ b/src/deserializer/impl/ucd-read-value.sync.ts @@ -22,6 +22,7 @@ import { UC_TOKEN_CLOSING_PARENTHESIS, UC_TOKEN_COMMA, UC_TOKEN_DOLLAR_SIGN, + UC_TOKEN_EMBED, UC_TOKEN_EXCLAMATION_MARK, UC_TOKEN_OPENING_PARENTHESIS, UcToken, @@ -33,10 +34,8 @@ import { UcrxHandle } from './ucrx-handle.js'; export function ucdReadValueSync( reader: SyncUcdReader, rx: UcrxHandle, - end?: (rx: UcrxHandle) => void, // Never set for the first item of the list, unless it is non-empty. + single: boolean, ): void { - const single = !end; - ucdSkipWhitespaceSync(reader); const firstToken = reader.current(); @@ -89,6 +88,15 @@ export function ucdReadValueSync( return; } + hasValue = true; + } else if (firstToken === UC_TOKEN_EMBED) { + reader.readEmbeds(rx.rx, emit => rx.emb(emit), single); + ucdSkipWhitespaceSync(reader); + + if (single) { + return; + } + hasValue = true; } @@ -125,8 +133,12 @@ export function ucdReadValueSync( const bound = reader.current(); if (!bound) { + if (!single) { + rx.end(); + } + // End of input. - return end?.(rx); + return; } if (bound === UC_TOKEN_CLOSING_PARENTHESIS) { // Unbalanced closing parenthesis. @@ -134,8 +146,11 @@ export function ucdReadValueSync( if (!hasValue) { rx.decode(printUcTokens(trimUcTokensTail(reader.consumePrev()))); } + if (!single) { + rx.end(); + } - return end?.(rx); + return; } if (bound === UC_TOKEN_COMMA) { @@ -245,12 +260,12 @@ function ucdReadMetaAndValueSync(reader: SyncUcdReader, rx: UcrxHandle): void { reader.skip(); // Skip opening parenthesis. - ucdReadValueSync(reader, rx.att(attributeName), rx => rx.end()); + ucdReadValueSync(reader, rx.att(attributeName), false); reader.skip(); // Skip closing parenthesis. // Read single value following the attribute. - ucdReadValueSync(reader, rx); + ucdReadValueSync(reader, rx, true); } function ucdReadTokensSync( @@ -355,7 +370,7 @@ function ucdReadItemsSync( } else { rx.nextItem(); } - ucdReadValueSync(reader, rx); + ucdReadValueSync(reader, rx, true); if (reader.current() === UC_TOKEN_COMMA) { // Skip comma and whitespace following it. @@ -372,7 +387,7 @@ function ucdReadMapSync(reader: SyncUcdReader, rx: UcrxHandle, firstKey: string) const entryRx = rx.firstEntry(firstKey); - ucdReadValueSync(reader, entryRx, rx => rx.end()); + ucdReadValueSync(reader, entryRx, false); const bound = reader.current(); @@ -421,7 +436,7 @@ function ucdReadEntriesSync(reader: SyncUcdReader, rx: UcrxHandle): void { const entryRx = rx.nextEntry(key); - ucdReadValueSync(reader, entryRx, rx => rx.end()); + ucdReadValueSync(reader, entryRx, false); if (!reader.current()) { // End of input. diff --git a/src/deserializer/impl/ucd-read-value.ts b/src/deserializer/impl/ucd-read-value.ts index 964a190d..6361608b 100644 --- a/src/deserializer/impl/ucd-read-value.ts +++ b/src/deserializer/impl/ucd-read-value.ts @@ -14,6 +14,7 @@ import { UC_TOKEN_CLOSING_PARENTHESIS, UC_TOKEN_COMMA, UC_TOKEN_DOLLAR_SIGN, + UC_TOKEN_EMBED, UC_TOKEN_EXCLAMATION_MARK, UC_TOKEN_OPENING_PARENTHESIS, UcToken, @@ -25,10 +26,8 @@ import { UcrxHandle } from './ucrx-handle.js'; export async function ucdReadValue( reader: AsyncUcdReader, rx: UcrxHandle, - end?: (rx: UcrxHandle) => void, // Never set for the first item of the list, unless it is non-empty. + single: boolean, ): Promise { - const single = !end; - await ucdSkipWhitespace(reader); const firstToken = reader.current(); @@ -81,6 +80,15 @@ export async function ucdReadValue( return; } + hasValue = true; + } else if (firstToken === UC_TOKEN_EMBED) { + await reader.readEmbeds(rx.rx, emit => rx.emb(emit), single); + await ucdSkipWhitespace(reader); + + if (single) { + return; + } + hasValue = true; } @@ -117,8 +125,12 @@ export async function ucdReadValue( const bound = reader.current(); if (!bound) { + if (!single) { + rx.end(); + } + // End of input. - return end?.(rx); + return; } if (bound === UC_TOKEN_CLOSING_PARENTHESIS) { // Unbalanced closing parenthesis. @@ -126,8 +138,11 @@ export async function ucdReadValue( if (!hasValue) { rx.decode(printUcTokens(trimUcTokensTail(reader.consumePrev()))); } + if (!single) { + rx.end(); + } - return end?.(rx); + return; } if (bound === UC_TOKEN_COMMA) { @@ -237,12 +252,12 @@ async function ucdReadMetaAndValue(reader: AsyncUcdReader, rx: UcrxHandle): Prom reader.skip(); // Skip opening parenthesis. - await ucdReadValue(reader, rx.att(attributeName), rx => rx.end()); + await ucdReadValue(reader, rx.att(attributeName), false); reader.skip(); // Skip closing parenthesis. // Read single value following the attribute. - await ucdReadValue(reader, rx); + await ucdReadValue(reader, rx, true); } async function ucdReadTokens( @@ -347,7 +362,7 @@ async function ucdReadItems( } else { rx.nextItem(); } - await ucdReadValue(reader, rx); + await ucdReadValue(reader, rx, true); if (reader.current() === UC_TOKEN_COMMA) { // Skip comma and whitespace following it. @@ -364,7 +379,7 @@ async function ucdReadMap(reader: AsyncUcdReader, rx: UcrxHandle, firstKey: stri const entryRx = rx.firstEntry(firstKey); - await ucdReadValue(reader, entryRx, rx => rx.end()); + await ucdReadValue(reader, entryRx, false); const bound = reader.current(); @@ -413,7 +428,7 @@ async function ucdReadEntries(reader: AsyncUcdReader, rx: UcrxHandle): Promise rx.end()); + await ucdReadValue(reader, entryRx, false); if (!reader.current()) { // End of input. diff --git a/src/deserializer/impl/ucrx-handle.ts b/src/deserializer/impl/ucrx-handle.ts index 0b552c8b..59344920 100644 --- a/src/deserializer/impl/ucrx-handle.ts +++ b/src/deserializer/impl/ucrx-handle.ts @@ -4,6 +4,7 @@ import { Ucrx } from '../../rx/ucrx.js'; import { UcMeta } from '../../schema/meta/uc-meta.js'; import { UcRejection } from '../../schema/uc-error.js'; import type { URIChargePath } from '../../schema/uri-charge/uri-charge-path.js'; +import { UcInputLexer, ucOpaqueLexer } from '../../syntax/uc-input-lexer.js'; import { UcToken } from '../../syntax/uc-token.js'; import { UcdReader } from '../ucd-reader.js'; @@ -25,6 +26,10 @@ export class UcrxHandle implements UcrxContext { this.#path = path; } + get rx(): Ucrx { + return this.#rx; + } + get data(): Record { return this.#reader.data; } @@ -83,6 +88,21 @@ export class UcrxHandle implements UcrxContext { this.#rx.bol(value, this); } + emb(emit: (token: UcToken) => void): UcInputLexer { + const lexer = this.#rx.emb(emit, this) ?? this.#reader.embed(this)?.(emit); + + if (lexer) { + return lexer; + } + + this.#reject({ + code: 'unrecognizedInput', + message: 'Unrecognized embedded input', + }); + + return ucOpaqueLexer; + } + ent(entity: string): void { if (!this.onEntity(entity)) { // Process entity. diff --git a/src/deserializer/sync-ucd-reader.ts b/src/deserializer/sync-ucd-reader.ts index ee5b379d..493fc6e1 100644 --- a/src/deserializer/sync-ucd-reader.ts +++ b/src/deserializer/sync-ucd-reader.ts @@ -1,5 +1,5 @@ import { Ucrx } from '../rx/ucrx.js'; -import { UcDeserializer } from '../schema/uc-deserializer.js'; +import { UcInputLexer } from '../syntax/uc-input-lexer.js'; import { UcLexer } from '../syntax/uc-lexer.js'; import { UcToken } from '../syntax/uc-token.js'; import { ucdReadValueSync } from './impl/ucd-read-value.sync.js'; @@ -8,12 +8,12 @@ import { UcdReader } from './ucd-reader.js'; export class SyncUcdReader extends UcdReader { - readonly #tokens: readonly UcToken[]; + #tokens: readonly UcToken[]; #current = -1; #next = 0; #consumed = 0; - constructor(tokens: readonly UcToken[], options?: UcDeserializer.Options) { + constructor(tokens: readonly UcToken[], options?: UcdReader.Options) { super(options); this.#tokens = tokens; } @@ -35,7 +35,44 @@ export class SyncUcdReader extends UcdReader { } override read(rx: Ucrx): void { - ucdReadValueSync(this, new UcrxHandle(this, rx, [{}]), rx => rx.end()); + ucdReadValueSync(this, new UcrxHandle(this, rx, [{}]), false); + } + + override readEmbeds( + rx: Ucrx, + createLexer: (emit: (token: UcToken) => void) => UcInputLexer, + single: boolean, + ): Promise | void { + this.#unwrapEmbeds(createLexer); + ucdReadValueSync(this, new UcrxHandle(this, rx, [{}]), single); + } + + #unwrapEmbeds(createLexer: (emit: (token: UcToken) => void) => UcInputLexer): void { + const tokens = this.#tokens; + const newTokens = []; + const lexer = createLexer(token => newTokens.push(token)); + + const length = tokens.length; + let end = this.#current + 1; + + while (end < length) { + const chunk = tokens[end++]; + + if (typeof chunk === 'number') { + // Embedded input expected to end with input bound. + break; + } + + lexer.scan(chunk); + } + lexer.flush(); + + newTokens.push(...this.#tokens.slice(end)); + + this.#tokens = newTokens; + this.#current = -1; + this.#next = 0; + this.#consumed = 0; } override next(): UcToken | undefined { @@ -113,17 +150,17 @@ export class SyncUcdReader extends UcdReader { export function createSyncUcdReader( input: string | readonly UcToken[], - options?: UcDeserializer.Options, + options?: UcdReader.Options, ): SyncUcdReader; export function createSyncUcdReader( input: string | readonly UcToken[] | unknown, - options?: UcDeserializer.Options, + options?: UcdReader.Options, ): SyncUcdReader | undefined; export function createSyncUcdReader( input: string | readonly UcToken[] | unknown, - options?: UcDeserializer.Options, + options?: UcdReader.Options, ): SyncUcdReader | undefined { if (typeof input === 'string') { return new SyncUcdReader(UcLexer.scan(input), options); diff --git a/src/deserializer/ucd-reader.ts b/src/deserializer/ucd-reader.ts index 663ef399..e831a457 100644 --- a/src/deserializer/ucd-reader.ts +++ b/src/deserializer/ucd-reader.ts @@ -4,6 +4,7 @@ import { UcrxContext } from '../rx/ucrx-context.js'; import { Ucrx } from '../rx/ucrx.js'; import { UcDeserializer } from '../schema/uc-deserializer.js'; import { UcError, UcErrorInfo } from '../schema/uc-error.js'; +import { UcInputLexer, UcInputLexerFactory } from '../syntax/uc-input-lexer.js'; import { UcToken } from '../syntax/uc-token.js'; export abstract class UcdReader { @@ -14,6 +15,7 @@ export abstract class UcdReader { readonly #entities: Exclude; readonly #formats: Exclude; readonly #onMeta: MetaUcrx; + readonly #embed: ((cx: UcrxContext) => UcInputLexerFactory | undefined) | undefined; constructor(options?: UcDeserializer.Options); @@ -24,9 +26,11 @@ export abstract class UcdReader { formats = {}, onMeta = UcdReader$noMeta, opaqueRx = OPAQUE_UCRX, + embed, }: UcdReader.Options = {}) { this.#data = data; this.#opaqueRx = opaqueRx; + this.#embed = embed; this.#onError = onError; this.#entities = entities; this.#formats = formats; @@ -41,6 +45,10 @@ export abstract class UcdReader { return this.#opaqueRx; } + embed(cx: UcrxContext): UcInputLexerFactory | undefined { + return this.#embed?.(cx); + } + abstract hasNext(): boolean; abstract current(): UcToken | undefined; @@ -55,6 +63,12 @@ export abstract class UcdReader { abstract read(rx: Ucrx): Promise | void; + abstract readEmbeds( + rx: Ucrx, + createLexer: (emit: (token: UcToken) => void) => UcInputLexer, + single: boolean, + ): Promise | void; + get entities(): Exclude { return this.#entities; } @@ -88,6 +102,19 @@ export abstract class UcdReader { export namespace UcdReader { export interface Options extends UcDeserializer.Options { readonly opaqueRx?: Ucrx | undefined; + /** + * Creates a lexer for _embedded input_. I.e. the input chunks enclosed into {@link churi!UC_TOKEN_EMBED embedded + * input bounds}. + * + * Once an embedded input is encountered, the deserializer would try to use the lexer defined by + * {@link churi!Ucrx#emb charge receiver}, and only if the latter is undefined - it will try to use the one created + * by this method. If that fails, an error will be thrown. + * + * @param cx - Charge processing context. + * + * @returns Either input lexer factory, or `undefined` if an embedded input is not expected. + */ + readonly embed?: ((cx: UcrxContext) => UcInputLexerFactory | undefined) | undefined; } } diff --git a/src/rx/all.ucrx.ts b/src/rx/all.ucrx.ts index 79564b28..b3beba61 100644 --- a/src/rx/all.ucrx.ts +++ b/src/rx/all.ucrx.ts @@ -1,3 +1,4 @@ +import { UcInputLexer } from '../syntax/uc-input-lexer.js'; import { UcToken } from '../syntax/uc-token.js'; import { Ucrx } from './ucrx.js'; @@ -8,6 +9,7 @@ export interface AllUcrx extends Ucrx { att(attr: string): AllUcrx | undefined; bol(value: boolean): 1; big(value: bigint): 1; + emb(emit: (token: UcToken) => void): UcInputLexer; ent(name: string): 1; fmt(format: string, data: readonly UcToken[]): 1; nls(): AllUcrx; diff --git a/src/rx/opaque.ucrx.spec.ts b/src/rx/opaque.ucrx.spec.ts index 508316c4..0e616846 100644 --- a/src/rx/opaque.ucrx.spec.ts +++ b/src/rx/opaque.ucrx.spec.ts @@ -1,4 +1,6 @@ import { describe, expect, it } from '@jest/globals'; +import { noop } from '@proc7ts/primitives'; +import { ucOpaqueLexer } from '../syntax/uc-input-lexer.js'; import { OpaqueUcrx } from './opaque.ucrx.js'; describe('OpaqueUcrx', () => { @@ -24,6 +26,12 @@ describe('OpaqueUcrx', () => { }); }); + describe('emb', () => { + it('returns ucOpaqueLexer', () => { + expect(new OpaqueUcrx().emb(noop)).toBe(ucOpaqueLexer); + }); + }); + describe('raw', () => { it('returns 1', () => { expect(new OpaqueUcrx().raw('test')).toBe(1); diff --git a/src/rx/opaque.ucrx.ts b/src/rx/opaque.ucrx.ts index 8009283b..0d2b0cdb 100644 --- a/src/rx/opaque.ucrx.ts +++ b/src/rx/opaque.ucrx.ts @@ -1,3 +1,4 @@ +import { UcInputLexer, ucOpaqueLexer } from '../syntax/uc-input-lexer.js'; import { UcToken } from '../syntax/uc-token.js'; import { AllUcrx } from './all.ucrx.js'; import { VoidUcrx } from './void.ucrx.js'; @@ -17,6 +18,11 @@ export class OpaqueUcrx extends VoidUcrx implements AllUcrx { // Ignore metadata. } + override emb(emit: (token: UcToken) => void): UcInputLexer; + override emb(_emit: (token: UcToken) => void): UcInputLexer { + return ucOpaqueLexer; + } + override raw(value: string): 1; override raw(_value: string): 1 { return 1; diff --git a/src/rx/token.ucrx.spec.ts b/src/rx/token.ucrx.spec.ts index 4b10e372..d53ef101 100644 --- a/src/rx/token.ucrx.spec.ts +++ b/src/rx/token.ucrx.spec.ts @@ -1,5 +1,6 @@ import { beforeEach, describe, expect, it } from '@jest/globals'; import { noop } from '@proc7ts/primitives'; +import { ucOpaqueLexer } from '../syntax/uc-input-lexer.js'; import { UC_TOKEN_CLOSING_PARENTHESIS, UC_TOKEN_OPENING_PARENTHESIS, @@ -21,6 +22,12 @@ describe('TokenUcrx', () => { }); }); + describe('emb', () => { + it('returns ucOpaqueLexer', () => { + expect(new TokenUcrx(noop).emb(noop)).toBe(ucOpaqueLexer); + }); + }); + describe('raw', () => { let tokens: UcToken[]; diff --git a/src/rx/token.ucrx.ts b/src/rx/token.ucrx.ts index 1fe089d6..ff3ac796 100644 --- a/src/rx/token.ucrx.ts +++ b/src/rx/token.ucrx.ts @@ -1,6 +1,7 @@ import { encodeURIPart } from 'httongue'; import { UC_KEY_ESCAPED, isEscapedUcString } from '../impl/uc-string-escapes.js'; import { printUcToken } from '../syntax/print-uc-token.js'; +import { UcInputLexer, ucOpaqueLexer } from '../syntax/uc-input-lexer.js'; import { UC_TOKEN_APOSTROPHE, UC_TOKEN_CLOSING_PARENTHESIS, @@ -81,6 +82,11 @@ export class TokenUcrx implements AllUcrx { return 1; } + emb(emit: (token: UcToken) => void): UcInputLexer; + emb(_emit: (token: UcToken) => void): UcInputLexer { + return ucOpaqueLexer; + } + ent(name: string): 1 { this.#addItem(); this.#add(UC_TOKEN_EXCLAMATION_MARK); diff --git a/src/rx/ucrx.ts b/src/rx/ucrx.ts index a70ed337..85939c7d 100644 --- a/src/rx/ucrx.ts +++ b/src/rx/ucrx.ts @@ -1,3 +1,4 @@ +import { UcInputLexer } from '../syntax/uc-input-lexer.js'; import { UcToken } from '../syntax/uc-token.js'; import { UcrxContext } from './ucrx-context.js'; @@ -53,6 +54,16 @@ export interface Ucrx { */ big(value: bigint, cx: UcrxContext): 0 | 1; + /** + * Called to start embedded input tokenization. + * + * @param emit - Emitter function called each time a token is found. + * @param cx - Charge processing context. + * + * @returns Either input lexer, or `undefined` if embedded input is not expected.. + */ + emb(emit: (token: UcToken) => void, cx: UcrxContext): UcInputLexer | undefined; + /** * Charges opaque (unrecognized) entity. * diff --git a/src/rx/void.ucrx.ts b/src/rx/void.ucrx.ts index bd756052..ed79bf5f 100644 --- a/src/rx/void.ucrx.ts +++ b/src/rx/void.ucrx.ts @@ -6,6 +6,7 @@ import { } from '../impl/ucrx-decode-raw.js'; import { UcEntity } from '../schema/entity/uc-entity.js'; import { UcFormatted } from '../schema/entity/uc-formatted.js'; +import { UcInputLexer } from '../syntax/uc-input-lexer.js'; import { UcToken } from '../syntax/uc-token.js'; import { UcrxContext } from './ucrx-context.js'; import { @@ -36,6 +37,11 @@ export class VoidUcrx implements Ucrx { return this.any(value) || cx.reject(ucrxRejectType('bigint', this)); } + emb(emit: (token: UcToken) => void, cx: UcrxContext): UcInputLexer | undefined; + emb(_emit: (token: UcToken) => void, _cx: UcrxContext): undefined { + // Embedded input not expected by default. + } + ent(name: string, cx: UcrxContext): 0 | 1 { return this.any(new UcEntity(name)) || cx.reject(ucrxRejectEntity(name)); } diff --git a/src/syntax/mod.ts b/src/syntax/mod.ts index 94b2e353..562035ea 100644 --- a/src/syntax/mod.ts +++ b/src/syntax/mod.ts @@ -1,6 +1,8 @@ export * from './print-uc-token.js'; export * from './trim-uc-tokens-tail.js'; +export * from './uc-input-lexer.js'; export * from './uc-lexer-stream.js'; export * from './uc-lexer.js'; +export * from './uc-plain-text-lexer.js'; export * from './uc-token-kind.js'; export * from './uc-token.js'; diff --git a/src/syntax/print-uc-token.spec.ts b/src/syntax/print-uc-token.spec.ts index 0f2ebb08..e06cc880 100644 --- a/src/syntax/print-uc-token.spec.ts +++ b/src/syntax/print-uc-token.spec.ts @@ -1,9 +1,12 @@ import { describe, expect, it } from '@jest/globals'; -import { printUcToken } from './print-uc-token.js'; +import { printUcToken, printUcTokens } from './print-uc-token.js'; import { + UC_TOKEN_CLOSING_PARENTHESIS, UC_TOKEN_CR, UC_TOKEN_CRLF, + UC_TOKEN_EMBED, UC_TOKEN_LF, + UC_TOKEN_OPENING_PARENTHESIS, UC_TOKEN_PREFIX_SPACE, UC_TOKEN_PREFIX_TAB, } from './uc-token.js'; @@ -26,4 +29,15 @@ describe('printUcToken', () => { expect(printUcToken('')).toBe(''); expect(printUcToken('a b c')).toBe('a b c'); }); + it('skips embeds', () => { + expect( + printUcTokens([ + UC_TOKEN_OPENING_PARENTHESIS, + UC_TOKEN_EMBED, + 'test', + UC_TOKEN_EMBED, + UC_TOKEN_CLOSING_PARENTHESIS, + ]), + ).toBe('(test)'); + }); }); diff --git a/src/syntax/print-uc-token.ts b/src/syntax/print-uc-token.ts index 3df969cf..37608f98 100644 --- a/src/syntax/print-uc-token.ts +++ b/src/syntax/print-uc-token.ts @@ -2,6 +2,7 @@ import { asis } from '@proc7ts/primitives'; import { UC_TOKEN_CR, UC_TOKEN_CRLF, + UC_TOKEN_EMBED, UC_TOKEN_PREFIX_SPACE, UC_TOKEN_PREFIX_TAB, UcToken, @@ -26,6 +27,8 @@ export function printUcToken(token: UcToken, encodeString?: (token: string) => s return '\t'.repeat((token >>> 8) + 1); case UC_TOKEN_CR: return token === UC_TOKEN_CRLF ? '\r\n' : '\r'; + case UC_TOKEN_EMBED: + return ''; } return String.fromCharCode(token); diff --git a/src/syntax/uc-input-lexer.ts b/src/syntax/uc-input-lexer.ts new file mode 100644 index 00000000..edb73523 --- /dev/null +++ b/src/syntax/uc-input-lexer.ts @@ -0,0 +1,43 @@ +import { UcToken } from './uc-token.js'; + +/** + * Charge lexer that splits the input onto tokens. + * + * The input chunks {@link UcInputLexer#scan scanned} by lexer one at a time. Each token found is emitted by calling + * provided emitter. On completion, the input has to by {@link UcInputLexer#flush flushed} in order to process the + * remaining input. + */ +export interface UcInputLexer { + /** + * Scans the input `chunk` for tokens. + * + * @param chunk - Chunk of input to scan. + */ + scan(chunk: string): void; + + /** + * Flushes the input emitting all pending tokens. + */ + flush(): void; +} + +/** + * Charge input lexer that ignores the input. + */ +export const ucOpaqueLexer: UcInputLexer = { + scan(_chunk) { + // Ignore input. + }, + flush() { + // Nothing to flush. + }, +}; + +/** + * Factory for {@link UcInputLexer input lexer}. + * + * @param emit - Emitter function called each time a token is found. + * + * @returns New lexer instance. + */ +export type UcInputLexerFactory = (emit: (token: UcToken) => void) => UcInputLexer; diff --git a/src/syntax/uc-lexer-stream.ts b/src/syntax/uc-lexer-stream.ts index 40cda2d6..f62c9657 100644 --- a/src/syntax/uc-lexer-stream.ts +++ b/src/syntax/uc-lexer-stream.ts @@ -1,3 +1,4 @@ +import { UcInputLexer } from './uc-input-lexer.js'; import { UcLexer } from './uc-lexer.js'; import { UcToken } from './uc-token.js'; @@ -8,19 +9,29 @@ import { UcToken } from './uc-token.js'; */ export class UcLexerStream extends TransformStream { + /** + * Constructs lexer stream. + * + * @param createLexer - Creates an input lexer to use. By default, creates {@link UcLexer} instance. + * @param writableStrategy - An object that optionally defines a queuing strategy for the input (chunks) stream. + * @param readableStrategy - An object that optionally defines a queuing strategy for the output (tokens) stream. + */ constructor( + createLexer: ( + emit: (token: UcToken) => void, + ) => UcInputLexer = UcLexerStream$createDefaultLexer, writableStrategy?: QueuingStrategy, readableStrategy?: QueuingStrategy, ) { - let tokenizer: UcLexer; + let lexer: UcInputLexer; super( { start: controller => { - tokenizer = new UcLexer(token => controller.enqueue(token)); + lexer = createLexer(token => controller.enqueue(token)); }, - transform: chunk => tokenizer.scan(chunk), - flush: () => tokenizer.flush(), + transform: chunk => lexer.scan(chunk), + flush: () => lexer.flush(), }, writableStrategy, readableStrategy, @@ -28,3 +39,7 @@ export class UcLexerStream extends TransformStream { } } + +function UcLexerStream$createDefaultLexer(emit: (token: UcToken) => void): UcInputLexer { + return new UcLexer(emit); +} diff --git a/src/syntax/uc-lexer.ts b/src/syntax/uc-lexer.ts index d753b67f..b2123b83 100644 --- a/src/syntax/uc-lexer.ts +++ b/src/syntax/uc-lexer.ts @@ -1,3 +1,4 @@ +import { UcInputLexer } from './uc-input-lexer.js'; import { UC_TOKEN_AMPERSAND, UC_TOKEN_APOSTROPHE, @@ -30,7 +31,7 @@ import { * the given emitter function. On completion, the input has to by {@link UcLexer#flush flushed} in order to process * the remaining input. */ -export class UcLexer { +export class UcLexer implements UcInputLexer { /** * Scans the `input` string for URI charge {@link UcToken tokens}. @@ -84,11 +85,6 @@ export class UcLexer { this.#emit = emit; } - /** - * Scans the input `chunk` for tokens. - * - * @param chunk - Chunk of input to scan. - */ scan(chunk: string): void { for (const token of chunk.split(UC_TOKEN_PATTERN)) { this.#add(token); @@ -209,9 +205,6 @@ export class UcLexer { this.#emit(pad | (count << 8)); } - /** - * Flushes the input emitting all pending tokens. - */ flush(): void { this.#emitPrev(); } diff --git a/src/syntax/uc-plain-text-lexer.spec.ts b/src/syntax/uc-plain-text-lexer.spec.ts new file mode 100644 index 00000000..8eb61135 --- /dev/null +++ b/src/syntax/uc-plain-text-lexer.spec.ts @@ -0,0 +1,215 @@ +import { beforeAll, beforeEach, describe, expect, it } from '@jest/globals'; +import { esline } from 'esgen'; +import { Readable } from 'node:stream'; +import { UcdCompiler } from '../compiler/deserialization/ucd-compiler.js'; +import { UC_MODULE_CHURI } from '../compiler/impl/uc-modules.js'; +import { ucList } from '../schema/list/uc-list.js'; +import { ucMap } from '../schema/map/uc-map.js'; +import { UcNumber, ucNumber } from '../schema/numeric/uc-number.js'; +import { UcString, ucString } from '../schema/string/uc-string.js'; +import { UcDeserializer } from '../schema/uc-deserializer.js'; +import { UcErrorInfo } from '../schema/uc-error.js'; +import { + UC_TOKEN_APOSTROPHE, + UC_TOKEN_CLOSING_PARENTHESIS, + UC_TOKEN_COMMA, + UC_TOKEN_EMBED, + UC_TOKEN_OPENING_PARENTHESIS, + UC_TOKEN_PREFIX_SPACE, + UcToken, +} from './uc-token.js'; + +describe('UcPlainTextLexer', () => { + let errors: UcErrorInfo[]; + + beforeEach(() => { + errors = []; + }); + + function onError(error: UcErrorInfo): void { + errors.push(error); + } + + describe('at top level', () => { + let readValue: UcDeserializer; + + beforeAll(async () => { + const compiler = new UcdCompiler({ + models: { + readValue: ucString(), + }, + embed: code => { + const UcPlainTextLexer = UC_MODULE_CHURI.import('UcPlainTextLexer'); + + code.line(esline`() => emit => new ${UcPlainTextLexer}(emit)`); + }, + }); + + ({ readValue } = await compiler.evaluate()); + }); + + it('generates string synchronously', () => { + expect(readValue([UC_TOKEN_EMBED, `'test'`, UC_TOKEN_EMBED])).toBe(`'test'`); + }); + + it('generates string asynchronously', async () => { + await expect(readValue(readTokens(UC_TOKEN_EMBED, `'test'`, UC_TOKEN_EMBED))).resolves.toBe( + `'test'`, + ); + }); + }); + + describe('as list item', () => { + let readList: UcDeserializer; + + beforeAll(async () => { + const compiler = new UcdCompiler({ + models: { + readList: ucList(ucString()), + }, + embed: code => { + const UcPlainTextLexer = UC_MODULE_CHURI.import('UcPlainTextLexer'); + + code.line(esline`() => emit => new ${UcPlainTextLexer}(emit)`); + }, + }); + + ({ readList } = await compiler.evaluate()); + }); + + it('generates string item synchronously', () => { + expect( + readList([ + 'start', + UC_TOKEN_COMMA, + UC_TOKEN_EMBED, + `'te`, + `st'`, + UC_TOKEN_EMBED, + UC_TOKEN_COMMA, + UC_TOKEN_APOSTROPHE, + 'end', + ]), + ).toEqual(['start', `'test'`, 'end']); + }); + + it('generates string item asynchronously', async () => { + await expect( + readList( + readTokens( + 'start', + UC_TOKEN_COMMA, + UC_TOKEN_EMBED, + `'te`, + `st'`, + UC_TOKEN_EMBED, + UC_TOKEN_COMMA, + UC_TOKEN_APOSTROPHE, + 'end', + ), + ), + ).resolves.toEqual(['start', `'test'`, 'end']); + }); + }); + + describe('as map entry', () => { + let readMap: UcDeserializer<{ foo: UcNumber; bar: UcString; baz: UcString }>; + + beforeAll(async () => { + const compiler = new UcdCompiler({ + models: { + readMap: ucMap({ foo: ucNumber(), bar: ucString(), baz: ucString() }), + }, + embed: code => { + const UcPlainTextLexer = UC_MODULE_CHURI.import('UcPlainTextLexer'); + + code.line(esline`() => emit => new ${UcPlainTextLexer}(emit)`); + }, + }); + + ({ readMap } = await compiler.evaluate()); + }); + + it('generates string item synchronously', () => { + expect( + readMap([ + 'foo', + UC_TOKEN_OPENING_PARENTHESIS, + '13', + UC_TOKEN_CLOSING_PARENTHESIS, + 'bar', + UC_TOKEN_OPENING_PARENTHESIS, + UC_TOKEN_PREFIX_SPACE || 2 << 8, + UC_TOKEN_EMBED, + `'te`, + `st'`, + UC_TOKEN_EMBED, + UC_TOKEN_PREFIX_SPACE || 2 << 8, + UC_TOKEN_CLOSING_PARENTHESIS, + 'baz', + UC_TOKEN_OPENING_PARENTHESIS, + UC_TOKEN_APOSTROPHE, + 'end', + UC_TOKEN_CLOSING_PARENTHESIS, + ]), + ).toEqual({ foo: 13, bar: `'test'`, baz: 'end' }); + }); + + it('generates string item asynchronously', async () => { + await expect( + readMap( + readTokens( + 'foo', + UC_TOKEN_OPENING_PARENTHESIS, + '13', + UC_TOKEN_CLOSING_PARENTHESIS, + 'bar', + UC_TOKEN_OPENING_PARENTHESIS, + UC_TOKEN_PREFIX_SPACE || 2 << 8, + UC_TOKEN_EMBED, + `'te`, + `st'`, + UC_TOKEN_EMBED, + UC_TOKEN_PREFIX_SPACE || 2 << 8, + UC_TOKEN_CLOSING_PARENTHESIS, + 'baz', + UC_TOKEN_OPENING_PARENTHESIS, + UC_TOKEN_APOSTROPHE, + 'end', + UC_TOKEN_CLOSING_PARENTHESIS, + ), + ), + ).resolves.toEqual({ foo: 13, bar: `'test'`, baz: 'end' }); + }); + }); + + describe('when turned off', () => { + let readValue: UcDeserializer; + + beforeAll(async () => { + const compiler = new UcdCompiler({ + models: { + readValue: ucString(), + }, + }); + + ({ readValue } = await compiler.evaluate()); + }); + + it('signals error on embedded input', () => { + expect(readValue([UC_TOKEN_EMBED, `'test'`, UC_TOKEN_EMBED], { onError })).toBe(``); + + expect(errors).toEqual([ + { + code: 'unrecognizedInput', + path: [{}], + message: 'Unrecognized embedded input', + }, + ]); + }); + }); + + function readTokens(...tokens: UcToken[]): ReadableStream { + return Readable.toWeb(Readable.from(tokens)) as ReadableStream; + } +}); diff --git a/src/syntax/uc-plain-text-lexer.ts b/src/syntax/uc-plain-text-lexer.ts new file mode 100644 index 00000000..2c9664b1 --- /dev/null +++ b/src/syntax/uc-plain-text-lexer.ts @@ -0,0 +1,23 @@ +import { UcInputLexer } from './uc-input-lexer.js'; +import { UcToken } from './uc-token.js'; + +/** + * Plain text lexer. + * + * The input chunks converted to string tokens directly, without any change. + */ +export class UcPlainTextLexer implements UcInputLexer { + + readonly #emit: (token: UcToken) => void; + + constructor(emit: (token: UcToken) => void) { + this.#emit = emit; + } + + scan(chunk: string): void { + this.#emit(chunk); + } + + flush(): void {} + +} diff --git a/src/syntax/uc-token-kind.spec.ts b/src/syntax/uc-token-kind.spec.ts new file mode 100644 index 00000000..480cd90b --- /dev/null +++ b/src/syntax/uc-token-kind.spec.ts @@ -0,0 +1,9 @@ +import { describe, expect, it } from '@jest/globals'; +import { UC_TOKEN_KIND_CONTROL, ucTokenKind } from './uc-token-kind.js'; +import { UC_TOKEN_EMBED } from './uc-token.js'; + +describe('ucTokenKind', () => { + it('is control for embed token', () => { + expect(ucTokenKind(UC_TOKEN_EMBED)).toBe(UC_TOKEN_KIND_CONTROL); + }); +}); diff --git a/src/syntax/uc-token-kind.ts b/src/syntax/uc-token-kind.ts index b54d5913..5f15e63e 100644 --- a/src/syntax/uc-token-kind.ts +++ b/src/syntax/uc-token-kind.ts @@ -1,18 +1,20 @@ import { - UcToken, UC_TOKEN_CLOSING_PARENTHESIS, UC_TOKEN_COMMA, UC_TOKEN_CR, + UC_TOKEN_EMBED, UC_TOKEN_LF, UC_TOKEN_OPENING_PARENTHESIS, UC_TOKEN_PREFIX_SPACE, UC_TOKEN_PREFIX_TAB, + UcToken, } from './uc-token.js'; -export const UC_TOKEN_KIND_STRING = 0 as const; +export const UC_TOKEN_KIND_CONTROL = 0 as const; +export const UC_TOKEN_KIND_STRING = 1 as const; -export const UC_TOKEN_KIND_PADDING = 1 as const; -export const UC_TOKEN_KIND_NL = 2 as const; +export const UC_TOKEN_KIND_PADDING = 2 as const; +export const UC_TOKEN_KIND_NL = 4 as const; export const UC_TOKEN_KIND_IS_WHITESPACE = UC_TOKEN_KIND_PADDING | UC_TOKEN_KIND_NL; export const UC_TOKEN_KIND_BOUND = 0x10 as const; @@ -20,6 +22,7 @@ export const UC_TOKEN_KIND_DELIMITER = 0x20 as const; export const UC_TOKEN_KIND_IS_RESERVED = UC_TOKEN_KIND_BOUND | UC_TOKEN_KIND_DELIMITER; export type UcTokenKind = + | typeof UC_TOKEN_KIND_CONTROL | typeof UC_TOKEN_KIND_STRING | typeof UC_TOKEN_KIND_PADDING | typeof UC_TOKEN_KIND_NL @@ -44,6 +47,8 @@ export function ucTokenKind(token: UcToken): UcTokenKind { case UC_TOKEN_OPENING_PARENTHESIS: case UC_TOKEN_CLOSING_PARENTHESIS: return UC_TOKEN_KIND_BOUND; + case UC_TOKEN_EMBED: + return UC_TOKEN_KIND_CONTROL; default: return UC_TOKEN_KIND_DELIMITER; } diff --git a/src/syntax/uc-token.ts b/src/syntax/uc-token.ts index 56042d55..87314065 100644 --- a/src/syntax/uc-token.ts +++ b/src/syntax/uc-token.ts @@ -1,3 +1,12 @@ +/** + * Embedded input bounds enclose the tokens that considered an _embedded input_ chunks rather normal tokens. + * The embedded input supposed to be processed by appropriate {@link UcInputLexer input lexer}. + * The bounds themselves are control tokens to be ignored. + * + * Embedded input expected at value position and nowhere else. E.g. it can't present in the middle of a string. + */ +export const UC_TOKEN_EMBED = 0x1f; + // Line terminators. export const UC_TOKEN_LF = 0x0a as const; export const UC_TOKEN_CR = 0x0d as const; @@ -53,6 +62,8 @@ export const UC_TOKEN_CLOSING_BRACKET = 0x5d as const; * Such padding always emitted for spaces and tabs around [reserved characters], line terminators, after input * beginning, and before input end. Spaces and tabs e.g. between words may be emitted as part of string tokens. * + * - Number corresponding to {@link UC_TOKEN_EMBED input bound}. The tokens between input bounds considered an + * _embedded input_ chunks to be processed by appropriate {@link UcInputLexer input lexer}. * * [percent-decoded]: https://www.rfc-editor.org/rfc/rfc3986#section-2.1 * [reserved characters]: https://www.rfc-editor.org/rfc/rfc3986#section-2.2