From b610cb380d1c029eac726f1f13a4b8befb7eb4fb Mon Sep 17 00:00:00 2001 From: liuyi Date: Wed, 31 Jul 2024 00:49:51 +0800 Subject: [PATCH] feat: add splitInputAfterSyntaxError --- src/parser/common/basicSQL.ts | 175 ++++++++++++++++++++++++++------- src/parser/flink/index.ts | 2 + src/parser/hive/index.ts | 2 + src/parser/impala/index.ts | 2 + src/parser/mysql/index.ts | 2 + src/parser/postgresql/index.ts | 2 + src/parser/spark/index.ts | 2 + src/parser/trino/index.ts | 2 + 8 files changed, 155 insertions(+), 34 deletions(-) diff --git a/src/parser/common/basicSQL.ts b/src/parser/common/basicSQL.ts index b835979a..86be4d65 100644 --- a/src/parser/common/basicSQL.ts +++ b/src/parser/common/basicSQL.ts @@ -48,6 +48,11 @@ export abstract class BasicSQL< */ protected abstract preferredRules: Set; + /** + * keywords which can start a single statement + */ + protected abstract statementStartKeywords: string[]; + /** * Create a antlr4 Lexer instance. * @param input source string @@ -251,6 +256,63 @@ export abstract class BasicSQL< return res; } + /** + * Try to get a small range as possible after syntax error. + * @param allTokens all tokens from input + * @param caretTokenIndex tokenIndex of caretPosition + * @returns { startToken: Token; stopToken: Token } + */ + private splitInputAfterSyntaxError( + allTokens: Token[], + caretTokenIndex: number + ): { startToken: Token; stopToken: Token } { + let startToken: Token | null = null; + for (let tokenIndex = caretTokenIndex; tokenIndex >= 0; tokenIndex--) { + const token = allTokens[tokenIndex]; + // end with semi + if (token?.text === ';') { + startToken = allTokens[tokenIndex + 1]; + break; + } + // keywords which can start a single statement + if ( + this.statementStartKeywords.some((item) => item === token?.text) && + tokenIndex !== 0 + ) { + startToken = allTokens[tokenIndex - 1]; + break; + } + } + // If there is no semicolon, start from the first token + if (startToken === null) { + startToken = allTokens[0]; + } + + let stopToken: Token | null = null; + for (let tokenIndex = caretTokenIndex; tokenIndex < allTokens.length; tokenIndex++) { + const token = allTokens[tokenIndex]; + // end with semi + if (token?.text === ';') { + stopToken = token; + break; + } + // keywords which can start a single statement + if ( + this.statementStartKeywords.some((item) => item === token?.text) && + tokenIndex !== 0 + ) { + stopToken = allTokens[tokenIndex - 1]; + break; + } + } + // If there is no semicolon, start from the first token + if (stopToken === null) { + stopToken = allTokens[allTokens.length - 1]; + } + + return { startToken, stopToken }; + } + /** * Get suggestions of syntax and token at caretPosition * @param input source string @@ -282,53 +344,98 @@ export abstract class BasicSQL< const statementCount = splitListener.statementsContext?.length; const statementsContext = splitListener.statementsContext; - // If there are multiple statements. - if (statementCount > 1) { - /** - * Find a minimum valid range, reparse the fragment, and provide a new parse tree to C3. - * The boundaries of this range must be statements with no syntax errors. - * This can ensure the stable performance of the C3. - */ - let startStatement: ParserRuleContext | null = null; - let stopStatement: ParserRuleContext | null = null; + const { startToken, stopToken } = this.splitInputAfterSyntaxError( + allTokens, + caretTokenIndex + ); - for (let index = 0; index < statementCount; index++) { - const ctx = statementsContext[index]; - const isCurrentCtxValid = !ctx.exception; - if (!isCurrentCtxValid) continue; + let startIndex: number = 0; + let stopIndex: number = 0; + /** + * If there is no semi + * and if there is no keyword which can start a single statement + * and if there are multiple statements + */ + if (startToken.tokenIndex === 1 && stopToken.tokenIndex === allTokens.length - 1) { + if (statementCount > 1) { /** - * Ensure that the statementContext before the left boundary - * and the last statementContext on the right boundary are qualified SQL statements. + * Find a minimum valid range, reparse the fragment, and provide a new parse tree to C3. + * The boundaries of this range must be statements with no syntax errors. + * This can ensure the stable performance of the C3. */ - const isPrevCtxValid = index === 0 || !statementsContext[index - 1]?.exception; - const isNextCtxValid = - index === statementCount - 1 || !statementsContext[index + 1]?.exception; - - if (ctx.stop && ctx.stop.tokenIndex < caretTokenIndex && isPrevCtxValid) { - startStatement = ctx; + let startStatement: ParserRuleContext | null = null; + let stopStatement: ParserRuleContext | null = null; + + for (let index = 0; index < statementCount; index++) { + const ctx = statementsContext[index]; + const isCurrentCtxValid = !ctx.exception; + if (!isCurrentCtxValid) continue; + + /** + * Ensure that the statementContext before the left boundary + * and the last statementContext on the right boundary are qualified SQL statements. + */ + const isPrevCtxValid = index === 0 || !statementsContext[index - 1]?.exception; + const isNextCtxValid = + index === statementCount - 1 || !statementsContext[index + 1]?.exception; + + if (ctx.stop && ctx.stop.tokenIndex < caretTokenIndex && isPrevCtxValid) { + startStatement = ctx; + } + + if ( + ctx.start && + !stopStatement && + ctx.start.tokenIndex > caretTokenIndex && + isNextCtxValid + ) { + stopStatement = ctx; + break; + } } - if ( - ctx.start && - !stopStatement && - ctx.start.tokenIndex > caretTokenIndex && - isNextCtxValid - ) { - stopStatement = ctx; - break; - } - } + // A boundary consisting of the index of the input. + startIndex = startStatement?.start?.start ?? 0; + stopIndex = stopStatement?.stop?.stop ?? input.length - 1; + /** + * Save offset of the tokenIndex in the range of input + * compared to the tokenIndex in the whole input + */ + tokenIndexOffset = startStatement?.start?.tokenIndex ?? 0; + caretTokenIndex = caretTokenIndex - tokenIndexOffset; + + /** + * Reparse the input fragment, + * and c3 will collect candidates in the newly generated parseTree. + */ + const inputSlice = input.slice(startIndex, stopIndex); + + const lexer = this.createLexer(inputSlice); + lexer.removeErrorListeners(); + const tokenStream = new CommonTokenStream(lexer); + tokenStream.fill(); + + const parser = this.createParserFromTokenStream(tokenStream); + parser.interpreter.predictionMode = PredictionMode.SLL; + parser.removeErrorListeners(); + parser.buildParseTrees = true; + parser.errorHandler = new ErrorStrategy(); + + sqlParserIns = parser; + c3Context = parser.program(); + } + } else { // A boundary consisting of the index of the input. - const startIndex = startStatement?.start?.start ?? 0; - const stopIndex = stopStatement?.stop?.stop ?? input.length - 1; + startIndex = startToken?.start ?? 0; + stopIndex = stopToken?.stop + 1 ?? input.length; /** * Save offset of the tokenIndex in the range of input * compared to the tokenIndex in the whole input */ - tokenIndexOffset = startStatement?.start?.tokenIndex ?? 0; + tokenIndexOffset = startToken?.tokenIndex ?? 0; caretTokenIndex = caretTokenIndex - tokenIndexOffset; /** diff --git a/src/parser/flink/index.ts b/src/parser/flink/index.ts index 61d19d22..d4675644 100644 --- a/src/parser/flink/index.ts +++ b/src/parser/flink/index.ts @@ -35,6 +35,8 @@ export class FlinkSQL extends BasicSQL { MySqlParser.RULE_columnNameCreate, ]); + protected statementStartKeywords: string[] = ['SELECT', 'INSERT']; + protected get splitListener() { return new MysqlSplitListener(); } diff --git a/src/parser/postgresql/index.ts b/src/parser/postgresql/index.ts index 63820000..cdfd2562 100644 --- a/src/parser/postgresql/index.ts +++ b/src/parser/postgresql/index.ts @@ -39,6 +39,8 @@ export class PostgreSQL extends BasicSQL