feat: add splitInputAfterSyntaxError

DTStack · Aug 26, 2024 · b610cb3 · b610cb3
1 parent c83c502
commit b610cb3
Show file tree

Hide file tree

Showing 8 changed files with 155 additions and 34 deletions.
diff --git a/src/parser/common/basicSQL.ts b/src/parser/common/basicSQL.ts
@@ -48,6 +48,11 @@ export abstract class BasicSQL<
      */
     protected abstract preferredRules: Set<number>;
 
+    /**
+     * keywords which can start a single statement
+     */
+    protected abstract statementStartKeywords: string[];
+
     /**
      * Create a antlr4 Lexer instance.
      * @param input source string
@@ -251,6 +256,63 @@ export abstract class BasicSQL<
         return res;
     }
 
+    /**
+     * Try to get a small range as possible after syntax error.
+     * @param allTokens all tokens from input
+     * @param caretTokenIndex tokenIndex of caretPosition
+     * @returns { startToken: Token; stopToken: Token }
+     */
+    private splitInputAfterSyntaxError(
+        allTokens: Token[],
+        caretTokenIndex: number
+    ): { startToken: Token; stopToken: Token } {
+        let startToken: Token | null = null;
+        for (let tokenIndex = caretTokenIndex; tokenIndex >= 0; tokenIndex--) {
+            const token = allTokens[tokenIndex];
+            // end with semi
+            if (token?.text === ';') {
+                startToken = allTokens[tokenIndex + 1];
+                break;
+            }
+            // keywords which can start a single statement
+            if (
+                this.statementStartKeywords.some((item) => item === token?.text) &&
+                tokenIndex !== 0
+            ) {
+                startToken = allTokens[tokenIndex - 1];
+                break;
+            }
+        }
+        // If there is no semicolon, start from the first token
+        if (startToken === null) {
+            startToken = allTokens[0];
+        }
+
+        let stopToken: Token | null = null;
+        for (let tokenIndex = caretTokenIndex; tokenIndex < allTokens.length; tokenIndex++) {
+            const token = allTokens[tokenIndex];
+            // end with semi
+            if (token?.text === ';') {
+                stopToken = token;
+                break;
+            }
+            // keywords which can start a single statement
+            if (
+                this.statementStartKeywords.some((item) => item === token?.text) &&
+                tokenIndex !== 0
+            ) {
+                stopToken = allTokens[tokenIndex - 1];
+                break;
+            }
+        }
+        // If there is no semicolon, start from the first token
+        if (stopToken === null) {
+            stopToken = allTokens[allTokens.length - 1];
+        }
+
+        return { startToken, stopToken };
+    }
+
     /**
      * Get suggestions of syntax and token at caretPosition
      * @param input source string
@@ -282,53 +344,98 @@ export abstract class BasicSQL<
         const statementCount = splitListener.statementsContext?.length;
         const statementsContext = splitListener.statementsContext;
 
-        // If there are multiple statements.
-        if (statementCount > 1) {
-            /**
-             * Find a minimum valid range, reparse the fragment, and provide a new parse tree to C3.
-             * The boundaries of this range must be statements with no syntax errors.
-             * This can ensure the stable performance of the C3.
-             */
-            let startStatement: ParserRuleContext | null = null;
-            let stopStatement: ParserRuleContext | null = null;
+        const { startToken, stopToken } = this.splitInputAfterSyntaxError(
+            allTokens,
+            caretTokenIndex
+        );
 
-            for (let index = 0; index < statementCount; index++) {
-                const ctx = statementsContext[index];
-                const isCurrentCtxValid = !ctx.exception;
-                if (!isCurrentCtxValid) continue;
+        let startIndex: number = 0;
+        let stopIndex: number = 0;
 
+        /**
+         * If there is no semi
+         * and if there is no keyword which can start a single statement
+         * and if there are multiple statements
+         */
+        if (startToken.tokenIndex === 1 && stopToken.tokenIndex === allTokens.length - 1) {
+            if (statementCount > 1) {
                 /**
-                 * Ensure that the statementContext before the left boundary
-                 * and the last statementContext on the right boundary are qualified SQL statements.
+                 * Find a minimum valid range, reparse the fragment, and provide a new parse tree to C3.
+                 * The boundaries of this range must be statements with no syntax errors.
+                 * This can ensure the stable performance of the C3.
                  */
-                const isPrevCtxValid = index === 0 || !statementsContext[index - 1]?.exception;
-                const isNextCtxValid =
-                    index === statementCount - 1 || !statementsContext[index + 1]?.exception;
-
-                if (ctx.stop && ctx.stop.tokenIndex < caretTokenIndex && isPrevCtxValid) {
-                    startStatement = ctx;
+                let startStatement: ParserRuleContext | null = null;
+                let stopStatement: ParserRuleContext | null = null;
+
+                for (let index = 0; index < statementCount; index++) {
+                    const ctx = statementsContext[index];
+                    const isCurrentCtxValid = !ctx.exception;
+                    if (!isCurrentCtxValid) continue;
+
+                    /**
+                     * Ensure that the statementContext before the left boundary
+                     * and the last statementContext on the right boundary are qualified SQL statements.
+                     */
+                    const isPrevCtxValid = index === 0 || !statementsContext[index - 1]?.exception;
+                    const isNextCtxValid =
+                        index === statementCount - 1 || !statementsContext[index + 1]?.exception;
+
+                    if (ctx.stop && ctx.stop.tokenIndex < caretTokenIndex && isPrevCtxValid) {
+                        startStatement = ctx;
+                    }
+
+                    if (
+                        ctx.start &&
+                        !stopStatement &&
+                        ctx.start.tokenIndex > caretTokenIndex &&
+                        isNextCtxValid
+                    ) {
+                        stopStatement = ctx;
+                        break;
+                    }
                 }
 
-                if (
-                    ctx.start &&
-                    !stopStatement &&
-                    ctx.start.tokenIndex > caretTokenIndex &&
-                    isNextCtxValid
-                ) {
-                    stopStatement = ctx;
-                    break;
-                }
-            }
+                // A boundary consisting of the index of the input.
+                startIndex = startStatement?.start?.start ?? 0;
+                stopIndex = stopStatement?.stop?.stop ?? input.length - 1;
 
+                /**
+                 * Save offset of the tokenIndex in the range of input
+                 * compared to the tokenIndex in the whole input
+                 */
+                tokenIndexOffset = startStatement?.start?.tokenIndex ?? 0;
+                caretTokenIndex = caretTokenIndex - tokenIndexOffset;
+
+                /**
+                 * Reparse the input fragment，
+                 * and c3 will collect candidates in the newly generated parseTree.
+                 */
+                const inputSlice = input.slice(startIndex, stopIndex);
+
+                const lexer = this.createLexer(inputSlice);
+                lexer.removeErrorListeners();
+                const tokenStream = new CommonTokenStream(lexer);
+                tokenStream.fill();
+
+                const parser = this.createParserFromTokenStream(tokenStream);
+                parser.interpreter.predictionMode = PredictionMode.SLL;
+                parser.removeErrorListeners();
+                parser.buildParseTrees = true;
+                parser.errorHandler = new ErrorStrategy();
+
+                sqlParserIns = parser;
+                c3Context = parser.program();
+            }
+        } else {
             // A boundary consisting of the index of the input.
-            const startIndex = startStatement?.start?.start ?? 0;
-            const stopIndex = stopStatement?.stop?.stop ?? input.length - 1;
+            startIndex = startToken?.start ?? 0;
+            stopIndex = stopToken?.stop + 1 ?? input.length;
 
             /**
              * Save offset of the tokenIndex in the range of input
              * compared to the tokenIndex in the whole input
              */
-            tokenIndexOffset = startStatement?.start?.tokenIndex ?? 0;
+            tokenIndexOffset = startToken?.tokenIndex ?? 0;
             caretTokenIndex = caretTokenIndex - tokenIndexOffset;
 
             /**

diff --git a/src/parser/flink/index.ts b/src/parser/flink/index.ts
@@ -35,6 +35,8 @@ export class FlinkSQL extends BasicSQL<FlinkSqlLexer, ProgramContext, FlinkSqlPa
         FlinkSqlParser.RULE_columnNameCreate,
     ]);
 
+    protected statementStartKeywords: string[] = ['SELECT', 'INSERT'];
+
     protected get splitListener() {
         return new FlinkSqlSplitListener();
     }

diff --git a/src/parser/hive/index.ts b/src/parser/hive/index.ts
@@ -36,6 +36,8 @@ export class HiveSQL extends BasicSQL<HiveSqlLexer, ProgramContext, HiveSqlParse
         HiveSqlParser.RULE_columnNameCreate,
     ]);
 
+    protected statementStartKeywords: string[] = ['SELECT', 'INSERT'];
+
     protected get splitListener() {
         return new HiveSqlSplitListener();
     }

diff --git a/src/parser/impala/index.ts b/src/parser/impala/index.ts
@@ -34,6 +34,8 @@ export class ImpalaSQL extends BasicSQL<ImpalaSqlLexer, ProgramContext, ImpalaSq
         ImpalaSqlParser.RULE_columnNamePath,
     ]);
 
+    protected statementStartKeywords: string[] = ['SELECT', 'INSERT'];
+
     protected get splitListener() {
         return new ImpalaSqlSplitListener();
     }

diff --git a/src/parser/mysql/index.ts b/src/parser/mysql/index.ts
@@ -34,6 +34,8 @@ export class MySQL extends BasicSQL<MySqlLexer, ProgramContext, MySqlParser> {
         MySqlParser.RULE_columnNameCreate,
     ]);
 
+    protected statementStartKeywords: string[] = ['SELECT', 'INSERT'];
+
     protected get splitListener() {
         return new MysqlSplitListener();
     }

diff --git a/src/parser/postgresql/index.ts b/src/parser/postgresql/index.ts
@@ -39,6 +39,8 @@ export class PostgreSQL extends BasicSQL<PostgreSqlLexer, ProgramContext, Postgr
         PostgreSqlParser.RULE_column_name, // column name
     ]);
 
+    protected statementStartKeywords: string[] = [];
+
     protected get splitListener() {
         return new PostgreSqlSplitListener();
     }

diff --git a/src/parser/spark/index.ts b/src/parser/spark/index.ts
@@ -34,6 +34,8 @@ export class SparkSQL extends BasicSQL<SparkSqlLexer, ProgramContext, SparkSqlPa
         SparkSqlParser.RULE_columnNameCreate,
     ]);
 
+    protected statementStartKeywords: string[] = ['SELECT', 'INSERT'];
+
     protected get splitListener() {
         return new SparkSqlSplitListener();
     }

diff --git a/src/parser/trino/index.ts b/src/parser/trino/index.ts
@@ -21,6 +21,8 @@ export class TrinoSQL extends BasicSQL<TrinoSqlLexer, ProgramContext, TrinoSqlPa
         return new TrinoSqlParser(tokenStream);
     }
 
+    protected statementStartKeywords: string[] = ['SELECT', 'INSERT'];
+
     protected get splitListener() {
         return new TrinoSqlSplitListener();
     }