diff --git a/antlr/antlr4/ANTLRv4Lexer.g4 b/antlr/antlr4/ANTLRv4Lexer.g4 index bf389d0915..b16b5118c3 100644 --- a/antlr/antlr4/ANTLRv4Lexer.g4 +++ b/antlr/antlr4/ANTLRv4Lexer.g4 @@ -47,9 +47,13 @@ lexer grammar ANTLRv4Lexer; options { superClass = LexerAdaptor; + + // Using a predefined list of tokens here to ensure the same order of the tokens as they were defined + // in the old ANTLR3 tree parsers (to avoid having to change the tree parsers code). + // The actual values of the tokens doesn't matter, but the order does. + tokenVocab = predefined; } -import LexBasic; // Standard set of fragments tokens { @@ -65,23 +69,25 @@ channels { // ------------------------- // Comments + DOC_COMMENT - : DocComment -> channel (COMMENT) + : '/**' .*? ('*/' | EOF) -> channel (COMMENT) ; BLOCK_COMMENT - : BlockComment -> channel (COMMENT) + : '/*' .*? ('*/' | EOF) -> channel (COMMENT) ; LINE_COMMENT - : LineComment -> channel (COMMENT) + : '//' ~ [\r\n]* -> channel (COMMENT) ; // ------------------------- // Integer INT - : DecimalNumeral + : '0' + | [1-9] [0-9]* ; // ------------------------- @@ -92,11 +98,11 @@ INT // may contain unicode escape sequences of the form \uxxxx, where x // is a valid hexadecimal number (per Unicode standard). STRING_LITERAL - : SQuoteLiteral + : '\'' (ESC_SEQUENCE | ~ ['\r\n\\])* '\'' ; UNTERMINATED_STRING_LITERAL - : USQuoteLiteral + : '\'' (ESC_SEQUENCE | ~ ['\r\n\\])* ; // ------------------------- @@ -106,13 +112,13 @@ UNTERMINATED_STRING_LITERAL // to a rule invocation, or input parameters to a rule specification // are contained within square brackets. BEGIN_ARGUMENT - : LBrack { this.handleBeginArgument(); } + : '[' { this.handleBeginArgument(); } ; // ------------------------- // Target Language Actions BEGIN_ACTION - : LBrace -> pushMode (TargetLanguageAction) + : '{' -> pushMode (TargetLanguageAction) ; // ------------------------- @@ -123,23 +129,15 @@ BEGIN_ACTION // Otherwise, the symbols are tokenized as RULE_REF and allowed as // an identifier in a labeledElement. OPTIONS - : 'options' WSNLCHARS* '{' + : 'options' WS* '{' ; TOKENS - : 'tokens' WSNLCHARS* '{' + : 'tokens' WS* '{' ; CHANNELS - : 'channels' WSNLCHARS* '{' - ; - -fragment WSNLCHARS - : ' ' - | '\t' - | '\f' - | '\n' - | '\r' + : 'channels' WS* '{' ; IMPORT @@ -202,125 +200,105 @@ MODE // Punctuation COLON - : Colon + : ':' ; COLONCOLON - : DColon + : '::' ; COMMA - : Comma + : ',' ; SEMI - : Semi + : ';' ; LPAREN - : LParen + : '(' ; RPAREN - : RParen - ; - -LBRACE - : LBrace + : ')' ; RBRACE - : RBrace + : '}' ; RARROW - : RArrow + : '->' ; LT - : Lt + : '<' ; GT - : Gt + : '>' ; ASSIGN - : Equal + : '=' ; QUESTION - : Question + : '?' ; STAR - : Star + : '*' ; PLUS_ASSIGN - : PlusAssign + : '+=' ; PLUS - : Plus + : '+' ; OR - : Pipe + : '|' ; DOLLAR - : Dollar + : '$' ; RANGE - : Range + : '..' ; DOT - : Dot + : '.' ; AT - : At + : '@' ; POUND - : Pound + : '#' ; NOT - : Tilde + : '~' ; // ------------------------- // Identifiers - allows unicode rule/token names ID - : Id + : NameStartChar NameChar* ; // ------------------------- // Whitespace WS - : Ws+ -> channel (OFF_CHANNEL) - ; - -// ------------------------- -// Illegal Characters -// -// This is an illegal character trap which is always the last rule in the -// lexer specification. It matches a single character of any value and being -// the last rule in the file will match when no other rule knows what to do -// about the character. It is reported as an error but is not passed on to the -// parser. This means that the parser to deal with the gramamr file anyway -// but we will not try to analyse or code generate from a file with lexical -// errors. - -// Comment this rule out to allow the error to be propagated to the parser -ERRCHAR - : . -> channel (HIDDEN) + : [ \t\r\n\f]+ -> channel (OFF_CHANNEL) ; // ====================================================== @@ -331,11 +309,11 @@ mode Argument; // E.g., [int x, List a[]] NESTED_ARGUMENT - : LBrack -> type (ARGUMENT_CONTENT), pushMode (Argument) + : '[' -> type (ARGUMENT_CONTENT), pushMode (Argument) ; ARGUMENT_ESCAPE - : EscAny -> type (ARGUMENT_CONTENT) + : '\\' . -> type (ARGUMENT_CONTENT) ; ARGUMENT_STRING_LITERAL @@ -343,11 +321,11 @@ ARGUMENT_STRING_LITERAL ; ARGUMENT_CHAR_LITERAL - : SQuoteLiteral -> type (ARGUMENT_CONTENT) + : STRING_LITERAL -> type (ARGUMENT_CONTENT) ; END_ARGUMENT - : RBrack { this.handleEndArgument(); } + : ']' { this.handleEndArgument(); } ; // added this to return non-EOF token type here. EOF does something weird @@ -359,11 +337,6 @@ ARGUMENT_CONTENT : . ; -// TODO: This grammar and the one used in the Intellij Antlr4 plugin differ -// for "actions". This needs to be resolved at some point. -// The Intellij Antlr4 grammar is here: -// https://github.com/antlr/intellij-plugin-v4/blob/1f36fde17f7fa63cb18d7eeb9cb213815ac658fb/src/main/antlr/org/antlr/intellij/plugin/parser/ANTLRv4Lexer.g4#L587 - // ------------------------- // Target Language Actions // @@ -372,15 +345,15 @@ ARGUMENT_CONTENT // braces. Additionally, we must make some assumptions about // literal string representation in the target language. We assume // that they are delimited by ' or " and so consume these -// in their own alts so as not to inadvertantly match {}. +// in their own alts so as not to inadvertently match {}. mode TargetLanguageAction; NESTED_ACTION - : LBrace -> type (ACTION_CONTENT), pushMode (TargetLanguageAction) + : '{' -> type (ACTION_CONTENT), pushMode (TargetLanguageAction) ; ACTION_ESCAPE - : EscAny -> type (ACTION_CONTENT) + : '\\' . -> type (ACTION_CONTENT) ; ACTION_STRING_LITERAL @@ -388,23 +361,23 @@ ACTION_STRING_LITERAL ; ACTION_CHAR_LITERAL - : SQuoteLiteral -> type (ACTION_CONTENT) + : STRING_LITERAL -> type (ACTION_CONTENT) ; ACTION_DOC_COMMENT - : DocComment -> type (ACTION_CONTENT) + : DOC_COMMENT -> type (ACTION_CONTENT) ; ACTION_BLOCK_COMMENT - : BlockComment -> type (ACTION_CONTENT) + : BLOCK_COMMENT -> type (ACTION_CONTENT) ; ACTION_LINE_COMMENT - : LineComment -> type (ACTION_CONTENT) + : LINE_COMMENT -> type (ACTION_CONTENT) ; END_ACTION - : RBrace { this.handleEndAction(); } + : '}' { this.handleEndAction(); } ; UNTERMINATED_ACTION @@ -419,11 +392,11 @@ ACTION_CONTENT mode LexerCharSet; LEXER_CHAR_SET_BODY - : (~ [\]\\] | EscAny)+ -> more + : (~ [\]\\] | '\\' .)+ -> more ; LEXER_CHAR_SET - : RBrack -> popMode + : ']' -> popMode ; UNTERMINATED_CHAR_SET @@ -432,6 +405,48 @@ UNTERMINATED_CHAR_SET // ------------------------------------------------------------------------------ // Grammar specific Keywords, Punctuation, etc. -fragment Id - : NameStartChar NameChar* - ; \ No newline at end of file + +fragment ESC_SEQUENCE + : '\\' ([btnfr"'\\] | UnicodeESC | . | EOF) + ; + +fragment HexDigit + : [0-9a-fA-F] + ; + +fragment UnicodeESC + : 'u' (HexDigit (HexDigit (HexDigit HexDigit?)?)?)? + ; + +fragment DQuoteLiteral + : '"' (ESC_SEQUENCE | ~ ["\r\n\\])* '"' + ; + +// ----------------------------------- +// Character ranges + +fragment NameChar + : NameStartChar + | '0' .. '9' + | '_' + | '\u00B7' + | '\u0300' .. '\u036F' + | '\u203F' .. '\u2040' + ; + +fragment NameStartChar + : 'A' .. 'Z' + | 'a' .. 'z' + | '\u00C0' .. '\u00D6' + | '\u00D8' .. '\u00F6' + | '\u00F8' .. '\u02FF' + | '\u0370' .. '\u037D' + | '\u037F' .. '\u1FFF' + | '\u200C' .. '\u200D' + | '\u2070' .. '\u218F' + | '\u2C00' .. '\u2FEF' + | '\u3001' .. '\uD7FF' + | '\uF900' .. '\uFDCF' + | '\uFDF0' .. '\uFFFD' + // ignores | ['\u10000-'\uEFFFF] + ; diff --git a/antlr/antlr4/ANTLRv4Parser.g4 b/antlr/antlr4/ANTLRv4Parser.g4 index a4ff765988..9c18076b44 100644 --- a/antlr/antlr4/ANTLRv4Parser.g4 +++ b/antlr/antlr4/ANTLRv4Parser.g4 @@ -135,11 +135,11 @@ actionScopeName ; actionBlock - : BEGIN_ACTION ACTION_CONTENT* END_ACTION + : BEGIN_ACTION ACTION_CONTENT*? END_ACTION ; argActionBlock - : BEGIN_ARGUMENT ARGUMENT_CONTENT* END_ARGUMENT + : BEGIN_ARGUMENT ARGUMENT_CONTENT*? END_ARGUMENT ; modeSpec @@ -184,7 +184,7 @@ ruleReturns // -------------- // Exception spec throwsSpec - : THROWS identifier (COMMA identifier)* + : THROWS qualifiedIdentifier (COMMA qualifiedIdentifier)* ; localsSpec @@ -302,7 +302,7 @@ element : labeledElement (ebnfSuffix |) | atom (ebnfSuffix |) | ebnf - | actionBlock (QUESTION predicateOptions?)? + | actionBlock QUESTION? predicateOptions? ; predicateOptions @@ -311,7 +311,7 @@ predicateOptions predicateOption : elementOption - | identifier ASSIGN actionBlock + | identifier ASSIGN (actionBlock | INT | STRING_LITERAL) ; labeledElement @@ -340,14 +340,18 @@ lexerAtom | terminalDef | notSet | LEXER_CHAR_SET - | DOT elementOptions? + | wildcard ; atom : terminalDef | ruleref | notSet - | DOT elementOptions? + | wildcard + ; + +wildcard + : DOT elementOptions? ; // -------------------- @@ -398,11 +402,15 @@ elementOptions ; elementOption - : identifier - | identifier ASSIGN (identifier | STRING_LITERAL) + : qualifiedIdentifier + | identifier ASSIGN (qualifiedIdentifier | STRING_LITERAL | INT) ; identifier : RULE_REF | TOKEN_REF - ; \ No newline at end of file + ; + +qualifiedIdentifier + : identifier (DOT identifier)* + ; diff --git a/antlr/antlr4/Cpp/ANTLRv4Lexer.g4 b/antlr/antlr4/Cpp/ANTLRv4Lexer.g4 index 493d554744..84e84ee2d3 100644 --- a/antlr/antlr4/Cpp/ANTLRv4Lexer.g4 +++ b/antlr/antlr4/Cpp/ANTLRv4Lexer.g4 @@ -35,20 +35,24 @@ * -- update for compatibility with Antlr v4.5 */ +// $antlr-format alignTrailingComments on, columnLimit 130, minEmptyLines 1, maxEmptyLinesToKeep 1, reflowComments off +// $antlr-format useTab off, allowShortRulesOnASingleLine off, allowShortBlocksOnASingleLine on, alignSemicolons hanging +// $antlr-format alignColons hanging + // ====================================================== // Lexer specification // ====================================================== -// $antlr-format alignTrailingComments true, columnLimit 150, maxEmptyLinesToKeep 1, reflowComments false, useTab false -// $antlr-format allowShortRulesOnASingleLine true, allowShortBlocksOnASingleLine true, minEmptyLines 0, alignSemicolons ownLine -// $antlr-format alignColons trailing, singleLineOverrulesHangingColon true, alignLexerCommands true, alignLabels true, alignTrailers true - lexer grammar ANTLRv4Lexer; options { superClass = LexerAdaptor; + + // Using a predefined list of tokens here to ensure the same order of the tokens as they were defined + // in the old ANTLR3 tree parsers (to avoid having to change the tree parsers code). + // The actual values of the tokens doesn't matter, but the order does. + tokenVocab = predefined; } -import LexBasic; @header { #include "LexerAdaptor.h" @@ -60,6 +64,7 @@ tokens { RULE_REF, LEXER_CHAR_SET } + channels { OFF_CHANNEL, COMMENT @@ -67,16 +72,26 @@ channels { // ------------------------- // Comments -DOC_COMMENT: DocComment -> channel (COMMENT); -BLOCK_COMMENT: BlockComment -> channel (COMMENT); +DOC_COMMENT + : '/**' .*? ('*/' | EOF) -> channel (COMMENT) + ; -LINE_COMMENT: LineComment -> channel (COMMENT); +BLOCK_COMMENT + : '/*' .*? ('*/' | EOF) -> channel (COMMENT) + ; + +LINE_COMMENT + : '//' ~ [\r\n]* -> channel (COMMENT) + ; // ------------------------- // Integer -INT: DecimalNumeral; +INT + : '0' + | [1-9] [0-9]* + ; // ------------------------- // Literal string @@ -85,9 +100,13 @@ INT: DecimalNumeral; // multi-character string. All literals are single quote delimited and // may contain unicode escape sequences of the form \uxxxx, where x // is a valid hexadecimal number (per Unicode standard). -STRING_LITERAL: SQuoteLiteral; +STRING_LITERAL + : '\'' (ESC_SEQUENCE | ~ ['\r\n\\])* '\'' + ; -UNTERMINATED_STRING_LITERAL: USQuoteLiteral; +UNTERMINATED_STRING_LITERAL + : '\'' (ESC_SEQUENCE | ~ ['\r\n\\])* + ; // ------------------------- // Arguments @@ -95,11 +114,15 @@ UNTERMINATED_STRING_LITERAL: USQuoteLiteral; // Certain argument lists, such as those specifying call parameters // to a rule invocation, or input parameters to a rule specification // are contained within square brackets. -BEGIN_ARGUMENT: LBrack { this->handleBeginArgument(); }; +BEGIN_ARGUMENT + : '[' { this.handleBeginArgument(); } + ; // ------------------------- // Target Language Actions -BEGIN_ACTION: LBrace -> pushMode (TargetLanguageAction); +BEGIN_ACTION + : '{' -> pushMode (TargetLanguageAction) + ; // ------------------------- // Keywords @@ -108,130 +131,214 @@ BEGIN_ACTION: LBrace -> pushMode (TargetLanguageAction); // but only when followed by '{', and considered as a single token. // Otherwise, the symbols are tokenized as RULE_REF and allowed as // an identifier in a labeledElement. -OPTIONS : 'options' WSNLCHARS* '{'; -TOKENS : 'tokens' WSNLCHARS* '{'; -CHANNELS : 'channels' WSNLCHARS* '{'; +OPTIONS + : 'options' WS* '{' + ; + +TOKENS + : 'tokens' WS* '{' + ; -fragment WSNLCHARS: ' ' | '\t' | '\f' | '\n' | '\r'; +CHANNELS + : 'channels' WS* '{' + ; -IMPORT: 'import'; +IMPORT + : 'import' + ; -FRAGMENT: 'fragment'; +FRAGMENT + : 'fragment' + ; -LEXER: 'lexer'; +LEXER + : 'lexer' + ; -PARSER: 'parser'; +PARSER + : 'parser' + ; -GRAMMAR: 'grammar'; +GRAMMAR + : 'grammar' + ; -PROTECTED: 'protected'; +PROTECTED + : 'protected' + ; -PUBLIC: 'public'; +PUBLIC + : 'public' + ; -PRIVATE: 'private'; +PRIVATE + : 'private' + ; -RETURNS: 'returns'; +RETURNS + : 'returns' + ; -LOCALS: 'locals'; +LOCALS + : 'locals' + ; -THROWS: 'throws'; +THROWS + : 'throws' + ; -CATCH: 'catch'; +CATCH + : 'catch' + ; -FINALLY: 'finally'; +FINALLY + : 'finally' + ; + +MODE + : 'mode' + ; -MODE: 'mode'; // ------------------------- // Punctuation -COLON: Colon; +COLON + : ':' + ; -COLONCOLON: DColon; +COLONCOLON + : '::' + ; -COMMA: Comma; +COMMA + : ',' + ; -SEMI: Semi; +SEMI + : ';' + ; -LPAREN: LParen; +LPAREN + : '(' + ; -RPAREN: RParen; +RPAREN + : ')' + ; -LBRACE: LBrace; +RBRACE + : '}' + ; -RBRACE: RBrace; +RARROW + : '->' + ; -RARROW: RArrow; +LT + : '<' + ; -LT: Lt; +GT + : '>' + ; -GT: Gt; +ASSIGN + : '=' + ; -ASSIGN: Equal; +QUESTION + : '?' + ; -QUESTION: Question; +STAR + : '*' + ; -STAR: Star; +PLUS_ASSIGN + : '+=' + ; -PLUS_ASSIGN: PlusAssign; +PLUS + : '+' + ; -PLUS: Plus; +OR + : '|' + ; -OR: Pipe; +DOLLAR + : '$' + ; -DOLLAR: Dollar; +RANGE + : '..' + ; -RANGE: Range; +DOT + : '.' + ; -DOT: Dot; +AT + : '@' + ; -AT: At; +POUND + : '#' + ; -POUND: Pound; +NOT + : '~' + ; -NOT: Tilde; // ------------------------- // Identifiers - allows unicode rule/token names -ID: Id; -// ------------------------- -// Whitespace - -WS: Ws+ -> channel (OFF_CHANNEL); +ID + : NameStartChar NameChar* + ; // ------------------------- -// Illegal Characters -// -// This is an illegal character trap which is always the last rule in the -// lexer specification. It matches a single character of any value and being -// the last rule in the file will match when no other rule knows what to do -// about the character. It is reported as an error but is not passed on to the -// parser. This means that the parser to deal with the gramamr file anyway -// but we will not try to analyse or code generate from a file with lexical -// errors. +// Whitespace -// Comment this rule out to allow the error to be propagated to the parser -ERRCHAR: . -> channel (HIDDEN); +WS + : [ \t\r\n\f]+ -> channel (OFF_CHANNEL) + ; // ====================================================== // Lexer modes // ------------------------- // Arguments mode Argument; + // E.g., [int x, List a[]] -NESTED_ARGUMENT: LBrack -> type (ARGUMENT_CONTENT), pushMode (Argument); +NESTED_ARGUMENT + : '[' -> type (ARGUMENT_CONTENT), pushMode (Argument) + ; -ARGUMENT_ESCAPE: EscAny -> type (ARGUMENT_CONTENT); +ARGUMENT_ESCAPE + : '\\' . -> type (ARGUMENT_CONTENT) + ; -ARGUMENT_STRING_LITERAL: DQuoteLiteral -> type (ARGUMENT_CONTENT); +ARGUMENT_STRING_LITERAL + : DQuoteLiteral -> type (ARGUMENT_CONTENT) + ; -ARGUMENT_CHAR_LITERAL: SQuoteLiteral -> type (ARGUMENT_CONTENT); +ARGUMENT_CHAR_LITERAL + : STRING_LITERAL -> type (ARGUMENT_CONTENT) + ; -END_ARGUMENT: RBrack { this->handleEndArgument(); }; +END_ARGUMENT + : ']' { this.handleEndArgument(); } + ; // added this to return non-EOF token type here. EOF does something weird -UNTERMINATED_ARGUMENT: EOF -> popMode; +UNTERMINATED_ARGUMENT + : EOF -> popMode + ; -ARGUMENT_CONTENT: .; +ARGUMENT_CONTENT + : . + ; // ------------------------- // Target Language Actions @@ -241,36 +348,108 @@ ARGUMENT_CONTENT: .; // braces. Additionally, we must make some assumptions about // literal string representation in the target language. We assume // that they are delimited by ' or " and so consume these -// in their own alts so as not to inadvertantly match {}. +// in their own alts so as not to inadvertently match {}. mode TargetLanguageAction; -NESTED_ACTION: LBrace -> type (ACTION_CONTENT), pushMode (TargetLanguageAction); -ACTION_ESCAPE: EscAny -> type (ACTION_CONTENT); +NESTED_ACTION + : '{' -> type (ACTION_CONTENT), pushMode (TargetLanguageAction) + ; + +ACTION_ESCAPE + : '\\' . -> type (ACTION_CONTENT) + ; -ACTION_STRING_LITERAL: DQuoteLiteral -> type (ACTION_CONTENT); +ACTION_STRING_LITERAL + : DQuoteLiteral -> type (ACTION_CONTENT) + ; -ACTION_CHAR_LITERAL: SQuoteLiteral -> type (ACTION_CONTENT); +ACTION_CHAR_LITERAL + : STRING_LITERAL -> type (ACTION_CONTENT) + ; -ACTION_DOC_COMMENT: DocComment -> type (ACTION_CONTENT); +ACTION_DOC_COMMENT + : DOC_COMMENT -> type (ACTION_CONTENT) + ; -ACTION_BLOCK_COMMENT: BlockComment -> type (ACTION_CONTENT); +ACTION_BLOCK_COMMENT + : BLOCK_COMMENT -> type (ACTION_CONTENT) + ; -ACTION_LINE_COMMENT: LineComment -> type (ACTION_CONTENT); +ACTION_LINE_COMMENT + : LINE_COMMENT -> type (ACTION_CONTENT) + ; -END_ACTION: RBrace { this->handleEndAction(); }; +END_ACTION + : '}' { this.handleEndAction(); } + ; -UNTERMINATED_ACTION: EOF -> popMode; +UNTERMINATED_ACTION + : EOF -> popMode + ; -ACTION_CONTENT: .; +ACTION_CONTENT + : . + ; // ------------------------- mode LexerCharSet; -LEXER_CHAR_SET_BODY: (~ [\]\\] | EscAny)+ -> more; -LEXER_CHAR_SET: RBrack -> popMode; +LEXER_CHAR_SET_BODY + : (~ [\]\\] | '\\' .)+ -> more + ; -UNTERMINATED_CHAR_SET: EOF -> popMode; +LEXER_CHAR_SET + : ']' -> popMode + ; + +UNTERMINATED_CHAR_SET + : EOF -> popMode + ; // ------------------------------------------------------------------------------ // Grammar specific Keywords, Punctuation, etc. -fragment Id: NameStartChar NameChar*; \ No newline at end of file + +fragment ESC_SEQUENCE + : '\\' ([btnfr"'\\] | UnicodeESC | . | EOF) + ; + +fragment HexDigit + : [0-9a-fA-F] + ; + +fragment UnicodeESC + : 'u' (HexDigit (HexDigit (HexDigit HexDigit?)?)?)? + ; + +fragment DQuoteLiteral + : '"' (ESC_SEQUENCE | ~ ["\r\n\\])* '"' + ; + +// ----------------------------------- +// Character ranges + +fragment NameChar + : NameStartChar + | '0' .. '9' + | '_' + | '\u00B7' + | '\u0300' .. '\u036F' + | '\u203F' .. '\u2040' + ; + +fragment NameStartChar + : 'A' .. 'Z' + | 'a' .. 'z' + | '\u00C0' .. '\u00D6' + | '\u00D8' .. '\u00F6' + | '\u00F8' .. '\u02FF' + | '\u0370' .. '\u037D' + | '\u037F' .. '\u1FFF' + | '\u200C' .. '\u200D' + | '\u2070' .. '\u218F' + | '\u2C00' .. '\u2FEF' + | '\u3001' .. '\uD7FF' + | '\uF900' .. '\uFDCF' + | '\uFDF0' .. '\uFFFD' + // ignores | ['\u10000-'\uEFFFF] + ; diff --git a/antlr/antlr4/LexBasic.g4 b/antlr/antlr4/LexBasic.g4 deleted file mode 100644 index 761e3db2e4..0000000000 --- a/antlr/antlr4/LexBasic.g4 +++ /dev/null @@ -1,286 +0,0 @@ -/* - * [The "BSD license"] - * Copyright (c) 2014-2015 Gerald Rosenberg - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ -/** - * A generally reusable set of fragments for import in to Lexer grammars. - * - * Modified 2015.06.16 gbr - - * -- generalized for inclusion into the ANTLRv4 grammar distribution - * - */ - -// $antlr-format alignTrailingComments on, columnLimit 130, minEmptyLines 1, maxEmptyLinesToKeep 1, reflowComments off -// $antlr-format useTab off, allowShortRulesOnASingleLine off, allowShortBlocksOnASingleLine on, alignSemicolons hanging -// $antlr-format alignColons hanging - -lexer grammar LexBasic; - -// ====================================================== -// Lexer fragments -// -// ----------------------------------- -// Whitespace & Comments - -fragment Ws - : Hws - | Vws - ; - -fragment Hws - : [ \t] - ; - -fragment Vws - : [\r\n\f] - ; - -fragment BlockComment - : '/*' .*? ('*/' | EOF) - ; - -fragment DocComment - : '/**' .*? ('*/' | EOF) - ; - -fragment LineComment - : '//' ~ [\r\n]* - ; - -// ----------------------------------- -// Escapes -// Any kind of escaped character that we can embed within ANTLR literal strings. - -fragment EscSeq - : Esc ([btnfr"'\\] | UnicodeEsc | . | EOF) - ; - -fragment EscAny - : Esc . - ; - -fragment UnicodeEsc - : 'u' (HexDigit (HexDigit (HexDigit HexDigit?)?)?)? - ; - -// ----------------------------------- -// Numerals - -fragment DecimalNumeral - : '0' - | [1-9] DecDigit* - ; - -// ----------------------------------- -// Digits - -fragment HexDigit - : [0-9a-fA-F] - ; - -fragment DecDigit - : [0-9] - ; - -// ----------------------------------- -// Literals - -fragment BoolLiteral - : 'true' - | 'false' - ; - -fragment CharLiteral - : SQuote (EscSeq | ~ ['\r\n\\]) SQuote - ; - -fragment SQuoteLiteral - : SQuote (EscSeq | ~ ['\r\n\\])* SQuote - ; - -fragment DQuoteLiteral - : DQuote (EscSeq | ~ ["\r\n\\])* DQuote - ; - -fragment USQuoteLiteral - : SQuote (EscSeq | ~ ['\r\n\\])* - ; - -// ----------------------------------- -// Character ranges - -fragment NameChar - : NameStartChar - | '0' .. '9' - | Underscore - | '\u00B7' - | '\u0300' .. '\u036F' - | '\u203F' .. '\u2040' - ; - -fragment NameStartChar - : 'A' .. 'Z' - | 'a' .. 'z' - | '\u00C0' .. '\u00D6' - | '\u00D8' .. '\u00F6' - | '\u00F8' .. '\u02FF' - | '\u0370' .. '\u037D' - | '\u037F' .. '\u1FFF' - | '\u200C' .. '\u200D' - | '\u2070' .. '\u218F' - | '\u2C00' .. '\u2FEF' - | '\u3001' .. '\uD7FF' - | '\uF900' .. '\uFDCF' - | '\uFDF0' .. '\uFFFD' - // ignores | ['\u10000-'\uEFFFF] - ; - -// ----------------------------------- -// Types - -fragment Int - : 'int' - ; - -// ----------------------------------- -// Symbols - -fragment Esc - : '\\' - ; - -fragment Colon - : ':' - ; - -fragment DColon - : '::' - ; - -fragment SQuote - : '\'' - ; - -fragment DQuote - : '"' - ; - -fragment LParen - : '(' - ; - -fragment RParen - : ')' - ; - -fragment LBrace - : '{' - ; - -fragment RBrace - : '}' - ; - -fragment LBrack - : '[' - ; - -fragment RBrack - : ']' - ; - -fragment RArrow - : '->' - ; - -fragment Lt - : '<' - ; - -fragment Gt - : '>' - ; - -fragment Equal - : '=' - ; - -fragment Question - : '?' - ; - -fragment Star - : '*' - ; - -fragment Plus - : '+' - ; - -fragment PlusAssign - : '+=' - ; - -fragment Underscore - : '_' - ; - -fragment Pipe - : '|' - ; - -fragment Dollar - : '$' - ; - -fragment Comma - : ',' - ; - -fragment Semi - : ';' - ; - -fragment Dot - : '.' - ; - -fragment Range - : '..' - ; - -fragment At - : '@' - ; - -fragment Pound - : '#' - ; - -fragment Tilde - : '~' - ; \ No newline at end of file diff --git a/antlr/antlr4/Python3/ANTLRv4Lexer.g4 b/antlr/antlr4/Python3/ANTLRv4Lexer.g4 index 5af4832505..90cbba8018 100644 --- a/antlr/antlr4/Python3/ANTLRv4Lexer.g4 +++ b/antlr/antlr4/Python3/ANTLRv4Lexer.g4 @@ -35,20 +35,24 @@ * -- update for compatibility with Antlr v4.5 */ +// $antlr-format alignTrailingComments on, columnLimit 130, minEmptyLines 1, maxEmptyLinesToKeep 1, reflowComments off +// $antlr-format useTab off, allowShortRulesOnASingleLine off, allowShortBlocksOnASingleLine on, alignSemicolons hanging +// $antlr-format alignColons hanging + // ====================================================== // Lexer specification // ====================================================== -// $antlr-format alignTrailingComments true, columnLimit 150, maxEmptyLinesToKeep 1, reflowComments false, useTab false -// $antlr-format allowShortRulesOnASingleLine true, allowShortBlocksOnASingleLine true, minEmptyLines 0, alignSemicolons ownLine -// $antlr-format alignColons trailing, singleLineOverrulesHangingColon true, alignLexerCommands true, alignLabels true, alignTrailers true - lexer grammar ANTLRv4Lexer; options { superClass = LexerAdaptor; + + // Using a predefined list of tokens here to ensure the same order of the tokens as they were defined + // in the old ANTLR3 tree parsers (to avoid having to change the tree parsers code). + // The actual values of the tokens doesn't matter, but the order does. + tokenVocab = predefined; } -import LexBasic; // Standard set of fragments tokens { @@ -56,6 +60,7 @@ tokens { RULE_REF, LEXER_CHAR_SET } + channels { OFF_CHANNEL, COMMENT @@ -63,16 +68,26 @@ channels { // ------------------------- // Comments -DOC_COMMENT: DocComment -> channel (COMMENT); -BLOCK_COMMENT: BlockComment -> channel (COMMENT); +DOC_COMMENT + : '/**' .*? ('*/' | EOF) -> channel (COMMENT) + ; -LINE_COMMENT: LineComment -> channel (COMMENT); +BLOCK_COMMENT + : '/*' .*? ('*/' | EOF) -> channel (COMMENT) + ; + +LINE_COMMENT + : '//' ~ [\r\n]* -> channel (COMMENT) + ; // ------------------------- // Integer -INT: DecimalNumeral; +INT + : '0' + | [1-9] [0-9]* + ; // ------------------------- // Literal string @@ -81,9 +96,13 @@ INT: DecimalNumeral; // multi-character string. All literals are single quote delimited and // may contain unicode escape sequences of the form \uxxxx, where x // is a valid hexadecimal number (per Unicode standard). -STRING_LITERAL: SQuoteLiteral; +STRING_LITERAL + : '\'' (ESC_SEQUENCE | ~ ['\r\n\\])* '\'' + ; -UNTERMINATED_STRING_LITERAL: USQuoteLiteral; +UNTERMINATED_STRING_LITERAL + : '\'' (ESC_SEQUENCE | ~ ['\r\n\\])* + ; // ------------------------- // Arguments @@ -91,11 +110,15 @@ UNTERMINATED_STRING_LITERAL: USQuoteLiteral; // Certain argument lists, such as those specifying call parameters // to a rule invocation, or input parameters to a rule specification // are contained within square brackets. -BEGIN_ARGUMENT: LBrack { self.handleBeginArgument() }; +BEGIN_ARGUMENT + : '[' { this.handleBeginArgument(); } + ; // ------------------------- // Target Language Actions -BEGIN_ACTION: LBrace -> pushMode (TargetLanguageAction); +BEGIN_ACTION + : '{' -> pushMode (TargetLanguageAction) + ; // ------------------------- // Keywords @@ -104,130 +127,214 @@ BEGIN_ACTION: LBrace -> pushMode (TargetLanguageAction); // but only when followed by '{', and considered as a single token. // Otherwise, the symbols are tokenized as RULE_REF and allowed as // an identifier in a labeledElement. -OPTIONS : 'options' WSNLCHARS* '{'; -TOKENS : 'tokens' WSNLCHARS* '{'; -CHANNELS : 'channels' WSNLCHARS* '{'; +OPTIONS + : 'options' WS* '{' + ; + +TOKENS + : 'tokens' WS* '{' + ; -fragment WSNLCHARS: ' ' | '\t' | '\f' | '\n' | '\r'; +CHANNELS + : 'channels' WS* '{' + ; -IMPORT: 'import'; +IMPORT + : 'import' + ; -FRAGMENT: 'fragment'; +FRAGMENT + : 'fragment' + ; -LEXER: 'lexer'; +LEXER + : 'lexer' + ; -PARSER: 'parser'; +PARSER + : 'parser' + ; -GRAMMAR: 'grammar'; +GRAMMAR + : 'grammar' + ; -PROTECTED: 'protected'; +PROTECTED + : 'protected' + ; -PUBLIC: 'public'; +PUBLIC + : 'public' + ; -PRIVATE: 'private'; +PRIVATE + : 'private' + ; -RETURNS: 'returns'; +RETURNS + : 'returns' + ; -LOCALS: 'locals'; +LOCALS + : 'locals' + ; -THROWS: 'throws'; +THROWS + : 'throws' + ; -CATCH: 'catch'; +CATCH + : 'catch' + ; -FINALLY: 'finally'; +FINALLY + : 'finally' + ; + +MODE + : 'mode' + ; -MODE: 'mode'; // ------------------------- // Punctuation -COLON: Colon; +COLON + : ':' + ; -COLONCOLON: DColon; +COLONCOLON + : '::' + ; -COMMA: Comma; +COMMA + : ',' + ; -SEMI: Semi; +SEMI + : ';' + ; -LPAREN: LParen; +LPAREN + : '(' + ; -RPAREN: RParen; +RPAREN + : ')' + ; -LBRACE: LBrace; +RBRACE + : '}' + ; -RBRACE: RBrace; +RARROW + : '->' + ; -RARROW: RArrow; +LT + : '<' + ; -LT: Lt; +GT + : '>' + ; -GT: Gt; +ASSIGN + : '=' + ; -ASSIGN: Equal; +QUESTION + : '?' + ; -QUESTION: Question; +STAR + : '*' + ; -STAR: Star; +PLUS_ASSIGN + : '+=' + ; -PLUS_ASSIGN: PlusAssign; +PLUS + : '+' + ; -PLUS: Plus; +OR + : '|' + ; -OR: Pipe; +DOLLAR + : '$' + ; -DOLLAR: Dollar; +RANGE + : '..' + ; -RANGE: Range; +DOT + : '.' + ; -DOT: Dot; +AT + : '@' + ; -AT: At; +POUND + : '#' + ; -POUND: Pound; +NOT + : '~' + ; -NOT: Tilde; // ------------------------- // Identifiers - allows unicode rule/token names -ID: Id; -// ------------------------- -// Whitespace - -WS: Ws+ -> channel (OFF_CHANNEL); +ID + : NameStartChar NameChar* + ; // ------------------------- -// Illegal Characters -// -// This is an illegal character trap which is always the last rule in the -// lexer specification. It matches a single character of any value and being -// the last rule in the file will match when no other rule knows what to do -// about the character. It is reported as an error but is not passed on to the -// parser. This means that the parser to deal with the gramamr file anyway -// but we will not try to analyse or code generate from a file with lexical -// errors. +// Whitespace -// Comment this rule out to allow the error to be propagated to the parser -ERRCHAR: . -> channel (HIDDEN); +WS + : [ \t\r\n\f]+ -> channel (OFF_CHANNEL) + ; // ====================================================== // Lexer modes // ------------------------- // Arguments mode Argument; + // E.g., [int x, List a[]] -NESTED_ARGUMENT: LBrack -> type (ARGUMENT_CONTENT), pushMode (Argument); +NESTED_ARGUMENT + : '[' -> type (ARGUMENT_CONTENT), pushMode (Argument) + ; -ARGUMENT_ESCAPE: EscAny -> type (ARGUMENT_CONTENT); +ARGUMENT_ESCAPE + : '\\' . -> type (ARGUMENT_CONTENT) + ; -ARGUMENT_STRING_LITERAL: DQuoteLiteral -> type (ARGUMENT_CONTENT); +ARGUMENT_STRING_LITERAL + : DQuoteLiteral -> type (ARGUMENT_CONTENT) + ; -ARGUMENT_CHAR_LITERAL: SQuoteLiteral -> type (ARGUMENT_CONTENT); +ARGUMENT_CHAR_LITERAL + : STRING_LITERAL -> type (ARGUMENT_CONTENT) + ; -END_ARGUMENT: RBrack { self.handleEndArgument() }; +END_ARGUMENT + : ']' { this.handleEndArgument(); } + ; // added this to return non-EOF token type here. EOF does something weird -UNTERMINATED_ARGUMENT: EOF -> popMode; +UNTERMINATED_ARGUMENT + : EOF -> popMode + ; -ARGUMENT_CONTENT: .; +ARGUMENT_CONTENT + : . + ; // ------------------------- // Target Language Actions @@ -237,36 +344,108 @@ ARGUMENT_CONTENT: .; // braces. Additionally, we must make some assumptions about // literal string representation in the target language. We assume // that they are delimited by ' or " and so consume these -// in their own alts so as not to inadvertantly match {}. +// in their own alts so as not to inadvertently match {}. mode TargetLanguageAction; -NESTED_ACTION: LBrace -> type (ACTION_CONTENT), pushMode (TargetLanguageAction); -ACTION_ESCAPE: EscAny -> type (ACTION_CONTENT); +NESTED_ACTION + : '{' -> type (ACTION_CONTENT), pushMode (TargetLanguageAction) + ; + +ACTION_ESCAPE + : '\\' . -> type (ACTION_CONTENT) + ; -ACTION_STRING_LITERAL: DQuoteLiteral -> type (ACTION_CONTENT); +ACTION_STRING_LITERAL + : DQuoteLiteral -> type (ACTION_CONTENT) + ; -ACTION_CHAR_LITERAL: SQuoteLiteral -> type (ACTION_CONTENT); +ACTION_CHAR_LITERAL + : STRING_LITERAL -> type (ACTION_CONTENT) + ; -ACTION_DOC_COMMENT: DocComment -> type (ACTION_CONTENT); +ACTION_DOC_COMMENT + : DOC_COMMENT -> type (ACTION_CONTENT) + ; -ACTION_BLOCK_COMMENT: BlockComment -> type (ACTION_CONTENT); +ACTION_BLOCK_COMMENT + : BLOCK_COMMENT -> type (ACTION_CONTENT) + ; -ACTION_LINE_COMMENT: LineComment -> type (ACTION_CONTENT); +ACTION_LINE_COMMENT + : LINE_COMMENT -> type (ACTION_CONTENT) + ; -END_ACTION: RBrace { self.handleEndAction() }; +END_ACTION + : '}' { this.handleEndAction(); } + ; -UNTERMINATED_ACTION: EOF -> popMode; +UNTERMINATED_ACTION + : EOF -> popMode + ; -ACTION_CONTENT: .; +ACTION_CONTENT + : . + ; // ------------------------- mode LexerCharSet; -LEXER_CHAR_SET_BODY: (~ [\]\\] | EscAny)+ -> more; -LEXER_CHAR_SET: RBrack -> popMode; +LEXER_CHAR_SET_BODY + : (~ [\]\\] | '\\' .)+ -> more + ; -UNTERMINATED_CHAR_SET: EOF -> popMode; +LEXER_CHAR_SET + : ']' -> popMode + ; + +UNTERMINATED_CHAR_SET + : EOF -> popMode + ; // ------------------------------------------------------------------------------ // Grammar specific Keywords, Punctuation, etc. -fragment Id: NameStartChar NameChar*; \ No newline at end of file + +fragment ESC_SEQUENCE + : '\\' ([btnfr"'\\] | UnicodeESC | . | EOF) + ; + +fragment HexDigit + : [0-9a-fA-F] + ; + +fragment UnicodeESC + : 'u' (HexDigit (HexDigit (HexDigit HexDigit?)?)?)? + ; + +fragment DQuoteLiteral + : '"' (ESC_SEQUENCE | ~ ["\r\n\\])* '"' + ; + +// ----------------------------------- +// Character ranges + +fragment NameChar + : NameStartChar + | '0' .. '9' + | '_' + | '\u00B7' + | '\u0300' .. '\u036F' + | '\u203F' .. '\u2040' + ; + +fragment NameStartChar + : 'A' .. 'Z' + | 'a' .. 'z' + | '\u00C0' .. '\u00D6' + | '\u00D8' .. '\u00F6' + | '\u00F8' .. '\u02FF' + | '\u0370' .. '\u037D' + | '\u037F' .. '\u1FFF' + | '\u200C' .. '\u200D' + | '\u2070' .. '\u218F' + | '\u2C00' .. '\u2FEF' + | '\u3001' .. '\uD7FF' + | '\uF900' .. '\uFDCF' + | '\uFDF0' .. '\uFFFD' + // ignores | ['\u10000-'\uEFFFF] + ; diff --git a/antlr/antlr4/predefined.tokens b/antlr/antlr4/predefined.tokens new file mode 100644 index 0000000000..fba57fde43 --- /dev/null +++ b/antlr/antlr4/predefined.tokens @@ -0,0 +1,98 @@ +ACTION = 4 +ACTION_CHAR_LITERAL = 5 +ACTION_ESC = 6 +ACTION_STRING_LITERAL = 7 +ARG_ACTION = 8 +ARG_OR_CHARSET = 9 +ASSIGN = 10 +AT = 11 +CATCH = 12 +CHANNELS = 13 +COLON = 14 +COLONCOLON = 15 +COMMA = 16 +UNUSED = 17 +DOC_COMMENT = 18 +DOLLAR = 19 +DOT = 20 +ERRCHAR = 21 +ESC_SEQ = 22 +FINALLY = 23 +FRAGMENT = 24 +GRAMMAR = 25 +GT = 26 +HEX_DIGIT = 27 +ID = 28 +IMPORT = 29 +INT = 30 +LEXER = 31 +LEXER_CHAR_SET = 32 +LOCALS = 33 +LPAREN = 34 +LT = 35 +MODE = 36 +NESTED_ACTION = 37 +NLCHARS = 38 +NOT = 39 +NameChar = 40 +NameStartChar = 41 +OPTIONS = 42 +OR = 43 +PARSER = 44 +PLUS = 45 +PLUS_ASSIGN = 46 +POUND = 47 +QUESTION = 48 +RANGE = 49 +RARROW = 50 +RBRACE = 51 +RETURNS = 52 +RPAREN = 53 +RULE_REF = 54 +SEMI = 55 +SEMPRED = 56 +SRC = 57 +STAR = 58 +STRING_LITERAL = 59 +THROWS = 60 +UNUSED2 = 61 +TOKEN_REF = 62 +UNICODE_ESC = 63 +UNICODE_EXTENDED_ESC = 64 +UnicodeBOM = 65 +WS = 66 +WSCHARS = 67 +WSNLCHARS = 68 +ALT = 69 +BLOCK = 70 +CLOSURE = 71 +COMBINED = 72 +ELEMENT_OPTIONS = 73 +EPSILON = 74 +LEXER_ACTION_CALL = 75 +LEXER_ALT_ACTION = 76 +OPTIONAL = 77 +POSITIVE_CLOSURE = 78 +RULE = 79 +RULEMODIFIERS = 80 +RULES = 81 +SET = 82 +WILDCARD = 83 +BLOCK_COMMENT = 84 +LINE_COMMENT = 85 +UNTERMINATED_STRING_LITERAL = 86 +BEGIN_ARGUMENT = 87 +BEGIN_ACTION = 88 +TOKENS = 89 +LBRACE = 90 +END_ARGUMENT = 91 +UNTERMINATED_ARGUMENT = 92 +ARGUMENT_CONTENT = 93 +END_ACTION = 94 +UNTERMINATED_ACTION = 95 +ACTION_CONTENT = 96 +UNTERMINATED_CHAR_SET = 97 +PRIVATE = 98 +PROTECTED = 99 +PUBLIC = 100 +PREDICATE_OPTIONS = 101 diff --git a/antlr/antlr4/readme.md b/antlr/antlr4/readme.md index 3afbc169fc..4c19b1a3fe 100644 --- a/antlr/antlr4/readme.md +++ b/antlr/antlr4/readme.md @@ -6,4 +6,4 @@ As of 24 Feb 2021: * Java, C#, JavaScript work on all grammars in examples/. Other targets have been provided but not tested. * All files have been organized in directories named after the target. * The split grammar ANTLRv4Lexer.g4 and ANTLRv4Parser.g4 contain "minimum" target-specific code, done in a "target-independent" way. Target-specific code for Python3 is contained in the Python3/ directory. -* The code is no generated with the -package option, and the grammars do not contain target-specific code that declares a project. +* The code is not generated with the -package option, and the grammars do not contain target-specific code that declares a project.