diff --git a/include/prism/enc/pm_encoding.h b/include/prism/enc/pm_encoding.h index 5236a0b3c4e..232bc97dd4c 100644 --- a/include/prism/enc/pm_encoding.h +++ b/include/prism/enc/pm_encoding.h @@ -57,6 +57,7 @@ bool pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptr // the parser so they need to be internally visible. size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n); size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n); +bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n); // This lookup table is referenced in both the UTF-8 encoding file and the // parser directly in order to speed up the default encoding processing. diff --git a/src/enc/pm_unicode.c b/src/enc/pm_unicode.c index ab10044424b..ee776fa2add 100644 --- a/src/enc/pm_unicode.c +++ b/src/enc/pm_unicode.c @@ -2285,7 +2285,7 @@ pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) { } } -static bool +bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) { if (*b < 0x80) { return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false; diff --git a/src/prism.c b/src/prism.c index 83e75b4b699..1809587eb05 100644 --- a/src/prism.c +++ b/src/prism.c @@ -6044,16 +6044,21 @@ static pm_token_type_t lex_identifier(pm_parser_t *parser, bool previous_command_start) { // Lex as far as we can into the current identifier. size_t width; - while (parser->current.end < parser->end && (width = char_is_identifier(parser, parser->current.end)) > 0) { - parser->current.end += width; + const uint8_t *end = parser->end; + const uint8_t *current_start = parser->current.start; + const uint8_t *current_end = parser->current.end; + + while (current_end < end && (width = char_is_identifier(parser, current_end)) > 0) { + current_end += width; } + parser->current.end = current_end; // Now cache the length of the identifier so that we can quickly compare it // against known keywords. - width = (size_t) (parser->current.end - parser->current.start); + width = (size_t) (current_end - current_start); - if (parser->current.end < parser->end) { - if (((parser->current.end + 1 >= parser->end) || (parser->current.end[1] != '=')) && (match(parser, '!') || match(parser, '?'))) { + if (current_end < end) { + if (((current_end + 1 >= end) || (current_end[1] != '=')) && (match(parser, '!') || match(parser, '?'))) { // First we'll attempt to extend the identifier by a ! or ?. Then we'll // check if we're returning the defined? keyword or just an identifier. width++; @@ -6163,7 +6168,10 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) { } } - return parser->encoding.isupper_char(parser->current.start, parser->end - parser->current.start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER; + if (parser->encoding_changed) { + return parser->encoding.isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER; + } + return pm_encoding_utf_8_isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER; } // Returns true if the current token that the parser is considering is at the