Skip to content

Commit

Permalink
Improve error offsets for character classes (#548)
Browse files Browse the repository at this point in the history
* Error offset should be advanced by one character for "[\d-z]"
  invalid range error

  The code does a 1-char lookahead for a hyphen, but then doesn't
  advance the pointer to consume the hyphen when returning the error.

  Perl's error message (with "use warnings") does advance to just
  after the hyphen, so PCRE2 should match.

  Fixes #545.

* Also improve error offsets for [[:bad:]], [[=...=]] and [z-\p{...}]
  cases
  • Loading branch information
NWilson authored Nov 6, 2024
1 parent c192b8c commit 6185344
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 75 deletions.
71 changes: 34 additions & 37 deletions src/pcre2_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -3702,6 +3702,7 @@ while (ptr < ptrend)

if (class_range_state == RANGE_STARTED)
{
ptr = tempptr + 2;
errorcode = ERR50;
goto FAILED;
}
Expand All @@ -3723,8 +3724,9 @@ while (ptr < ptrend)

if (*ptr != CHAR_COLON)
{
ptr = tempptr + 2;
errorcode = ERR13;
goto FAILED_BACK;
goto FAILED;
}

if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
Expand All @@ -3734,19 +3736,18 @@ while (ptr < ptrend)
}

posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
ptr = tempptr + 2;
if (posix_class < 0)
{
errorcode = ERR30;
goto FAILED;
}
ptr = tempptr + 2;

/* Set "a hyphen is forbidden to be the start of a range". For the '-]'
case, the hyphen is treated as a literal, but for '-1' it is disallowed
(because it would be interpreted as range). */

class_range_state = RANGE_FORBID_NO;
class_range_forbid_ptr = ptr;
class_op_state = CLASS_OP_OPERAND;

/* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
Expand Down Expand Up @@ -3989,6 +3990,7 @@ while (ptr < ptrend)
{
*parsed_pattern++ = CHAR_MINUS;
class_range_state = RANGE_FORBID_STARTED;
class_range_forbid_ptr = ptr;
}

/* Handle a literal character */
Expand Down Expand Up @@ -4073,40 +4075,8 @@ while (ptr < ptrend)
errorcode = ERR7;
ptr--;
goto FAILED;
}

/* The second part of a range can be a single-character escape
sequence (detected above), but not any of the other escapes. Perl
treats a hyphen as a literal in such circumstances. However, in Perl's
warning mode, a warning is given, so PCRE now faults it, as it is
almost certainly a mistake on the user's part. */

if (class_range_state == RANGE_STARTED)
{
errorcode = ERR50;
goto FAILED;
}

/* Perl gives a warning unless the hyphen following a multi-character
escape is the last character in the class. PCRE throws an error. */

if (class_range_state == RANGE_FORBID_STARTED)
{
ptr = class_range_forbid_ptr;
errorcode = ERR50;
goto FAILED;
}

/* Of the remaining escapes, only those that define characters are
allowed in a class. None may start a range. */

class_range_state = RANGE_FORBID_NO;
class_range_forbid_ptr = ptr;
class_op_state = CLASS_OP_OPERAND;

switch(escape)
{
case ESC_N:
case ESC_N: /* Not permitted by Perl either */
errorcode = ERR71;
goto FAILED;

Expand Down Expand Up @@ -4143,7 +4113,6 @@ while (ptr < ptrend)
if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
*parsed_pattern++ = META_ESCAPE + escape;
*parsed_pattern++ = (ptype << 16) | pdata;
class_range_forbid_ptr = ptr;
}
#else
errorcode = ERR45;
Expand All @@ -4156,6 +4125,34 @@ while (ptr < ptrend)
ptr--;
goto FAILED;
}

/* All the switch-cases above which end in "break" describe a set
of characters. None may start a range. */

/* The second part of a range can be a single-character escape
sequence (detected above), but not any of the other escapes. Perl
treats a hyphen as a literal in such circumstances. However, in Perl's
warning mode, a warning is given, so PCRE now faults it, as it is
almost certainly a mistake on the user's part. */

if (class_range_state == RANGE_STARTED)
{
errorcode = ERR50;
goto FAILED;
}

/* Perl gives a warning unless the hyphen following a multi-character
escape is the last character in the class. PCRE throws an error. */

if (class_range_state == RANGE_FORBID_STARTED)
{
ptr = class_range_forbid_ptr;
errorcode = ERR50;
goto FAILED;
}

class_range_state = RANGE_FORBID_NO;
class_op_state = CLASS_OP_OPERAND;
}

/* Proceed to next thing in the class. */
Expand Down
8 changes: 8 additions & 0 deletions testdata/testinput2
Original file line number Diff line number Diff line change
Expand Up @@ -7008,4 +7008,12 @@ a)"xI

/[[:digit:]\Q\E-H]+/

/[z-[:space:]]/

/[z-\d]/

/[[:space:]-z]/

/[\d-z]/

# End of testinput2
8 changes: 8 additions & 0 deletions testdata/testinput5
Original file line number Diff line number Diff line change
Expand Up @@ -3189,4 +3189,12 @@
/^([\h\x{9000}\x{9002}\x{9004}][\v\x{9000}\x{9002}\x{9004}\x{9006}\x{9008}][\h\v\x{9000}],){4}$/B,utf
\x09\x0a\x0d,\x{1680}\x{2028}\x{1680},\x{180e}\x{2029}\x{180e},\x{9000}\x{9000}\x{9000},

/[z-\p{Lu}]/

/[z-\pL]/

/[\p{Lu}-z]/

/[\pL-z]/

# End of testinput5
Expand Down
78 changes: 45 additions & 33 deletions testdata/testoutput2
Original file line number Diff line number Diff line change
Expand Up @@ -2170,13 +2170,13 @@ Starting code units: % 0 1 A B C D E F G H I J K L M N O P Q R S T U V W
Subject length lower bound = 1

/[[.ch.]]/I
Failed: error 113 at offset 1: POSIX collating elements are not supported
Failed: error 113 at offset 7: POSIX collating elements are not supported

/[[=ch=]]/I
Failed: error 113 at offset 1: POSIX collating elements are not supported
Failed: error 113 at offset 7: POSIX collating elements are not supported

/[[:rhubarb:]]/I
Failed: error 130 at offset 3: unknown POSIX class name
Failed: error 130 at offset 12: unknown POSIX class name

/[[:upper:]]/Ii
Capture group count = 0
Expand Down Expand Up @@ -8775,31 +8775,31 @@ Failed: error 162 at offset 4: subpattern name expected
Failed: error 162 at offset 4: subpattern name expected

/[[:foo:]]/
Failed: error 130 at offset 3: unknown POSIX class name
Failed: error 130 at offset 8: unknown POSIX class name

/[[:1234:]]/
Failed: error 130 at offset 3: unknown POSIX class name
Failed: error 130 at offset 9: unknown POSIX class name

/[[:f\oo:]]/
Failed: error 130 at offset 3: unknown POSIX class name
Failed: error 130 at offset 9: unknown POSIX class name

/[[: :]]/
Failed: error 130 at offset 3: unknown POSIX class name
Failed: error 130 at offset 6: unknown POSIX class name

/[[:...:]]/
Failed: error 130 at offset 3: unknown POSIX class name
Failed: error 130 at offset 8: unknown POSIX class name

/[[:l\ower:]]/
Failed: error 130 at offset 3: unknown POSIX class name
Failed: error 130 at offset 11: unknown POSIX class name

/[[:abc\:]]/
Failed: error 130 at offset 3: unknown POSIX class name
Failed: error 130 at offset 9: unknown POSIX class name

/[abc[:x\]pqr:]]/
Failed: error 130 at offset 6: unknown POSIX class name
Failed: error 130 at offset 14: unknown POSIX class name

/[[:a\dz:]]/
Failed: error 130 at offset 3: unknown POSIX class name
Failed: error 130 at offset 9: unknown POSIX class name

/(^(a|b\g<-1'c))/
Failed: error 157 at offset 8: \g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number
Expand Down Expand Up @@ -11524,7 +11524,7 @@ Failed: error 171 at offset 4: \N is not supported in a class
aNc

/a[B-\Nc]/
Failed: error 150 at offset 6: invalid range in character class
Failed: error 171 at offset 6: \N is not supported in a class

/a[B\Nc]/
Failed: error 171 at offset 5: \N is not supported in a class
Expand Down Expand Up @@ -13347,16 +13347,16 @@ Failed: error 178 at offset 5: digits missing after \x or in \x{} or \o{} or \N{
------------------------------------------------------------------

/[a-[:digit:]]+/
Failed: error 150 at offset 4: invalid range in character class
Failed: error 150 at offset 12: invalid range in character class

/[A-[:digit:]]+/
Failed: error 150 at offset 4: invalid range in character class
Failed: error 150 at offset 12: invalid range in character class

/[a-[.xxx.]]+/
Failed: error 150 at offset 4: invalid range in character class
Failed: error 150 at offset 10: invalid range in character class

/[a-[=xxx=]]+/
Failed: error 150 at offset 4: invalid range in character class
Failed: error 150 at offset 10: invalid range in character class

/[a-[!xxx!]]+/
Failed: error 108 at offset 3: range out of order in character class
Expand Down Expand Up @@ -13477,7 +13477,7 @@ No match
No match

/[a[:<:]] should give error/
Failed: error 130 at offset 4: unknown POSIX class name
Failed: error 130 at offset 7: unknown POSIX class name

/(?=ab\K)/aftertext,allow_lookaround_bsk
abcd\=startchar
Expand Down Expand Up @@ -15879,11 +15879,11 @@ Failed: error 125 at offset 13: length of lookbehind assertion is not limited
# Perl accepts these, but gives a warning. We can't warn, so give an error.

/[a-[:digit:]]+/
Failed: error 150 at offset 4: invalid range in character class
Failed: error 150 at offset 12: invalid range in character class
a-a9-a

/[A-[:digit:]]+/
Failed: error 150 at offset 4: invalid range in character class
Failed: error 150 at offset 12: invalid range in character class
A-A9-A

/[a-\d]+/
Expand Down Expand Up @@ -16020,7 +16020,7 @@ Failed: error 128 at offset 63: atomic assertion expected after (?( or (?(?C)
.+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X

/[:[:alnum:]-[[a:lnum:]+/
Failed: error 150 at offset 11: invalid range in character class
Failed: error 150 at offset 12: invalid range in character class

/((?(?C'')\QX\E(?!((?(?C'')(?!X=X));=)r*X=X));=)/
Failed: error 128 at offset 11: atomic assertion expected after (?( or (?(?C)
Expand Down Expand Up @@ -16654,10 +16654,10 @@ Subject length lower bound = 3
------------------------------------------------------------------

/[Q-\N]/B,bad_escape_is_literal
Failed: error 150 at offset 5: invalid range in character class
Failed: error 171 at offset 5: \N is not supported in a class

/[\s-_]/bad_escape_is_literal
Failed: error 150 at offset 3: invalid range in character class
Failed: error 150 at offset 4: invalid range in character class

/[_-\s]/bad_escape_is_literal
Failed: error 150 at offset 5: invalid range in character class
Expand Down Expand Up @@ -16857,19 +16857,19 @@ No match
No match

/[[:digit:]-a]/
Failed: error 150 at offset 10: invalid range in character class
Failed: error 150 at offset 11: invalid range in character class

/[[:digit:]-[:print:]]/
Failed: error 150 at offset 10: invalid range in character class
Failed: error 150 at offset 11: invalid range in character class

/[\d-a]/
Failed: error 150 at offset 3: invalid range in character class
Failed: error 150 at offset 4: invalid range in character class

/[\H-z]/
Failed: error 150 at offset 3: invalid range in character class
Failed: error 150 at offset 4: invalid range in character class

/[\d-[:print:]]/
Failed: error 150 at offset 3: invalid range in character class
Failed: error 150 at offset 4: invalid range in character class

# Perl gets the second of these wrong, giving no match.

Expand Down Expand Up @@ -20619,7 +20619,7 @@ Failed: error 211 at offset 7: brackets needed to clarify operator precedence in
No match

/[\d-z]/B,alt_extended_class
Failed: error 150 at offset 3: invalid range in character class
Failed: error 150 at offset 4: invalid range in character class

/[z-\d]/B,alt_extended_class
Failed: error 150 at offset 5: invalid range in character class
Expand Down Expand Up @@ -20654,16 +20654,28 @@ Failed: error 207 at offset 118: character classes are too deeply nested
# --------------

/[[:digit:] -Z]/xx
Failed: error 150 at offset 10: invalid range in character class
Failed: error 150 at offset 14: invalid range in character class

/[\d -Z]/xx
Failed: error 150 at offset 3: invalid range in character class
Failed: error 150 at offset 7: invalid range in character class

/[[:digit:]\E-H]/
Failed: error 150 at offset 10: invalid range in character class
Failed: error 150 at offset 13: invalid range in character class

/[[:digit:]\Q\E-H]+/
Failed: error 150 at offset 10: invalid range in character class
Failed: error 150 at offset 15: invalid range in character class

/[z-[:space:]]/
Failed: error 150 at offset 12: invalid range in character class

/[z-\d]/
Failed: error 150 at offset 5: invalid range in character class

/[[:space:]-z]/
Failed: error 150 at offset 11: invalid range in character class

/[\d-z]/
Failed: error 150 at offset 4: invalid range in character class

# End of testinput2
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
Expand Down
Loading

0 comments on commit 6185344

Please sign in to comment.