Implement ALT_EXTENDED_CLASS flag (#523)

* Move some existing character class code into pcre2_compile_class.c * Add a new flag PCRE2_ALT_EXTENDED_CLASS to change the behaviour of parsing [...] character classes, to emit new META codes, and new OP_ECLASS codes for nested character classes with operators * Document the behaviour relative to the UTS#18 standard * No JIT support; it falls back to the interpreter. DFA is supported.
PCRE2Project · Oct 30, 2024 · fc38d9e · fc38d9e
1 parent 96f0653
commit fc38d9e
Show file tree

Hide file tree

Showing 38 changed files with 5,370 additions and 1,338 deletions.
diff --git a/HACKING b/HACKING
@@ -199,6 +199,9 @@ META_RANGE_ESCAPED    hyphen in class range with at least one escape
 META_RANGE_LITERAL    hyphen in class range defined literally
 META_SKIP             (*SKIP) - no argument (see below for with argument)
 META_THEN             (*THEN) - no argument (see below for with argument)
+META_ECLASS_OR        || in an extended character class
+META_ECLASS_AND       && in an extended character class
+META_ECLASS_SUB       -- in an extended character class
 
 The two RANGE values occur only in character classes. They are positioned
 between two literals that define the start and end of the range. In an EBCDIC

diff --git a/doc/html/pcre2_compile.html b/doc/html/pcre2_compile.html
@@ -57,6 +57,7 @@ <h1>pcre2_compile man page</h1>
   PCRE2_ALLOW_EMPTY_CLASS  Allow empty classes
   PCRE2_ALT_BSUX           Alternative handling of \u, \U, and \x
   PCRE2_ALT_CIRCUMFLEX     Alternative handling of ^ in multiline mode
+  PCRE2_ALT_EXTENDED_CLASS Alternative extended character class syntax
   PCRE2_ALT_VERBNAMES      Process backslashes in verb names
   PCRE2_AUTO_CALLOUT       Compile automatic callouts
   PCRE2_CASELESS           Do caseless matching

diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html
@@ -1559,7 +1559,7 @@ <h1>pcre2api man page</h1>
 error has occurred.
 </P>
 <P>
-There are nearly 100 positive error codes that <b>pcre2_compile()</b> may return
+There are over 100 positive error codes that <b>pcre2_compile()</b> may return
 if it finds an error in the pattern. There are also some negative error codes
 that are used for invalid UTF strings when validity checking is in force. These
 are the same as given by <b>pcre2_match()</b> and <b>pcre2_dfa_match()</b>, and
@@ -1667,6 +1667,16 @@ <h1>pcre2api man page</h1>
 end of the subject, for compatibility with Perl. If you want a multiline
 circumflex also to match after a terminating newline, you must set
 PCRE2_ALT_CIRCUMFLEX.
+<pre>
+  PCRE2_ALT_EXTENDED_CLASS
+</pre>
+Alters the parsing of character classes to follow the extended syntax
+described by Unicode UTS#18. The PCRE2_ALT_EXTENDED_CLASS option has no impact
+on the behaviour of the Perl-specific "(?[...])" syntax for extended classes,
+but instead enables the alternative syntax of extended class behaviour inside
+ordinary "[...]" character classes. See the
+<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
+documentation for details of the character classes supported.
 <pre>
   PCRE2_ALT_VERBNAMES
 </pre>

diff --git a/doc/html/pcre2pattern.html b/doc/html/pcre2pattern.html
diff --git a/doc/html/pcre2syntax.html b/doc/html/pcre2syntax.html
@@ -280,7 +280,7 @@ <h1>pcre2syntax man page</h1>
   RLI         right-to-left isolate
   RLO         right-to-left override
   S           segment separator
-  WS          which space
+  WS          white space
 </PRE>
 </P>
 <br><a name="SEC11" href="#TOC1">CHARACTER CLASSES</a><br>
@@ -421,7 +421,7 @@ <h1>pcre2syntax man page</h1>
   (?^)            unset imnrsx options
 </pre>
 (?aP) implies (?aT) as well, though this has no additional effect. However, it
-means that (?-aP) is really (?-PT) which disables all ASCII restrictions for
+means that (?-aP) also implies (?-aT) and disables all ASCII restrictions for
 POSIX classes.
 </P>
 <P>
@@ -436,10 +436,10 @@ <h1>pcre2syntax man page</h1>
 of the newline or \R sequences or options with similar syntax. More than one
 of them may appear. For the first three, d is a decimal number.
 <pre>
-  (*CASELESS_RESTRICT) set PCRE2_EXTRA_CASELESS_RESTRICT when matching
   (*LIMIT_DEPTH=d)     set the backtracking limit to d
   (*LIMIT_HEAP=d)      set the heap size limit to d * 1024 bytes
   (*LIMIT_MATCH=d)     set the match limit to d
+  (*CASELESS_RESTRICT) set PCRE2_EXTRA_CASELESS_RESTRICT when matching
   (*NOTEMPTY)          set PCRE2_NOTEMPTY when matching
   (*NOTEMPTY_ATSTART)  set PCRE2_NOTEMPTY_ATSTART when matching
   (*NO_AUTO_POSSESS)   no auto-possessification (PCRE2_NO_AUTO_POSSESS)
@@ -703,7 +703,7 @@ <h1>pcre2syntax man page</h1>
 </P>
 <br><a name="SEC34" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 24 September 2024
+Last updated: 20 October 2024
 <br>
 Copyright &copy; 1997-2024 University of Cambridge.
 <br>

diff --git a/doc/html/pcre2test.html b/doc/html/pcre2test.html
@@ -105,8 +105,8 @@ <h1>pcre2test man page</h1>
 <P>
 When testing the 16-bit or 32-bit libraries, there is a need to be able to
 generate character code points greater than 255 in the strings that are passed
-to the library. For subject lines, backslash escapes can be used. In addition,
-when the <b>utf</b> modifier (see
+to the library. For subject lines and some patterns, backslash escapes can be
+used. In addition, when the <b>utf</b> modifier (see
 <a href="#optionmodifiers">"Setting compilation options"</a>
 below) is set, the pattern and any following subject lines are interpreted as
 UTF-8 strings and translated to UTF-16 or UTF-32 as appropriate.
@@ -125,9 +125,8 @@ <h1>pcre2test man page</h1>
 than 0x7fffffff, but such values can be handled by the 32-bit library. When
 testing this library in non-UTF mode with <b>utf8_input</b> set, if any
 character is preceded by the byte 0xff (which is an invalid byte in UTF-8)
-0x80000000 is added to the character's value. This is the only way of passing
-such code points in a pattern string. For subject strings, using an escape
-sequence is preferable.
+0x80000000 is added to the character's value. For subject strings, using an
+escape sequence is preferable.
 </P>
 <br><a name="SEC4" href="#TOC1">COMMAND LINE OPTIONS</a><br>
 <P>
@@ -537,39 +536,48 @@ <h1>pcre2test man page</h1>
 <b>subject_literal</b> modifier was set for the pattern. The following provide a
 means of encoding non-printing characters in a visible way:
 <pre>
-  \a         alarm (BEL, \x07)
-  \b         backspace (\x08)
-  \e         escape (\x27)
-  \f         form feed (\x0c)
-  \n         newline (\x0a)
-  \r         carriage return (\x0d)
-  \t         tab (\x09)
-  \v         vertical tab (\x0b)
-  \nnn       octal character (up to 3 octal digits); always
-               a byte unless &#62; 255 in UTF-8 or 16-bit or 32-bit mode
-  \o{dd...}  octal character (any number of octal digits}
-  \xhh       hexadecimal byte (up to 2 hex digits)
-  \x{hh...}  hexadecimal character (any number of hex digits)
-</pre>
-The use of \x{hh...} is not dependent on the use of the <b>utf</b> modifier on
-the pattern. It is recognized always. There may be any number of hexadecimal
-digits inside the braces; invalid values provoke error messages.
-</P>
-<P>
-Note that \xhh specifies one byte rather than one character in UTF-8 mode;
-this makes it possible to construct invalid UTF-8 sequences for testing
-purposes. On the other hand, \x{hh} is interpreted as a UTF-8 character in
-UTF-8 mode, generating more than one byte if the value is greater than 127.
-When testing the 8-bit library not in UTF-8 mode, \x{hh} generates one byte
-for values that could fit on it, and causes an error for greater values.
-</P>
-<P>
-In UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it
-possible to construct invalid UTF-16 sequences for testing purposes.
-</P>
-<P>
-In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This makes it
-possible to construct invalid UTF-32 sequences for testing purposes.
+  \a          alarm (BEL, \x07)
+  \b          backspace (\x08)
+  \e          escape (\x27)
+  \f          form feed (\x0c)
+  \n          newline (\x0a)
+  \N{U+hh...} unicode character (any number of hex digits)
+  \r          carriage return (\x0d)
+  \t          tab (\x09)
+  \v          vertical tab (\x0b)
+  \ddd        octal number (up to 3 octal digits); represent a single
+                code point unless larger than 255 with the 8-bit library
+  \o{dd...}   octal number (any number of octal digits} representing a
+                character in UTF mode or a code point
+  \xhh        hexadecimal byte (up to 2 hex digits)
+  \x{hh...}   hexadecimal number (up to 8 hex digits) representing a
+                character in UTF mode or a code point
+</pre>
+Invoking \N{U+hh...} or \x{hh...} doesn't require the use of the <b>utf</b>
+modifier on the pattern. It is always recognized. There may be any number of
+hexadecimal digits inside the braces; invalid values provoke error messages
+but when using \N{U+hh...} with some invalid unicode characters they will
+be accepted with a warning instead.
+</P>
+<P>
+Note that even in UTF-8 mode, \xhh (and depending of how large, \ddd)
+describe one byte rather than one character; this makes it possible to
+construct invalid UTF-8 sequences for testing purposes. On the other hand,
+\x{hh...} is interpreted as a UTF-8 character in UTF-8 mode, only generating
+more than one byte if the value is greater than 127. To avoid the ambiguity
+it is preferred to use \N{U+hh...} when describing characters. When testing
+the 8-bit library not in UTF-8 mode, \x{hh} generates one byte for values
+that could fit on it, and causes an error for greater values.
+</P>
+<P>
+When testing te 16-bit library, not in UTF-16 mode, all 4-digit \x{hhhh}
+values are accepted. This makes it possible to construct invalid UTF-16
+sequences for testing purposes.
+</P>
+<P>
+When testing the 32-bit library, not in UTF-32 mode, all 4 to 8-digit \x{...}
+values are accepted. This makes it possible to construct invalid UTF-32
+sequences for testing purposes.
 </P>
 <P>
 There is a special backslash sequence that specifies replication of one or more
@@ -635,6 +643,7 @@ <h1>pcre2test man page</h1>
       allow_surrogate_escapes   set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
       alt_bsux                  set PCRE2_ALT_BSUX
       alt_circumflex            set PCRE2_ALT_CIRCUMFLEX
+      alt_extended_class        set PCRE2_ALT_EXTENDED_CLASS
       alt_verbnames             set PCRE2_ALT_VERBNAMES
       anchored                  set PCRE2_ANCHORED
   /a  ascii_all                 set all ASCII options
@@ -2261,7 +2270,7 @@ <h1>pcre2test man page</h1>
 </P>
 <br><a name="SEC21" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 04 October 2024
+Last updated: 16 October 2024
 <br>
 Copyright &copy; 1997-2024 University of Cambridge.
 <br>

diff --git a/doc/pcre2_compile.3 b/doc/pcre2_compile.3
@@ -45,6 +45,7 @@ The primary option bits are:
   PCRE2_ALLOW_EMPTY_CLASS  Allow empty classes
   PCRE2_ALT_BSUX           Alternative handling of \eu, \eU, and \ex
   PCRE2_ALT_CIRCUMFLEX     Alternative handling of ^ in multiline mode
+  PCRE2_ALT_EXTENDED_CLASS Alternative extended character class syntax
   PCRE2_ALT_VERBNAMES      Process backslashes in verb names
   PCRE2_AUTO_CALLOUT       Compile automatic callouts
   PCRE2_CASELESS           Do caseless matching

diff --git a/doc/pcre2api.3 b/doc/pcre2api.3
@@ -1493,7 +1493,7 @@ error code and an offset (number of code units) within the pattern,
 respectively, when \fBpcre2_compile()\fP returns NULL because a compilation
 error has occurred.
 .P
-There are nearly 100 positive error codes that \fBpcre2_compile()\fP may return
+There are over 100 positive error codes that \fBpcre2_compile()\fP may return
 if it finds an error in the pattern. There are also some negative error codes
 that are used for invalid UTF strings when validity checking is in force. These
 are the same as given by \fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP, and
@@ -1601,6 +1601,18 @@ after any internal newline. However, it does not match after a newline at the
 end of the subject, for compatibility with Perl. If you want a multiline
 circumflex also to match after a terminating newline, you must set
 PCRE2_ALT_CIRCUMFLEX.
+.sp
+  PCRE2_ALT_EXTENDED_CLASS
+.sp
+Alters the parsing of character classes to follow the extended syntax
+described by Unicode UTS#18. The PCRE2_ALT_EXTENDED_CLASS option has no impact
+on the behaviour of the Perl-specific "(?[...])" syntax for extended classes,
+but instead enables the alternative syntax of extended class behaviour inside
+ordinary "[...]" character classes. See the
+.\" HREF
+\fBpcre2pattern\fP
+.\"
+documentation for details of the character classes supported.
 .sp
   PCRE2_ALT_VERBNAMES
 .sp

diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3
@@ -1547,6 +1547,52 @@ the next two sections), and the terminating closing square bracket. However,
 escaping other non-alphanumeric characters does no harm.
 .
 .
+.SH "UTS#18 EXTENDED CHARACTER CLASSES"
+.rs
+The PCRE2_ALT_EXTENDED_CLASS option enables an alternative to Perl's "(?[...])"
+syntax, allowing instead extended class behaviour inside ordinary "[...]"
+character classes. This altered syntax for "[...]" classes is loosely described
+by the Unicode standard UTS#18.
+.P
+Firstly, in Perl syntax, an expression such as "[a[]" is a character class
+with two literal characters "a" and "[", but in UTS#18 extended classes the "["
+character becomes an additional metacharacter within classes, denoting the start
+of a nested class, so a literal "[" must be escaped as "\e[".
+.P
+Secondly, within the UTS#18 extended syntax, there are additional operators
+"||", "&&" and "--" which denote character class union, intersection, and
+subtraction respectively. In standard Perl syntax, these would simply be
+needlessly-repeated literals (except for "-" which can denote a range). These
+operators can be used in constructs such as "[\ep{L}--[QW]]" for "Unicode
+letters, other than Q and W". A literal "-" at the end of a range must be
+escaped (so while "[--1]" in Perl syntax is the range from hyphen to "1", it
+must be escaped as "[\e--1]" in UTS#18 extended classes).
+.P
+The specific rules in PCRE2 are that classes can be nested:
+"[...[B]...[^C]...]". The individual class items (literal characters, literal
+ranges, properties such as \ed or \ep{...}, and nested classes) can be
+combined by juxtaposition or by an operator "||", "&&", or "--".
+Juxtaposition is the implicit union operator, and binds more tightly than any
+explicit operator. Precedence between the explicit operators is not defined,
+so mixing operators is a syntax error (thus "[A&&B--C]" is an error, but
+"[A&&[B--C]]" is accepted).
+.P
+This is an emerging syntax which is being adopted gradually across the regex
+ecosystem: for example JavaScript adopted the "/v" flag in ECMAScript 2024;
+Python's "re" module reserves the syntax for future use with a FutureWarning
+for unescaped use of "[" as a literal within character classes. Due to UTS#18
+providing insufficient guidance, engines interpret the syntax differently.
+Rust's "regex" crate and Python's "regex" PyPi module both implement UTS#18
+extended classes, but with slight incompatibilities ("[A||B&&C]" is parsed as
+"[A||[B&&C]]" in Python's "regex" but as "[[A||B]&&C]" in Rust's "regex").
+.P
+PCRE2's syntax adds syntax restrictions similar to ECMASCript's /v flag, so
+that all the extended classes accepted as valid by PCRE2 have the property
+that they are interpreted either with the same behaviour, or as invalid, by
+all other major engines. Please file an issue if you are aware of cross-engine
+differences in behaviour between PCRE2 and another major engine.
+.
+.
 .SH "POSIX CHARACTER CLASSES"
 .rs
 .sp

diff --git a/doc/pcre2test.1 b/doc/pcre2test.1
@@ -598,6 +598,7 @@ for a description of the effects of these options.
       allow_surrogate_escapes   set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
       alt_bsux                  set PCRE2_ALT_BSUX
       alt_circumflex            set PCRE2_ALT_CIRCUMFLEX
+      alt_extended_class        set PCRE2_ALT_EXTENDED_CLASS
       alt_verbnames             set PCRE2_ALT_VERBNAMES
       anchored                  set PCRE2_ANCHORED
   /a  ascii_all                 set all ASCII options

diff --git a/src/pcre2.h.generic b/src/pcre2.h.generic
@@ -143,6 +143,7 @@ D   is inspected during pcre2_dfa_match() execution
 #define PCRE2_EXTENDED_MORE       0x01000000u  /* C       */
 #define PCRE2_LITERAL             0x02000000u  /* C       */
 #define PCRE2_MATCH_INVALID_UTF   0x04000000u  /*   J M D */
+#define PCRE2_ALT_EXTENDED_CLASS  0x08000000u  /* C       */
 
 /* An additional compile options word is available in the compile context. */
 
@@ -332,7 +333,12 @@ pcre2_pattern_convert(). */
 #define PCRE2_ERROR_EXTRA_CASING_REQUIRES_UNICODE  204
 #define PCRE2_ERROR_TURKISH_CASING_REQUIRES_UTF    205
 #define PCRE2_ERROR_EXTRA_CASING_INCOMPATIBLE      206
-
+#define PCRE2_ERROR_ECLASS_NEST_TOO_DEEP           207
+#define PCRE2_ERROR_ECLASS_INVALID_OPERATOR        208
+#define PCRE2_ERROR_ECLASS_UNEXPECTED_OPERATOR     209
+#define PCRE2_ERROR_ECLASS_EXPECTED_OPERAND        210
+#define PCRE2_ERROR_ECLASS_MIXED_OPERATORS         211
+#define PCRE2_ERROR_ECLASS_HINT_SQUARE_BRACKET     212
 
 /* "Expected" matching error codes: no match and partial match. */
 

diff --git a/src/pcre2.h.in b/src/pcre2.h.in
@@ -143,6 +143,7 @@ D   is inspected during pcre2_dfa_match() execution
 #define PCRE2_EXTENDED_MORE       0x01000000u  /* C       */
 #define PCRE2_LITERAL             0x02000000u  /* C       */
 #define PCRE2_MATCH_INVALID_UTF   0x04000000u  /*   J M D */
+#define PCRE2_ALT_EXTENDED_CLASS  0x08000000u  /* C       */
 
 /* An additional compile options word is available in the compile context. */
 
@@ -332,7 +333,12 @@ pcre2_pattern_convert(). */
 #define PCRE2_ERROR_EXTRA_CASING_REQUIRES_UNICODE  204
 #define PCRE2_ERROR_TURKISH_CASING_REQUIRES_UTF    205
 #define PCRE2_ERROR_EXTRA_CASING_INCOMPATIBLE      206
-
+#define PCRE2_ERROR_ECLASS_NEST_TOO_DEEP           207
+#define PCRE2_ERROR_ECLASS_INVALID_OPERATOR        208
+#define PCRE2_ERROR_ECLASS_UNEXPECTED_OPERATOR     209
+#define PCRE2_ERROR_ECLASS_EXPECTED_OPERAND        210
+#define PCRE2_ERROR_ECLASS_MIXED_OPERATORS         211
+#define PCRE2_ERROR_ECLASS_HINT_SQUARE_BRACKET     212
 
 /* "Expected" matching error codes: no match and partial match. */