Update for regex documentation and improved matching detection of reg…

…ex names.
MaxSagebaum · Aug 2, 2024 · fb3a1ee · fb3a1ee
1 parent 64637d8
commit fb3a1ee
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 55 deletions.
diff --git a/docs/cpp2/metafunctions.md b/docs/cpp2/metafunctions.md
@@ -374,7 +374,7 @@ A `cpp1_rule_of_zero` type is one that has no user-written copy/move/destructor
 
 #### `regex`
 
-Replaces fields in the class with regular expression objects. Each field starting with `regex` is replaced with a regular expression of the same type.
+Replaces fields in the class with regular expression objects. All fields named `regex` or starting with `regex_` are replaced with a regular expression of the same type.
 
 ``` cpp title="Regular expression example"
 name_matcher: @regex type
@@ -401,7 +401,6 @@ main: (args) = {
 
     std::cout << "Case insensitive match: " << m.regex_no_case.search("blubabABblah").group(0) << std::endl;
 }
-
 ```
 
 The regex syntax used by cppfront is the [perl syntax](https://perldoc.perl.org/perlre). Most of the syntax is available. Currently we do not support unicode characters and the syntax tokens associated with them. In [supported features](../other/regex_status.md) all the available regex syntax is listed.

diff --git a/docs/other/regex_status.md b/docs/other/regex_status.md
@@ -10,7 +10,7 @@ The listings are taken from [perl regex docs](https://perldoc.perl.org/perlre).
  - [x] m                Treat the string being matched against as multiple lines. That is, change "^" and "$" from matching the start of the string's first line and the end of its last line to matching the start and end of each line within the string.
  - [x] s                Treat the string as single line. That is, change "." to match any character whatsoever, even a newline, which normally it would not match.
  - [x] x and xx         Extend your pattern's legibility by permitting whitespace and comments. Details in "/x and /xx"
- - [x] n                Prevent the grouping metacharacters () from capturing. This modifier, new in 5.22, will stop $1, $2, etc... from being filled in.
+ - [x] n                Prevent the grouping metacharacters () from capturing. This modifier will stop $1, $2, etc... from being filled in.
  - [ ] c                keep the current position during repeated matching
 ```
 
@@ -54,37 +54,37 @@ The listings are taken from [perl regex docs](https://perldoc.perl.org/perlre).
 
 ### Character Classes and other Special Escapes __(Complete)__
 ```
- - [x] [...]     [1]  Match a character according to the rules of the
+ - [x] [...]        Match a character according to the rules of the
                     bracketed character class defined by the "...".
                     Example: [a-z] matches "a" or "b" or "c" ... or "z"
- - [x] [[:...:]] [2]  Match a character according to the rules of the POSIX
+ - [x] [[:...:]]    Match a character according to the rules of the POSIX
                     character class "..." within the outer bracketed
                     character class.  Example: [[:upper:]] matches any
                     uppercase character.
- - [x] \g1       [5]  Backreference to a specific or previous group,
- - [x] \g{-1}    [5]  The number may be negative indicating a relative
-                  previous group and may optionally be wrapped in
-                  curly brackets for safer parsing.
- - [x] \g{name}  [5]  Named backreference
- - [x] \k<name>  [5]  Named backreference
- - [x] \k'name'  [5]  Named backreference
- - [x] \k{name}  [5]  Named backreference
- - [x] \w        [3]  Match a "word" character (alphanumeric plus "_", plus
+ - [x] \g1          Backreference to a specific or previous group,
+ - [x] \g{-1}       The number may be negative indicating a relative
+                    previous group and may optionally be wrapped in
+                    curly brackets for safer parsing.
+ - [x] \g{name}     Named backreference
+ - [x] \k<name>     Named backreference
+ - [x] \k'name'     Named backreference
+ - [x] \k{name}     Named backreference
+ - [x] \w           Match a "word" character (alphanumeric plus "_", plus
                     other connector punctuation chars plus Unicode
                     marks)
- - [x] \W        [3]  Match a non-"word" character
- - [x] \s        [3]  Match a whitespace character
- - [x] \S        [3]  Match a non-whitespace character
- - [x] \d        [3]  Match a decimal digit character
- - [x] \D        [3]  Match a non-digit character
- - [x] \v        [3]  Vertical whitespace
- - [x] \V        [3]  Not vertical whitespace
- - [x] \h        [3]  Horizontal whitespace
- - [x] \H        [3]  Not horizontal whitespace
- - [x] \1        [5]  Backreference to a specific capture group or buffer.
+ - [x] \W           Match a non-"word" character
+ - [x] \s           Match a whitespace character
+ - [x] \S           Match a non-whitespace character
+ - [x] \d           Match a decimal digit character
+ - [x] \D           Match a non-digit character
+ - [x] \v           Vertical whitespace
+ - [x] \V           Not vertical whitespace
+ - [x] \h           Horizontal whitespace
+ - [x] \H           Not horizontal whitespace
+ - [x] \1           Backreference to a specific capture group or buffer.
                     '1' may actually be any positive integer.
- - [x] \N        [7]  Any character but \n.  Not affected by /s modifier
- - [x] \K        [6]  Keep the stuff left of the \K, don't include it in $&
+ - [x] \N           Any character but \n.  Not affected by /s modifier
+ - [x] \K           Keep the stuff left of the \K, don't include it in $&
 ```
 
 ### Assertions
@@ -95,7 +95,7 @@ The listings are taken from [perl regex docs](https://perldoc.perl.org/perlre).
  - [x] \Z     Match only at end of string, or before newline at the end
  - [x] \z     Match only at end of string
  - [ ] \G     Match only at pos() (e.g. at the end-of-match position
-          of prior m//g)
+              of prior m//g)
 ```
 
 ### Capture groups __(Complete)__
@@ -157,7 +157,7 @@ The listings are taken from [perl regex docs](https://perldoc.perl.org/perlre).
 ### Modifiers
 ```
  - [ ] p                Preserve the string matched such that ${^PREMATCH}, ${^MATCH}, and ${^POSTMATCH} are available for use after matching.
- - [ ] a, d, l, and u   These modifiers, all new in 5.14, affect which character-set rules (Unicode, etc.) are used, as described below in "Character set modifiers".
+ - [ ] a, d, l, and u   These modifiers affect which character-set rules (Unicode, etc.) are used, as described below in "Character set modifiers".
  - [ ] g                globally match the pattern repeatedly in the string
  - [ ] e                evaluate the right-hand side as an expression
  - [ ] ee               evaluate the right side as a string then eval the result
@@ -180,11 +180,11 @@ The listings are taken from [perl regex docs](https://perldoc.perl.org/perlre).
 
 ### Character Classes and other Special Escapes
 ```
- - [ ]  (?[...])  [8]  Extended bracketed character class
- - [ ] \pP       [3]  Match P, named property.  Use \p{Prop} for longer names
- - [ ] \PP       [3]  Match non-P
- - [ ] \X        [4]  Match Unicode "eXtended grapheme cluster"
- - [ ] \R        [4]  Linebreak
+ - [ ]  (?[...])  Extended bracketed character class
+ - [ ] \pP        Match P, named property.  Use \p{Prop} for longer names
+ - [ ] \PP        Match non-P
+ - [ ] \X         Match Unicode "eXtended grapheme cluster"
+ - [ ] \R         Linebreak
 ```
 
 ### Assertions
@@ -208,4 +208,4 @@ The listings are taken from [perl regex docs](https://perldoc.perl.org/perlre).
  - [ ] (*sr:pattern)                 All chars in pattern need to be of the same script.
  - [ ] (*atomic_script_run:pattern)  Without backtracking.
  - [ ] (*asr:pattern)                Without backtracking.
-```
+```
diff --git a/regression-tests/test-results/msvc-2022-c++latest/pure2-regex_10_escapes.cpp.execution b/regression-tests/test-results/msvc-2022-c++latest/pure2-regex_10_escapes.cpp.execution
@@ -9,26 +9,26 @@ Running tests_10_escapes:
 08_y: OK regex: foo(\h)bar parsed_regex: foo(\h)bar str: foo	bar result_expr: $1 expected_results 	
 09_y: OK regex: (\H)(\h) parsed_regex: (\H)(\h) str: foo	bar result_expr: $1-$2 expected_results o-	
 10_y: OK regex: (\h)(\H) parsed_regex: (\h)(\H) str: foo	bar result_expr: $1-$2 expected_results 	-b
-11_y: OK regex: foo(\v+)bar parsed_regex: foo(\v+)bar str: foo
-
+11_y: OK regex: foo(\v+)bar parsed_regex: foo(\v+)bar str: foo
 
-bar result_expr: $1 expected_results 
-
 
+bar result_expr: $1 expected_results
 
-12_y: OK regex: (\V+)(\v) parsed_regex: (\V+)(\v) str: foo
-
 
-bar result_expr: $1-$2 expected_results foo-
-13_y: OK regex: (\v+)(\V) parsed_regex: (\v+)(\V) str: foo
-
 
-bar result_expr: $1-$2 expected_results 
-
+12_y: OK regex: (\V+)(\v) parsed_regex: (\V+)(\v) str: foo
+
+
+bar result_expr: $1-$2 expected_results foo-
+13_y: OK regex: (\v+)(\V) parsed_regex: (\v+)(\V) str: foo
+
+
+bar result_expr: $1-$2 expected_results
+
 
 -b
-14_y: OK regex: foo(\v)bar parsed_regex: foo(\v)bar str: foobar result_expr: $1 expected_results 
-15_y: OK regex: (\V)(\v) parsed_regex: (\V)(\v) str: foobar result_expr: $1-$2 expected_results o-
+14_y: OK regex: foo(\v)bar parsed_regex: foo(\v)bar str: foobar result_expr: $1 expected_results
+15_y: OK regex: (\V)(\v) parsed_regex: (\V)(\v) str: foobar result_expr: $1-$2 expected_results o-
 16_y: OK regex: (\v)(\V) parsed_regex: (\v)(\V) str: foobar result_expr: $1-$2 expected_results -b
 17_y: OK regex: foo\t\n\r\f\a\ebar parsed_regex: foo\t\n\r\f\a\ebar str: foo	
 bar result_expr: $& expected_results foo	

diff --git a/source/reflect.h b/source/reflect.h
@@ -2097,22 +2097,22 @@ auto print(cpp2::impl::in<meta::type_declaration> t) -> void
 auto regex_gen(meta::type_declaration& t) -> void
 {
     auto has_default {false}; 
-    auto prefix {"regex"}; 
-    std::string postfix {"_mod"};           // TODO: remove mod syntax when 'm.initializer()' can be '("pat", "mod")'
+    auto exact_name {"regex"}; 
+    auto prefix {"regex_"}; 
     std::map<std::string,std::string> expressions {}; 
 
     for ( auto& m : CPP2_UFCS(get_member_objects)(t) ) 
     {
         std::string name {CPP2_UFCS(name)(m)}; 
 
-        if (CPP2_UFCS(starts_with)(name, prefix)) 
+        if (CPP2_UFCS(starts_with)(name, prefix) || name == exact_name) 
         {
             if (!(CPP2_UFCS(has_initializer)(m))) {
                 CPP2_UFCS(error)(t, "Regular expression must have an initializer.");
             }
             CPP2_UFCS(mark_for_removal_from_enclosing_type)(m);
 
-            if (name == prefix) {
+            if (name == exact_name) {
                 if (has_default) {
                     CPP2_UFCS(error)(t, "Type can only contain one default named regular expression.");
                 }

diff --git a/source/reflect.h2 b/source/reflect.h2
@@ -1460,22 +1460,22 @@ print: (t: meta::type_declaration) =
 regex_gen: (inout t: meta::type_declaration) =
 {
     has_default := false;
-    prefix      := "regex";
-    postfix     : std::string = "_mod";     // TODO: remove mod syntax when 'm.initializer()' can be '("pat", "mod")'
+    exact_name  := "regex";
+    prefix      := "regex_";
     expressions : std::map<std::string, std::string> = ();
 
     for t.get_member_objects() do (inout m)
     {
         name: std::string = m.name();
 
-        if name.starts_with(prefix)
+        if name.starts_with(prefix) || name == exact_name
         {
             if !m.has_initializer() {
                 t.error("Regular expression must have an initializer.");
             }
             m.mark_for_removal_from_enclosing_type();
 
-            if name == prefix {
+            if name == exact_name {
                 if has_default {
                     t.error("Type can only contain one default named regular expression.");
                 }