Skip to content

Commit

Permalink
feat: update to Go 1.22
Browse files Browse the repository at this point in the history
Go 1.22 includes improved printing of regular expression flags in
regexp/syntax.

See golang/go#57950
See https://go-review.googlesource.com/c/go/+/507015
  • Loading branch information
theseion committed Feb 17, 2024
1 parent bbdbed1 commit 89b8610
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 33 deletions.
56 changes: 36 additions & 20 deletions regex/operators/assembler.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"bytes"
"errors"
"fmt"
"regexp"
"sort"
"strings"

Expand All @@ -18,12 +19,6 @@ import (
"github.com/coreruleset/crs-toolchain/regex/processors"
)

var metaGroupReplacements = map[string]string{
"(?-s:.)": ".",
"(?m:^)": "^",
"(?m:$)": "$",
}

// Create the processor stack
var processorStack ProcessorStack
var processor processors.IProcessor
Expand Down Expand Up @@ -136,7 +131,7 @@ func (a *Operator) complete(assembleParser *parser.Parser) string {
result = a.runSimplificationAssembly(result)
logger.Trace().Msgf("After simplification assembly: %s\n", result)
result = a.useHexEscapes(result)
logger.Trace().Msgf("After simplification assembly: %s\n", result)
logger.Trace().Msgf("After use hex escapdes: %s\n", result)
result = a.escapeDoublequotes(result)
logger.Trace().Msgf("After escaping double quotes: %s\n", result)
result = a.useHexBackslashes(result)
Expand Down Expand Up @@ -217,12 +212,7 @@ func (a *Operator) useHexBackslashes(input string) string {
// compatible engines.
func (a *Operator) includeVerticalTabInSpaceClass(input string) string {
logger.Trace().Msg("Fixing up regex to include \\v in white space class matches")
// Note: replacement order is important. Don't use a map.
result := strings.ReplaceAll(input, `[\t-\n\f-\r ]`, `[\s\v]`)
result = strings.ReplaceAll(result, `[^\t-\n\f-\r ]`, `[^\s\v]`)
// There's a range attached, can't just replace
result = strings.ReplaceAll(result, `\t-\n\f-\r -`, `\s\v -`)
return strings.ReplaceAll(result, `\t-\n\f-\r `, `\s\v`)
return strings.ReplaceAll(input, `\t\n\f\r `, `\s\v`)
}

// rassemble-go doesn't provide an option to specify literals.
Expand Down Expand Up @@ -256,19 +246,45 @@ func (a *Operator) useHexEscapes(input string) string {
return sb.String()
}

// The Go regexp/syntax library will convert:
// - a dot (`.`) into `(?-s:.)`
// - a caret (`^`) into `(?m:^)`
// - a dollar (`$`) into (?m:$)`
// We want to retain the original dot.
// The Go regexp/syntax library will convert insert flags when it encounters
// meta characters that could be ambiguous, such as `^`, `$`, `.`.
// Remove both flags for the current context, e.g., `...(?m)...`, and flag groups
// applied to subexpressions, e.g., `...(?m:...)...`
func (a *Operator) dontUseFlagsForMetaCharacters(input string) string {
result := input
for needle, replacement := range metaGroupReplacements {
result = strings.ReplaceAll(result, needle, replacement)
flagsStartRegexp := regexp.MustCompile(`\(\?[-misU]+\)`)
result = flagsStartRegexp.ReplaceAllString(result, "")

flagGroupStartRegexp := regexp.MustCompile(`\(\?[-misU]+:`)
for {
location := flagGroupStartRegexp.FindStringIndex(result)
if len(location) > 0 {
result = replaceFlagGroup(result, location)
} else {
break
}
}
return result
}

// Remove flag groups like `...(?-s:...)...`
func replaceFlagGroup(input string, location []int) string {
parensCounter := 1
groupStart := location[0]
bodyStart := location[1]
index := bodyStart
for ; parensCounter > 0; index++ {
char := input[index]
switch char {
case '(':
parensCounter++
case ')':
parensCounter--
}
}
return input[:groupStart] + input[bodyStart:index-1] + input[index:]
}

func (a *Operator) startPreprocessor(processorName string, args []string) error {
logger.Trace().Msgf("Found processor %s start\n", processorName)
switch processorName {
Expand Down
24 changes: 12 additions & 12 deletions regex/operators/assembler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ b`
assembler := NewAssembler(s.ctx)
output, err := assembler.Run(contents)
s.Require().NoError(err)
s.Equal("a prefix[a-b]", output)
s.Equal("a prefix[ab]", output)
}

func (s *specialCommentsTestSuite) TestHandlesSuffixComment() {
Expand All @@ -281,7 +281,7 @@ b`
assembler := NewAssembler(s.ctx)
output, err := assembler.Run(contents)
s.Require().NoError(err)
s.Equal("[a-b]a suffix", output)
s.Equal("[ab]a suffix", output)
}

func (s *specialCasesTestSuite) TestIgnoresEmptyLines() {
Expand Down Expand Up @@ -327,7 +327,7 @@ b\x5c\x48
assembler := NewAssembler(s.ctx)
output, err := assembler.Run(contents)
s.Require().NoError(err)
s.Equal(`[a-b]\x5cH`, output)
s.Equal(`[ab]\x5cH`, output)
}

func (s *specialCasesTestSuite) TestSpecialComments_HandlesEscapedAlternationsCorrectly() {
Expand Down Expand Up @@ -568,7 +568,7 @@ d

output, err := assembler.Run(contents)
s.Require().NoError(err)
s.Equal(`[^0-9A-Z_a-z]*\(two(?:a+b|[c-d])`, output)
s.Equal(`[^0-9A-Z_a-z]*\(two(?:a+b|[cd])`, output)

}

Expand All @@ -582,7 +582,7 @@ d

output, err := assembler.Run(contents)
s.Require().NoError(err)
s.Equal(`(?:a+b|[c-d])[^0-9A-Z_a-z]*\(two`, output)
s.Equal(`(?:a+b|[cd])[^0-9A-Z_a-z]*\(two`, output)

}
func (s *assemblerTestSuite) TestAssemble_Assembling_3() {
Expand Down Expand Up @@ -717,7 +717,7 @@ func (s *assemblerTestSuite) TestAssemble_ConcatenatingWithStoredInput() {
output, err := assembler.Run(contents)
s.Require().NoError(err)

s.Equal(`(?:\x5c|%(?:2f|5c))\.(?:%0[0-1])?(?:\x5c|%(?:2f|5c))`, output)
s.Equal(`(?:\x5c|%(?:2f|5c))\.(?:%0[01])?(?:\x5c|%(?:2f|5c))`, output)

}

Expand Down Expand Up @@ -807,7 +807,7 @@ d
output, err := assembler.Run(contents)
s.Require().NoError(err)

s.Equal(`[a-b][c-d]`, output)
s.Equal(`[ab][cd]`, output)

}
func (s *assemblerTestSuite) TestAssemble_ConcatenationWithPrefixAndSuffix() {
Expand All @@ -825,7 +825,7 @@ b
output, err := assembler.Run(contents)
s.Require().NoError(err)

s.Equal(`prefix[a-b]suffix`, output)
s.Equal(`prefix[ab]suffix`, output)

}
func (s *assemblerTestSuite) TestAssemble_AssembleWrappedInGroupWithTailConcatenation() {
Expand All @@ -845,7 +845,7 @@ more
output, err := assembler.Run(contents)
s.Require().NoError(err)

s.Equal(`[a-b][c-d]more`, output)
s.Equal(`[ab][cd]more`, output)

}
func (s *assemblerTestSuite) TestAssemble_AssembleWrappedInGroupWithTailAlternation() {
Expand All @@ -863,7 +863,7 @@ more
output, err := assembler.Run(contents)
s.Require().NoError(err)

s.Equal(`[a-b][c-d]|more`, output)
s.Equal(`[ab][cd]|more`, output)

}
func (s *assemblerTestSuite) TestAssemble_NestedGroups() {
Expand All @@ -885,7 +885,7 @@ func (s *assemblerTestSuite) TestAssemble_RemoveExtraGroups() {
output, err := assembler.Run(contents)
s.Require().NoError(err)

s.Equal(`a[b-c]d`, output)
s.Equal(`a[bc]d`, output)
}

// The Go regexp/syntax library will convert a dot (`.`) into `(?-s:.)`.
Expand Down Expand Up @@ -913,7 +913,7 @@ func (s *assemblerTestSuite) TestAssemble_DotRemainsDotWithSflag() {
}

// The Go regexp/syntax library will convert a caret (`^`) into `(?m:^)`.
// We want to retain the original dot.
// We want to retain the original without the flag.
func (s *assemblerTestSuite) TestAssemble_CaretRemainsCaret() {
contents := "^a|b"
assembler := NewAssembler(s.ctx)
Expand Down
2 changes: 1 addition & 1 deletion regex/processors/assemble_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ func (s *assembleTestSuite) TestAssemble_RegularExpressions() {

s.Require().NoError(err)
s.Len(output, 1)
s.Equal("(?:(?:home[,r]|(?-s:.)imps[a-c]{2}n))", output[0])
s.Equal("(?:(?:(?-s:home[,r]|.imps[a-c]{2}n)))", output[0])
}

func (s *assembleTestSuite) TestAssemble_InvalidRegularExpressionFails() {
Expand Down

0 comments on commit 89b8610

Please sign in to comment.