From 89b861021b52c9f14b85e0c87ab7627af511cccc Mon Sep 17 00:00:00 2001 From: Max Leske Date: Sat, 17 Feb 2024 18:59:47 +0100 Subject: [PATCH] feat: update to Go 1.22 Go 1.22 includes improved printing of regular expression flags in regexp/syntax. See https://github.com/golang/go/issues/57950 See https://go-review.googlesource.com/c/go/+/507015 --- regex/operators/assembler.go | 56 ++++++++++++++++++++----------- regex/operators/assembler_test.go | 24 ++++++------- regex/processors/assemble_test.go | 2 +- 3 files changed, 49 insertions(+), 33 deletions(-) diff --git a/regex/operators/assembler.go b/regex/operators/assembler.go index a19863b..49f9283 100644 --- a/regex/operators/assembler.go +++ b/regex/operators/assembler.go @@ -8,6 +8,7 @@ import ( "bytes" "errors" "fmt" + "regexp" "sort" "strings" @@ -18,12 +19,6 @@ import ( "github.com/coreruleset/crs-toolchain/regex/processors" ) -var metaGroupReplacements = map[string]string{ - "(?-s:.)": ".", - "(?m:^)": "^", - "(?m:$)": "$", -} - // Create the processor stack var processorStack ProcessorStack var processor processors.IProcessor @@ -136,7 +131,7 @@ func (a *Operator) complete(assembleParser *parser.Parser) string { result = a.runSimplificationAssembly(result) logger.Trace().Msgf("After simplification assembly: %s\n", result) result = a.useHexEscapes(result) - logger.Trace().Msgf("After simplification assembly: %s\n", result) + logger.Trace().Msgf("After use hex escapdes: %s\n", result) result = a.escapeDoublequotes(result) logger.Trace().Msgf("After escaping double quotes: %s\n", result) result = a.useHexBackslashes(result) @@ -217,12 +212,7 @@ func (a *Operator) useHexBackslashes(input string) string { // compatible engines. func (a *Operator) includeVerticalTabInSpaceClass(input string) string { logger.Trace().Msg("Fixing up regex to include \\v in white space class matches") - // Note: replacement order is important. Don't use a map. - result := strings.ReplaceAll(input, `[\t-\n\f-\r ]`, `[\s\v]`) - result = strings.ReplaceAll(result, `[^\t-\n\f-\r ]`, `[^\s\v]`) - // There's a range attached, can't just replace - result = strings.ReplaceAll(result, `\t-\n\f-\r -`, `\s\v -`) - return strings.ReplaceAll(result, `\t-\n\f-\r `, `\s\v`) + return strings.ReplaceAll(input, `\t\n\f\r `, `\s\v`) } // rassemble-go doesn't provide an option to specify literals. @@ -256,19 +246,45 @@ func (a *Operator) useHexEscapes(input string) string { return sb.String() } -// The Go regexp/syntax library will convert: -// - a dot (`.`) into `(?-s:.)` -// - a caret (`^`) into `(?m:^)` -// - a dollar (`$`) into (?m:$)` -// We want to retain the original dot. +// The Go regexp/syntax library will convert insert flags when it encounters +// meta characters that could be ambiguous, such as `^`, `$`, `.`. +// Remove both flags for the current context, e.g., `...(?m)...`, and flag groups +// applied to subexpressions, e.g., `...(?m:...)...` func (a *Operator) dontUseFlagsForMetaCharacters(input string) string { result := input - for needle, replacement := range metaGroupReplacements { - result = strings.ReplaceAll(result, needle, replacement) + flagsStartRegexp := regexp.MustCompile(`\(\?[-misU]+\)`) + result = flagsStartRegexp.ReplaceAllString(result, "") + + flagGroupStartRegexp := regexp.MustCompile(`\(\?[-misU]+:`) + for { + location := flagGroupStartRegexp.FindStringIndex(result) + if len(location) > 0 { + result = replaceFlagGroup(result, location) + } else { + break + } } return result } +// Remove flag groups like `...(?-s:...)...` +func replaceFlagGroup(input string, location []int) string { + parensCounter := 1 + groupStart := location[0] + bodyStart := location[1] + index := bodyStart + for ; parensCounter > 0; index++ { + char := input[index] + switch char { + case '(': + parensCounter++ + case ')': + parensCounter-- + } + } + return input[:groupStart] + input[bodyStart:index-1] + input[index:] +} + func (a *Operator) startPreprocessor(processorName string, args []string) error { logger.Trace().Msgf("Found processor %s start\n", processorName) switch processorName { diff --git a/regex/operators/assembler_test.go b/regex/operators/assembler_test.go index c0446e8..bf7ffa5 100644 --- a/regex/operators/assembler_test.go +++ b/regex/operators/assembler_test.go @@ -271,7 +271,7 @@ b` assembler := NewAssembler(s.ctx) output, err := assembler.Run(contents) s.Require().NoError(err) - s.Equal("a prefix[a-b]", output) + s.Equal("a prefix[ab]", output) } func (s *specialCommentsTestSuite) TestHandlesSuffixComment() { @@ -281,7 +281,7 @@ b` assembler := NewAssembler(s.ctx) output, err := assembler.Run(contents) s.Require().NoError(err) - s.Equal("[a-b]a suffix", output) + s.Equal("[ab]a suffix", output) } func (s *specialCasesTestSuite) TestIgnoresEmptyLines() { @@ -327,7 +327,7 @@ b\x5c\x48 assembler := NewAssembler(s.ctx) output, err := assembler.Run(contents) s.Require().NoError(err) - s.Equal(`[a-b]\x5cH`, output) + s.Equal(`[ab]\x5cH`, output) } func (s *specialCasesTestSuite) TestSpecialComments_HandlesEscapedAlternationsCorrectly() { @@ -568,7 +568,7 @@ d output, err := assembler.Run(contents) s.Require().NoError(err) - s.Equal(`[^0-9A-Z_a-z]*\(two(?:a+b|[c-d])`, output) + s.Equal(`[^0-9A-Z_a-z]*\(two(?:a+b|[cd])`, output) } @@ -582,7 +582,7 @@ d output, err := assembler.Run(contents) s.Require().NoError(err) - s.Equal(`(?:a+b|[c-d])[^0-9A-Z_a-z]*\(two`, output) + s.Equal(`(?:a+b|[cd])[^0-9A-Z_a-z]*\(two`, output) } func (s *assemblerTestSuite) TestAssemble_Assembling_3() { @@ -717,7 +717,7 @@ func (s *assemblerTestSuite) TestAssemble_ConcatenatingWithStoredInput() { output, err := assembler.Run(contents) s.Require().NoError(err) - s.Equal(`(?:\x5c|%(?:2f|5c))\.(?:%0[0-1])?(?:\x5c|%(?:2f|5c))`, output) + s.Equal(`(?:\x5c|%(?:2f|5c))\.(?:%0[01])?(?:\x5c|%(?:2f|5c))`, output) } @@ -807,7 +807,7 @@ d output, err := assembler.Run(contents) s.Require().NoError(err) - s.Equal(`[a-b][c-d]`, output) + s.Equal(`[ab][cd]`, output) } func (s *assemblerTestSuite) TestAssemble_ConcatenationWithPrefixAndSuffix() { @@ -825,7 +825,7 @@ b output, err := assembler.Run(contents) s.Require().NoError(err) - s.Equal(`prefix[a-b]suffix`, output) + s.Equal(`prefix[ab]suffix`, output) } func (s *assemblerTestSuite) TestAssemble_AssembleWrappedInGroupWithTailConcatenation() { @@ -845,7 +845,7 @@ more output, err := assembler.Run(contents) s.Require().NoError(err) - s.Equal(`[a-b][c-d]more`, output) + s.Equal(`[ab][cd]more`, output) } func (s *assemblerTestSuite) TestAssemble_AssembleWrappedInGroupWithTailAlternation() { @@ -863,7 +863,7 @@ more output, err := assembler.Run(contents) s.Require().NoError(err) - s.Equal(`[a-b][c-d]|more`, output) + s.Equal(`[ab][cd]|more`, output) } func (s *assemblerTestSuite) TestAssemble_NestedGroups() { @@ -885,7 +885,7 @@ func (s *assemblerTestSuite) TestAssemble_RemoveExtraGroups() { output, err := assembler.Run(contents) s.Require().NoError(err) - s.Equal(`a[b-c]d`, output) + s.Equal(`a[bc]d`, output) } // The Go regexp/syntax library will convert a dot (`.`) into `(?-s:.)`. @@ -913,7 +913,7 @@ func (s *assemblerTestSuite) TestAssemble_DotRemainsDotWithSflag() { } // The Go regexp/syntax library will convert a caret (`^`) into `(?m:^)`. -// We want to retain the original dot. +// We want to retain the original without the flag. func (s *assemblerTestSuite) TestAssemble_CaretRemainsCaret() { contents := "^a|b" assembler := NewAssembler(s.ctx) diff --git a/regex/processors/assemble_test.go b/regex/processors/assemble_test.go index 45c0404..eca3ffe 100644 --- a/regex/processors/assemble_test.go +++ b/regex/processors/assemble_test.go @@ -82,7 +82,7 @@ func (s *assembleTestSuite) TestAssemble_RegularExpressions() { s.Require().NoError(err) s.Len(output, 1) - s.Equal("(?:(?:home[,r]|(?-s:.)imps[a-c]{2}n))", output[0]) + s.Equal("(?:(?:(?-s:home[,r]|.imps[a-c]{2}n)))", output[0]) } func (s *assembleTestSuite) TestAssemble_InvalidRegularExpressionFails() {