diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 2b1a9014f4f0bf..527ef1b2e357b9 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -1822,42 +1822,47 @@ void EmitAlternation(RegexNode node) // the whole alternation can be treated as a simple switch, so we special-case that. However, // we can't goto _into_ switch cases, which means we can't use this approach if there's any // possibility of backtracking into the alternation. - bool useSwitchedBranches = false; - if ((node.Options & RegexOptions.RightToLeft) == 0) + if ((node.Options & RegexOptions.RightToLeft) == 0 && + TryEmitAlternationAsSwitch()) { - useSwitchedBranches = isAtomic; - if (!useSwitchedBranches) + return; + } + + EmitAllBranches(); + return; + + // Tries to emit an alternation as a switch on the first character of each branch. + // Returns true if the optimization was applied, false otherwise. + bool TryEmitAlternationAsSwitch() + { + // We can't use switched branches if there's any possibility of backtracking into the alternation. + if (!isAtomic) { - useSwitchedBranches = true; for (int i = 0; i < childCount; i++) { if (rm.Analysis.MayBacktrack(node.Child(i))) { - useSwitchedBranches = false; - break; + return false; } } } - } - // Detect whether every branch begins with one or more unique characters. - const int SetCharsSize = 64; // arbitrary limit; we want it to be large enough to handle ignore-case of common sets, like hex, the latin alphabet, etc. - Span setChars = stackalloc char[SetCharsSize]; - if (useSwitchedBranches) - { + // Detect whether every branch begins with one or more unique characters. + const int SetCharsSize = 64; // arbitrary limit; we want it to be large enough to handle ignore-case of common sets, like hex, the latin alphabet, etc. + Span setChars = stackalloc char[SetCharsSize]; + // Iterate through every branch, seeing if we can easily find a starting One, Multi, or small Set. // If we can, extract its starting char (or multiple in the case of a set), validate that all such // starting characters are unique relative to all the branches. var seenChars = new HashSet(); - for (int i = 0; i < childCount && useSwitchedBranches; i++) + for (int i = 0; i < childCount; i++) { // Look for the guaranteed starting node that's a one, multi, set, // or loop of one of those with at least one minimum iteration. We need to exclude notones. if (node.Child(i).FindStartingLiteralNode(allowZeroWidth: false) is not RegexNode startingLiteralNode || startingLiteralNode.IsNotoneFamily) { - useSwitchedBranches = false; - break; + return false; } // If it's a One or a Multi, get the first character and add it to the set. @@ -1866,8 +1871,7 @@ void EmitAlternation(RegexNode node) { if (!seenChars.Add(startingLiteralNode.FirstCharOfOneOrMulti())) { - useSwitchedBranches = false; - break; + return false; } } else @@ -1879,8 +1883,7 @@ void EmitAlternation(RegexNode node) if (RegexCharClass.IsNegated(startingLiteralNode.Str!) || (numChars = RegexCharClass.GetSetChars(startingLiteralNode.Str!, setChars)) == 0) { - useSwitchedBranches = false; - break; + return false; } // Check to make sure each of the chars is unique relative to all other branches examined. @@ -1888,28 +1891,15 @@ void EmitAlternation(RegexNode node) { if (!seenChars.Add(c)) { - useSwitchedBranches = false; - break; + return false; } } } } - } - if (useSwitchedBranches) - { - // Note: This optimization does not exist with RegexOptions.Compiled. Here we rely on the - // C# compiler to lower the C# switch statement with appropriate optimizations. In some - // cases there are enough branches that the compiler will emit a jump table. In others - // it'll optimize the order of checks in order to minimize the total number in the worst - // case. In any case, we get easier to read and reason about C#. EmitSwitchedBranches(); + return true; } - else - { - EmitAllBranches(); - } - return; // Emits the code for a switch-based alternation of non-overlapping branches. void EmitSwitchedBranches() @@ -1921,7 +1911,8 @@ void EmitSwitchedBranches() // Emit a switch statement on the first char of each branch. using (EmitBlock(writer, $"switch ({sliceSpan}[{sliceStaticPos}])")) { - Span setChars = stackalloc char[SetCharsSize]; // needs to be same size as detection check in caller + const int SetCharsSize = 64; // arbitrary limit; we want it to be large enough to handle ignore-case of common sets, like hex, the latin alphabet, etc. + Span setChars = stackalloc char[SetCharsSize]; int startingSliceStaticPos = sliceStaticPos; // Emit a case for each branch. @@ -2001,9 +1992,11 @@ void EmitSwitchedBranches() break; } - // This is only ever used for atomic alternations, so we can simply reset the doneLabel - // after emitting the child, as nothing will backtrack here (and we need to reset it - // so that all branches see the original). + // This is only ever used for alternations where no branch may backtrack + // (whether due to being atomic or simply because nothing in the branch + // can backtrack), so we can simply reset the doneLabel after emitting the + // child, as nothing will backtrack here (and we need to reset it so that + // all branches see the original). doneLabel = originalDoneLabel; // If we get here in the generated code, the branch completed successfully. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 9a18bd270cf031..3a5bf0dcd39ce5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -1678,6 +1678,19 @@ void EmitAlternation(RegexNode node) Debug.Assert(node.Parent is not null); bool isAtomic = analysis.IsAtomicByAncestor(node); + // If no child branch overlaps with another child branch, we can emit more streamlined code + // that avoids checking unnecessary branches, e.g. with abc|def|ghi if the next character in + // the input is 'a', we needn't try the def or ghi branches. A simple, relatively common case + // of this is if every branch begins with a specific, unique character, in which case + // the whole alternation can be treated as a simple switch, so we special-case that. However, + // we can't goto _into_ switch cases, which means we can't use this approach if there's any + // possibility of backtracking into the alternation. + if ((node.Options & RegexOptions.RightToLeft) == 0 && + TryEmitAlternationAsSwitch(node, childCount, isAtomic)) + { + return; + } + // Label to jump to when any branch completes successfully. Label matchLabel = DefineLabel(); @@ -1866,6 +1879,252 @@ void EmitAlternation(RegexNode node) Debug.Assert(sliceStaticPos == 0); } + // Tries to emit an alternation as a switch on the first character of each branch. + // Returns true if the optimization was applied, false otherwise. + bool TryEmitAlternationAsSwitch(RegexNode node, int childCount, bool isAtomic) + { + // We can't use switched branches if there's any possibility of backtracking into the alternation. + if (!isAtomic) + { + for (int i = 0; i < childCount; i++) + { + if (analysis.MayBacktrack(node.Child(i))) + { + return false; + } + } + } + + // Detect whether every branch begins with one or more unique characters. + const int SetCharsSize = 64; // arbitrary limit; we want it to be large enough to handle ignore-case of common sets, like hex, the latin alphabet, etc. + Span setChars = stackalloc char[SetCharsSize]; + var seenChars = new HashSet(); + + // Iterate through every branch, seeing if we can easily find a starting One, Multi, or small Set. + // If we can, extract its starting char (or multiple in the case of a set), validate that all such + // starting characters are unique relative to all the branches. + for (int i = 0; i < childCount; i++) + { + // Look for the guaranteed starting node that's a one, multi, set, + // or loop of one of those with at least one minimum iteration. We need to exclude notones. + if (node.Child(i).FindStartingLiteralNode(allowZeroWidth: false) is not RegexNode startingLiteralNode || + startingLiteralNode.IsNotoneFamily) + { + return false; + } + + // If it's a One or a Multi, get the first character and add it to the set. + // If it was already in the set, we can't apply this optimization. + if (startingLiteralNode.IsOneFamily || startingLiteralNode.Kind is RegexNodeKind.Multi) + { + if (!seenChars.Add(startingLiteralNode.FirstCharOfOneOrMulti())) + { + return false; + } + } + else + { + // The branch begins with a set. Make sure it's a set of only a few characters + // and get them. If we can't, we can't apply this optimization. + Debug.Assert(startingLiteralNode.IsSetFamily); + int numChars; + if (RegexCharClass.IsNegated(startingLiteralNode.Str!) || + (numChars = RegexCharClass.GetSetChars(startingLiteralNode.Str!, setChars)) == 0) + { + return false; + } + + // Check to make sure each of the chars is unique relative to all other branches examined. + foreach (char c in setChars.Slice(0, numChars)) + { + if (!seenChars.Add(c)) + { + return false; + } + } + } + } + + // Apply the Roslyn switch heuristic: emit an IL switch only if + // count_of_values >= 3 AND count_of_values / (max_value - min_value + 1) >= 0.5 + int count = seenChars.Count; + if (count < 3) + { + return false; + } + + int minValue = int.MaxValue; + int maxValue = int.MinValue; + foreach (char c in seenChars) + { + if (c < minValue) minValue = c; + if (c > maxValue) maxValue = c; + } + + int range = maxValue - minValue + 1; + if ((double)count / range < 0.5) + { + return false; + } + + // Emit switched branches using an IL switch instruction. + EmitSwitchedBranches(node, childCount, minValue, range); + return true; + } + + // Emits the code for a switch-based alternation of non-overlapping branches. + void EmitSwitchedBranches(RegexNode node, int childCount, int minValue, int range) + { + const int SetCharsSize = 64; + Span setChars = stackalloc char[SetCharsSize]; + + Label originalDoneLabel = doneLabel; + Label matchLabel = DefineLabel(); + int startingTextSpanPos = sliceStaticPos; + + // We need at least 1 remaining character in the span, for the char to switch on. + EmitSpanLengthCheck(1); + + // Build a map from character value to branch index + var charToBranchIndex = new Dictionary(); + for (int i = 0; i < childCount; i++) + { + RegexNode child = node.Child(i); + RegexNode? startingLiteralNode = child.FindStartingLiteralNode(allowZeroWidth: false); + Debug.Assert(startingLiteralNode is not null, "Unexpectedly couldn't find the branch starting node."); + + if (startingLiteralNode.IsSetFamily) + { + int numChars = RegexCharClass.GetSetChars(startingLiteralNode.Str!, setChars); + Debug.Assert(numChars != 0); + foreach (char c in setChars.Slice(0, numChars)) + { + charToBranchIndex[c] = i; + } + } + else + { + charToBranchIndex[startingLiteralNode.FirstCharOfOneOrMulti()] = i; + } + } + + // Create labels for each branch + var branchLabels = new Label[childCount]; + for (int i = 0; i < childCount; i++) + { + branchLabels[i] = DefineLabel(); + } + + // Build the switch table: an array of labels indexed by (charValue - minValue) + var switchTable = new Label[range]; + for (int i = 0; i < range; i++) + { + char c = (char)(minValue + i); + switchTable[i] = charToBranchIndex.TryGetValue(c, out int branchIndex) ? branchLabels[branchIndex] : originalDoneLabel; + } + + // Load the first character of the slice, subtract minValue, and switch + // slice[sliceStaticPos] + Ldloca(slice); + Ldc(sliceStaticPos); + Call(SpanGetItemMethod); + LdindU2(); + + // Subtract minValue to get 0-based index + if (minValue != 0) + { + Ldc(minValue); + Sub(); + } + + // Emit the switch + Switch(switchTable); + + // Default case: the character didn't match any branch + BrFar(originalDoneLabel); + + // Emit the code for each branch + for (int i = 0; i < childCount; i++) + { + MarkLabel(branchLabels[i]); + sliceStaticPos = startingTextSpanPos; + + RegexNode child = node.Child(i); + RegexNode? startingLiteralNode = child.FindStartingLiteralNode(allowZeroWidth: false); + Debug.Assert(startingLiteralNode is not null, "Unexpectedly couldn't find the branch starting node."); + + // Emit the code for the branch, without the first character that was already matched in the switch. + switch (child.Kind) + { + case RegexNodeKind.One: + case RegexNodeKind.Set: + // The character was handled entirely by the switch. No additional matching is needed. + sliceStaticPos++; + break; + + case RegexNodeKind.Multi: + // First character was handled by the switch. Emit matching code for the remainder of the multi string. + sliceStaticPos++; + EmitNode(CreateSlicedMulti(child)); + break; + + case RegexNodeKind.Concatenate when child.Child(0) == startingLiteralNode && (startingLiteralNode.Kind is RegexNodeKind.One or RegexNodeKind.Set or RegexNodeKind.Multi): + // This is a concatenation where its first node is the starting literal we found and that starting literal + // is one of the nodes above that we know how to handle completely. This is a common + // enough case that we want to special-case it to avoid duplicating the processing for that character + // unnecessarily. First slice off the first character that was already handled. If that child is a multi, temporarily + // replace it with a node that doesn't have the already-matched first character; otherwise, replace it with an empty node + // that'll be ignored when rendered. Then emit the new tree, and subsequently restore the original child. + sliceStaticPos++; + RegexNode originalFirst = child.Child(0); + child.ReplaceChild(0, + child.Child(0).Kind is RegexNodeKind.Multi ? + CreateSlicedMulti(child.Child(0)) : + new RegexNode(RegexNodeKind.Empty, child.Options)); + try + { + EmitNode(child); + } + finally + { + child.ReplaceChild(0, originalFirst); + } + break; + + default: + EmitNode(child); + break; + } + + // This is only ever used for alternations where no branch may backtrack + // (whether due to being atomic or simply because nothing in the branch + // can backtrack), so we can simply reset the doneLabel after emitting the + // child, as nothing will backtrack here (and we need to reset it so that + // all branches see the original). + doneLabel = originalDoneLabel; + + // If we get here in the generated code, the branch completed successfully. + // Before jumping to the end, we need to zero out sliceStaticPos, so that no + // matter what the value is after the branch, whatever follows the alternate + // will see the same sliceStaticPos. + TransferSliceStaticPosToPos(); + BrFar(matchLabel); + } + + // Successfully completed the alternate. + MarkLabel(matchLabel); + Debug.Assert(sliceStaticPos == 0); + + // Creates a new Multi node with the first character sliced off + static RegexNode CreateSlicedMulti(RegexNode multi) + { + Debug.Assert(multi.Kind is RegexNodeKind.Multi, $"Expected a Multi node, got {multi.Kind}"); + return multi.Str!.Length == 2 ? + new(RegexNodeKind.One, multi.Options, multi.Str[1]) : + new(RegexNodeKind.Multi, multi.Options, multi.Str.Substring(1)); + } + } + // Emits the code to handle a backreference. void EmitBackreference(RegexNode node) { diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index 33ffbc6a1bf7a1..b9722046901bf8 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -778,6 +778,21 @@ public static IEnumerable Match_MemberData() } yield return ("[^a-z0-9]etag|[^a-z0-9]digest", "this string has .digest as a substring", RegexOptions.None, 16, 7, true, ".digest"); yield return (@"(\w+|\d+)a+[ab]+", "123123aa", RegexOptions.None, 0, 8, true, "123123aa"); + + // Alternations with many branches starting with unique characters (switch optimization) + if (!RegexHelpers.IsNonBacktracking(engine)) + { + yield return (@"(?>abc|bcd|cde|def|efg|fgh|ghi|hij)", "hij", RegexOptions.None, 0, 3, true, "hij"); + yield return (@"(?>abc|bcd|cde|def|efg|fgh|ghi|hij)", "efg", RegexOptions.None, 0, 3, true, "efg"); + yield return (@"(?>abc|bcd|cde|def|efg|fgh|ghi|hij)", "xyz", RegexOptions.None, 0, 3, false, ""); + yield return (@"(?>abc|bcd|cde|def|efg|fgh|ghi|hij)", "ab", RegexOptions.None, 0, 2, false, ""); + yield return (@"(?>abc|bcd|cde|def|efg|fgh|ghi|hij)", "abcdef", RegexOptions.None, 0, 6, true, "abc"); + yield return (@"(?>a1|b2|c3|d4|e5|f6|g7|h8)", "e5", RegexOptions.None, 0, 2, true, "e5"); + yield return (@"(?>a1|b2|c3|d4|e5|f6|g7|h8)", "h8", RegexOptions.None, 0, 2, true, "h8"); + yield return (@"(?>a1|b2|c3|d4|e5|f6|g7|h8)", "a1", RegexOptions.None, 0, 2, true, "a1"); + yield return (@"(?>a1|b2|c3|d4|e5|f6|g7|h8)", "z9", RegexOptions.None, 0, 2, false, ""); + } + foreach (string aOptional in new[] { "(a|)", "(|a)", "(a?)", "(a??)" }) { yield return (@$"^{aOptional}{{0,2}}?b", "aab", RegexOptions.None, 0, 3, true, "aab");