Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1822,42 +1822,47 @@ void EmitAlternation(RegexNode node)
// the whole alternation can be treated as a simple switch, so we special-case that. However,
// we can't goto _into_ switch cases, which means we can't use this approach if there's any
// possibility of backtracking into the alternation.
bool useSwitchedBranches = false;
if ((node.Options & RegexOptions.RightToLeft) == 0)
if ((node.Options & RegexOptions.RightToLeft) == 0 &&
TryEmitAlternationAsSwitch())
{
useSwitchedBranches = isAtomic;
if (!useSwitchedBranches)
return;
}

EmitAllBranches();
return;

// Tries to emit an alternation as a switch on the first character of each branch.
// Returns true if the optimization was applied, false otherwise.
bool TryEmitAlternationAsSwitch()
{
// We can't use switched branches if there's any possibility of backtracking into the alternation.
if (!isAtomic)
{
useSwitchedBranches = true;
for (int i = 0; i < childCount; i++)
{
if (rm.Analysis.MayBacktrack(node.Child(i)))
{
useSwitchedBranches = false;
break;
return false;
}
}
}
}

// Detect whether every branch begins with one or more unique characters.
const int SetCharsSize = 64; // arbitrary limit; we want it to be large enough to handle ignore-case of common sets, like hex, the latin alphabet, etc.
Span<char> setChars = stackalloc char[SetCharsSize];
if (useSwitchedBranches)
{
// Detect whether every branch begins with one or more unique characters.
const int SetCharsSize = 64; // arbitrary limit; we want it to be large enough to handle ignore-case of common sets, like hex, the latin alphabet, etc.
Span<char> setChars = stackalloc char[SetCharsSize];

// Iterate through every branch, seeing if we can easily find a starting One, Multi, or small Set.
// If we can, extract its starting char (or multiple in the case of a set), validate that all such
// starting characters are unique relative to all the branches.
var seenChars = new HashSet<char>();
for (int i = 0; i < childCount && useSwitchedBranches; i++)
for (int i = 0; i < childCount; i++)
{
// Look for the guaranteed starting node that's a one, multi, set,
// or loop of one of those with at least one minimum iteration. We need to exclude notones.
if (node.Child(i).FindStartingLiteralNode(allowZeroWidth: false) is not RegexNode startingLiteralNode ||
startingLiteralNode.IsNotoneFamily)
{
useSwitchedBranches = false;
break;
return false;
}

// If it's a One or a Multi, get the first character and add it to the set.
Expand All @@ -1866,8 +1871,7 @@ void EmitAlternation(RegexNode node)
{
if (!seenChars.Add(startingLiteralNode.FirstCharOfOneOrMulti()))
{
useSwitchedBranches = false;
break;
return false;
}
}
else
Expand All @@ -1879,37 +1883,23 @@ void EmitAlternation(RegexNode node)
if (RegexCharClass.IsNegated(startingLiteralNode.Str!) ||
(numChars = RegexCharClass.GetSetChars(startingLiteralNode.Str!, setChars)) == 0)
{
useSwitchedBranches = false;
break;
return false;
}

// Check to make sure each of the chars is unique relative to all other branches examined.
foreach (char c in setChars.Slice(0, numChars))
{
if (!seenChars.Add(c))
{
useSwitchedBranches = false;
break;
return false;
}
}
}
}
}

if (useSwitchedBranches)
{
// Note: This optimization does not exist with RegexOptions.Compiled. Here we rely on the
// C# compiler to lower the C# switch statement with appropriate optimizations. In some
// cases there are enough branches that the compiler will emit a jump table. In others
// it'll optimize the order of checks in order to minimize the total number in the worst
// case. In any case, we get easier to read and reason about C#.
EmitSwitchedBranches();
return true;
}
else
{
EmitAllBranches();
}
return;

// Emits the code for a switch-based alternation of non-overlapping branches.
void EmitSwitchedBranches()
Expand All @@ -1921,7 +1911,8 @@ void EmitSwitchedBranches()
// Emit a switch statement on the first char of each branch.
using (EmitBlock(writer, $"switch ({sliceSpan}[{sliceStaticPos}])"))
{
Span<char> setChars = stackalloc char[SetCharsSize]; // needs to be same size as detection check in caller
const int SetCharsSize = 64; // arbitrary limit; we want it to be large enough to handle ignore-case of common sets, like hex, the latin alphabet, etc.
Span<char> setChars = stackalloc char[SetCharsSize];
int startingSliceStaticPos = sliceStaticPos;

// Emit a case for each branch.
Expand Down Expand Up @@ -2001,9 +1992,11 @@ void EmitSwitchedBranches()
break;
}

// This is only ever used for atomic alternations, so we can simply reset the doneLabel
// after emitting the child, as nothing will backtrack here (and we need to reset it
// so that all branches see the original).
// This is only ever used for alternations where no branch may backtrack
// (whether due to being atomic or simply because nothing in the branch
// can backtrack), so we can simply reset the doneLabel after emitting the
// child, as nothing will backtrack here (and we need to reset it so that
// all branches see the original).
doneLabel = originalDoneLabel;

// If we get here in the generated code, the branch completed successfully.
Expand Down
Loading
Loading