in src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs [685:1417]
private static void EmitTryFindNextPossibleStartingPosition(IndentedTextWriter writer, RegexMethod rm, Dictionary<string, string[]> requiredHelpers, bool checkOverflow)
{
RegexOptions options = rm.Options;
RegexTree regexTree = rm.Tree;
bool rtl = (options & RegexOptions.RightToLeft) != 0;
// In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
// To handle that, we build up a collection of all the declarations to include, track where they should be inserted,
// and then insert them at that position once everything else has been output.
var additionalDeclarations = new HashSet<string>();
// Emit locals initialization
writer.WriteLine("int pos = base.runtextpos;");
writer.Flush();
int additionalDeclarationsPosition = ((StringWriter)writer.InnerWriter).GetStringBuilder().Length;
int additionalDeclarationsIndent = writer.Indent;
writer.WriteLine();
const string NoMatchFound = "NoMatchFound";
bool findEndsInAlwaysReturningTrue = false;
bool noMatchFoundLabelNeeded = false;
// Generate length check. If the input isn't long enough to possibly match, fail quickly.
// It's rare for min required length to be 0, so we don't bother special-casing the check,
// especially since we want the "return false" code regardless.
int minRequiredLength = rm.Tree.FindOptimizations.MinRequiredLength;
Debug.Assert(minRequiredLength >= 0);
FinishEmitBlock clause = default;
if (minRequiredLength > 0)
{
writer.WriteLine(minRequiredLength == 1 ?
"// Empty matches aren't possible." :
$"// Any possible match is at least {minRequiredLength} characters.");
clause = EmitBlock(writer, (minRequiredLength, rtl) switch
{
(1, false) => "if ((uint)pos < (uint)inputSpan.Length)",
(_, false) => $"if (pos <= inputSpan.Length - {minRequiredLength})",
(1, true) => "if (pos > 0)",
(_, true) => $"if (pos >= {minRequiredLength})",
});
}
using (clause)
{
// Emit any anchors.
if (!EmitAnchors())
{
// Either anchors weren't specified, or they don't completely root all matches to a specific location.
// Emit the code for whatever find mode has been determined.
switch (regexTree.FindOptimizations.FindMode)
{
case FindNextStartingPositionMode.LeadingString_LeftToRight:
case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight:
case FindNextStartingPositionMode.FixedDistanceString_LeftToRight:
EmitIndexOfString_LeftToRight();
break;
case FindNextStartingPositionMode.LeadingString_RightToLeft:
EmitIndexOfString_RightToLeft();
break;
case FindNextStartingPositionMode.LeadingStrings_LeftToRight:
case FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight:
EmitIndexOfStrings_LeftToRight();
break;
case FindNextStartingPositionMode.LeadingSet_LeftToRight:
case FindNextStartingPositionMode.FixedDistanceSets_LeftToRight:
EmitFixedSet_LeftToRight();
break;
case FindNextStartingPositionMode.LeadingSet_RightToLeft:
EmitFixedSet_RightToLeft();
break;
case FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight:
EmitLiteralAfterAtomicLoop();
break;
default:
Debug.Fail($"Unexpected mode: {regexTree.FindOptimizations.FindMode}");
goto case FindNextStartingPositionMode.NoSearch;
case FindNextStartingPositionMode.NoSearch:
writer.WriteLine("return true;");
findEndsInAlwaysReturningTrue = true;
break;
}
}
}
// If the main path is guaranteed to end in a "return true;" and nothing is going to
// jump past it, we don't need a "return false;" path.
if (minRequiredLength > 0 || !findEndsInAlwaysReturningTrue || noMatchFoundLabelNeeded)
{
writer.WriteLine();
writer.WriteLine("// No match found.");
if (noMatchFoundLabelNeeded)
{
writer.WriteLine($"{NoMatchFound}:");
}
writer.WriteLine($"base.runtextpos = {(!rtl ? "inputSpan.Length" : "0")};");
writer.WriteLine("return false;");
}
// We're done. Patch up any additional declarations.
InsertAdditionalDeclarations(writer, additionalDeclarations, additionalDeclarationsPosition, additionalDeclarationsIndent);
return;
// Emit a goto for the specified label.
void Goto(string label) => writer.WriteLine($"goto {label};");
// Emits any anchors. Returns true if the anchor roots any match to a specific location and thus no further
// searching is required; otherwise, false.
bool EmitAnchors()
{
// Anchors that fully implement TryFindNextPossibleStartingPosition, with a check that leads to immediate success or failure determination.
switch (regexTree.FindOptimizations.FindMode)
{
case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning:
writer.WriteLine("// The pattern leads with a beginning (\\A) anchor.");
using (EmitBlock(writer, "if (pos == 0)"))
{
// If we're at the beginning, we're at a possible match location. Otherwise,
// we'll never be, so fail immediately.
writer.WriteLine("return true;");
}
return true;
case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start:
case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start:
writer.Write($"// The pattern leads with a start (\\G) anchor");
if (regexTree.FindOptimizations.FindMode == FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start)
{
writer.Write(" when processed right to left.");
}
writer.WriteLine(".");
using (EmitBlock(writer, "if (pos == base.runtextstart)"))
{
// For both left-to-right and right-to-left, if we're currently at the start,
// we're at a possible match location. Otherwise, because we've already moved
// beyond it, we'll never be, so fail immediately.
writer.WriteLine("return true;");
}
return true;
case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ:
writer.WriteLine("// The pattern leads with an end (\\Z) anchor.");
using (EmitBlock(writer, "if (pos < inputSpan.Length - 1)"))
{
// If we're not currently at the end (or a newline just before it), skip ahead
// since nothing until then can possibly match.
writer.WriteLine("base.runtextpos = inputSpan.Length - 1;");
}
writer.WriteLine("return true;");
findEndsInAlwaysReturningTrue = true;
return true;
case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End:
writer.WriteLine("// The pattern leads with an end (\\z) anchor.");
using (EmitBlock(writer, "if (pos < inputSpan.Length)"))
{
// If we're not currently at the end (or a newline just before it), skip ahead
// since nothing until then can possibly match.
writer.WriteLine("base.runtextpos = inputSpan.Length;");
}
writer.WriteLine("return true;");
findEndsInAlwaysReturningTrue = true;
return true;
case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning:
writer.WriteLine("// The pattern leads with a beginning (\\A) anchor when processed right to left.");
using (EmitBlock(writer, "if (pos != 0)"))
{
// If we're not currently at the beginning, skip ahead (or, rather, backwards)
// since nothing until then can possibly match. (We're iterating from the end
// to the beginning in RightToLeft mode.)
writer.WriteLine("base.runtextpos = 0;");
}
writer.WriteLine("return true;");
findEndsInAlwaysReturningTrue = true;
return true;
case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ:
writer.WriteLine("// The pattern leads with an end (\\Z) anchor when processed right to left.");
using (EmitBlock(writer, "if (pos >= inputSpan.Length - 1 && ((uint)pos >= (uint)inputSpan.Length || inputSpan[pos] == '\\n'))"))
{
// If we're currently at the end, we're at a valid position to try. Otherwise,
// we'll never be (we're iterating from end to beginning), so fail immediately.
writer.WriteLine("return true;");
}
return true;
case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End:
writer.WriteLine("// The pattern leads with an end (\\z) anchor when processed right to left.");
using (EmitBlock(writer, "if (pos >= inputSpan.Length)"))
{
// If we're currently at the end, we're at a valid position to try. Otherwise,
// we'll never be (we're iterating from end to beginning), so fail immediately.
writer.WriteLine("return true;");
}
return true;
case FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ:
// Jump to the end, minus the min required length, which in this case is actually the fixed length, minus 1 (for a possible ending \n).
writer.WriteLine($"// The pattern has a trailing end (\\Z) anchor, and any possible match is exactly {regexTree.FindOptimizations.MinRequiredLength} characters.");
using (EmitBlock(writer, $"if (pos < inputSpan.Length - {regexTree.FindOptimizations.MinRequiredLength + 1})"))
{
writer.WriteLine($"base.runtextpos = inputSpan.Length - {regexTree.FindOptimizations.MinRequiredLength + 1};");
}
writer.WriteLine("return true;");
findEndsInAlwaysReturningTrue = true;
return true;
case FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_End:
// Jump to the end, minus the min required length, which in this case is actually the fixed length.
writer.WriteLine($"// The pattern has a trailing end (\\z) anchor, and any possible match is exactly {regexTree.FindOptimizations.MinRequiredLength} characters.");
using (EmitBlock(writer, $"if (pos < inputSpan.Length - {regexTree.FindOptimizations.MinRequiredLength})"))
{
writer.WriteLine($"base.runtextpos = inputSpan.Length - {regexTree.FindOptimizations.MinRequiredLength};");
}
writer.WriteLine("return true;");
findEndsInAlwaysReturningTrue = true;
return true;
}
// Now handle anchors that boost the position but may not determine immediate success or failure.
switch (regexTree.FindOptimizations.LeadingAnchor)
{
case RegexNodeKind.Bol:
// Optimize the handling of a Beginning-Of-Line (BOL) anchor. BOL is special, in that unlike
// other anchors like Beginning, there are potentially multiple places a BOL can match. So unlike
// the other anchors, which all skip all subsequent processing if found, with BOL we just use it
// to boost our position to the next line, and then continue normally with any searches.
writer.WriteLine($"// The pattern has a leading beginning-of-line anchor.");
using (EmitBlock(writer, "if (pos > 0 && inputSpan[pos - 1] != '\\n')"))
{
writer.WriteLine("int newlinePos = inputSpan.Slice(pos).IndexOf('\\n');");
using (EmitBlock(writer, "if ((uint)newlinePos > inputSpan.Length - pos - 1)"))
{
noMatchFoundLabelNeeded = true;
Goto(NoMatchFound);
}
writer.WriteLine("pos += newlinePos + 1;");
writer.WriteLine();
// We've updated the position. Make sure there's still enough room in the input for a possible match.
using (EmitBlock(writer, minRequiredLength switch
{
0 => "if (pos > inputSpan.Length)",
1 => "if (pos >= inputSpan.Length)",
_ => $"if (pos > inputSpan.Length - {minRequiredLength})"
}))
{
noMatchFoundLabelNeeded = true;
Goto(NoMatchFound);
}
}
writer.WriteLine();
break;
}
switch (regexTree.FindOptimizations.TrailingAnchor)
{
case RegexNodeKind.End when regexTree.FindOptimizations.MaxPossibleLength is int maxLength:
writer.WriteLine($"// The pattern has a trailing end (\\z) anchor, and any possible match is no more than {maxLength} characters.");
using (EmitBlock(writer, $"if (pos < inputSpan.Length - {maxLength})"))
{
writer.WriteLine($"pos = inputSpan.Length - {maxLength};");
}
writer.WriteLine();
break;
case RegexNodeKind.EndZ when regexTree.FindOptimizations.MaxPossibleLength is int maxLength:
writer.WriteLine($"// The pattern has a trailing end (\\Z) anchor, and any possible match is no more than {maxLength} characters.");
using (EmitBlock(writer, $"if (pos < inputSpan.Length - {maxLength + 1})"))
{
writer.WriteLine($"pos = inputSpan.Length - {maxLength + 1};");
}
writer.WriteLine();
break;
}
return false;
}
// Emits a case-sensitive left-to-right search for a substring.
void EmitIndexOfString_LeftToRight()
{
RegexFindOptimizations opts = regexTree.FindOptimizations;
string substring = "", stringComparison = "Ordinal", offset = "", offsetDescription = "";
switch (opts.FindMode)
{
case FindNextStartingPositionMode.LeadingString_LeftToRight:
substring = regexTree.FindOptimizations.LeadingPrefix;
offsetDescription = "at the beginning of the pattern";
Debug.Assert(!string.IsNullOrEmpty(substring));
break;
case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight:
substring = regexTree.FindOptimizations.LeadingPrefix;
stringComparison = "OrdinalIgnoreCase";
offsetDescription = "ordinal case-insensitive at the beginning of the pattern";
Debug.Assert(!string.IsNullOrEmpty(substring));
break;
case FindNextStartingPositionMode.FixedDistanceString_LeftToRight:
Debug.Assert(!string.IsNullOrEmpty(regexTree.FindOptimizations.FixedDistanceLiteral.String));
substring = regexTree.FindOptimizations.FixedDistanceLiteral.String;
if (regexTree.FindOptimizations.FixedDistanceLiteral is { Distance: > 0 } literal)
{
offset = $" + {literal.Distance}";
offsetDescription = $" at index {literal.Distance} in the pattern";
}
break;
default:
Debug.Fail($"Unexpected mode: {opts.FindMode}");
break;
}
string substringAndComparison = $"{substring}_{stringComparison}";
string fieldName = "s_indexOfString_";
fieldName = IsValidInFieldName(substring) ?
fieldName + substringAndComparison :
GetSHA256FieldName(fieldName, substringAndComparison);
if (!requiredHelpers.ContainsKey(fieldName))
{
requiredHelpers.Add(fieldName,
[
$"/// <summary>Supports searching for the string {EscapeXmlComment(Literal(substring))}.</summary>",
$"internal static readonly SearchValues<string> {fieldName} = SearchValues.Create([{Literal(substring)}], StringComparison.{stringComparison});",
]);
}
writer.WriteLine($"// The pattern has the literal {Literal(substring)} {offsetDescription}. Find the next occurrence.");
writer.WriteLine($"// If it can't be found, there's no match.");
writer.WriteLine($"int i = inputSpan.Slice(pos{offset}).IndexOfAny({HelpersTypeName}.{fieldName});");
using (EmitBlock(writer, "if (i >= 0)"))
{
writer.WriteLine("base.runtextpos = pos + i;");
writer.WriteLine("return true;");
}
// Determines whether its ok to embed the string in the field name.
// This is the same algorithm used by Roslyn.
static bool IsValidInFieldName(string s)
{
foreach (char c in s)
{
if (char.GetUnicodeCategory(c) is not
(UnicodeCategory.UppercaseLetter or
UnicodeCategory.LowercaseLetter or
UnicodeCategory.TitlecaseLetter or
UnicodeCategory.ModifierLetter or
UnicodeCategory.LetterNumber or
UnicodeCategory.OtherLetter or
UnicodeCategory.DecimalDigitNumber or
UnicodeCategory.ConnectorPunctuation or
UnicodeCategory.SpacingCombiningMark or
UnicodeCategory.NonSpacingMark or
UnicodeCategory.Format))
{
return false;
}
}
return true;
}
}
// Emits a case-sensitive left-to-right search for any one of multiple leading prefixes.
void EmitIndexOfStrings_LeftToRight()
{
RegexFindOptimizations opts = regexTree.FindOptimizations;
Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingStrings_LeftToRight or FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight);
string prefixes = string.Join(", ", opts.LeadingPrefixes.Select(prefix => Literal(prefix)));
StringComparison stringComparison = opts.FindMode is FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight ?
StringComparison.OrdinalIgnoreCase :
StringComparison.Ordinal;
string fieldName = GetSHA256FieldName($"s_indexOfAnyStrings_{stringComparison}_", prefixes);
if (!requiredHelpers.ContainsKey(fieldName))
{
requiredHelpers.Add(fieldName,
[
$"/// <summary>Supports searching for the specified strings.</summary>",
$"internal static readonly SearchValues<string> {fieldName} = SearchValues.Create([{prefixes}], StringComparison.{stringComparison});", // explicitly using an array in case prefixes is large
]);
}
writer.WriteLine($"// The pattern has multiple strings that could begin the match. Search for any of them.");
writer.WriteLine($"// If none can be found, there's no match.");
writer.WriteLine($"int i = inputSpan.Slice(pos).IndexOfAny({HelpersTypeName}.{fieldName});");
using (EmitBlock(writer, "if (i >= 0)"))
{
writer.WriteLine("base.runtextpos = pos + i;");
writer.WriteLine("return true;");
}
}
// Emits a case-sensitive right-to-left search for a substring.
void EmitIndexOfString_RightToLeft()
{
string prefix = regexTree.FindOptimizations.LeadingPrefix;
writer.WriteLine($"// The pattern begins with a literal {Literal(prefix)}. Find the next occurrence right-to-left.");
writer.WriteLine($"// If it can't be found, there's no match.");
writer.WriteLine($"pos = inputSpan.Slice(0, pos).LastIndexOf({Literal(prefix)});");
using (EmitBlock(writer, "if (pos >= 0)"))
{
writer.WriteLine($"base.runtextpos = pos + {prefix.Length};");
writer.WriteLine($"return true;");
}
}
// Emits a search for a set at a fixed position from the start of the pattern,
// and potentially other sets at other fixed positions in the pattern.
void EmitFixedSet_LeftToRight()
{
Debug.Assert(regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 });
List<RegexFindOptimizations.FixedDistanceSet>? sets = regexTree.FindOptimizations.FixedDistanceSets;
RegexFindOptimizations.FixedDistanceSet primarySet = sets![0];
const int MaxSets = 4;
int setsToUse = Math.Min(sets.Count, MaxSets);
writer.WriteLine(primarySet.Distance == 0 ?
$"// The pattern begins with {DescribeSet(primarySet.Set)}." :
$"// The pattern matches {DescribeSet(primarySet.Set)} at index {primarySet.Distance}.");
writer.WriteLine($"// Find the next occurrence. If it can't be found, there's no match.");
// Use IndexOf{Any} to accelerate the skip loop via vectorization to match the first prefix.
// But we avoid using it for the relatively common case of the starting set being '.', aka anything other than
// a newline, as it's very rare to have long, uninterrupted sequences of newlines. And we avoid using it
// for the case of the starting set being anything (e.g. '.' with SingleLine), as in that case it'll always match
// the first char.
int setIndex = 0;
bool canUseIndexOf =
primarySet.Set != RegexCharClass.NotNewLineClass &&
primarySet.Set != RegexCharClass.AnyClass;
bool needLoop = !canUseIndexOf || setsToUse > 1;
FinishEmitBlock loopBlock = default;
if (needLoop)
{
writer.WriteLine("ReadOnlySpan<char> span = inputSpan.Slice(pos);");
string upperBound = "span.Length" + (setsToUse > 1 || primarySet.Distance != 0 ? $" - {minRequiredLength - 1}" : "");
loopBlock = EmitBlock(writer, $"for (int i = 0; i < {upperBound}; i++)");
}
if (canUseIndexOf)
{
string span = needLoop ?
"span" :
"inputSpan.Slice(pos)";
span = (needLoop, primarySet.Distance) switch
{
(false, 0) => span,
(true, 0) => $"{span}.Slice(i)",
(false, _) => $"{span}.Slice({primarySet.Distance})",
(true, _) => $"{span}.Slice(i + {primarySet.Distance})",
};
// Get the IndexOf* expression to use to perform the search.
string indexOf;
if (primarySet.Chars is not null)
{
Debug.Assert(primarySet.Chars.Length > 0);
// We have a chars array, so we can use IndexOf{Any}{Except} to search for it. Choose the best overload.
string indexOfName = "IndexOf", indexOfAnyName = "IndexOfAny";
if (primarySet.Negated)
{
indexOfName = indexOfAnyName = "IndexOfAnyExcept";
}
indexOf = primarySet.Chars.Length switch
{
// 1, 2, 3 have dedicated optimized IndexOfAny overloads
1 => $"{span}.{indexOfName}({Literal(primarySet.Chars[0])})",
2 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})",
3 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})",
// 4, 5 have dedicated optimized IndexOfAny overloads accessible via the ReadOnlySpan<char> overload,
// but can also be handled via SearchValues
4 or 5 => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",
// > 5 can only be handled efficiently via SearchValues
_ => $"{span}.{indexOfAnyName}({EmitSearchValues(primarySet.Chars, requiredHelpers)})",
};
}
else if (primarySet.Range is not null)
{
// We have a range, so we can use IndexOfAny{Except}InRange to search for it. In the corner case,
// where we end up with a set of a single char, we can use IndexOf instead.
indexOf = (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Negated) switch
{
(false, false) => $"{span}.IndexOfAnyInRange({Literal(primarySet.Range.Value.LowInclusive)}, {Literal(primarySet.Range.Value.HighInclusive)})",
(true, false) => $"{span}.IndexOf({Literal(primarySet.Range.Value.LowInclusive)})",
(false, true) => $"{span}.IndexOfAnyExceptInRange({Literal(primarySet.Range.Value.LowInclusive)}, {Literal(primarySet.Range.Value.HighInclusive)})",
(true, true) => $"{span}.IndexOfAnyExcept({Literal(primarySet.Range.Value.LowInclusive)})",
};
}
else if (RegexCharClass.IsUnicodeCategoryOfSmallCharCount(primarySet.Set, out char[]? setChars, out bool negated, out string? description))
{
// We have a known set of characters, and we can use the supplied IndexOfAny{Except}(...).
indexOf = $"{span}.{(negated ? "IndexOfAnyExcept" : "IndexOfAny")}({EmitSearchValues(setChars, requiredHelpers, $"s_{description}")})";
}
else
{
// We have an arbitrary set of characters that's really large or otherwise not enumerable.
// We use a custom IndexOfAny helper that will perform the search as efficiently as possible.
indexOf = $"{span}.{EmitIndexOfAnyCustomHelper(primarySet.Set, requiredHelpers, checkOverflow)}()";
}
if (needLoop)
{
writer.WriteLine($"int indexOfPos = {indexOf};");
using (EmitBlock(writer, "if (indexOfPos < 0)"))
{
noMatchFoundLabelNeeded = true;
Goto(NoMatchFound);
}
writer.WriteLine("i += indexOfPos;");
writer.WriteLine();
if (setsToUse > 1)
{
// Of the remaining sets we're going to check, find the maximum distance of any of them.
// If it's further than the primary set we checked, we need a bounds check.
int maxDistance = sets[1].Distance;
for (int i = 2; i < setsToUse; i++)
{
maxDistance = Math.Max(maxDistance, sets[i].Distance);
}
if (maxDistance > primarySet.Distance)
{
int numRemainingSets = setsToUse - 1;
writer.WriteLine($"// The primary set being searched for was found. {numRemainingSets} more set{(numRemainingSets > 1 ? "s" : "")} will be checked so as");
writer.WriteLine($"// to minimize the number of places TryMatchAtCurrentPosition is run unnecessarily.");
writer.WriteLine($"// Make sure {(numRemainingSets > 1 ? "they fit" : "it fits")} in the remainder of the input.");
using (EmitBlock(writer, $"if ((uint)(i + {maxDistance}) >= (uint)span.Length)"))
{
noMatchFoundLabelNeeded = true;
Goto(NoMatchFound);
}
writer.WriteLine();
}
}
}
else
{
writer.WriteLine($"int i = {indexOf};");
using (EmitBlock(writer, "if (i >= 0)"))
{
writer.WriteLine("base.runtextpos = pos + i;");
writer.WriteLine("return true;");
}
}
setIndex = 1;
}
if (needLoop)
{
Debug.Assert(setIndex is 0 or 1);
bool hasCharClassConditions = false;
if (setIndex < setsToUse)
{
// if (CharInClass(textSpan[i + charClassIndex], prefix[0], "...") &&
// ...)
Debug.Assert(needLoop);
int start = setIndex;
for (; setIndex < setsToUse; setIndex++)
{
string spanIndex = $"span[i{(sets[setIndex].Distance > 0 ? $" + {sets[setIndex].Distance}" : "")}]";
string charInClassExpr = MatchCharacterClass(spanIndex, sets[setIndex].Set, negate: false, additionalDeclarations, requiredHelpers);
if (setIndex == start)
{
writer.Write($"if ({charInClassExpr}");
}
else
{
writer.WriteLine(" &&");
writer.Write($" {charInClassExpr}");
}
}
writer.WriteLine(")");
hasCharClassConditions = true;
}
using (hasCharClassConditions ? EmitBlock(writer, null) : default)
{
writer.WriteLine("base.runtextpos = pos + i;");
writer.WriteLine("return true;");
}
}
loopBlock.Dispose();
}
// Emits a right-to-left search for a set at a fixed position from the start of the pattern.
// (Currently that position will always be a distance of 0, meaning the start of the pattern itself.)
void EmitFixedSet_RightToLeft()
{
Debug.Assert(regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 });
RegexFindOptimizations.FixedDistanceSet set = regexTree.FindOptimizations.FixedDistanceSets![0];
Debug.Assert(set.Distance == 0);
writer.WriteLine($"// The pattern begins with {DescribeSet(set.Set)}.");
writer.WriteLine($"// Find the next occurrence. If it can't be found, there's no match.");
if (set.Chars is { Length: 1 })
{
Debug.Assert(!set.Negated);
writer.WriteLine($"pos = inputSpan.Slice(0, pos).LastIndexOf({Literal(set.Chars[0])});");
using (EmitBlock(writer, "if (pos >= 0)"))
{
writer.WriteLine("base.runtextpos = pos + 1;");
writer.WriteLine("return true;");
}
}
else
{
using (EmitBlock(writer, "while ((uint)--pos < (uint)inputSpan.Length)"))
{
using (EmitBlock(writer, $"if ({MatchCharacterClass("inputSpan[pos]", set.Set, negate: false, additionalDeclarations, requiredHelpers)})"))
{
writer.WriteLine("base.runtextpos = pos + 1;");
writer.WriteLine("return true;");
}
}
}
}
// Emits a search for a literal following a leading atomic single-character loop.
void EmitLiteralAfterAtomicLoop()
{
Debug.Assert(regexTree.FindOptimizations.LiteralAfterLoop is not null);
(RegexNode LoopNode, (char Char, string? String, StringComparison StringComparison, char[]? Chars) Literal) target = regexTree.FindOptimizations.LiteralAfterLoop.Value;
Debug.Assert(target.LoopNode.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic);
Debug.Assert(target.LoopNode.N == int.MaxValue);
string stringComparisonComment = target.Literal.StringComparison == StringComparison.OrdinalIgnoreCase ? "ordinal case-insensitive " : "";
string stringComparisonArgument = target.Literal.StringComparison == StringComparison.OrdinalIgnoreCase ? ", StringComparison.OrdinalIgnoreCase" : "";
writer.Write($"// The pattern begins with an atomic loop for {DescribeSet(target.LoopNode.Str!)}, followed by ");
writer.WriteLine(
target.Literal.String is not null ? $"the {stringComparisonComment}string {Literal(target.Literal.String)}." :
target.Literal.Chars is not null ? $"one of the characters {Literal(new string(target.Literal.Chars))}" :
$"the character {Literal(target.Literal.Char)}.");
writer.WriteLine($"// Search for the literal, and then walk backwards to the beginning of the loop.");
FinishEmitBlock block = default;
if (target.LoopNode.M > 0)
{
// If there's no lower bound on the loop, then once we find the literal, we know we have a valid starting position to try.
// If there is a lower bound, then we need a loop, as we could find the literal but it might not be prefixed with enough
// appropriate characters to satisfy the minimum bound.
block = EmitBlock(writer, "while (true)");
}
using (block)
{
writer.WriteLine($"ReadOnlySpan<char> slice = inputSpan.Slice(pos);");
writer.WriteLine();
// Find the literal. If we can't find it, we're done searching.
writer.Write("int i = slice.");
writer.WriteLine(
target.Literal.String is string literalString ? $"IndexOf({Literal(literalString)}{stringComparisonArgument});" :
target.Literal.Chars is not char[] literalChars ? $"IndexOf({Literal(target.Literal.Char)});" :
literalChars.Length switch
{
2 => $"IndexOfAny({Literal(literalChars[0])}, {Literal(literalChars[1])});",
3 => $"IndexOfAny({Literal(literalChars[0])}, {Literal(literalChars[1])}, {Literal(literalChars[2])});",
_ => $"IndexOfAny({EmitSearchValuesOrLiteral(literalChars, requiredHelpers)});",
});
FinishEmitBlock indexOfFoundBlock = default;
if (target.LoopNode.M > 0)
{
using (EmitBlock(writer, $"if (i < 0)"))
{
writer.WriteLine("break;");
}
writer.WriteLine();
}
else
{
indexOfFoundBlock = EmitBlock(writer, $"if (i >= 0)");
}
// We found the literal. Walk backwards from it finding as many matches as we can against the loop.
writer.WriteLine("int prev = i - 1;");
using (EmitBlock(writer, $"while ((uint)prev < (uint)slice.Length && {MatchCharacterClass("slice[prev]", target.LoopNode.Str!, negate: false, additionalDeclarations, requiredHelpers)})"))
{
writer.WriteLine("prev--;");
}
writer.WriteLine();
if (target.LoopNode.M > 0)
{
// If we found fewer than needed, loop around to try again. The loop doesn't overlap with the literal,
// so we can start from after the last place the literal matched.
using (EmitBlock(writer, $"if ((i - prev - 1) < {target.LoopNode.M})"))
{
writer.WriteLine("pos += i + 1;");
writer.WriteLine("continue;");
}
writer.WriteLine();
}
// We have a winner. The starting position is just after the last position that failed to match the loop.
// We also store the position after the loop into runtrackpos (an extra, unused field on RegexRunner) in order
// to communicate this position to the match algorithm such that it can skip the loop.
writer.WriteLine("base.runtextpos = pos + prev + 1;");
writer.WriteLine("base.runtrackpos = pos + i;");
writer.WriteLine("return true;");
indexOfFoundBlock.Dispose();
}
}
}