in lib/Regex/Executor.cpp [962:1502]
auto Context<Traits>::match(State<Traits> *s, bool onlyAtStart)
-> ExecutorResult<const CodeUnit *> {
using State = State<Traits>;
BacktrackStack backtrackStack;
// We'll refer to the cursor often.
Cursor<Traits> &c = s->cursor_;
// Pull out the instruction portion of the bytecode, following the header.
const uint8_t *const bytecode = &bytecodeStream_[sizeof(RegexBytecodeHeader)];
// Save the incoming IP in case we have to loop.
const auto startIp = s->ip_;
const CodeUnit *const startLoc = c.currentPointer();
// Use offsetFromRight() instead of remaining() here so that the length passed
// to advanceStringIndex is accurate even when the cursor is going backwards.
const size_t charsToRight = c.offsetFromRight();
// Decide how many locations we'll need to check.
// Note that we do want to check the empty range at the end, so add one to
// charsToRight.
const size_t locsToCheckCount = onlyAtStart ? 1 : 1 + charsToRight;
// If we are tracking backwards, we should only ever have one potential match
// location. This is because advanceStringIndex only ever tracks forwards.
assert(
(c.forwards() || locsToCheckCount == 1) &&
"Can only check one location when cursor is backwards");
// Macro used when a state fails to match.
#define BACKTRACK() \
do { \
auto btRes = backtrack(backtrackStack, s); \
if (LLVM_UNLIKELY(!btRes)) \
return btRes.getStatus(); \
if (*btRes) \
goto backtrackingSucceeded; \
goto backtrackingExhausted; \
} while (0)
for (size_t locIndex = 0; locIndex < locsToCheckCount;
locIndex = advanceStringIndex(startLoc, locIndex, charsToRight)) {
const CodeUnit *potentialMatchLocation = startLoc + locIndex;
c.setCurrentPointer(potentialMatchLocation);
s->ip_ = startIp;
backtrackingSucceeded:
for (;;) {
const Insn *base = reinterpret_cast<const Insn *>(&bytecode[s->ip_]);
switch (base->opcode) {
case Opcode::Goal:
return potentialMatchLocation;
case Opcode::LeftAnchor:
if (!matchesLeftAnchor(*this, *s))
BACKTRACK();
s->ip_ += sizeof(LeftAnchorInsn);
break;
case Opcode::RightAnchor:
if (!matchesRightAnchor(*this, *s))
BACKTRACK();
s->ip_ += sizeof(RightAnchorInsn);
break;
case Opcode::MatchAny:
if (c.atEnd() ||
!matchWidth1<Width1Opcode::MatchAny>(base, c.consume()))
BACKTRACK();
s->ip_ += sizeof(MatchAnyInsn);
break;
case Opcode::U16MatchAny:
if (c.atEnd())
BACKTRACK();
c.consumeUTF16();
s->ip_ += sizeof(U16MatchAnyInsn);
break;
case Opcode::MatchAnyButNewline:
if (c.atEnd() ||
!matchWidth1<Width1Opcode::MatchAnyButNewline>(base, c.consume()))
BACKTRACK();
s->ip_ += sizeof(MatchAnyButNewlineInsn);
break;
case Opcode::U16MatchAnyButNewline:
if (c.atEnd() || isLineTerminator(c.consumeUTF16()))
BACKTRACK();
s->ip_ += sizeof(U16MatchAnyButNewlineInsn);
break;
case Opcode::MatchChar8: {
if (c.atEnd() ||
!matchWidth1<Width1Opcode::MatchChar8>(base, c.consume()))
BACKTRACK();
s->ip_ += sizeof(MatchChar8Insn);
break;
}
case Opcode::MatchChar16: {
if (c.atEnd() ||
!matchWidth1<Width1Opcode::MatchChar16>(base, c.consume()))
BACKTRACK();
s->ip_ += sizeof(MatchChar16Insn);
break;
}
case Opcode::U16MatchChar32: {
const auto *insn = llvh::cast<U16MatchChar32Insn>(base);
if (c.atEnd() || c.consumeUTF16() != (CodePoint)insn->c)
BACKTRACK();
s->ip_ += sizeof(U16MatchChar32Insn);
break;
}
case Opcode::MatchCharICase8: {
if (c.atEnd() ||
!matchWidth1<Width1Opcode::MatchCharICase8>(base, c.consume()))
BACKTRACK();
s->ip_ += sizeof(MatchCharICase8Insn);
break;
}
case Opcode::MatchCharICase16: {
if (c.atEnd() ||
!matchWidth1<Width1Opcode::MatchCharICase16>(base, c.consume()))
BACKTRACK();
s->ip_ += sizeof(MatchCharICase16Insn);
break;
}
case Opcode::U16MatchCharICase32: {
const auto *insn = llvh::cast<U16MatchCharICase32Insn>(base);
bool matched = false;
if (!c.atEnd()) {
CodePoint cp = c.consumeUTF16();
matched =
(cp == (CodePoint)insn->c ||
traits_.canonicalize(cp, true) == (CodePoint)insn->c);
}
if (!matched)
BACKTRACK();
s->ip_ += sizeof(U16MatchCharICase32Insn);
break;
}
case Opcode::MatchNChar8: {
const auto *insn = llvh::cast<MatchNChar8Insn>(base);
if (c.remaining() < insn->charCount || !matchesNChar8(insn, *s))
BACKTRACK();
s->ip_ += insn->totalWidth();
break;
}
case Opcode::MatchNCharICase8: {
const auto *insn = llvh::cast<MatchNCharICase8Insn>(base);
if (c.remaining() < insn->charCount || !matchesNCharICase8(insn, *s))
BACKTRACK();
s->ip_ += insn->totalWidth();
break;
}
case Opcode::Alternation: {
// We have an alternation. Determine which of our first and second
// branches are viable. If both are, we have to split our state.
const AlternationInsn *alt = llvh::cast<AlternationInsn>(base);
bool primaryViable =
c.satisfiesConstraints(flags_, alt->primaryConstraints);
bool secondaryViable =
c.satisfiesConstraints(flags_, alt->secondaryConstraints);
if (primaryViable && secondaryViable) {
// We need to explore both branches. Explore the primary branch
// first, backtrack to the secondary one.
s->ip_ += sizeof(AlternationInsn);
auto res = pushBacktrack(
backtrackStack,
BacktrackInsn::makeSetPosition(
alt->secondaryBranch, c.currentPointer()));
if (res != ExecutionStatus::RETURNED) {
return res;
}
} else if (primaryViable) {
s->ip_ += sizeof(AlternationInsn);
} else if (secondaryViable) {
s->ip_ = alt->secondaryBranch;
} else {
BACKTRACK();
}
break;
}
case Opcode::Jump32:
s->ip_ = llvh::cast<Jump32Insn>(base)->target;
break;
case Opcode::Bracket: {
if (c.atEnd() ||
!matchWidth1<Width1Opcode::Bracket>(base, c.consume()))
BACKTRACK();
s->ip_ += llvh::cast<BracketInsn>(base)->totalWidth();
break;
}
case Opcode::U16Bracket: {
const U16BracketInsn *insn = llvh::cast<U16BracketInsn>(base);
// U16BracketInsn is followed by a list of BracketRange32s.
const BracketRange32 *ranges =
reinterpret_cast<const BracketRange32 *>(insn + 1);
if (c.atEnd() ||
!bracketMatchesChar<Traits>(
*this, insn, ranges, c.consumeUTF16()))
BACKTRACK();
s->ip_ += insn->totalWidth();
break;
}
case Opcode::WordBoundary: {
const WordBoundaryInsn *insn = llvh::cast<WordBoundaryInsn>(base);
const auto *charPointer = c.currentPointer();
bool prevIsWordchar = false;
if (!c.atLeft())
prevIsWordchar = traits_.characterHasType(
charPointer[-1], CharacterClass::Words);
bool currentIsWordchar = false;
if (!c.atRight())
currentIsWordchar =
traits_.characterHasType(charPointer[0], CharacterClass::Words);
bool isWordBoundary = (prevIsWordchar != currentIsWordchar);
if (isWordBoundary ^ insn->invert)
s->ip_ += sizeof(WordBoundaryInsn);
else
BACKTRACK();
break;
}
case Opcode::BeginMarkedSubexpression: {
const auto *insn = llvh::cast<BeginMarkedSubexpressionInsn>(base);
auto res = pushBacktrack(
backtrackStack,
BacktrackInsn::makeSetCaptureGroup(
insn->mexp, {kNotMatched, kNotMatched}));
if (res != ExecutionStatus::RETURNED) {
return res;
}
// When tracking backwards (in a lookbehind assertion) we traverse our
// input backwards, so set the end before the start.
auto &range = s->getCapturedRange(insn->mexp);
if (c.forwards()) {
range.start = c.offsetFromLeft();
} else {
range.end = c.offsetFromLeft();
}
s->ip_ += sizeof(BeginMarkedSubexpressionInsn);
break;
}
case Opcode::EndMarkedSubexpression: {
const auto *insn = llvh::cast<EndMarkedSubexpressionInsn>(base);
auto &range = s->getCapturedRange(insn->mexp);
if (c.forwards()) {
assert(
range.start != kNotMatched && "Capture group was not entered");
range.end = c.offsetFromLeft();
} else {
assert(range.end != kNotMatched && "Capture group was not entered");
range.start = c.offsetFromLeft();
}
assert(range.start <= range.end && "Captured range end before start");
s->ip_ += sizeof(EndMarkedSubexpressionInsn);
break;
}
// ES10 21.2.2.9.1
case Opcode::BackRef: {
const auto insn = llvh::cast<BackRefInsn>(base);
// a. Let cap be x's captures List.
// b. Let s be cap[n].
CapturedRange cr = s->getCapturedRange(insn->mexp);
// c. If s is undefined, return c(x).
// Note we have to check both cr.start and cr.end here. If we are
// currently in the middle of matching a capture group (going either
// forwards or backwards) we should just return success.
if (cr.start == kNotMatched || cr.end == kNotMatched) {
// Backreferences to a capture group that did not match always
// succeed (ES10 21.2.2.9)
s->ip_ += sizeof(BackRefInsn);
break;
}
// TODO: this can be optimized by hoisting the branches out of the
// loop.
bool icase = syntaxFlags_.ignoreCase;
bool unicode = syntaxFlags_.unicode;
auto capturedStart = first_ + cr.start;
auto capturedEnd = first_ + cr.end;
Cursor<Traits> cursor2(
capturedStart,
c.forwards() ? capturedStart : capturedEnd,
capturedEnd,
c.forwards());
Cursor<Traits> cursor1 = c;
bool matched = true;
while (matched && !cursor2.atEnd()) {
if (cursor1.atEnd()) {
matched = false;
} else if (!icase) {
// Direct comparison. Here we don't need to decode surrogate
// pairs.
matched = (cursor1.consume() == cursor2.consume());
} else if (!unicode) {
// Case-insensitive non-Unicode comparison, no decoding of
// surrogate pairs.
auto c1 = cursor1.consume();
auto c2 = cursor2.consume();
matched =
(c1 == c2 ||
traits_.canonicalize(c1, unicode) ==
traits_.canonicalize(c2, unicode));
} else {
// Unicode: we do need to decode surrogate pairs.
auto cp1 = cursor1.consumeUTF16();
auto cp2 = cursor2.consumeUTF16();
matched =
(cp1 == cp2 ||
traits_.canonicalize(cp1, unicode) ==
traits_.canonicalize(cp2, unicode));
}
}
if (!matched) {
BACKTRACK();
}
s->ip_ += sizeof(BackRefInsn);
c.setCurrentPointer(cursor1.currentPointer());
break;
}
case Opcode::Lookaround: {
const LookaroundInsn *insn = llvh::cast<LookaroundInsn>(base);
bool matched = false;
if (c.satisfiesConstraints(flags_, insn->constraints)) {
// Copy the state. This is because if the match fails (or if we are
// inverted) we need to restore its capture groups.
State savedState{*s};
// Set the direction of the cursor.
c.setForwards(insn->forwards);
// Invoke match() recursively with our expression.
// Save and restore the position because lookaheads do not consume
// anything.
s->ip_ += sizeof(LookaroundInsn);
auto match = this->match(s, true /* onlyAtStart */);
// There were no errors and we matched something (so non-null
// return)
matched = match && match.getValue();
c.setCurrentPointer(savedState.cursor_.currentPointer());
c.setForwards(savedState.cursor_.forwards());
// Restore capture groups unless we are a positive lookaround that
// successfully matched. If we are a successfully matching positive
// lookaround, set up backtracking to reset the capture groups. Note
// we never backtrack INTO a successfully matched lookahead:
// once a lookahead finds a match it forgets all other ways it could
// have matched. (ES 5.1 15.10.2.8 Note 2).
if (matched && !insn->invert) {
// Backtrack capture groups in the lookahead expression.
for (uint32_t i = insn->mexpBegin, e = insn->mexpEnd; i < e;
i++) {
CapturedRange cr = savedState.getCapturedRange(i);
auto res = pushBacktrack(
backtrackStack, BacktrackInsn::makeSetCaptureGroup(i, cr));
if (res != ExecutionStatus::RETURNED)
return res;
}
} else {
// Restore the saved state.
*s = std::move(savedState);
}
}
// 'matched' tells us whether the enclosed assertion expression
// matched the input. This instruction matched the input if it is a
// positive assertion (invert == false) and the expression matched,
// or a negative assertion (invert == true) and the expression did
// not match. Hence xor with invert.
if (matched ^ insn->invert)
s->ip_ = insn->continuation;
else
BACKTRACK();
break;
}
case Opcode::BeginLoop: {
// Here we are entering a loop from outside, not jumping back into
// it.
const BeginLoopInsn *loop = llvh::cast<BeginLoopInsn>(base);
s->getLoop(loop->loopId).iterations = 0;
// Check to see if the loop body is viable. If not, and the loop has
// a nonzero minimum iteration, then we know we won't match and we
// can reject the state. If it does have a minimum iteration, we can
// just skip to the not-taken target. Note that this is a static
// property of the loop so we don't need to check it on every
// iteration, only the first one.
if (!c.satisfiesConstraints(flags_, loop->loopeeConstraints)) {
if (loop->min > 0) {
BACKTRACK();
} else {
s->ip_ = loop->notTakenTarget;
break;
}
}
goto runLoop;
}
case Opcode::EndLoop:
// This is reached after the body of a loop finishes executing.
// Move the IP to the loop and run it again immediately.
s->ip_ = llvh::cast<EndLoopInsn>(base)->target;
base = reinterpret_cast<const Insn *>(&bytecode[s->ip_]);
// Note fall through.
runLoop : {
const BeginLoopInsn *loop = llvh::cast<BeginLoopInsn>(base);
auto &loopData = s->getLoop(loop->loopId);
uint32_t iteration = loopData.iterations;
const uint32_t loopTakenIp = s->ip_ + sizeof(BeginLoopInsn);
assert(loop->min <= loop->max && "Inconsistent loop bounds");
// Check to see if we have looped more than the minimum number of
// iterations, and if so, whether the subexpression we looped over
// matched an empty string. ES6 21.2.2.5.1 Note 4: "once the
// minimum number of repetitions has been satisfied, any more
// expansions of Atom that match the empty character sequence are
// not considered for further repetitions."
if (iteration > loop->min &&
loopData.entryPosition == c.offsetFromLeft())
BACKTRACK();
if (iteration < loop->min) {
auto res = prepareToEnterLoopBody(s, loop, backtrackStack);
if (res != ExecutionStatus::RETURNED)
return res;
s->ip_ = loopTakenIp;
} else if (iteration == loop->max) {
s->ip_ = loop->notTakenTarget;
} else {
// We are within the target iteration range, figure out whether we
// should continue or exit.
assert(iteration >= loop->min && iteration < loop->max);
if (!loop->greedy) {
// Backtrack by entering this non-greedy loop.
loopData.entryPosition = c.offsetFromLeft();
auto res = pushBacktrack(
backtrackStack,
BacktrackInsn::makeEnterNonGreedyLoop(
loop, loopTakenIp, loopData));
if (res != ExecutionStatus::RETURNED) {
return res;
}
s->ip_ = loop->notTakenTarget;
} else {
// Backtrack by exiting this greedy loop.
auto pushRes = pushBacktrack(
backtrackStack,
BacktrackInsn::makeSetPosition(
loop->notTakenTarget, c.currentPointer()));
if (pushRes != ExecutionStatus::RETURNED)
return pushRes;
auto prepRes = prepareToEnterLoopBody(s, loop, backtrackStack);
if (prepRes != ExecutionStatus::RETURNED)
return prepRes;
s->ip_ = loopTakenIp;
}
}
break;
}
case Opcode::BeginSimpleLoop: {
// Here we are entering a simple loop from outside,
// not jumping back into it.
const BeginSimpleLoopInsn *loop =
llvh::cast<BeginSimpleLoopInsn>(base);
if (!c.satisfiesConstraints(flags_, loop->loopeeConstraints)) {
s->ip_ = loop->notTakenTarget;
break;
}
goto runSimpleLoop;
}
case Opcode::EndSimpleLoop:
s->ip_ = llvh::cast<EndSimpleLoopInsn>(base)->target;
base = reinterpret_cast<const Insn *>(&bytecode[s->ip_]);
// Note: fall-through.
runSimpleLoop : {
const BeginSimpleLoopInsn *loop =
llvh::cast<BeginSimpleLoopInsn>(base);
// Since this is a simple loop, we'll always need to explore both
// exiting the loop at this point and continuing to loop.
// Note simple loops are always greedy.
auto res = pushBacktrack(
backtrackStack,
BacktrackInsn::makeSetPosition(
loop->notTakenTarget, c.currentPointer()));
if (res != ExecutionStatus::RETURNED) {
return res;
}
s->ip_ += sizeof(BeginSimpleLoopInsn);
break;
}
case Opcode::Width1Loop: {
const Width1LoopInsn *loop = llvh::cast<Width1LoopInsn>(base);
auto matchRes = matchWidth1Loop(loop, s, backtrackStack);
if (LLVM_UNLIKELY(!matchRes))
return matchRes.getStatus();
if (!*matchRes)
BACKTRACK();
break;
}
}
}
// The search failed at this location.
backtrackingExhausted:
continue;
}
#undef BACKTRACK
// The match failed.
return nullptr;
}