auto Context::match()

in lib/Regex/Executor.cpp [962:1502]


auto Context<Traits>::match(State<Traits> *s, bool onlyAtStart)
    -> ExecutorResult<const CodeUnit *> {
  using State = State<Traits>;
  BacktrackStack backtrackStack;

  // We'll refer to the cursor often.
  Cursor<Traits> &c = s->cursor_;

  // Pull out the instruction portion of the bytecode, following the header.
  const uint8_t *const bytecode = &bytecodeStream_[sizeof(RegexBytecodeHeader)];

  // Save the incoming IP in case we have to loop.
  const auto startIp = s->ip_;

  const CodeUnit *const startLoc = c.currentPointer();

  // Use offsetFromRight() instead of remaining() here so that the length passed
  // to advanceStringIndex is accurate even when the cursor is going backwards.
  const size_t charsToRight = c.offsetFromRight();

  // Decide how many locations we'll need to check.
  // Note that we do want to check the empty range at the end, so add one to
  // charsToRight.
  const size_t locsToCheckCount = onlyAtStart ? 1 : 1 + charsToRight;

  // If we are tracking backwards, we should only ever have one potential match
  // location. This is because advanceStringIndex only ever tracks forwards.
  assert(
      (c.forwards() || locsToCheckCount == 1) &&
      "Can only check one location when cursor is backwards");

  // Macro used when a state fails to match.
#define BACKTRACK()                            \
  do {                                         \
    auto btRes = backtrack(backtrackStack, s); \
    if (LLVM_UNLIKELY(!btRes))                 \
      return btRes.getStatus();                \
    if (*btRes)                                \
      goto backtrackingSucceeded;              \
    goto backtrackingExhausted;                \
  } while (0)

  for (size_t locIndex = 0; locIndex < locsToCheckCount;
       locIndex = advanceStringIndex(startLoc, locIndex, charsToRight)) {
    const CodeUnit *potentialMatchLocation = startLoc + locIndex;
    c.setCurrentPointer(potentialMatchLocation);
    s->ip_ = startIp;
  backtrackingSucceeded:
    for (;;) {
      const Insn *base = reinterpret_cast<const Insn *>(&bytecode[s->ip_]);
      switch (base->opcode) {
        case Opcode::Goal:
          return potentialMatchLocation;

        case Opcode::LeftAnchor:
          if (!matchesLeftAnchor(*this, *s))
            BACKTRACK();
          s->ip_ += sizeof(LeftAnchorInsn);
          break;

        case Opcode::RightAnchor:
          if (!matchesRightAnchor(*this, *s))
            BACKTRACK();
          s->ip_ += sizeof(RightAnchorInsn);
          break;

        case Opcode::MatchAny:
          if (c.atEnd() ||
              !matchWidth1<Width1Opcode::MatchAny>(base, c.consume()))
            BACKTRACK();
          s->ip_ += sizeof(MatchAnyInsn);
          break;

        case Opcode::U16MatchAny:
          if (c.atEnd())
            BACKTRACK();
          c.consumeUTF16();
          s->ip_ += sizeof(U16MatchAnyInsn);
          break;

        case Opcode::MatchAnyButNewline:
          if (c.atEnd() ||
              !matchWidth1<Width1Opcode::MatchAnyButNewline>(base, c.consume()))
            BACKTRACK();
          s->ip_ += sizeof(MatchAnyButNewlineInsn);
          break;

        case Opcode::U16MatchAnyButNewline:
          if (c.atEnd() || isLineTerminator(c.consumeUTF16()))
            BACKTRACK();
          s->ip_ += sizeof(U16MatchAnyButNewlineInsn);
          break;

        case Opcode::MatchChar8: {
          if (c.atEnd() ||
              !matchWidth1<Width1Opcode::MatchChar8>(base, c.consume()))
            BACKTRACK();
          s->ip_ += sizeof(MatchChar8Insn);
          break;
        }

        case Opcode::MatchChar16: {
          if (c.atEnd() ||
              !matchWidth1<Width1Opcode::MatchChar16>(base, c.consume()))
            BACKTRACK();
          s->ip_ += sizeof(MatchChar16Insn);
          break;
        }

        case Opcode::U16MatchChar32: {
          const auto *insn = llvh::cast<U16MatchChar32Insn>(base);
          if (c.atEnd() || c.consumeUTF16() != (CodePoint)insn->c)
            BACKTRACK();
          s->ip_ += sizeof(U16MatchChar32Insn);
          break;
        }

        case Opcode::MatchCharICase8: {
          if (c.atEnd() ||
              !matchWidth1<Width1Opcode::MatchCharICase8>(base, c.consume()))
            BACKTRACK();
          s->ip_ += sizeof(MatchCharICase8Insn);
          break;
        }

        case Opcode::MatchCharICase16: {
          if (c.atEnd() ||
              !matchWidth1<Width1Opcode::MatchCharICase16>(base, c.consume()))
            BACKTRACK();
          s->ip_ += sizeof(MatchCharICase16Insn);
          break;
        }

        case Opcode::U16MatchCharICase32: {
          const auto *insn = llvh::cast<U16MatchCharICase32Insn>(base);
          bool matched = false;
          if (!c.atEnd()) {
            CodePoint cp = c.consumeUTF16();
            matched =
                (cp == (CodePoint)insn->c ||
                 traits_.canonicalize(cp, true) == (CodePoint)insn->c);
          }
          if (!matched)
            BACKTRACK();
          s->ip_ += sizeof(U16MatchCharICase32Insn);
          break;
        }

        case Opcode::MatchNChar8: {
          const auto *insn = llvh::cast<MatchNChar8Insn>(base);
          if (c.remaining() < insn->charCount || !matchesNChar8(insn, *s))
            BACKTRACK();
          s->ip_ += insn->totalWidth();
          break;
        }

        case Opcode::MatchNCharICase8: {
          const auto *insn = llvh::cast<MatchNCharICase8Insn>(base);
          if (c.remaining() < insn->charCount || !matchesNCharICase8(insn, *s))
            BACKTRACK();
          s->ip_ += insn->totalWidth();
          break;
        }

        case Opcode::Alternation: {
          // We have an alternation. Determine which of our first and second
          // branches are viable. If both are, we have to split our state.
          const AlternationInsn *alt = llvh::cast<AlternationInsn>(base);
          bool primaryViable =
              c.satisfiesConstraints(flags_, alt->primaryConstraints);
          bool secondaryViable =
              c.satisfiesConstraints(flags_, alt->secondaryConstraints);
          if (primaryViable && secondaryViable) {
            // We need to explore both branches. Explore the primary branch
            // first, backtrack to the secondary one.
            s->ip_ += sizeof(AlternationInsn);
            auto res = pushBacktrack(
                backtrackStack,
                BacktrackInsn::makeSetPosition(
                    alt->secondaryBranch, c.currentPointer()));
            if (res != ExecutionStatus::RETURNED) {
              return res;
            }
          } else if (primaryViable) {
            s->ip_ += sizeof(AlternationInsn);
          } else if (secondaryViable) {
            s->ip_ = alt->secondaryBranch;
          } else {
            BACKTRACK();
          }
          break;
        }

        case Opcode::Jump32:
          s->ip_ = llvh::cast<Jump32Insn>(base)->target;
          break;

        case Opcode::Bracket: {
          if (c.atEnd() ||
              !matchWidth1<Width1Opcode::Bracket>(base, c.consume()))
            BACKTRACK();
          s->ip_ += llvh::cast<BracketInsn>(base)->totalWidth();
          break;
        }

        case Opcode::U16Bracket: {
          const U16BracketInsn *insn = llvh::cast<U16BracketInsn>(base);
          // U16BracketInsn is followed by a list of BracketRange32s.
          const BracketRange32 *ranges =
              reinterpret_cast<const BracketRange32 *>(insn + 1);
          if (c.atEnd() ||
              !bracketMatchesChar<Traits>(
                  *this, insn, ranges, c.consumeUTF16()))
            BACKTRACK();
          s->ip_ += insn->totalWidth();
          break;
        }

        case Opcode::WordBoundary: {
          const WordBoundaryInsn *insn = llvh::cast<WordBoundaryInsn>(base);
          const auto *charPointer = c.currentPointer();

          bool prevIsWordchar = false;
          if (!c.atLeft())
            prevIsWordchar = traits_.characterHasType(
                charPointer[-1], CharacterClass::Words);

          bool currentIsWordchar = false;
          if (!c.atRight())
            currentIsWordchar =
                traits_.characterHasType(charPointer[0], CharacterClass::Words);

          bool isWordBoundary = (prevIsWordchar != currentIsWordchar);
          if (isWordBoundary ^ insn->invert)
            s->ip_ += sizeof(WordBoundaryInsn);
          else
            BACKTRACK();
          break;
        }

        case Opcode::BeginMarkedSubexpression: {
          const auto *insn = llvh::cast<BeginMarkedSubexpressionInsn>(base);
          auto res = pushBacktrack(
              backtrackStack,
              BacktrackInsn::makeSetCaptureGroup(
                  insn->mexp, {kNotMatched, kNotMatched}));
          if (res != ExecutionStatus::RETURNED) {
            return res;
          }
          // When tracking backwards (in a lookbehind assertion) we traverse our
          // input backwards, so set the end before the start.
          auto &range = s->getCapturedRange(insn->mexp);
          if (c.forwards()) {
            range.start = c.offsetFromLeft();
          } else {
            range.end = c.offsetFromLeft();
          }
          s->ip_ += sizeof(BeginMarkedSubexpressionInsn);
          break;
        }

        case Opcode::EndMarkedSubexpression: {
          const auto *insn = llvh::cast<EndMarkedSubexpressionInsn>(base);
          auto &range = s->getCapturedRange(insn->mexp);
          if (c.forwards()) {
            assert(
                range.start != kNotMatched && "Capture group was not entered");
            range.end = c.offsetFromLeft();
          } else {
            assert(range.end != kNotMatched && "Capture group was not entered");
            range.start = c.offsetFromLeft();
          }
          assert(range.start <= range.end && "Captured range end before start");
          s->ip_ += sizeof(EndMarkedSubexpressionInsn);
          break;
        }

        // ES10 21.2.2.9.1
        case Opcode::BackRef: {
          const auto insn = llvh::cast<BackRefInsn>(base);
          // a. Let cap be x's captures List.
          // b. Let s be cap[n].
          CapturedRange cr = s->getCapturedRange(insn->mexp);

          // c. If s is undefined, return c(x).
          // Note we have to check both cr.start and cr.end here. If we are
          // currently in the middle of matching a capture group (going either
          // forwards or backwards) we should just return success.
          if (cr.start == kNotMatched || cr.end == kNotMatched) {
            // Backreferences to a capture group that did not match always
            // succeed (ES10 21.2.2.9)
            s->ip_ += sizeof(BackRefInsn);
            break;
          }

          // TODO: this can be optimized by hoisting the branches out of the
          // loop.
          bool icase = syntaxFlags_.ignoreCase;
          bool unicode = syntaxFlags_.unicode;
          auto capturedStart = first_ + cr.start;
          auto capturedEnd = first_ + cr.end;
          Cursor<Traits> cursor2(
              capturedStart,
              c.forwards() ? capturedStart : capturedEnd,
              capturedEnd,
              c.forwards());
          Cursor<Traits> cursor1 = c;
          bool matched = true;
          while (matched && !cursor2.atEnd()) {
            if (cursor1.atEnd()) {
              matched = false;
            } else if (!icase) {
              // Direct comparison. Here we don't need to decode surrogate
              // pairs.
              matched = (cursor1.consume() == cursor2.consume());
            } else if (!unicode) {
              // Case-insensitive non-Unicode comparison, no decoding of
              // surrogate pairs.
              auto c1 = cursor1.consume();
              auto c2 = cursor2.consume();
              matched =
                  (c1 == c2 ||
                   traits_.canonicalize(c1, unicode) ==
                       traits_.canonicalize(c2, unicode));
            } else {
              // Unicode: we do need to decode surrogate pairs.
              auto cp1 = cursor1.consumeUTF16();
              auto cp2 = cursor2.consumeUTF16();
              matched =
                  (cp1 == cp2 ||
                   traits_.canonicalize(cp1, unicode) ==
                       traits_.canonicalize(cp2, unicode));
            }
          }
          if (!matched) {
            BACKTRACK();
          }
          s->ip_ += sizeof(BackRefInsn);
          c.setCurrentPointer(cursor1.currentPointer());
          break;
        }

        case Opcode::Lookaround: {
          const LookaroundInsn *insn = llvh::cast<LookaroundInsn>(base);
          bool matched = false;
          if (c.satisfiesConstraints(flags_, insn->constraints)) {
            // Copy the state. This is because if the match fails (or if we are
            // inverted) we need to restore its capture groups.
            State savedState{*s};

            // Set the direction of the cursor.
            c.setForwards(insn->forwards);

            // Invoke match() recursively with our expression.
            // Save and restore the position because lookaheads do not consume
            // anything.
            s->ip_ += sizeof(LookaroundInsn);
            auto match = this->match(s, true /* onlyAtStart */);
            // There were no errors and we matched something (so non-null
            // return)
            matched = match && match.getValue();
            c.setCurrentPointer(savedState.cursor_.currentPointer());
            c.setForwards(savedState.cursor_.forwards());

            // Restore capture groups unless we are a positive lookaround that
            // successfully matched. If we are a successfully matching positive
            // lookaround, set up backtracking to reset the capture groups. Note
            // we never backtrack INTO a successfully matched lookahead:
            // once a lookahead finds a match it forgets all other ways it could
            // have matched. (ES 5.1 15.10.2.8 Note 2).
            if (matched && !insn->invert) {
              // Backtrack capture groups in the lookahead expression.
              for (uint32_t i = insn->mexpBegin, e = insn->mexpEnd; i < e;
                   i++) {
                CapturedRange cr = savedState.getCapturedRange(i);
                auto res = pushBacktrack(
                    backtrackStack, BacktrackInsn::makeSetCaptureGroup(i, cr));
                if (res != ExecutionStatus::RETURNED)
                  return res;
              }
            } else {
              // Restore the saved state.
              *s = std::move(savedState);
            }
          }

          // 'matched' tells us whether the enclosed assertion expression
          // matched the input. This instruction matched the input if it is a
          // positive assertion (invert == false) and the expression matched,
          // or a negative assertion (invert == true) and the expression did
          // not match. Hence xor with invert.
          if (matched ^ insn->invert)
            s->ip_ = insn->continuation;
          else
            BACKTRACK();
          break;
        }

        case Opcode::BeginLoop: {
          // Here we are entering a loop from outside, not jumping back into
          // it.
          const BeginLoopInsn *loop = llvh::cast<BeginLoopInsn>(base);
          s->getLoop(loop->loopId).iterations = 0;
          // Check to see if the loop body is viable. If not, and the loop has
          // a nonzero minimum iteration, then we know we won't match and we
          // can reject the state. If it does have a minimum iteration, we can
          // just skip to the not-taken target. Note that this is a static
          // property of the loop so we don't need to check it on every
          // iteration, only the first one.
          if (!c.satisfiesConstraints(flags_, loop->loopeeConstraints)) {
            if (loop->min > 0) {
              BACKTRACK();
            } else {
              s->ip_ = loop->notTakenTarget;
              break;
            }
          }
          goto runLoop;
        }

        case Opcode::EndLoop:
          // This is reached after the body of a loop finishes executing.
          // Move the IP to the loop and run it again immediately.
          s->ip_ = llvh::cast<EndLoopInsn>(base)->target;
          base = reinterpret_cast<const Insn *>(&bytecode[s->ip_]);
          // Note fall through.

        runLoop : {
          const BeginLoopInsn *loop = llvh::cast<BeginLoopInsn>(base);
          auto &loopData = s->getLoop(loop->loopId);
          uint32_t iteration = loopData.iterations;

          const uint32_t loopTakenIp = s->ip_ + sizeof(BeginLoopInsn);

          assert(loop->min <= loop->max && "Inconsistent loop bounds");

          // Check to see if we have looped more than the minimum number of
          // iterations, and if so, whether the subexpression we looped over
          // matched an empty string. ES6 21.2.2.5.1 Note 4: "once the
          // minimum number of repetitions has been satisfied, any more
          // expansions of Atom that match the empty character sequence are
          // not considered for further repetitions."
          if (iteration > loop->min &&
              loopData.entryPosition == c.offsetFromLeft())
            BACKTRACK();

          if (iteration < loop->min) {
            auto res = prepareToEnterLoopBody(s, loop, backtrackStack);
            if (res != ExecutionStatus::RETURNED)
              return res;
            s->ip_ = loopTakenIp;
          } else if (iteration == loop->max) {
            s->ip_ = loop->notTakenTarget;
          } else {
            // We are within the target iteration range, figure out whether we
            // should continue or exit.
            assert(iteration >= loop->min && iteration < loop->max);
            if (!loop->greedy) {
              // Backtrack by entering this non-greedy loop.
              loopData.entryPosition = c.offsetFromLeft();
              auto res = pushBacktrack(
                  backtrackStack,
                  BacktrackInsn::makeEnterNonGreedyLoop(
                      loop, loopTakenIp, loopData));
              if (res != ExecutionStatus::RETURNED) {
                return res;
              }
              s->ip_ = loop->notTakenTarget;
            } else {
              // Backtrack by exiting this greedy loop.
              auto pushRes = pushBacktrack(
                  backtrackStack,
                  BacktrackInsn::makeSetPosition(
                      loop->notTakenTarget, c.currentPointer()));
              if (pushRes != ExecutionStatus::RETURNED)
                return pushRes;

              auto prepRes = prepareToEnterLoopBody(s, loop, backtrackStack);
              if (prepRes != ExecutionStatus::RETURNED)
                return prepRes;
              s->ip_ = loopTakenIp;
            }
          }
          break;
        }

        case Opcode::BeginSimpleLoop: {
          // Here we are entering a simple loop from outside,
          // not jumping back into it.
          const BeginSimpleLoopInsn *loop =
              llvh::cast<BeginSimpleLoopInsn>(base);

          if (!c.satisfiesConstraints(flags_, loop->loopeeConstraints)) {
            s->ip_ = loop->notTakenTarget;
            break;
          }

          goto runSimpleLoop;
        }

        case Opcode::EndSimpleLoop:
          s->ip_ = llvh::cast<EndSimpleLoopInsn>(base)->target;
          base = reinterpret_cast<const Insn *>(&bytecode[s->ip_]);
          // Note: fall-through.

        runSimpleLoop : {
          const BeginSimpleLoopInsn *loop =
              llvh::cast<BeginSimpleLoopInsn>(base);
          // Since this is a simple loop, we'll always need to explore both
          // exiting the loop at this point and continuing to loop.
          // Note simple loops are always greedy.
          auto res = pushBacktrack(
              backtrackStack,
              BacktrackInsn::makeSetPosition(
                  loop->notTakenTarget, c.currentPointer()));
          if (res != ExecutionStatus::RETURNED) {
            return res;
          }
          s->ip_ += sizeof(BeginSimpleLoopInsn);
          break;
        }

        case Opcode::Width1Loop: {
          const Width1LoopInsn *loop = llvh::cast<Width1LoopInsn>(base);
          auto matchRes = matchWidth1Loop(loop, s, backtrackStack);
          if (LLVM_UNLIKELY(!matchRes))
            return matchRes.getStatus();
          if (!*matchRes)
            BACKTRACK();
          break;
        }
      }
    }
  // The search failed at this location.
  backtrackingExhausted:
    continue;
  }
#undef BACKTRACK
  // The match failed.
  return nullptr;
}