bool GlobMatcher::tryMatchAt()

in eden/fs/model/git/GlobMatcher.cpp [473:688]


bool GlobMatcher::tryMatchAt(
    StringPiece text,
    size_t textIdx,
    size_t patternIdx) const {
  // Loop through all opcodes in the pattern buffer.
  // It's kind of unfortunate how big and complicated this while loop is.
  //
  // It would improve readability to break this down into one function per
  // opcode, but then it would require additional conditional checks after each
  // function to see if we should break out or keep going.  Having everything
  // inlined in this single while loop makes it very easy to break out early
  // without additional checks.
  //
  // I have tried breaking this out into separate functions (and also using an
  // array lookup to find the correct opcode handler, rather than just serial
  // if checks).  Unfortunately this did result in a performance hit.
  while (patternIdx < pattern_.size()) {
    if (pattern_[patternIdx] == GLOB_LITERAL) {
      // A literal string section
      uint8_t length = pattern_[patternIdx + 1];
      const uint8_t* literal = pattern_.data() + patternIdx + 2;
      patternIdx += 2 + length;
      if (patternIdx >= pattern_.size()) {
        // This is the last section of the pattern.
        // We can exit out early if the lengths don't match.
        if (text.size() - textIdx != length) {
          return false;
        }
        return 0 == memcmp(text.data() + textIdx, literal, length);
      }
      // Not the final piece of the pattern.  We have to do the string compare
      // (unless the text remaining is too short).
      if (text.size() - textIdx < length) {
        return false;
      }
      if (0 != memcmp(text.data() + textIdx, literal, length)) {
        return false;
      }
      // Matched so far, keep going.
      textIdx += length;
    } else if (pattern_[patternIdx] == GLOB_STAR) {
      // '*' matches 0 or more characters, excluding '/'
      ++patternIdx;
      auto matchCanStartWithDot = pattern_[patternIdx] == GLOB_TRUE;
      ++patternIdx;

      // If the glob cannot match text starting with a dot, but the text
      // has a dot here, then it cannot match.
      if (!matchCanStartWithDot && textIdx < text.size() &&
          text[textIdx] == '.') {
        return false;
      }

      if (patternIdx >= pattern_.size()) {
        // This '*' is at the end of the pattern.
        // We match as long as there are no more '/' characters
        return memchr(text.data() + textIdx, '/', text.size() - textIdx) ==
            nullptr;
      } else if (pattern_[patternIdx] == GLOB_LITERAL) {
        // This '*' is followed by a string literal.
        // Jump ahead to the next place where we find this literal.  Make sure
        // we don't cross a '/'
        auto literalLength = pattern_[patternIdx + 1];
        StringPiece literalPattern{
            ByteRange(pattern_.data() + patternIdx + 2, literalLength)};
        patternIdx += 2 + literalLength;
        auto nextSlash = text.find('/', textIdx);
        while (true) {
          auto literalIdx = text.find(literalPattern, textIdx);
          if (literalIdx == StringPiece::npos) {
            // No match.
            return false;
          }
          if (nextSlash < literalIdx) {
            return false;
          }
          if (tryMatchAt(text, literalIdx + literalLength, patternIdx)) {
            return true;
          }
          // No match here.  Move forwards and try again.
          textIdx = literalIdx + 1;
        }
      } else {
        // '*' followed by another glob special, such as ? or a character
        // class.  We inefficiently try matching forwards one character at a
        // time.
        //
        // In practice this type of pattern is rare.
        while (textIdx < text.size()) {
          if (tryMatchAt(text, textIdx, patternIdx)) {
            return true;
          }
          if (text[textIdx] == '/') {
            return false;
          }
          ++textIdx;
        }
        return false;
      }
    } else if (pattern_[patternIdx] == GLOB_ENDS_WITH) {
      // Advance patternIdx to read the bool from the original GLOB_STAR.
      ++patternIdx;
      auto matchCanStartWithDot = pattern_[patternIdx] == GLOB_TRUE;

      // If the glob match is not allowed to start with a dot then we also
      // reject cases where it matches the empty string followed by a dot.
      // We intentionally do not allow `*.cpp` to match `.cpp`
      // This matches the behavior of the POSIX fnmatch() function.
      // Because any match of '*' will start from the current textIdx, we
      // can return right away if we know any match would start with an
      // illegal dot.
      if (!matchCanStartWithDot && textIdx < text.size() &&
          text[textIdx] == '.') {
        return false;
      }

      // An "ends-with" section
      uint8_t length = pattern_[patternIdx + 1];
      const uint8_t* literal = pattern_.data() + patternIdx + 2;
      if (text.size() - textIdx < length) {
        return false;
      }
      if (0 != memcmp(text.end() - length, literal, length)) {
        return false;
      }
      // The end of the text matched the desired literal.
      // Now we just have to verify that there were no '/' characters in the
      // preceding portion (that matches "*").
      return memchr(
                 text.data() + textIdx,
                 '/',
                 text.size() - (textIdx + length)) == nullptr;
    } else if (pattern_[patternIdx] == GLOB_STAR_STAR_END) {
      // This is '**' at the end of a pattern.  It matches everything else in
      // the text. However, if this matcher was created with
      // GlobOptions::IGNORE_DOTFILES, then we must ensure that none of the path
      // components in the remaining text start with a '.'.
      ++patternIdx;
      auto pathComponentInMatchCanStartWithDot =
          pattern_[patternIdx] == GLOB_TRUE;
      if (pathComponentInMatchCanStartWithDot) {
        return true;
      }

      // By construction, we know that GLOB_STAR_STAR_END is preceded by a
      // slash, so we can start from the previous character and scan the
      // remaining text for "/." If we find one, then this is not a match.
      auto searchIndex = textIdx == 0 ? 0 : textIdx - 1;
      return text.find("/.", searchIndex) == StringPiece::npos;
    } else if (pattern_[patternIdx] == GLOB_STAR_STAR_SLASH) {
      ++patternIdx;
      auto pathComponentInMatchCannotStartWithDot =
          pattern_[patternIdx] == GLOB_FALSE;

      // This is "**/"
      // It may match nothing at all, or it may match some arbitrary number of
      // characters followed by a slash.
      ++patternIdx;
      while (true) {
        if (tryMatchAt(text, textIdx, patternIdx)) {
          return true;
        }

        auto prevTextIdx = textIdx;
        textIdx = text.find('/', prevTextIdx + 1);
        if (textIdx == StringPiece::npos) {
          // No match.
          return false;
        } else if (
            pathComponentInMatchCannotStartWithDot &&
            text[prevTextIdx] == '.') {
          // Verify the path component does not start with an illegal dot
          // before proceeding.
          return false;
        }

        ++textIdx;
      }
    } else {
      // The other glob special patterns all match exactly one character.
      // Get this character now.
      if (textIdx >= text.size()) {
        return false;
      }
      uint8_t ch = text[textIdx];
      ++textIdx;

      // Git does not allow '/' to match any of these cases.
      if (ch == '/') {
        return false;
      }

      if (pattern_[patternIdx] == GLOB_CHAR_CLASS) {
        // An inclusive character class
        if (!charClassMatch(ch, &patternIdx)) {
          return false;
        }
      } else if (pattern_[patternIdx] == GLOB_CHAR_CLASS_NEGATED) {
        // An exclusive character class
        if (charClassMatch(ch, &patternIdx)) {
          return false;
        }
      } else if (pattern_[patternIdx] == GLOB_QMARK) {
        // '?' matches any character except '/'
        // (which we already excluded above)
        ++patternIdx;
      } else {
        // Unknown opcode.  This should never happen.
        XLOG(FATAL) << "bug processing glob pattern buffer at index "
                    << patternIdx;
      }
    }
  }

  return textIdx == text.size();
}