in eden/fs/model/git/GlobMatcher.cpp [473:688]
bool GlobMatcher::tryMatchAt(
StringPiece text,
size_t textIdx,
size_t patternIdx) const {
// Loop through all opcodes in the pattern buffer.
// It's kind of unfortunate how big and complicated this while loop is.
//
// It would improve readability to break this down into one function per
// opcode, but then it would require additional conditional checks after each
// function to see if we should break out or keep going. Having everything
// inlined in this single while loop makes it very easy to break out early
// without additional checks.
//
// I have tried breaking this out into separate functions (and also using an
// array lookup to find the correct opcode handler, rather than just serial
// if checks). Unfortunately this did result in a performance hit.
while (patternIdx < pattern_.size()) {
if (pattern_[patternIdx] == GLOB_LITERAL) {
// A literal string section
uint8_t length = pattern_[patternIdx + 1];
const uint8_t* literal = pattern_.data() + patternIdx + 2;
patternIdx += 2 + length;
if (patternIdx >= pattern_.size()) {
// This is the last section of the pattern.
// We can exit out early if the lengths don't match.
if (text.size() - textIdx != length) {
return false;
}
return 0 == memcmp(text.data() + textIdx, literal, length);
}
// Not the final piece of the pattern. We have to do the string compare
// (unless the text remaining is too short).
if (text.size() - textIdx < length) {
return false;
}
if (0 != memcmp(text.data() + textIdx, literal, length)) {
return false;
}
// Matched so far, keep going.
textIdx += length;
} else if (pattern_[patternIdx] == GLOB_STAR) {
// '*' matches 0 or more characters, excluding '/'
++patternIdx;
auto matchCanStartWithDot = pattern_[patternIdx] == GLOB_TRUE;
++patternIdx;
// If the glob cannot match text starting with a dot, but the text
// has a dot here, then it cannot match.
if (!matchCanStartWithDot && textIdx < text.size() &&
text[textIdx] == '.') {
return false;
}
if (patternIdx >= pattern_.size()) {
// This '*' is at the end of the pattern.
// We match as long as there are no more '/' characters
return memchr(text.data() + textIdx, '/', text.size() - textIdx) ==
nullptr;
} else if (pattern_[patternIdx] == GLOB_LITERAL) {
// This '*' is followed by a string literal.
// Jump ahead to the next place where we find this literal. Make sure
// we don't cross a '/'
auto literalLength = pattern_[patternIdx + 1];
StringPiece literalPattern{
ByteRange(pattern_.data() + patternIdx + 2, literalLength)};
patternIdx += 2 + literalLength;
auto nextSlash = text.find('/', textIdx);
while (true) {
auto literalIdx = text.find(literalPattern, textIdx);
if (literalIdx == StringPiece::npos) {
// No match.
return false;
}
if (nextSlash < literalIdx) {
return false;
}
if (tryMatchAt(text, literalIdx + literalLength, patternIdx)) {
return true;
}
// No match here. Move forwards and try again.
textIdx = literalIdx + 1;
}
} else {
// '*' followed by another glob special, such as ? or a character
// class. We inefficiently try matching forwards one character at a
// time.
//
// In practice this type of pattern is rare.
while (textIdx < text.size()) {
if (tryMatchAt(text, textIdx, patternIdx)) {
return true;
}
if (text[textIdx] == '/') {
return false;
}
++textIdx;
}
return false;
}
} else if (pattern_[patternIdx] == GLOB_ENDS_WITH) {
// Advance patternIdx to read the bool from the original GLOB_STAR.
++patternIdx;
auto matchCanStartWithDot = pattern_[patternIdx] == GLOB_TRUE;
// If the glob match is not allowed to start with a dot then we also
// reject cases where it matches the empty string followed by a dot.
// We intentionally do not allow `*.cpp` to match `.cpp`
// This matches the behavior of the POSIX fnmatch() function.
// Because any match of '*' will start from the current textIdx, we
// can return right away if we know any match would start with an
// illegal dot.
if (!matchCanStartWithDot && textIdx < text.size() &&
text[textIdx] == '.') {
return false;
}
// An "ends-with" section
uint8_t length = pattern_[patternIdx + 1];
const uint8_t* literal = pattern_.data() + patternIdx + 2;
if (text.size() - textIdx < length) {
return false;
}
if (0 != memcmp(text.end() - length, literal, length)) {
return false;
}
// The end of the text matched the desired literal.
// Now we just have to verify that there were no '/' characters in the
// preceding portion (that matches "*").
return memchr(
text.data() + textIdx,
'/',
text.size() - (textIdx + length)) == nullptr;
} else if (pattern_[patternIdx] == GLOB_STAR_STAR_END) {
// This is '**' at the end of a pattern. It matches everything else in
// the text. However, if this matcher was created with
// GlobOptions::IGNORE_DOTFILES, then we must ensure that none of the path
// components in the remaining text start with a '.'.
++patternIdx;
auto pathComponentInMatchCanStartWithDot =
pattern_[patternIdx] == GLOB_TRUE;
if (pathComponentInMatchCanStartWithDot) {
return true;
}
// By construction, we know that GLOB_STAR_STAR_END is preceded by a
// slash, so we can start from the previous character and scan the
// remaining text for "/." If we find one, then this is not a match.
auto searchIndex = textIdx == 0 ? 0 : textIdx - 1;
return text.find("/.", searchIndex) == StringPiece::npos;
} else if (pattern_[patternIdx] == GLOB_STAR_STAR_SLASH) {
++patternIdx;
auto pathComponentInMatchCannotStartWithDot =
pattern_[patternIdx] == GLOB_FALSE;
// This is "**/"
// It may match nothing at all, or it may match some arbitrary number of
// characters followed by a slash.
++patternIdx;
while (true) {
if (tryMatchAt(text, textIdx, patternIdx)) {
return true;
}
auto prevTextIdx = textIdx;
textIdx = text.find('/', prevTextIdx + 1);
if (textIdx == StringPiece::npos) {
// No match.
return false;
} else if (
pathComponentInMatchCannotStartWithDot &&
text[prevTextIdx] == '.') {
// Verify the path component does not start with an illegal dot
// before proceeding.
return false;
}
++textIdx;
}
} else {
// The other glob special patterns all match exactly one character.
// Get this character now.
if (textIdx >= text.size()) {
return false;
}
uint8_t ch = text[textIdx];
++textIdx;
// Git does not allow '/' to match any of these cases.
if (ch == '/') {
return false;
}
if (pattern_[patternIdx] == GLOB_CHAR_CLASS) {
// An inclusive character class
if (!charClassMatch(ch, &patternIdx)) {
return false;
}
} else if (pattern_[patternIdx] == GLOB_CHAR_CLASS_NEGATED) {
// An exclusive character class
if (charClassMatch(ch, &patternIdx)) {
return false;
}
} else if (pattern_[patternIdx] == GLOB_QMARK) {
// '?' matches any character except '/'
// (which we already excluded above)
++patternIdx;
} else {
// Unknown opcode. This should never happen.
XLOG(FATAL) << "bug processing glob pattern buffer at index "
<< patternIdx;
}
}
}
return textIdx == text.size();
}