bool DirectorySourceQueue::explore()

in util/DirectorySourceQueue.cpp [257:411]


bool DirectorySourceQueue::explore() {
  WLOG(INFO) << "Exploring root dir " << rootDir_
             << " include_pattern : " << includePattern_
             << " exclude_pattern : " << excludePattern_
             << " prune_dir_pattern : " << pruneDirPattern_;
  WDT_CHECK(!rootDir_.empty());
  bool hasError = false;
  std::set<string> visited;
  std::regex includeRegex(includePattern_);
  std::regex excludeRegex(excludePattern_);
  std::regex pruneDirRegex(pruneDirPattern_);
  std::deque<string> todoList;
  todoList.push_back("");
  while (!todoList.empty()) {
    if (threadCtx_->getAbortChecker()->shouldAbort()) {
      WLOG(ERROR) << "Directory transfer thread aborted";
      hasError = true;
      break;
    }
    // would be nice to do those 2 in 1 call...
    auto relativePath = todoList.front();
    todoList.pop_front();
    const string fullPath = rootDir_ + relativePath;
    WVLOG(1) << "Processing directory " << fullPath;
    DIR *dirPtr = opendir(fullPath.c_str());
    if (!dirPtr) {
      WPLOG(ERROR) << "Error opening dir " << fullPath;
      failedDirectories_.emplace_back(fullPath);
      hasError = true;
      continue;
    }
    // http://elliotth.blogspot.com/2012/10/how-not-to-use-readdirr3.html
    // tl;dr readdir is actually better than readdir_r ! (because of the
    // nastiness of calculating correctly buffer size and race conditions there)
    struct dirent *dirEntryRes = nullptr;
    while (true) {
      if (threadCtx_->getAbortChecker()->shouldAbort()) {
        break;
      }
      errno = 0;  // yes that's right
      dirEntryRes = readdir(dirPtr);
      if (!dirEntryRes) {
        if (errno) {
          WPLOG(ERROR) << "Error reading dir " << fullPath;
          // closedir always called
          hasError = true;
        } else {
          WVLOG(2) << "Done with " << fullPath;
          // finished reading dir
        }
        break;
      }
      const auto dType = dirEntryRes->d_type;
      WVLOG(2) << "Found entry " << dirEntryRes->d_name << " type "
               << (int)dType;
      if (dirEntryRes->d_name[0] == '.') {
        if (dirEntryRes->d_name[1] == '\0' ||
            (dirEntryRes->d_name[1] == '.' && dirEntryRes->d_name[2] == '\0')) {
          WVLOG(3) << "Skipping entry : " << dirEntryRes->d_name;
          continue;
        }
      }
      // Following code is a bit ugly trying to save stat() call for directories
      // yet still work for xfs which returns DT_UNKNOWN for everything
      // would be simpler to always stat()

      // if we reach DT_DIR and DT_REG directly:
      bool isDir = (dType == DT_DIR);
      bool isLink = (dType == DT_LNK);
      bool keepEntry = (isDir || dType == DT_REG || dType == DT_UNKNOWN);
      if (followSymlinks_) {
        keepEntry |= isLink;
      }
      if (!keepEntry) {
        WVLOG(3) << "Ignoring entry type " << (int)(dType);
        continue;
      }
      string newRelativePath = relativePath + string(dirEntryRes->d_name);
      string newFullPath = rootDir_ + newRelativePath;
      if (!isDir) {
        // DT_REG, DT_LNK or DT_UNKNOWN cases
        struct stat fileStat;
        // On XFS we don't know yet if this is a symlink, so check
        // if following symlinks is ok we will do stat() too
        if (lstat(newFullPath.c_str(), &fileStat) != 0) {
          WPLOG(ERROR) << "lstat() failed on path " << newFullPath;
          hasError = true;
          continue;
        }
        isLink = S_ISLNK(fileStat.st_mode);
        WVLOG(2) << "lstat for " << newFullPath << " is link ? " << isLink;
        if (followSymlinks_ && isLink) {
          // Use stat to see if the pointed file is of the right type
          // (overrides previous stat call result)
          if (stat(newFullPath.c_str(), &fileStat) != 0) {
            WPLOG(ERROR) << "stat() failed on path " << newFullPath;
            hasError = true;
            continue;
          }
          newFullPath = resolvePath(newFullPath);
          if (newFullPath.empty()) {
            // already logged error
            hasError = true;
            continue;
          }
          WVLOG(2) << "Resolved symlink " << dirEntryRes->d_name << " to "
                   << newFullPath;
        }

        // could dcheck that if DT_REG we better be !isDir
        isDir = S_ISDIR(fileStat.st_mode);
        // if we were DT_UNKNOWN this could still be a symlink, block device
        // etc... (xfs)
        if (S_ISREG(fileStat.st_mode)) {
          WVLOG(2) << "Found file " << newFullPath << " of size "
                   << fileStat.st_size;
          if (!excludePattern_.empty() &&
              std::regex_match(newRelativePath, excludeRegex)) {
            continue;
          }
          if (!includePattern_.empty() &&
              !std::regex_match(newRelativePath, includeRegex)) {
            continue;
          }
          WdtFileInfo fileInfo(newRelativePath, fileStat.st_size, directReads_);
          createIntoQueue(newFullPath, fileInfo);
          continue;
        }
      }
      if (isDir) {
        if (followSymlinks_) {
          if (visited.find(newFullPath) != visited.end()) {
            WLOG(ERROR) << "Attempted to visit directory twice: "
                        << newFullPath;
            hasError = true;
            continue;
          }
          // TODO: consider custom hashing ignoring common prefix
          visited.insert(newFullPath);
        }
        newRelativePath.push_back('/');
        if (pruneDirPattern_.empty() ||
            !std::regex_match(newRelativePath, pruneDirRegex)) {
          WVLOG(2) << "Adding " << newRelativePath;
          todoList.push_back(std::move(newRelativePath));
        }
      }
    }
    closedir(dirPtr);
  }
  WLOG(INFO) << "Number of files explored: " << numEntries_ << " opened "
             << numFilesOpened_ << " with direct " << numFilesOpenedWithDirect_
             << " errors " << std::boolalpha << hasError;
  return !hasError;
}