in util/DirectorySourceQueue.cpp [257:411]
bool DirectorySourceQueue::explore() {
WLOG(INFO) << "Exploring root dir " << rootDir_
<< " include_pattern : " << includePattern_
<< " exclude_pattern : " << excludePattern_
<< " prune_dir_pattern : " << pruneDirPattern_;
WDT_CHECK(!rootDir_.empty());
bool hasError = false;
std::set<string> visited;
std::regex includeRegex(includePattern_);
std::regex excludeRegex(excludePattern_);
std::regex pruneDirRegex(pruneDirPattern_);
std::deque<string> todoList;
todoList.push_back("");
while (!todoList.empty()) {
if (threadCtx_->getAbortChecker()->shouldAbort()) {
WLOG(ERROR) << "Directory transfer thread aborted";
hasError = true;
break;
}
// would be nice to do those 2 in 1 call...
auto relativePath = todoList.front();
todoList.pop_front();
const string fullPath = rootDir_ + relativePath;
WVLOG(1) << "Processing directory " << fullPath;
DIR *dirPtr = opendir(fullPath.c_str());
if (!dirPtr) {
WPLOG(ERROR) << "Error opening dir " << fullPath;
failedDirectories_.emplace_back(fullPath);
hasError = true;
continue;
}
// http://elliotth.blogspot.com/2012/10/how-not-to-use-readdirr3.html
// tl;dr readdir is actually better than readdir_r ! (because of the
// nastiness of calculating correctly buffer size and race conditions there)
struct dirent *dirEntryRes = nullptr;
while (true) {
if (threadCtx_->getAbortChecker()->shouldAbort()) {
break;
}
errno = 0; // yes that's right
dirEntryRes = readdir(dirPtr);
if (!dirEntryRes) {
if (errno) {
WPLOG(ERROR) << "Error reading dir " << fullPath;
// closedir always called
hasError = true;
} else {
WVLOG(2) << "Done with " << fullPath;
// finished reading dir
}
break;
}
const auto dType = dirEntryRes->d_type;
WVLOG(2) << "Found entry " << dirEntryRes->d_name << " type "
<< (int)dType;
if (dirEntryRes->d_name[0] == '.') {
if (dirEntryRes->d_name[1] == '\0' ||
(dirEntryRes->d_name[1] == '.' && dirEntryRes->d_name[2] == '\0')) {
WVLOG(3) << "Skipping entry : " << dirEntryRes->d_name;
continue;
}
}
// Following code is a bit ugly trying to save stat() call for directories
// yet still work for xfs which returns DT_UNKNOWN for everything
// would be simpler to always stat()
// if we reach DT_DIR and DT_REG directly:
bool isDir = (dType == DT_DIR);
bool isLink = (dType == DT_LNK);
bool keepEntry = (isDir || dType == DT_REG || dType == DT_UNKNOWN);
if (followSymlinks_) {
keepEntry |= isLink;
}
if (!keepEntry) {
WVLOG(3) << "Ignoring entry type " << (int)(dType);
continue;
}
string newRelativePath = relativePath + string(dirEntryRes->d_name);
string newFullPath = rootDir_ + newRelativePath;
if (!isDir) {
// DT_REG, DT_LNK or DT_UNKNOWN cases
struct stat fileStat;
// On XFS we don't know yet if this is a symlink, so check
// if following symlinks is ok we will do stat() too
if (lstat(newFullPath.c_str(), &fileStat) != 0) {
WPLOG(ERROR) << "lstat() failed on path " << newFullPath;
hasError = true;
continue;
}
isLink = S_ISLNK(fileStat.st_mode);
WVLOG(2) << "lstat for " << newFullPath << " is link ? " << isLink;
if (followSymlinks_ && isLink) {
// Use stat to see if the pointed file is of the right type
// (overrides previous stat call result)
if (stat(newFullPath.c_str(), &fileStat) != 0) {
WPLOG(ERROR) << "stat() failed on path " << newFullPath;
hasError = true;
continue;
}
newFullPath = resolvePath(newFullPath);
if (newFullPath.empty()) {
// already logged error
hasError = true;
continue;
}
WVLOG(2) << "Resolved symlink " << dirEntryRes->d_name << " to "
<< newFullPath;
}
// could dcheck that if DT_REG we better be !isDir
isDir = S_ISDIR(fileStat.st_mode);
// if we were DT_UNKNOWN this could still be a symlink, block device
// etc... (xfs)
if (S_ISREG(fileStat.st_mode)) {
WVLOG(2) << "Found file " << newFullPath << " of size "
<< fileStat.st_size;
if (!excludePattern_.empty() &&
std::regex_match(newRelativePath, excludeRegex)) {
continue;
}
if (!includePattern_.empty() &&
!std::regex_match(newRelativePath, includeRegex)) {
continue;
}
WdtFileInfo fileInfo(newRelativePath, fileStat.st_size, directReads_);
createIntoQueue(newFullPath, fileInfo);
continue;
}
}
if (isDir) {
if (followSymlinks_) {
if (visited.find(newFullPath) != visited.end()) {
WLOG(ERROR) << "Attempted to visit directory twice: "
<< newFullPath;
hasError = true;
continue;
}
// TODO: consider custom hashing ignoring common prefix
visited.insert(newFullPath);
}
newRelativePath.push_back('/');
if (pruneDirPattern_.empty() ||
!std::regex_match(newRelativePath, pruneDirRegex)) {
WVLOG(2) << "Adding " << newRelativePath;
todoList.push_back(std::move(newRelativePath));
}
}
}
closedir(dirPtr);
}
WLOG(INFO) << "Number of files explored: " << numEntries_ << " opened "
<< numFilesOpened_ << " with direct " << numFilesOpenedWithDirect_
<< " errors " << std::boolalpha << hasError;
return !hasError;
}