in bistro/runners/RemoteWorkerRunner.cpp [438:505]
LogLines RemoteWorkerRunner::getJobLogs(
const string& logtype,
const vector<string>& jobs,
const vector<string>& nodes,
int64_t line_id,
bool is_ascending,
const string& regex_filter) const {
// We are going to query all the workers. This is wasteful, but it makes
// it much easier to find logs for tasks, because:
// 1) Multiple workers can have logs for different iterations of a task
// 2) The logs API supports multi-queries, which, in some cases,
// require us to query all workers anyhow.
std::vector<cpp2::ServiceAddress> services;
std::vector<std::string> unhealthy_workers;
std::vector<std::string> lost_workers;
SYNCHRONIZED_CONST(workers_) {
for (const auto& wconn : workers_.workerPool()) {
const auto& w = wconn.second->getBistroWorker();
// Instead of trying to fetch logs from unhealthy workers, which can
// be slow, and degrade the user experience, display a "transient"
// error right away.
auto state = wconn.second->getState();
if (state == RemoteWorkerState::State::UNHEALTHY) {
unhealthy_workers.push_back(*w.shard_ref());
} else if (state == RemoteWorkerState::State::MUST_DIE) {
lost_workers.push_back(*w.shard_ref());
} else {
services.push_back(*w.addr_ref());
}
}
}
// Inform the user about the logs that we are not querying.
std::string unqueried_workers;
if (!unhealthy_workers.empty()) {
unqueried_workers += "unhealthy: " + folly::join(", ", unhealthy_workers);
}
if (!lost_workers.empty()) {
if (!unqueried_workers.empty()) {
unqueried_workers += "; ";
}
unqueried_workers += "lost: " + folly::join(", ", lost_workers);
}
if (services.empty()) {
if (unqueried_workers.empty()) {
throw BistroException("No workers connected; cannot query logs.");
} else {
throw BistroException(
"All workers are unhealthy; cannot query logs. Known workers: ",
unqueried_workers
);
}
}
return getJobLogsThreadAndEventBaseSafe(
unqueried_workers,
services,
logtype,
jobs,
nodes,
line_id,
is_ascending,
regex_filter,
workerClientFn_
);
}