glean/lang/clang/index.cpp (579 lines of code) (raw):
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <csignal>
#include <cstdlib>
#include <iostream>
#include <string>
#include <filesystem>
#include <clang/Frontend/CompilerInstance.h>
#include <clang/Tooling/CommonOptionsParser.h>
#include <clang/Tooling/Tooling.h>
#include <llvm/Support/CommandLine.h>
#include <llvm/Support/ErrorHandling.h>
#include <boost/algorithm/string/predicate.hpp>
#include <folly/Conv.h>
#include <folly/executors/GlobalExecutor.h>
#include <folly/FileUtil.h>
#include <folly/json.h>
#include <folly/Range.h>
#if FACEBOOK
#include "common/init/Init.h"
#else
#include <folly/init/Init.h>
#endif
#include "thrift/lib/cpp/transport/TTransportException.h"
#include "glean/cpp/glean.h"
#include "glean/cpp/sender.h"
#include "glean/interprocess/cpp/counters.h"
#include "glean/interprocess/cpp/worklist.h"
#include "glean/lang/clang/action.h"
#include "glean/lang/clang/ast.h"
#include "glean/lang/clang/preprocessor.h"
#include "glean/rts/binary.h"
#include "glean/rts/inventory.h"
DEFINE_string(service, "", "TIER or HOST:PORT of Glean server");
DEFINE_string(dump, "", "dump the produce batch to file at PATH");
DEFINE_string(work_file, "", "PATH to work file");
DEFINE_string(task, "", "task id (for logging)");
DEFINE_string(request, "", "request id (for logging)");
DEFINE_string(origin, "", "origin (for logging)");
DEFINE_string(inventory, "", "PATH to inventory file");
DEFINE_string(root, "", "root repository PATH");
DEFINE_string(blank_cell_name, "", "buck cell name output as nothing");
DEFINE_string(cwd_subdir, "", "current working subdirectory under --root");
DEFINE_string(target_subdir, "", "clang target subdirectory under --root");
DEFINE_string(path_prefix, "",
"Path fragment to prefix files with (e.g. 'fbsource')");
DEFINE_string(repo_name, "", "repository name (e.g., fbsource)");
DEFINE_string(repo_hash, "", "repository hash as produced by the VCS");
DEFINE_int32(max_comm_errors, 30,
"maximum number of consecutive communication errors");
DEFINE_int32(stop_after, 0, "stop after N files");
DEFINE_int32(max_rss, 6291456, "stop after RSS reaches this size (kB)");
DEFINE_bool(dry_run, false, "don't send data");
DEFINE_bool(fact_stats, true, "log fact statistics");
DEFINE_uint64(fact_cache, 805306368, "set maximum fact cache size");
DEFINE_uint64(fact_buffer, 201326592, "set maximum fact buffer size");
DEFINE_uint32(log_every, 1, "log every N translation units");
DEFINE_uint32(worker_index, 0, "index of this worker");
DEFINE_uint32(worker_count, 1, "total number of workers");
DEFINE_string(counter_file, "", "PATH to stats counter file");
DEFINE_string(counters, "", "comma-separated list of NAME@N");
DEFINE_bool(suppress_diagnostics, false, "suppress all Clang diagnostics");
DEFINE_bool(fail_on_error, false, "immediately fail on compilation errors");
DEFINE_bool(index_on_error, false, "index files that have compilation errors");
DEFINE_string(clang_arguments, "", "arguments to pass to Clang");
DEFINE_bool(clang_no_pch, false, "disable PCH");
DEFINE_bool(clang_no_modules, false, "disable modules");
DEFINE_string(clang_resource_dir, "", "PATH to Clang resource dir");
// Index single cdb
DEFINE_string(cdb_target, "", "Target name");
DEFINE_string(cdb_dir, "", "Directory with compile_commands.json in it");
static llvm::cl::OptionCategory indexerCategory("glean");
// This file implements some plumbing and the main function for the
// Clang indexer
namespace {
using namespace facebook::glean;
using namespace facebook::glean::clangx;
#define LOG_CFG(level,config) LOG(level) << (config).log_pfx
struct Counters {
using counter_t = interprocess::Counters::counter_t;
Counters() {
std::vector<std::pair<std::string, size_t>> names;
if (!FLAGS_counter_file.empty()) {
size_t size = 0;
const auto spec = FLAGS_counters;
const auto end = spec.end();
auto pos = spec.begin();
while (pos != end) {
auto a = std::find(pos,end,'@');
if (a == end) {
throw std::runtime_error("invalid --counter");
}
auto b = a+1;
auto e = std::find(b,end,',');
auto index = folly::to<size_t>(std::string(b,e));
size = std::max(size, index+1);
names.push_back({std::string(pos,a), index});
pos = e;
if (pos != end) {
++pos;
}
}
counters = interprocess::counters(FLAGS_counter_file, size);
}
const std::unordered_map<std::string, counter_t**> fields{
{"fact_buffer_size", &fact_buffer_size},
{"fact_cache_size", &fact_cache_size},
{"fact_cache_hits", &fact_cache_hits},
{"fact_cache_misses", &fact_cache_misses},
};
for (const auto& x : fields) {
*(x.second) = nullptr;
}
for (const auto& x : names) {
auto p = fields.find(x.first);
if (p != fields.end()) {
*(p->second) = counters->counter(x.second);
} else {
LOG(ERROR) << "unknown counter '" << x.first << "'";
}
}
for (const auto& x : fields) {
if (*(x.second) == nullptr) {
locals.emplace_back(0);
*(x.second) = &locals.back();
}
}
}
counter_t *fact_buffer_size;
counter_t *fact_cache_size;
counter_t *fact_cache_hits;
counter_t *fact_cache_misses;
std::unique_ptr<interprocess::Counters> counters;
std::deque<std::atomic<uint64_t>> locals;
};
struct Config {
std::filesystem::path root;
// subdir of root for setting current working directory
folly::Optional<std::string> cwd_subdir;
// subdir of root for interpreting relative clang paths
folly::Optional<std::string> target_subdir;
folly::Optional<std::string> path_prefix;
folly::Optional<std::string> platform;
std::string log_pfx;
std::unique_ptr<Sender> sender;
bool should_log;
std::unique_ptr<DbSchema<SCHEMA>> schema;
std::unique_ptr<clang::DiagnosticConsumer> diagnostics;
std::vector<SourceFile> sources;
Counters counters;
Config(int argc, char **argv) {
assert (argc > 0);
root = std::filesystem::canonical(FLAGS_root);
if (!FLAGS_cwd_subdir.empty()) {
cwd_subdir = FLAGS_cwd_subdir;
}
if (!FLAGS_target_subdir.empty()) {
target_subdir = FLAGS_target_subdir;
}
if (!FLAGS_path_prefix.empty()) {
path_prefix = FLAGS_path_prefix;
}
log_pfx = folly::to<std::string>(FLAGS_worker_index) + ": ";
#if FACEBOOK
if (!FLAGS_service.empty()) {
// Full logging if we are talking to a remote service
should_log = true;
if (FLAGS_repo_name.empty()) {
fail("missing repo_name");
}
if (FLAGS_repo_hash.empty()) {
fail("missing repo_hash");
}
sender = thriftSender(
FLAGS_service,
FLAGS_repo_name,
FLAGS_repo_hash,
10, // hardcode min_retry_delay for now
static_cast<size_t>(FLAGS_max_comm_errors)
);
} else
#endif
if (!FLAGS_dump.empty()) {
// No logging when dumping to a file
should_log = false;
sender = fileWriter(FLAGS_dump);
} else {
fail("missing --service or --dump");
}
if (FLAGS_inventory.empty()) {
fail("missing --inventory");
} else {
std::string contents;
if (!folly::readFile(FLAGS_inventory.c_str(), contents)) {
fail("couldn't read " + FLAGS_inventory);
}
schema = std::make_unique<DbSchema<SCHEMA>>(
rts::Inventory::deserialize(binary::byteRange(contents)));
}
diagnostics = diagnosticConsumer();
// Add targets from json
for (int i = 1; i < argc; ++i) {
std::string contents;
if (!folly::readFile(argv[i], contents)) {
fail(std::string("couldn't read ") + argv[i]);
}
for (const auto& item : folly::parseJson(contents)) {
folly::Optional<std::string> platform;
if (auto *p = item.get_ptr("platform")) {
platform = p->getString();
}
sources.push_back(SourceFile{
item["target"].getString(),
std::move(platform),
item["dir"].getString(),
item["file"].getString()});
}
}
// Add sources from single cdb options
if (!FLAGS_cdb_target.empty() || !FLAGS_cdb_dir.empty()) {
if (FLAGS_cdb_target.empty()) {
fail("missing --cdb-target");
}
if (FLAGS_cdb_dir.empty()) {
fail("missing --cdb-dir");
}
std::string err;
std::string dir = FLAGS_cdb_dir.c_str();
auto cdb = clang::tooling::CompilationDatabase::loadFromDirectory(dir, err);
if (!cdb) {
throw std::runtime_error("couldn't load " + dir + ": " + err);
}
for(auto file : cdb->getAllFiles()){
sources.push_back(SourceFile{
FLAGS_cdb_target.c_str(),
folly::Optional<std::string>(),
FLAGS_cdb_dir.c_str(),
file,
});
}
}
}
ActionLogger logger(const std::string &name) {
return ActionLogger(name,
FLAGS_task,
FLAGS_request,
FLAGS_repo_name,
FLAGS_repo_hash,
FLAGS_worker_index,
FLAGS_origin,
FLAGS_cwd_subdir,
should_log);
}
[[noreturn]] void fail(const std::string& msg) const {
LOG_CFG(FATAL, *this) << msg;
}
static std::unique_ptr<clang::DiagnosticConsumer> diagnosticConsumer() {
if (FLAGS_suppress_diagnostics) {
// It's important to use the default DiagnosticConsumer rather
// than IgnoringDiagConsumer, because IgnoringDiagConsumer
// doesn't even count the errors. ClangTool::run() will always
// return true when using IgnoringDiagConsumer, because it looks
// at getNumErrors().
return std::make_unique<clang::DiagnosticConsumer>();
} else {
return {};
}
}
};
// A wrapper for a Clang compilation database
class CDB {
public:
// Load the compilation database for a particular source file if it is
// different from the one loaded before. The pointer is valid until the
// next call to load as long as the CDB object is alive.
const clang::tooling::CompilationDatabase *load(const SourceFile& source) {
if (!cdb || dir != source.dir) {
std::string err;
dir = source.dir;
cdb = clang::tooling::CompilationDatabase::loadFromDirectory(dir, err);
if (!cdb) {
throw std::runtime_error("couldn't load " + dir + ": " + err);
}
}
return cdb.get();
}
private:
std::unique_ptr<clang::tooling::CompilationDatabase> cdb;
std::string dir;
};
struct SourceIndexer {
const Config& config;
Batch<SCHEMA> batch;
CDB cdb;
explicit SourceIndexer(Config& cfg)
: config(cfg)
, batch(cfg.schema.get(), FLAGS_fact_cache)
{
blank_cell_name = (!FLAGS_blank_cell_name.empty())
? folly::Optional<std::string>(FLAGS_blank_cell_name)
: folly::none;
}
bool index(const SourceFile& source) {
auto pcdb = cdb.load(source);
ClangCfg cfg{
ClangDB::Env{
locatorOf(source),
platformOf(source),
config.root,
config.target_subdir,
config.path_prefix,
batch,
},
config.diagnostics.get()
};
FrontendActionFactory factory(&cfg);
clang::tooling::ClangTool tool(*pcdb, source.file);
if (!FLAGS_clang_arguments.empty()) {
clang::tooling::CommandLineArguments args;
folly::split(" ", FLAGS_clang_arguments, args, true);
if (!args.empty()) {
tool.appendArgumentsAdjuster(
clang::tooling::getInsertArgumentAdjuster(
args, clang::tooling::ArgumentInsertPosition::END));
}
}
tool.appendArgumentsAdjuster([](const auto& args, auto) {
clang::tooling::CommandLineArguments stripped;
stripped.reserve(args.size());
for (size_t i = 0; i < args.size(); ++i) {
if (FLAGS_clang_no_pch && args[i] == "-include-pch") {
++i; // skip next argument
} else if (FLAGS_clang_no_pch
&& args[i] == "-include"
&& i+1 < args.size()
&& boost::ends_with(args[i+1],".pch")) {
// headers included from the command line cause problems
// because they won't be visible to tools exploring the
// include graph via the cxx.Trace predicate. When used to
// include a .pch file these are typically a compile-time
// optimisation only, so we strip them out when
// --clang_no_pch is on.
++i;
} else if (!FLAGS_clang_resource_dir.empty()
&& args[i] == "-resource-dir") {
++i; // replace -resource-dir flag
stripped.push_back("-resource-dir");
stripped.push_back(FLAGS_clang_resource_dir);
} else if (boost::starts_with(args[i], "--cc=")) {
// skip this flag - llvm complains about it
} else if (FLAGS_clang_no_modules &&
(args[i] == "-fmodules"
|| args[i] == "-fcxx-modules"
|| boost::starts_with(args[i], "-fmodule-name="))) {
// skip these
} else {
stripped.push_back(args[i]);
}
}
return stripped;
});
return tool.run(&factory) == 0;
}
private:
folly::Optional<std::string> blank_cell_name;
Fact<Buck::Locator> locatorOf(const SourceFile& source) {
// Parsing source.target as cell//path:name
const auto slashes = source.target.find("//");
const size_t cell_len = (slashes == std::string::npos) ? 0 : slashes;
const size_t path_start = (slashes == std::string::npos) ? 0 : slashes+2;
const auto colon = source.target.find(':', path_start);
const size_t path_len = (colon == std::string::npos)
? source.target.size() - path_start
: colon - path_start;
const size_t name_start = (colon == std::string::npos)
? source.target.size() // substr will return empty string for name
: colon+1;
folly::Optional<std::string> cell = (0 < cell_len)
? folly::Optional<std::string>(source.target.substr(0, cell_len))
: folly::none;
// Enforce policy that buck.Locator{subdir=nothing} means blank_cell_name
if (cell == blank_cell_name) {
cell = folly::none;
}
return batch.fact<Buck::Locator>(
maybe(cell),
source.target.substr(path_start, path_len),
source.target.substr(name_start)
);
}
folly::Optional<Fact<Buck::Platform>> platformOf(
const SourceFile& file) {
if (file.platform) {
return batch.fact<Buck::Platform>(file.platform.value());
} else {
return folly::none;
}
}
struct ClangCfg {
ClangDB::Env env;
clang::DiagnosticConsumer *diagnostics;
};
// FrontendAction uses the ClangIndexer to plumb PPCallbacks and ASTConsumer
struct FrontendAction : public clang::ASTFrontendAction {
using Base = clang::ASTFrontendAction;
explicit FrontendAction(const ClangCfg *cfg) : config(cfg) {}
bool BeginSourceFileAction(clang::CompilerInstance& ci) override {
if (config->diagnostics) {
config->diagnostics->clear();
ci.getDiagnostics().setClient(config->diagnostics, false);
}
db = std::make_unique<ClangDB>(config->env, ci);
ci.getPreprocessor().addPPCallbacks(
facebook::glean::clangx::newPPCallbacks(db.get()));
return Base::BeginSourceFileAction(ci);
}
std::unique_ptr<clang::ASTConsumer> CreateASTConsumer(
clang::CompilerInstance&, clang::StringRef) override {
CHECK(db);
return facebook::glean::clangx::newASTConsumer(db.get());
}
const ClangCfg *config;
std::unique_ptr<ClangDB> db;
};
struct FrontendActionFactory : public clang::tooling::FrontendActionFactory {
explicit FrontendActionFactory(const ClangCfg *cfg) : config(cfg) {}
#if LLVM_VERSION_MAJOR >= 11
std::unique_ptr<clang::FrontendAction> create() override {
return std::unique_ptr<clang::FrontendAction>(new FrontendAction(config));
}
#else
clang::FrontendAction *create() override {
return new FrontendAction(config);
}
#endif
const ClangCfg *config;
};
};
using stats_vec = std::vector<std::pair<size_t, const char *>>;
const stats_vec counts = {{1000*1000, "m"}, {1000, "k"}};
const stats_vec mems = {{1024*1024, "MB"}, {1024, "KB"}, {0, "B"}};
folly::fbstring showStat(const stats_vec& vec, size_t n) {
for (const auto& p : vec) {
if (n >= p.first) {
if (p.first != 0) {
n /= p.first;
}
return folly::to<folly::fbstring>(n) + p.second;
}
}
return folly::to<folly::fbstring>(n);
}
folly::fbstring showStats(const FactStats& stats) {
return
showStat(mems, stats.memory) + " (" + showStat(counts, stats.count) + ")";
}
struct FatalLLVMError : std::runtime_error {
using std::runtime_error::runtime_error;
};
void handleLLVMError(void *, const std::string& reason, bool) {
throw FatalLLVMError(reason);
}
int getSelfRSS() {
std::string contents;
if (!folly::readFile("/proc/self/smaps_rollup", contents)) {
LOG(ERROR) << "Couldn't read /proc/self/smaps_rollup";
return 0;
}
std::vector<std::string> lines;
folly::split("\n", contents, lines);
for (const auto& line : lines) {
folly::StringPiece piece(line);
if (piece.subpiece(0, 4) == "Rss:") {
auto rss = piece.subpiece(4);
return folly::to<int>(&rss);
}
}
return 0;
}
}
int main(int argc, char **argv) {
#if FACEBOOK
facebook::initFacebook(&argc, &argv);
#else
folly::init(&argc, &argv);
#endif
std::signal(SIGTERM, [](int) {
#if FACEBOOK
LOG(CRITICAL)
#else
LOG(ERROR)
#endif
<< "worker " << FLAGS_worker_index << " received SIGTERM, exiting";
_exit(1);
});
Config config(argc, argv);
std::filesystem::current_path(
config.root / std::filesystem::path(config.cwd_subdir.value_or("")));
const auto work_counter = FLAGS_work_file.empty()
? worklist::serialCounter(0, config.sources.size())
: worklist::stealingCounter(
FLAGS_work_file, FLAGS_worker_index, FLAGS_worker_count);
SourceIndexer indexer(config);
llvm::install_fatal_error_handler(&handleLLVMError, nullptr);
const size_t n = FLAGS_stop_after != 0
? std::min(size_t(FLAGS_stop_after), config.sources.size())
: config.sources.size();
FactStats prev_stats = {0,0};
FactStats lifetime_stats = {0,0};
uint32_t lifetime_files = 0;
bool error_exit = false;
bool memory_exit = false;
int rss = 0;
for (auto next = work_counter->next();
next.has_value();
next = work_counter->next()) {
const auto i = next.value().start;
auto errorGuard = folly::makeGuard([&] {
LOG_CFG(ERROR,config) << "error guard at "
<< i+1 << "/" << next.value().end << " [" << n << "] "
<< config.sources[i].file;
});
if (FLAGS_log_every != 0 && (lifetime_files % FLAGS_log_every) == 0) {
LOG_CFG(INFO,config)
<< i+1 << "/" << next.value().end << " [" << n << "] "
<< config.sources[i].file;
if (FLAGS_fact_stats) {
LOG_CFG(INFO,config)
<< "fact buffer: " << showStats(indexer.batch.bufferStats())
<< " cache: " << showStats(indexer.batch.cacheStats().facts)
<< " lifetime: " << showStats(lifetime_stats);
}
}
const auto& source = config.sources[i];
const auto buf_stats = indexer.batch.bufferStats();
const auto cache_stats = indexer.batch.cacheStats();
try {
bool ok = config
.logger("clang/index")
.log_index(source, buf_stats, cache_stats, [&]() {
return indexer.index(source);
});
if (!ok && FLAGS_fail_on_error) {
LOG(ERROR) << "compilation failed for " << source.file;
error_exit = true;
}
} catch(const FatalLLVMError& e) {
// TODO: log this to Scuba if it turns out to happen a lot
LOG(ERROR) << "fatal LLVM error in " << source.file << ": " << e.what();
error_exit = true;
} catch(const std::exception& e) {
LOG(ERROR) << "while indexing " << source.file << ": " << e.what();
error_exit = FLAGS_fail_on_error;
}
lifetime_stats.memory += buf_stats.memory - prev_stats.memory;
lifetime_stats.count += buf_stats.count - prev_stats.count;
config.counters.fact_buffer_size->store(buf_stats.memory);
config.counters.fact_cache_size->store(cache_stats.facts.memory);
config.counters.fact_cache_hits->store(cache_stats.hits);
config.counters.fact_cache_misses->store(cache_stats.misses);
if (!FLAGS_dry_run) {
const bool wait =
FLAGS_fact_buffer != 0 && buf_stats.memory >= FLAGS_fact_buffer;
if (wait) {
LOG_CFG(INFO,config)
<< "fact buffer size " << buf_stats.memory << ", waiting";
}
config.logger(wait ? "clang/wait" : "clang/send").log([&]() {
config.sender->rebaseAndSend(indexer.batch.base(), wait);
});
}
prev_stats = indexer.batch.bufferStats();
++lifetime_files;
memory_exit = FLAGS_max_rss != 0 && (rss = getSelfRSS()) > FLAGS_max_rss;
errorGuard.dismiss();
if ((FLAGS_stop_after != 0 && lifetime_files >= FLAGS_stop_after)
|| memory_exit
|| error_exit) {
LOG_CFG(WARNING,config)
<< "Exiting after "
<< i+1 << "/" << next.value().end << " [" << n << "] "
<< config.sources[i].file;
// we do not want the for loop to call work_counter->next()
// because that will skip the next target for no good reason
break;
}
}
if (!FLAGS_dry_run) {
LOG_CFG(INFO,config) << "flushing";
config.logger("clang/flush").log([&]() {
config.sender->flush(indexer.batch.base());
});
}
config.counters.fact_buffer_size->store(0);
config.counters.fact_cache_size->store(0);
if (memory_exit) {
LOG_CFG(ERROR, config)
<< "Exiting due to memory pressure, RSS was " << rss
<< " kB, RSS after flushing is " << getSelfRSS()
<< " kB, --max-rss is " << FLAGS_max_rss << " kB";
}
LOG_CFG(INFO,config)
<< (error_exit || memory_exit ? "aborting" : "finished")
<< ", lifetime files: " << lifetime_files
<< " facts: " << showStats(lifetime_stats);
if (memory_exit) {
return 147;
}
if (error_exit) {
return 1;
}
return 0;
}