source/Positions.cpp (311 lines of code) (raw):
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <cstdio>
#include <fstream>
#include <boost/algorithm/string.hpp>
#include <boost/filesystem.hpp>
#include <fmt/format.h>
#include <re2/re2.h>
#include <ConcurrentContainers.h>
#include <DexClass.h>
#include <IRCode.h>
#include <Show.h>
#include <SpartaWorkQueue.h>
#include <Walkers.h>
#include <mariana-trench/Assert.h>
#include <mariana-trench/JsonValidation.h>
#include <mariana-trench/Log.h>
#include <mariana-trench/Positions.h>
#include <mariana-trench/Timer.h>
namespace marianatrench {
namespace {
constexpr int k_unknown_line = -1;
// Performance optimization to avoid calling more expensive regex matches on
// every line.
bool maybe_class(const std::string& line) {
return line.find("class") != std::string::npos ||
line.find("interface") != std::string::npos ||
line.find("object") != std::string::npos ||
line.find("enum") != std::string::npos;
}
} // namespace
Positions::Positions() {}
Positions::Positions(const Options& options, const DexStoresVector& stores) {
if (options.skip_source_indexing()) {
// Create a dummy path for all methods.
for (auto& scope : DexStoreClassesIterator(stores)) {
walk::parallel::methods(scope, [&](DexMethod* method) {
auto class_name = method->get_class()->str();
class_name = class_name.substr(0, class_name.find(";"));
class_name = class_name.substr(0, class_name.find("$")) + ";";
if (class_name.size() > 2) {
class_name = class_name.substr(1, class_name.size() - 2);
}
std::string path = class_name + ".java";
method_to_path_.emplace(method, paths_.insert(path).first);
});
}
} else {
// Find all Java/Kotlin files in the source root directory.
Timer paths_timer;
LOG(2,
"Finding files to index in `{}`...",
options.source_root_directory());
auto current_path = boost::filesystem::current_path();
boost::filesystem::current_path(options.source_root_directory());
std::string hg_command =
"hg files --include=**.java --include=**.kt --include=**.mustache --exclude=.ovrsource-rest";
std::string find_command =
"find . -type f \\( -iname \\*.java -o -iname \\*.kt -iname \\*.mustache \\) -not -path ./.ovrsource-rest/\\*";
int return_code = -1;
std::string output;
output = execute_and_catch_output(hg_command, return_code);
if (return_code != EXIT_SUCCESS) {
WARNING(
1,
"Source directory is not a mercurial repository. Trying `find` to discover files.");
output = execute_and_catch_output(find_command, return_code);
if (return_code != EXIT_SUCCESS) {
ERROR(1, "`find` failed, no source file will be indexed.");
}
}
std::vector<std::string> paths;
boost::split(paths, output, boost::is_any_of("\n"));
LOG(2,
"Found {} files in {:.2f}s.",
paths.size(),
paths_timer.duration_in_seconds());
// Find top-level classes in files.
Timer index_timer;
LOG(2, "Indexing classes...");
std::atomic<std::size_t> iteration(0);
ConcurrentMap<std::string, std::string> class_to_path;
re2::RE2 package_regex("^package\\s+([^;]+)(?:;|$)");
re2::RE2 class_regex(
"^\\s*(?:/\\*.*\\*/)?\\s*(?:public|internal|private)?\\s*(?:abstract|data|final|open)?\\s*(?:class|enum|interface|object)\\s+([A-z0-9]+)");
std::unordered_set<std::string> skipped_package_prefixes = {
"android/",
};
auto exclude_directories = options.source_exclude_directories();
for (auto& exclude_directory : exclude_directories) {
if (!boost::ends_with(exclude_directory, "/")) {
exclude_directory.push_back('/');
}
}
auto queue = sparta::work_queue<std::string*>(
[&](std::string* path) {
iteration++;
if (iteration % 10000 == 0) {
LOG(2, "Indexed {} of {} files.", iteration.load(), paths.size());
}
if (boost::starts_with(*path, "./")) {
// Remove the `./` prefix added by `find`.
path->erase(0, 2);
}
if (path->empty()) {
return;
}
for (const auto& exclude_directory : exclude_directories) {
if (boost::starts_with(*path, exclude_directory)) {
return;
}
}
std::optional<std::string> package = std::nullopt;
std::ifstream stream(*path);
std::string line;
while (std::getline(stream, line)) {
re2::StringPiece package_match;
// Using capturing groups with `re2` is very slow, so we only
// capture if we know the regex matches. This gives a huge
// performance boost.
if (!package && re2::RE2::PartialMatch(line, package_regex) &&
re2::RE2::PartialMatch(line, package_regex, &package_match)) {
package = package_match.as_string();
boost::replace_all(*package, ".", "/");
if (std::any_of(
skipped_package_prefixes.begin(),
skipped_package_prefixes.end(),
[&](const auto& skipped_prefix) {
return boost::starts_with(*package, skipped_prefix);
})) {
LOG(3, "Skipping module `{}` at `{}`...", *package, *path);
return;
}
if (boost::ends_with(*path, ".kt")) {
auto pos = path->find_last_of("/");
if (pos != std::string::npos) {
auto filename = path->substr(pos + 1, path->size() - pos - 4);
auto classname = fmt::format("L{}/{}Kt;", *package, filename);
class_to_path.update(
classname,
[&path](
const std::string& /* classname */,
std::string& value,
bool exists) {
if (exists && value < *path) {
return;
}
value = *path;
});
}
}
}
re2::StringPiece class_match;
if (package && maybe_class(line) &&
re2::RE2::PartialMatch(line, class_regex) &&
re2::RE2::PartialMatch(line, class_regex, &class_match)) {
auto classname = fmt::format("L{}/{};", *package, class_match);
class_to_path.update(
classname,
[&path](
const std::string& /* classname */,
std::string& value,
bool exists) {
if (exists && value < *path) {
return;
}
value = *path;
});
}
}
},
sparta::parallel::default_num_threads());
for (auto& path : paths) {
queue.add_item(&path);
}
queue.run_all();
boost::filesystem::current_path(current_path);
LOG(2,
"Indexed {} top-level classes in {:.2f}s.",
class_to_path.size(),
index_timer.duration_in_seconds());
Timer method_paths_timer;
LOG(2, "Indexing method paths...");
for (auto& scope : DexStoreClassesIterator(stores)) {
walk::parallel::methods(scope, [&](DexMethod* method) {
auto class_name = method->get_class()->str();
class_name = class_name.substr(0, class_name.find(";"));
class_name = class_name.substr(0, class_name.find("$")) + ";";
std::string path = class_to_path.get(class_name, /* default */ "");
if (!path.empty()) {
method_to_path_.emplace(method, paths_.insert(path).first);
}
});
}
LOG(2,
"Indexed {} method paths in {:.2f}s.",
method_to_path_.size(),
method_paths_timer.duration_in_seconds());
}
Timer method_lines_timer;
LOG(2, "Indexing method lines...");
// Index first lines of methods because building the control flow graph will
// destroy them.
for (auto& scope : DexStoreClassesIterator(stores)) {
walk::parallel::methods(scope, [&](DexMethod* method) {
const auto* code = method->get_code();
if (!code) {
return;
}
for (const auto& instruction : *code) {
if (instruction.type == MFLOW_POSITION) {
const auto* instruction_position = instruction.pos.get();
if (instruction_position) {
// Assume the method signature is on the previous line.
int line =
std::max(static_cast<int>(instruction_position->line) - 1, 0);
method_to_line_.emplace(method, line);
}
break;
}
}
});
}
LOG(2,
"Indexed {} method lines in {:.2f}s.",
method_to_line_.size(),
method_lines_timer.duration_in_seconds());
}
std::string Positions::execute_and_catch_output(
const std::string& command,
int& return_code) {
auto pclose_wrapper = [&return_code](FILE* command) {
return_code = pclose(command);
};
std::string output;
std::unique_ptr<FILE, decltype(pclose_wrapper)> pipe(
popen(command.c_str(), "r"), pclose_wrapper);
if (!pipe) {
throw std::runtime_error(
fmt::format("Unable to open pipe to command `{}`.", command));
}
std::array<char, 128> buffer;
while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
output += buffer.data();
}
// Function scope ensures the unique pointer goes out of scope and
// thus ensures invoking the close wrapper
return output;
}
const Position* Positions::get(
const DexMethod* method,
const DexPosition* position,
std::optional<Root> port,
const IRInstruction* instruction) const {
mt_assert(method != nullptr);
const std::string* path = method_to_path_.get(method, /* default */ nullptr);
int line = k_unknown_line;
if (position) {
line = position->line;
} else {
line = method_to_line_.get(method, /* default */ k_unknown_line);
}
return positions_.insert(Position(path, line, port, instruction)).first;
}
const Position* Positions::get(
const Method* method,
const DexPosition* position,
std::optional<Root> port,
const IRInstruction* instruction) const {
return get(method->dex_method(), position, port, instruction);
}
const Position* Positions::get(
const DexMethod* method,
int line,
std::optional<Root> port,
const IRInstruction* instruction,
int start,
int end) const {
mt_assert(method != nullptr);
const std::string* path = method_to_path_.get(method, /* default */ nullptr);
return positions_.insert(Position(path, line, port, instruction, start, end))
.first;
}
const Position* Positions::get(
const std::optional<std::string>& path,
int line,
std::optional<Root> port,
const IRInstruction* instruction,
int start,
int end) const {
auto position = Position(
/* path */ path ? paths_.insert(*path).first : nullptr,
/* line */ line,
/* port */ port,
/* instruction */ instruction,
/* start */ start,
/* end */ end);
return positions_.insert(position).first;
}
const Position* Positions::get(
const Position* position,
std::optional<Root> port,
const IRInstruction* instruction) const {
auto new_position = Position(
/* path */ position->path() ? paths_.insert(*position->path()).first
: nullptr,
/* line */ position->line(),
/* port */ port,
/* instruction */ instruction);
return positions_.insert(new_position).first;
}
const Position*
Positions::get(const Position* position, int line, int start, int end) const {
auto new_position = Position(
/* path */ position->path() ? paths_.insert(*position->path()).first
: nullptr,
/* line */ line,
/* port */ position->port(),
/* instruction */ position->instruction(),
/* start */ start,
/* end */ end);
return positions_.insert(new_position).first;
};
const Position* Positions::unknown() const {
return positions_.insert(Position(nullptr, k_unknown_line)).first;
}
} // namespace marianatrench