libredex/ProguardLexer.cpp (686 lines of code) (raw):
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <boost/multi_index/hashed_index.hpp>
#include <boost/multi_index/member.hpp>
#include <boost/multi_index_container.hpp>
#include <cctype>
#include <istream>
#include <unordered_map>
#include <utility>
#include <vector>
#include "Debug.h"
#include "Macros.h"
#include "ProguardLexer.h"
namespace keep_rules {
namespace proguard_parser {
namespace {
constexpr char kPathDelim =
#if IS_WINDOWS
';';
#else
':';
#endif
bool is_deliminator(char ch) {
return isspace(ch) || ch == '{' || ch == '}' || ch == '(' || ch == ')' ||
ch == ',' || ch == ';' || ch == ':' || ch == EOF || ch == '#';
}
bool is_not_idenfitier_character(char ch) {
return ch == '=' || ch == '+' || ch == '|' || ch == '@' || ch == '#' ||
ch == '^' || ch == '&' || ch == '"' || ch == '\'' || ch == '`' ||
ch == '~' || ch == '-';
}
// An identifier can refer to a class name, a field name or a package name.
// https://docs.oracle.com/javase/specs/jls/se16/html/jls-3.html#jls-JavaLetter
bool is_identifier(const std::string_view& ident) {
for (const char& ch : ident) {
// java identifiers can be multi-lingual so membership testing is complex.
// much simpler to test for what is definitely not an identifier and then
// assume everything else is a legal identifier char, accepting that we
// will have false positives.
if (is_deliminator(ch) || is_not_idenfitier_character(ch)) {
return false;
}
}
return true;
}
void skip_whitespace(std::string_view& data, unsigned int* line) {
size_t index = 0;
for (; index != data.size(); ++index) {
char ch = data[index];
if (ch == '\n') {
(*line)++;
}
if (!isspace(ch)) {
break;
}
}
if (index == data.size()) {
data = std::string_view();
} else {
data = data.substr(index);
}
}
std::string_view read_path(std::string_view& data, unsigned int* line) {
skip_whitespace(data, line);
// Handle the case for optional filepath arguments by
// returning an empty filepath.
if (data.empty() || data[0] == '-') {
return std::string_view();
}
bool has_quotes = data[0] == '"';
size_t start = has_quotes ? 1 : 0;
size_t end = start;
for (; end != data.size(); ++end) {
char c = data[end];
if (c == kPathDelim || (!has_quotes && isspace(c))) {
break;
}
if (c == '"' && has_quotes) {
++end;
break;
}
}
if (start == end) {
data = data.substr(start);
return std::string_view(); // Should maybe be an error.
}
size_t adjusted_end = end;
if (has_quotes && data[adjusted_end - 1] == '"') {
--adjusted_end;
}
auto ret = data.substr(start, adjusted_end - start);
data = data.substr(end);
return ret;
}
std::vector<std::pair<std::string_view, unsigned int>> read_paths(
std::string_view& data, unsigned int* line) {
std::vector<std::pair<std::string_view, unsigned int>> paths;
paths.push_back({read_path(data, line), *line});
skip_whitespace(data, line);
while (!data.empty() && data[0] == kPathDelim) {
data = data.substr(1);
paths.push_back({read_path(data, line), *line});
skip_whitespace(data, line);
}
return paths;
}
template <bool kSkipWs, typename FilterFn>
std::string_view parse_part_fn(std::string_view& data,
unsigned int* line,
FilterFn fn) {
if (kSkipWs) {
skip_whitespace(data, line);
}
auto first_delim = std::find_if(data.begin(), data.end(), fn);
auto part = first_delim != data.end()
? data.substr(0, first_delim - data.begin())
: data;
data = first_delim != data.end() ? data.substr(first_delim - data.begin())
: std::string_view();
return part;
}
std::string_view read_target_version(std::string_view& data,
unsigned int* line) {
auto is_version_character = [](char ch) { return ch == '.' || isdigit(ch); };
return parse_part_fn</*kSkipWs=*/true>(
data, line, [&](char c) { return !is_version_character(c); });
}
std::string_view parse_package_name(std::string_view& data,
unsigned int* line) {
auto pkg_name_char = [](char ch) {
return isalnum(ch) || ch == '.' || ch == '\'' || ch == '_' || ch == '$';
};
return parse_part_fn</*kSkipWs=*/true>(
data, line, [&](char c) { return !pkg_name_char(c); });
}
bool lex_filter(std::string_view& data,
std::string_view* filter,
unsigned int* line) {
skip_whitespace(data, line);
// Make sure we are not at the end of the file or the start of another
// command when the argument is missing.
if (data.empty() || data[0] == '-') {
return false;
}
*filter = parse_part_fn</*kSkipWs=*/false>(
data, line, [](char c) { return c == ',' || isspace(c); });
return true;
}
std::vector<std::string_view> lex_filter_list(std::string_view& data,
unsigned int* line) {
std::vector<std::string_view> filter_list;
std::string_view filter;
bool ok = lex_filter(data, &filter, line);
if (!ok) {
return filter_list;
}
filter_list.push_back(filter);
skip_whitespace(data, line);
while (ok && !data.empty() && data[0] == ',') {
// Swallow up the comma.
data = data.substr(1);
ok = lex_filter(data, &filter, line);
if (ok) {
filter_list.push_back(filter);
skip_whitespace(data, line);
}
}
return filter_list;
}
// std::unordered_map does not work with string views. Use Boost magic.
template <typename T, typename Q>
struct MyPair {
T first;
mutable Q second;
};
struct StringViewEquals {
bool operator()(const std::string& s1, const std::string& s2) const {
return s1 == s2;
}
bool operator()(const std::string& s1, const std::string_view& v2) const {
return v2 == s1;
}
bool operator()(const std::string_view& v1, const std::string& s2) const {
return v1 == s2;
}
bool operator()(const std::string_view& v1,
const std::string_view& v2) const {
return v1 == v2;
}
};
using namespace boost::multi_index;
template <typename Q>
using UnorderedStringViewIndexableMap = multi_index_container<
MyPair<std::string_view, Q>,
indexed_by<hashed_unique<member<MyPair<std::string_view, Q>,
std::string_view,
&MyPair<std::string_view, Q>::first>,
boost::hash<std::string_view>,
StringViewEquals>>>;
} // namespace
std::string Token::show() const {
switch (type) {
case TokenType::openCurlyBracket:
return "{";
case TokenType::closeCurlyBracket:
return "}";
case TokenType::openBracket:
return "(";
case TokenType::closeBracket:
return ")";
case TokenType::semiColon:
return ";";
case TokenType::colon:
return ":";
case TokenType::notToken:
return "!";
case TokenType::comma:
return ",";
case TokenType::slash:
return "/";
case TokenType::classToken:
return "class";
case TokenType::publicToken:
return "public";
case TokenType::final:
return "final";
case TokenType::abstract:
return "abstract";
case TokenType::interface:
return "interface";
case TokenType::enumToken:
return "enum";
case TokenType::extends:
return "extends";
case TokenType::implements:
return "implements";
case TokenType::privateToken:
return "private";
case TokenType::protectedToken:
return "protected";
case TokenType::staticToken:
return "static";
case TokenType::volatileToken:
return "volatile";
case TokenType::transient:
return "transient";
case TokenType::annotation:
return "@interface";
case TokenType::annotation_application:
return "@";
case TokenType::synchronized:
return "synchronized";
case TokenType::native:
return "native";
case TokenType::strictfp:
return "strictfp";
case TokenType::synthetic:
return "synthetic";
case TokenType::bridge:
return "bridge";
case TokenType::varargs:
return "varargs";
case TokenType::command:
return "-" + std::string(data);
case TokenType::identifier:
return "identifier: " + std::string(data);
case TokenType::arrayType:
return "[]";
case TokenType::filepath:
return "filepath " + std::string(data);
case TokenType::target_version_token:
return std::string(data);
case TokenType::filter_pattern:
return "filter: " + std::string(data);
case TokenType::eof_token:
return "<EOF>";
case TokenType::comment:
return "#" + std::string(data);
// Input/Output Options
case TokenType::include:
return "-include";
case TokenType::basedirectory:
return "-basedirectory";
case TokenType::dump:
return "-dump";
case TokenType::injars:
return "-injars ";
case TokenType::outjars:
return "-outjars ";
case TokenType::libraryjars:
return "-libraryjars ";
case TokenType::keepdirectories:
return "-keepdirectories";
case TokenType::target:
return "-target ";
case TokenType::dontskipnonpubliclibraryclasses:
return "-dontskipnonpubliclibraryclasses";
// Keep Options
case TokenType::keep:
return "-keep";
case TokenType::keepclassmembers:
return "-keepclassmembers";
case TokenType::keepclasseswithmembers:
return "-keepclasseswithmembers";
case TokenType::keepnames:
return "-keepnames";
case TokenType::keepclassmembernames:
return "-keepclassmembernames";
case TokenType::keepclasseswithmembernames:
return "-keepclasseswithmembernames";
case TokenType::printseeds:
return "-printseeds ";
// Keep Option Modifiers
case TokenType::includedescriptorclasses_token:
return "includedescriptorclasses";
case TokenType::allowshrinking_token:
return "allowshrinking";
case TokenType::allowoptimization_token:
return "allowoptimization";
case TokenType::allowobfuscation_token:
return "allowobfuscation";
// Shrinking Options
case TokenType::dontshrink:
return "-dontshrink";
case TokenType::printusage:
return "-printusage";
case TokenType::whyareyoukeeping:
return "-whyareyoukeeping";
// Optimization Options
case TokenType::dontoptimize:
return "-dontoptimize";
case TokenType::optimizations:
return "-optimizations";
case TokenType::optimizationpasses:
return "-optimizationpasses";
case TokenType::assumenosideeffects:
return "-assumenosideeffects";
case TokenType::mergeinterfacesaggressively:
return "-mergeinterfacesaggressively";
case TokenType::allowaccessmodification_token:
return "-allowaccessmodification";
case TokenType::returns:
return "return";
// Obfuscation Options
case TokenType::dontobfuscate:
return "-dontobfuscate ";
case TokenType::printmapping:
return "-printmapping ";
case TokenType::repackageclasses:
return "-repackageclasses";
case TokenType::keepattributes:
return "-keepattributes";
case TokenType::dontusemixedcaseclassnames_token:
return "-dontusemixedcaseclassnames";
case TokenType::keeppackagenames:
return "-keeppackagenames";
// Preverification Options
case TokenType::dontpreverify_token:
return "-dontpreverify";
// General Options
case TokenType::printconfiguration:
return "-printconfiguration ";
case TokenType::dontwarn:
return "-dontwarn";
case TokenType::verbose_token:
return "-verbose";
case TokenType::unknownToken:
return "unknown token at line " + std::to_string(line) + " : " +
std::string(data);
}
not_reached();
}
bool Token::is_command() const {
switch (type) {
case TokenType::openCurlyBracket:
case TokenType::closeCurlyBracket:
case TokenType::openBracket:
case TokenType::closeBracket:
case TokenType::semiColon:
case TokenType::colon:
case TokenType::notToken:
case TokenType::comma:
case TokenType::slash:
case TokenType::classToken:
case TokenType::publicToken:
case TokenType::final:
case TokenType::abstract:
case TokenType::interface:
case TokenType::enumToken:
case TokenType::extends:
case TokenType::implements:
case TokenType::privateToken:
case TokenType::protectedToken:
case TokenType::staticToken:
case TokenType::volatileToken:
case TokenType::transient:
case TokenType::annotation:
case TokenType::annotation_application:
case TokenType::synchronized:
case TokenType::native:
case TokenType::strictfp:
case TokenType::synthetic:
case TokenType::bridge:
case TokenType::varargs:
case TokenType::identifier:
case TokenType::arrayType:
case TokenType::filepath:
case TokenType::target_version_token:
case TokenType::filter_pattern:
case TokenType::eof_token:
case TokenType::comment:
return false;
case TokenType::command:
return true;
// Input/Output Options
case TokenType::include:
case TokenType::basedirectory:
case TokenType::dump:
case TokenType::injars:
case TokenType::outjars:
case TokenType::libraryjars:
case TokenType::keepdirectories:
case TokenType::target:
case TokenType::dontskipnonpubliclibraryclasses:
return true;
// Keep Options
case TokenType::keep:
case TokenType::keepclassmembers:
case TokenType::keepclasseswithmembers:
case TokenType::keepnames:
case TokenType::keepclassmembernames:
case TokenType::keepclasseswithmembernames:
case TokenType::printseeds:
return true;
// Keep Option Modifiers
case TokenType::includedescriptorclasses_token:
case TokenType::allowshrinking_token:
case TokenType::allowoptimization_token:
case TokenType::allowobfuscation_token:
return false;
// Shrinking Options
case TokenType::dontshrink:
case TokenType::printusage:
case TokenType::whyareyoukeeping:
return true;
// Optimization Options
case TokenType::dontoptimize:
case TokenType::optimizations:
case TokenType::optimizationpasses:
case TokenType::assumenosideeffects:
case TokenType::mergeinterfacesaggressively:
case TokenType::allowaccessmodification_token:
case TokenType::returns:
return true;
// Obfuscation Options
case TokenType::dontobfuscate:
case TokenType::printmapping:
case TokenType::repackageclasses:
case TokenType::keepattributes:
case TokenType::dontusemixedcaseclassnames_token:
case TokenType::keeppackagenames:
return true;
// Preverification Options
case TokenType::dontpreverify_token:
return true;
// General Options
case TokenType::printconfiguration:
case TokenType::dontwarn:
case TokenType::verbose_token:
return true;
case TokenType::unknownToken:
return false;
}
not_reached();
}
std::vector<Token> lex(const std::string_view& in) {
std::unordered_map<char, TokenType> simple_tokens{
{'{', TokenType::openCurlyBracket},
{'}', TokenType::closeCurlyBracket},
{'(', TokenType::openBracket},
{')', TokenType::closeBracket},
{';', TokenType::semiColon},
{':', TokenType::colon},
{',', TokenType::comma},
{'!', TokenType::notToken},
{'/', TokenType::slash},
{'@', TokenType::annotation_application},
};
using TokenMap = UnorderedStringViewIndexableMap<TokenType>;
TokenMap word_tokens{
{"includedescriptorclasses", TokenType::includedescriptorclasses_token},
{"allowshrinking", TokenType::allowshrinking_token},
{"allowoptimization", TokenType::allowoptimization_token},
{"allowobfuscation", TokenType::allowobfuscation_token},
{"class", TokenType::classToken},
{"public", TokenType::publicToken},
{"final", TokenType::final},
{"abstract", TokenType::abstract},
{"enum", TokenType::enumToken},
{"private", TokenType::privateToken},
{"protected", TokenType::protectedToken},
{"static", TokenType::staticToken},
{"volatile", TokenType::volatileToken},
{"transient", TokenType::transient},
{"synchronized", TokenType::synchronized},
{"native", TokenType::native},
{"strictfp", TokenType::strictfp},
{"synthetic", TokenType::synthetic},
{"bridge", TokenType::bridge},
{"varargs", TokenType::varargs},
{"extends", TokenType::extends},
{"implements", TokenType::implements},
{"return", TokenType::returns},
};
TokenMap simple_commands{
// Keep Options
{"keep", TokenType::keep},
{"keepclassmembers", TokenType::keepclassmembers},
{"keepclasseswithmembers", TokenType::keepclasseswithmembers},
{"keepnames", TokenType::keepnames},
{"keepclassmembernames", TokenType::keepclassmembernames},
{"keepclasseswithmembernames", TokenType::keepclasseswithmembernames},
// Shrinking Options
{"dontshrink", TokenType::dontshrink},
{"whyareyoukeeping", TokenType::whyareyoukeeping},
// Optimization Options
{"assumenosideeffects", TokenType::assumenosideeffects},
{"allowaccessmodification", TokenType::allowaccessmodification_token},
{"dontoptimize", TokenType::dontoptimize},
{"optimizationpasses", TokenType::optimizationpasses},
{"mergeinterfacesaggressively", TokenType::mergeinterfacesaggressively},
// Obfuscation Options
{"dontobfuscate", TokenType::dontobfuscate},
{"dontusemixedcaseclassnames",
TokenType::dontusemixedcaseclassnames_token},
{"dontskipnonpubliclibraryclasses",
TokenType::dontskipnonpubliclibraryclasses},
// Preverification Options.
{"dontpreverify", TokenType::dontpreverify_token},
// General Options
{"verbose", TokenType::verbose_token},
};
TokenMap single_filepath_commands{
// Input/Output Options
{"include", TokenType::include},
{"basedirectory", TokenType::basedirectory},
{"dump", TokenType::dump},
{"printmapping", TokenType::printmapping},
{"printconfiguration", TokenType::printconfiguration},
{"printseeds", TokenType::printseeds},
// Shrinking Options
{"printusage", TokenType::printusage},
};
TokenMap multi_filepaths_commands{
// Input/Output Options
{"injars", TokenType::injars},
{"outjars", TokenType::outjars},
{"libraryjars", TokenType::libraryjars},
// Keep Options
{"keepdirectories", TokenType::keepdirectories},
};
TokenMap filter_list_commands{
// Optimization Options
{"optimizations", TokenType::optimizations},
// Obfuscation Options
{"keepattributes", TokenType::keepattributes},
// General Options
{"dontwarn", TokenType::dontwarn},
{"keeppackagenames", TokenType::keeppackagenames},
};
std::vector<Token> tokens;
tokens.reserve(std::max((size_t)1, in.size() / 20)); // 5% ratio.
unsigned int line = 1;
auto add_token = [&](TokenType type) { tokens.emplace_back(type, line); };
auto add_token_data = [&](TokenType type, const std::string_view& data) {
tokens.emplace_back(type, line, data);
};
auto add_token_line_data =
[&](TokenType type, size_t t_line, const std::string_view& data) {
tokens.emplace_back(type, t_line, data);
};
std::string_view data = in;
while (!data.empty()) {
char ch = data[0];
// Skip comments.
if (ch == '#') {
auto eol_pos = data.find('\n');
std::string_view comment_data;
if (eol_pos != std::string_view::npos) {
comment_data = data.substr(1, eol_pos - 1);
data = data.substr(eol_pos + 1);
} else {
comment_data = data.substr(1);
data = std::string_view();
}
tokens.emplace_back(TokenType::comment, line, comment_data);
++line;
continue;
}
auto consume_ws = [&line, &data]() {
size_t index = 0;
for (; index != data.size(); ++index) {
char c = data[index];
if (c == '\n') {
line++;
continue;
}
if (!isspace(c)) {
break;
}
}
data = data.substr(index);
};
// Skip whitespaces.
if (isspace(ch)) {
consume_ws();
continue;
}
{
auto it = simple_tokens.find(ch);
if (it != simple_tokens.end()) {
add_token(it->second);
data = data.substr(1);
continue;
}
}
if (ch == '[') {
auto old_view = data;
data = data.substr(1);
consume_ws(); // Consume any whitespace
// Check for closing brace.
if (data.empty()) {
add_token_data(TokenType::unknownToken, old_view);
continue;
}
if (data[0] == ']') {
add_token(TokenType::arrayType);
data = data.substr(1);
continue;
}
// Any token other than a ']' next is a bad token.
}
// Check for commands.
if (ch == '-') {
data = data.substr(1);
auto command =
parse_part_fn</*kSkipWs=*/false>(data, &line, is_deliminator);
{
auto it = simple_commands.find(command);
if (it != simple_commands.end()) {
add_token(it->second);
continue;
}
}
{
auto it = single_filepath_commands.find(command);
if (it != single_filepath_commands.end()) {
add_token(it->second);
auto path = read_path(data, &line);
if (!path.empty()) {
add_token_data(TokenType::filepath, path);
}
continue;
}
}
{
auto it = multi_filepaths_commands.find(command);
if (it != multi_filepaths_commands.end()) {
add_token(it->second);
auto paths = read_paths(data, &line);
for (auto& path : paths) {
add_token_line_data(TokenType::filepath, path.second, path.first);
}
continue;
}
}
{
auto it = filter_list_commands.find(command);
if (it != filter_list_commands.end()) {
add_token(it->second);
for (auto& filter : lex_filter_list(data, &line)) {
add_token_data(TokenType::filter_pattern, filter);
}
continue;
}
}
// Input/Output Options
if (command == "target") {
add_token(TokenType::target);
auto version = read_target_version(data, &line);
if (!version.empty()) {
add_token_data(TokenType::target_version_token, version);
}
continue;
}
// Obfuscation Options
if (command == "repackageclasses") {
add_token(TokenType::repackageclasses);
auto package_name = parse_package_name(data, &line);
if (!package_name.empty()) {
add_token_data(TokenType::identifier, package_name);
}
continue;
}
// Some other command.
add_token_data(TokenType::command, command);
continue;
}
auto word = parse_part_fn</*kSkipWs=*/false>(data, &line, is_deliminator);
{
auto it = word_tokens.find(word);
if (it != word_tokens.end()) {
add_token(it->second);
continue;
}
}
if (word == "interface") {
// If the previous symbol was a @ then this is really an annotation.
if (!tokens.empty() &&
tokens.back().type == TokenType::annotation_application) {
tokens.pop_back();
add_token(TokenType::annotation);
} else {
add_token(TokenType::interface);
}
continue;
}
if (is_identifier(word)) {
add_token_data(TokenType::identifier, word);
continue;
}
// This is an unrecognized token.
add_token_data(TokenType::unknownToken, word);
}
add_token(TokenType::eof_token);
return tokens;
}
} // namespace proguard_parser
} // namespace keep_rules