std::vector lex()

in libredex/ProguardLexer.cpp [527:821]


std::vector<Token> lex(const std::string_view& in) {
  std::unordered_map<char, TokenType> simple_tokens{
      {'{', TokenType::openCurlyBracket},
      {'}', TokenType::closeCurlyBracket},
      {'(', TokenType::openBracket},
      {')', TokenType::closeBracket},
      {';', TokenType::semiColon},
      {':', TokenType::colon},
      {',', TokenType::comma},
      {'!', TokenType::notToken},
      {'/', TokenType::slash},
      {'@', TokenType::annotation_application},
  };

  using TokenMap = UnorderedStringViewIndexableMap<TokenType>;

  TokenMap word_tokens{
      {"includedescriptorclasses", TokenType::includedescriptorclasses_token},
      {"allowshrinking", TokenType::allowshrinking_token},
      {"allowoptimization", TokenType::allowoptimization_token},
      {"allowobfuscation", TokenType::allowobfuscation_token},
      {"class", TokenType::classToken},
      {"public", TokenType::publicToken},
      {"final", TokenType::final},
      {"abstract", TokenType::abstract},
      {"enum", TokenType::enumToken},
      {"private", TokenType::privateToken},
      {"protected", TokenType::protectedToken},
      {"static", TokenType::staticToken},
      {"volatile", TokenType::volatileToken},
      {"transient", TokenType::transient},
      {"synchronized", TokenType::synchronized},
      {"native", TokenType::native},
      {"strictfp", TokenType::strictfp},
      {"synthetic", TokenType::synthetic},
      {"bridge", TokenType::bridge},
      {"varargs", TokenType::varargs},
      {"extends", TokenType::extends},
      {"implements", TokenType::implements},
      {"return", TokenType::returns},
  };

  TokenMap simple_commands{
      // Keep Options
      {"keep", TokenType::keep},
      {"keepclassmembers", TokenType::keepclassmembers},
      {"keepclasseswithmembers", TokenType::keepclasseswithmembers},
      {"keepnames", TokenType::keepnames},
      {"keepclassmembernames", TokenType::keepclassmembernames},
      {"keepclasseswithmembernames", TokenType::keepclasseswithmembernames},

      // Shrinking Options
      {"dontshrink", TokenType::dontshrink},

      {"whyareyoukeeping", TokenType::whyareyoukeeping},

      // Optimization Options
      {"assumenosideeffects", TokenType::assumenosideeffects},
      {"allowaccessmodification", TokenType::allowaccessmodification_token},
      {"dontoptimize", TokenType::dontoptimize},
      {"optimizationpasses", TokenType::optimizationpasses},
      {"mergeinterfacesaggressively", TokenType::mergeinterfacesaggressively},

      // Obfuscation Options
      {"dontobfuscate", TokenType::dontobfuscate},
      {"dontusemixedcaseclassnames",
       TokenType::dontusemixedcaseclassnames_token},
      {"dontskipnonpubliclibraryclasses",
       TokenType::dontskipnonpubliclibraryclasses},

      // Preverification Options.
      {"dontpreverify", TokenType::dontpreverify_token},

      // General Options
      {"verbose", TokenType::verbose_token},
  };

  TokenMap single_filepath_commands{
      // Input/Output Options
      {"include", TokenType::include},
      {"basedirectory", TokenType::basedirectory},
      {"dump", TokenType::dump},
      {"printmapping", TokenType::printmapping},
      {"printconfiguration", TokenType::printconfiguration},
      {"printseeds", TokenType::printseeds},
      // Shrinking Options
      {"printusage", TokenType::printusage},
  };
  TokenMap multi_filepaths_commands{
      // Input/Output Options
      {"injars", TokenType::injars},
      {"outjars", TokenType::outjars},
      {"libraryjars", TokenType::libraryjars},
      // Keep Options
      {"keepdirectories", TokenType::keepdirectories},
  };

  TokenMap filter_list_commands{
      // Optimization Options
      {"optimizations", TokenType::optimizations},
      // Obfuscation Options
      {"keepattributes", TokenType::keepattributes},
      // General Options
      {"dontwarn", TokenType::dontwarn},
      {"keeppackagenames", TokenType::keeppackagenames},
  };

  std::vector<Token> tokens;
  tokens.reserve(std::max((size_t)1, in.size() / 20)); // 5% ratio.

  unsigned int line = 1;

  auto add_token = [&](TokenType type) { tokens.emplace_back(type, line); };
  auto add_token_data = [&](TokenType type, const std::string_view& data) {
    tokens.emplace_back(type, line, data);
  };
  auto add_token_line_data =
      [&](TokenType type, size_t t_line, const std::string_view& data) {
        tokens.emplace_back(type, t_line, data);
      };

  std::string_view data = in;
  while (!data.empty()) {
    char ch = data[0];

    // Skip comments.
    if (ch == '#') {
      auto eol_pos = data.find('\n');
      std::string_view comment_data;
      if (eol_pos != std::string_view::npos) {
        comment_data = data.substr(1, eol_pos - 1);
        data = data.substr(eol_pos + 1);
      } else {
        comment_data = data.substr(1);
        data = std::string_view();
      }
      tokens.emplace_back(TokenType::comment, line, comment_data);
      ++line;
      continue;
    }

    auto consume_ws = [&line, &data]() {
      size_t index = 0;
      for (; index != data.size(); ++index) {
        char c = data[index];
        if (c == '\n') {
          line++;
          continue;
        }
        if (!isspace(c)) {
          break;
        }
      }
      data = data.substr(index);
    };

    // Skip whitespaces.
    if (isspace(ch)) {
      consume_ws();
      continue;
    }

    {
      auto it = simple_tokens.find(ch);
      if (it != simple_tokens.end()) {
        add_token(it->second);
        data = data.substr(1);
        continue;
      }
    }

    if (ch == '[') {
      auto old_view = data;
      data = data.substr(1);
      consume_ws(); // Consume any whitespace
      // Check for closing brace.
      if (data.empty()) {
        add_token_data(TokenType::unknownToken, old_view);
        continue;
      }
      if (data[0] == ']') {
        add_token(TokenType::arrayType);
        data = data.substr(1);
        continue;
      }
      // Any token other than a ']' next is a bad token.
    }

    // Check for commands.
    if (ch == '-') {
      data = data.substr(1);
      auto command =
          parse_part_fn</*kSkipWs=*/false>(data, &line, is_deliminator);

      {
        auto it = simple_commands.find(command);
        if (it != simple_commands.end()) {
          add_token(it->second);
          continue;
        }
      }

      {
        auto it = single_filepath_commands.find(command);
        if (it != single_filepath_commands.end()) {
          add_token(it->second);
          auto path = read_path(data, &line);
          if (!path.empty()) {
            add_token_data(TokenType::filepath, path);
          }
          continue;
        }
      }

      {
        auto it = multi_filepaths_commands.find(command);
        if (it != multi_filepaths_commands.end()) {
          add_token(it->second);
          auto paths = read_paths(data, &line);
          for (auto& path : paths) {
            add_token_line_data(TokenType::filepath, path.second, path.first);
          }
          continue;
        }
      }

      {
        auto it = filter_list_commands.find(command);
        if (it != filter_list_commands.end()) {
          add_token(it->second);
          for (auto& filter : lex_filter_list(data, &line)) {
            add_token_data(TokenType::filter_pattern, filter);
          }
          continue;
        }
      }

      // Input/Output Options
      if (command == "target") {
        add_token(TokenType::target);
        auto version = read_target_version(data, &line);
        if (!version.empty()) {
          add_token_data(TokenType::target_version_token, version);
        }
        continue;
      }

      // Obfuscation Options
      if (command == "repackageclasses") {
        add_token(TokenType::repackageclasses);
        auto package_name = parse_package_name(data, &line);
        if (!package_name.empty()) {
          add_token_data(TokenType::identifier, package_name);
        }
        continue;
      }

      // Some other command.
      add_token_data(TokenType::command, command);
      continue;
    }

    auto word = parse_part_fn</*kSkipWs=*/false>(data, &line, is_deliminator);

    {
      auto it = word_tokens.find(word);
      if (it != word_tokens.end()) {
        add_token(it->second);
        continue;
      }
    }

    if (word == "interface") {
      // If the previous symbol was a @ then this is really an annotation.
      if (!tokens.empty() &&
          tokens.back().type == TokenType::annotation_application) {
        tokens.pop_back();
        add_token(TokenType::annotation);
      } else {
        add_token(TokenType::interface);
      }
      continue;
    }

    if (is_identifier(word)) {
      add_token_data(TokenType::identifier, word);
      continue;
    }

    // This is an unrecognized token.
    add_token_data(TokenType::unknownToken, word);
  }
  add_token(TokenType::eof_token);
  return tokens;
}