void sortIntegralValues()

in fbpcs/data_processing/id_combiner/SortIntegralValues.cpp [44:133]


void sortIntegralValues(
    std::istream& inStream,
    std::ostream& outStream,
    const std::string& sortBy,
    const std::vector<std::string>& listColumns) {
  if (std::find(listColumns.begin(), listColumns.end(), sortBy) ==
      listColumns.end()) {
    XLOG(FATAL) << "SortBy column must be contained in the listColumns";
  }

  std::string line;
  getline(inStream, line);
  auto header = splitWithBrackets(line);

  auto headerSize = header.size();

  // Output the header as before
  outStream << vectorToString(header) << '\n';

  while (getline(inStream, line)) {
    auto row = splitWithBrackets(line);
    auto rowSize = row.size();
    if (rowSize != headerSize) {
      XLOG(FATAL) << "Mismatch between header and row\n"
                  << "Header has size " << headerSize << " while row has size "
                  << rowSize << '\n'
                  << "Header: " << vectorToString(header) << '\n'
                  << "Row   : " << vectorToString(row) << '\n';
    }

    // First parse the listy columns
    std::vector<std::vector<std::string>> listsInRow;
    std::size_t sortByIdxInParsedLists;
    std::size_t pushBackIdx = 0;
    for (const auto& listCol : listColumns) {
      if (listCol == sortBy) {
        sortByIdxInParsedLists = pushBackIdx;
      }
      auto idx = headerIndex(header, listCol);
      listsInRow.push_back(splitList(row.at(idx)));
      ++pushBackIdx;
    }

    // We go ahead and parse the sortBy column once to avoid duplicating work
    std::vector<int64_t> vals;
    for (const auto& s : listsInRow.at(sortByIdxInParsedLists)) {
      int64_t parsed;
      std::istringstream parser{s};
      parser >> parsed;
      if (parser.fail()) {
        XLOG(FATAL) << "Failed to parse " << s << " as int64_t";
      }
      vals.push_back(parsed);
    }

    // Then sort them all based on the sortBy column
    auto permutation =
        getSortPermutation(vals, [](int64_t a, int64_t b) { return a < b; });
    XLOG(INFO) << "The permutation of " << vectorToString(vals) << " is... "
               << vectorToString(permutation);

    // Apply the permutation to every list column
    for (auto& lst : listsInRow) {
      applyPermutation(lst, permutation);
    }

    // Finally, emit a new line
    bool first = true;
    for (std::size_t i = 0; i < row.size(); ++i) {
      if (!first) {
        outStream << ',';
      }
      first = false;

      // If this is a list column, output from the sorted listsInRow instead
      auto listFind =
          std::find(listColumns.begin(), listColumns.end(), header.at(i));
      if (listFind != listColumns.end()) {
        outStream << '['
                  << vectorToString(
                         listsInRow.at(listFind - listColumns.begin()))
                  << ']';
      } else {
        // Otherwise we have the "easy" case -- just output
        outStream << row.at(i);
      }
    }
    outStream << '\n';
  }
}