in fbpcs/data_processing/id_combiner/SortIntegralValues.cpp [44:133]
void sortIntegralValues(
std::istream& inStream,
std::ostream& outStream,
const std::string& sortBy,
const std::vector<std::string>& listColumns) {
if (std::find(listColumns.begin(), listColumns.end(), sortBy) ==
listColumns.end()) {
XLOG(FATAL) << "SortBy column must be contained in the listColumns";
}
std::string line;
getline(inStream, line);
auto header = splitWithBrackets(line);
auto headerSize = header.size();
// Output the header as before
outStream << vectorToString(header) << '\n';
while (getline(inStream, line)) {
auto row = splitWithBrackets(line);
auto rowSize = row.size();
if (rowSize != headerSize) {
XLOG(FATAL) << "Mismatch between header and row\n"
<< "Header has size " << headerSize << " while row has size "
<< rowSize << '\n'
<< "Header: " << vectorToString(header) << '\n'
<< "Row : " << vectorToString(row) << '\n';
}
// First parse the listy columns
std::vector<std::vector<std::string>> listsInRow;
std::size_t sortByIdxInParsedLists;
std::size_t pushBackIdx = 0;
for (const auto& listCol : listColumns) {
if (listCol == sortBy) {
sortByIdxInParsedLists = pushBackIdx;
}
auto idx = headerIndex(header, listCol);
listsInRow.push_back(splitList(row.at(idx)));
++pushBackIdx;
}
// We go ahead and parse the sortBy column once to avoid duplicating work
std::vector<int64_t> vals;
for (const auto& s : listsInRow.at(sortByIdxInParsedLists)) {
int64_t parsed;
std::istringstream parser{s};
parser >> parsed;
if (parser.fail()) {
XLOG(FATAL) << "Failed to parse " << s << " as int64_t";
}
vals.push_back(parsed);
}
// Then sort them all based on the sortBy column
auto permutation =
getSortPermutation(vals, [](int64_t a, int64_t b) { return a < b; });
XLOG(INFO) << "The permutation of " << vectorToString(vals) << " is... "
<< vectorToString(permutation);
// Apply the permutation to every list column
for (auto& lst : listsInRow) {
applyPermutation(lst, permutation);
}
// Finally, emit a new line
bool first = true;
for (std::size_t i = 0; i < row.size(); ++i) {
if (!first) {
outStream << ',';
}
first = false;
// If this is a list column, output from the sorted listsInRow instead
auto listFind =
std::find(listColumns.begin(), listColumns.end(), header.at(i));
if (listFind != listColumns.end()) {
outStream << '['
<< vectorToString(
listsInRow.at(listFind - listColumns.begin()))
<< ']';
} else {
// Otherwise we have the "easy" case -- just output
outStream << row.at(i);
}
}
outStream << '\n';
}
}