void idSwap()

in fbpcs/data_processing/id_combiner/IdSwap.cpp [28:118]


void idSwap(
    std::istream& dataFile,
    std::istream& spineIdFile,
    std::ostream& outFile) {
  const std::string kCommaSplitRegex = ",";
  const std::string kIdColumnName = "id_";

  XLOG(INFO) << "Starting.";

  std::string line;

  getline(dataFile, line);
  std::vector<std::string> header;
  folly::split(kCommaSplitRegex, line, header);

  auto idColumnIdx = headerIndex(header, kIdColumnName);

  auto headerSize = header.size();

  // Output the header swapping out id_ for private_id_
  outFile << vectorToString(header) << "\n";

  // Build a map for <id_ to private_id> from the spineId File
  std::unordered_map<std::string, std::string> idToPrivateIDMap;
  std::string spineRow;
  while (getline(spineIdFile, spineRow)) {
    std::vector<std::string> cols;
    folly::split(kCommaSplitRegex, spineRow, cols);
    // expect col 1 in spineIdFile to contain the id_
    auto priv_id = cols.at(0);
    auto row_id = cols.at(1);
    if (row_id != "") {
      idToPrivateIDMap[row_id] = priv_id;
    }
  }
  spineIdFile.clear();
  spineIdFile.seekg(0);

  // Build a map for <id_ to data> from data file
  std::unordered_map<std::string, std::vector<std::vector<std::string>>>
      idToDataMap;

  while (getline(dataFile, line)) {
    std::vector<std::string> rowVec;
    folly::split(kCommaSplitRegex, line, rowVec);

    auto rowSize = rowVec.size();
    if (rowSize != headerSize) {
      XLOG(INFO) << "Mismatch between header and row '\n'"
                 << "Header has size " << headerSize << " while row has size "
                 << rowSize << '\n'
                 << "row: " << vectorToString(rowVec) << "\n"
                 << "header: " << vectorToString(header) << "\n";
      std::exit(1);
    }
    // Verifying that every id in the dataFile has a corresponding
    // private_id mapped in the spineFile else throwing
    auto rowId = rowVec.at(idColumnIdx);
    auto idSearch = idToPrivateIDMap.find(rowId);
    if (idSearch == idToPrivateIDMap.end()) {
      XLOG(FATAL) << "ID is missing in the spineID file '\n'" << rowId
                  << " does not have a corresponding private_id"
                  << "\n";
    }

    idToDataMap[rowId].push_back(rowVec);
  }

  // Output each row from dataFile to outFile, swapping out id_ for private_id_
  // if id_ doesn't exist in mapping/spineId file, throw an error
  std::string row;
  while (getline(spineIdFile, row)) {
    std::vector<std::string> cols;
    folly::split(kCommaSplitRegex, row, cols);

    // for each row in spine id,
    // look for the corresponding rows in dataFile and
    // output the private_id, along with the data from dataFile
    auto priv_id = cols.at(0);
    auto row_id = cols.at(1); // expect col 1 in spineIdFile to contain the id_
    if (idToDataMap.count(row_id) > 0) {
      auto& dataRows = idToDataMap.at(row_id);
      for (auto& dRow : dataRows) {
        outFile << vectorToStringWithReplacement(dRow, idColumnIdx, priv_id)
                << '\n';
      }
    }
  }

  XLOG(INFO) << "Finished.";
}