in fbpcs/emp_games/lift/calculator/InputData.cpp [129:284]
void InputData::addFromCSV(
const std::vector<std::string>& header,
const std::vector<std::string>& parts) {
std::vector<std::string> featureValues;
if (!firstLineParsedAlready_) {
for (const auto& col : header) {
// If the column starts with the feature prefix, push it to
// featureHeader_
if (col.rfind(kFeaturePrefix, 0) != std::string::npos) {
featureHeader_.push_back(col);
}
}
if (std::find(header.begin(), header.end(), "cohort_id") != header.end() &&
anyFeatureColumns(header)) {
LOG(FATAL)
<< "Supplying both cohort_id and feature columns is not supported";
}
firstLineParsedAlready_ = true;
}
// These bools + int64_t allow us to create separate vectors for testPop and
// controlPop without enforcing an ordering between oppFlag and testFlag.
bool sawOppFlag = false;
bool sawTestFlag = false;
int64_t storedOpportunityFlag = 0;
int64_t storedTestFlag = 0;
for (std::size_t i = 0; i < header.size(); ++i) {
auto column = header[i];
auto value = parts[i];
int64_t parsed = 0;
std::istringstream iss{value};
// Array columns and features may be parsed differently
if (!(column == "opportunity_timestamps" || column == "event_timestamps" ||
column == "values" ||
column == "id_" || // ID doesn't have to be parse-able to int64_t
column.rfind(kFeaturePrefix, 0) != std::string::npos)) {
iss >> parsed;
if (iss.fail()) {
LOG(FATAL) << "Failed to parse '" << iss.str() << "' to int64_t";
}
}
if (column == "opportunity") {
sawOppFlag = true;
if (sawTestFlag) {
testPopulation_.push_back(parsed & storedTestFlag ? 1 : 0);
controlPopulation_.push_back(
(parsed & ((!storedTestFlag) ? 1 : 0)) ? 1 : 0);
} else {
storedOpportunityFlag = parsed;
}
} else if (column == "test_flag") {
sawTestFlag = true;
if (sawOppFlag) {
testPopulation_.push_back(parsed & storedOpportunityFlag ? 1 : 0);
controlPopulation_.push_back((!parsed) & storedOpportunityFlag ? 1 : 0);
} else {
storedTestFlag = parsed;
}
} else if (column == "opportunity_timestamp") {
// secret-share-lift can have negative input timestamps
if (liftMpcType_ == LiftMPCType::Standard && parsed < epoch_ &&
parsed != 0) {
LOG(FATAL) << "Timestamp " << parsed << " is before epoch " << epoch_
<< ", which is unexpected.";
}
opportunityTimestamps_.push_back(parsed - epoch_);
} else if (column == "num_impressions") {
numImpressions_.push_back(parsed);
} else if (column == "num_clicks") {
numClicks_.push_back(parsed);
} else if (column == "total_spend") {
totalSpend_.push_back(parsed);
} else if (column == "cohort_id" || column == "breakdown_id") {
// Work-in-progress: we currently support cohort_id *or* feature columns
groupIds_.push_back(parsed);
// We use parsed + 1 because cohorts are zero-indexed
numGroups_ = std::max(numGroups_, parsed + 1);
} else if (column == "event_timestamp") {
// When event_timestamp column presents (in standard Converter Lift
// input), parse it as arrays of size 1.
if (liftMpcType_ == LiftMPCType::Standard) {
value = "[" + value + "]";
setTimestamps(value, purchaseTimestampArrays_);
} else {
purchaseTimestamps_.push_back(parsed - epoch_);
}
} else if (column == "event_timestamps") {
setTimestamps(value, purchaseTimestampArrays_);
} else if (column == "value") {
totalValue_ += parsed;
purchaseValues_.push_back(parsed);
// If this is secret_share lift, we can't pre-compute squared values
if (liftMpcType_ == LiftMPCType::Standard) {
totalValueSquared_ += parsed * parsed;
purchaseValuesSquared_.push_back(parsed * parsed);
}
} else if (column == "values") {
setValuesFields(value);
} else if (column == "value_squared") {
// This column is only valid in secret_share lift
// otherwise, we just use simple multiplication in the above condition
if (liftMpcType_ == LiftMPCType::SecretShare) {
totalValueSquared_ += parsed;
purchaseValuesSquared_.push_back(parsed);
}
} else if (column == "opportunity_timestamps") {
// This column is only valid in secret_share lift
// otherwise, we just use single opportunity_timestamp
if (liftMpcType_ == LiftMPCType::SecretShare) {
setTimestamps(value, opportunityTimestampArrays_);
}
} else if (column == "purchase_flag") {
// When purchase_flag column presents (in standard Converter Lift
// input), parse it as arrays of size 1.
if (liftMpcType_ == LiftMPCType::Standard) {
value = "[" + value + "]";
setValuesFields(value);
} else {
totalValue_ += parsed;
purchaseValues_.push_back(parsed);
}
} else if (column.rfind(kFeaturePrefix, 0) != std::string::npos) {
// This is a feature column
featureValues.push_back(value);
} else if (column != "id_") { // Do nothing with the id_ column as Lift
// games assume the ids are already matched
// We shouldn't fail if there are extra columns in the input
LOG(WARNING) << "Warning: Unknown column in csv: " << column;
}
}
// Once we've gone through every column, we need to check if we've added the
// test/control values yet. From the input dataset, opp_flag is *optional*
// so this can be interpreted as "this is a valid opportunity"
if (!sawOppFlag) {
testPopulation_.push_back(storedTestFlag);
controlPopulation_.push_back(1 - storedTestFlag);
}
// Finally, check which feature groupId this row belongs to
// If we haven't seen this feature group before, denote that it corresponds
// to a new groupId
if (featureHeader_.size() > 0) {
if (featuresToGroupId_.find(featureValues) == featuresToGroupId_.end()) {
featuresToGroupId_[featureValues] = numGroups_;
groupIdToFeatures_[numGroups_] = featureValues;
++numGroups_;
}
// Make a note of which group this row belongs to
groupIds_.push_back(featuresToGroupId_[featureValues]);
}
}