in tensorflow_decision_forests/tensorflow/ops/inference/kernel.cc [501:618]
tf::Status SetVerticalDataset(const InputTensors& inputs,
const FeatureIndex& feature_index,
Cache* cache) const {
cache->dataset_.set_nrow(inputs.batch_size);
// Numerical features.
for (int col_idx = 0; col_idx < feature_index.numerical_features().size();
col_idx++) {
const int feature_idx = feature_index.numerical_features()[col_idx];
auto* col = cache->dataset_.MutableColumnWithCastOrNull<
dataset::VerticalDataset::NumericalColumn>(feature_idx);
if (col == nullptr) {
return tf::Status(tf::error::INTERNAL, "Unexpected column type.");
}
col->Resize(inputs.batch_size);
auto& dst = *col->mutable_values();
for (int example_idx = 0; example_idx < inputs.batch_size;
example_idx++) {
// Missing represented as NaN.
dst[example_idx] = inputs.numerical_features(example_idx, col_idx);
}
}
// Boolean features.
for (int col_idx = 0; col_idx < feature_index.boolean_features().size();
col_idx++) {
const int feature_idx = feature_index.boolean_features()[col_idx];
auto* col = cache->dataset_.MutableColumnWithCastOrNull<
dataset::VerticalDataset::BooleanColumn>(feature_idx);
if (col == nullptr) {
return tf::Status(tf::error::INTERNAL, "Unexpected column type.");
}
col->Resize(inputs.batch_size);
auto& dst = *col->mutable_values();
for (int example_idx = 0; example_idx < inputs.batch_size;
example_idx++) {
char bool_value;
const float value = inputs.boolean_features(example_idx, col_idx);
if (std::isnan(value)) {
bool_value = dataset::VerticalDataset::BooleanColumn::kNaValue;
} else if (value >= 0.5) {
bool_value = dataset::VerticalDataset::BooleanColumn::kTrueValue;
} else {
bool_value = dataset::VerticalDataset::BooleanColumn::kFalseValue;
}
dst[example_idx] = bool_value;
}
}
// Categorical int features.
for (int col_idx = 0;
col_idx < feature_index.categorical_int_features().size(); col_idx++) {
const int feature_idx = feature_index.categorical_int_features()[col_idx];
auto* col = cache->dataset_.MutableColumnWithCastOrNull<
dataset::VerticalDataset::CategoricalColumn>(feature_idx);
if (col == nullptr) {
return tf::Status(tf::error::INTERNAL, "Unexpected column type.");
}
col->Resize(inputs.batch_size);
const int max_value = cache->dataset_.data_spec()
.columns(feature_idx)
.categorical()
.number_of_unique_values();
auto& dst = *col->mutable_values();
for (int example_idx = 0; example_idx < inputs.batch_size;
example_idx++) {
auto value = inputs.categorical_int_features(example_idx, col_idx);
if (value < -1 || value >= max_value) {
value = 0;
}
dst[example_idx] = value;
}
}
// Categorical set int features.
//
// Note: The categorical-set values are stored in a "two levels" ragged
// tensor i.e. a ragged tensor inside of another one, shaped
// "[batch_size, num_features, set_size]", where "set_size" is the only
// ragged dimension.
// In other words, "value[i,j,k]" is the "k-th" item, of the "j-th" feature,
// of the "i-th" example.
std::vector<int> tmp_values;
for (int col_idx = 0;
col_idx < feature_index.categorical_set_int_features().size();
col_idx++) {
const int feature_idx =
feature_index.categorical_set_int_features()[col_idx];
auto* col = cache->dataset_.MutableColumnWithCastOrNull<
dataset::VerticalDataset::CategoricalSetColumn>(feature_idx);
if (col == nullptr) {
return tf::Status(tf::error::INTERNAL, "Unexpected column type.");
}
col->Resize(inputs.batch_size);
const int max_value = cache->dataset_.data_spec()
.columns(feature_idx)
.categorical()
.number_of_unique_values();
for (int example_idx = 0; example_idx < inputs.batch_size;
example_idx++) {
const auto status =
ExtractCategoricalSetInt(inputs, feature_index, col_idx, max_value,
example_idx, &tmp_values);
if (!status.ok()) {
return status;
}
if (!tmp_values.empty() && tmp_values.front() < 0) {
col->SetNA(example_idx);
} else {
col->SetIter(example_idx, tmp_values.begin(), tmp_values.end());
}
}
}
return tf::Status::OK();
}