tf::Status SetVerticalDataset()

in tensorflow_decision_forests/tensorflow/ops/inference/kernel.cc [501:618]


  tf::Status SetVerticalDataset(const InputTensors& inputs,
                                const FeatureIndex& feature_index,
                                Cache* cache) const {
    cache->dataset_.set_nrow(inputs.batch_size);
    // Numerical features.
    for (int col_idx = 0; col_idx < feature_index.numerical_features().size();
         col_idx++) {
      const int feature_idx = feature_index.numerical_features()[col_idx];
      auto* col = cache->dataset_.MutableColumnWithCastOrNull<
          dataset::VerticalDataset::NumericalColumn>(feature_idx);
      if (col == nullptr) {
        return tf::Status(tf::error::INTERNAL, "Unexpected column type.");
      }
      col->Resize(inputs.batch_size);
      auto& dst = *col->mutable_values();
      for (int example_idx = 0; example_idx < inputs.batch_size;
           example_idx++) {
        // Missing represented as NaN.
        dst[example_idx] = inputs.numerical_features(example_idx, col_idx);
      }
    }

    // Boolean features.
    for (int col_idx = 0; col_idx < feature_index.boolean_features().size();
         col_idx++) {
      const int feature_idx = feature_index.boolean_features()[col_idx];
      auto* col = cache->dataset_.MutableColumnWithCastOrNull<
          dataset::VerticalDataset::BooleanColumn>(feature_idx);
      if (col == nullptr) {
        return tf::Status(tf::error::INTERNAL, "Unexpected column type.");
      }
      col->Resize(inputs.batch_size);
      auto& dst = *col->mutable_values();
      for (int example_idx = 0; example_idx < inputs.batch_size;
           example_idx++) {
        char bool_value;
        const float value = inputs.boolean_features(example_idx, col_idx);
        if (std::isnan(value)) {
          bool_value = dataset::VerticalDataset::BooleanColumn::kNaValue;
        } else if (value >= 0.5) {
          bool_value = dataset::VerticalDataset::BooleanColumn::kTrueValue;
        } else {
          bool_value = dataset::VerticalDataset::BooleanColumn::kFalseValue;
        }
        dst[example_idx] = bool_value;
      }
    }

    // Categorical int features.
    for (int col_idx = 0;
         col_idx < feature_index.categorical_int_features().size(); col_idx++) {
      const int feature_idx = feature_index.categorical_int_features()[col_idx];
      auto* col = cache->dataset_.MutableColumnWithCastOrNull<
          dataset::VerticalDataset::CategoricalColumn>(feature_idx);
      if (col == nullptr) {
        return tf::Status(tf::error::INTERNAL, "Unexpected column type.");
      }
      col->Resize(inputs.batch_size);
      const int max_value = cache->dataset_.data_spec()
                                .columns(feature_idx)
                                .categorical()
                                .number_of_unique_values();
      auto& dst = *col->mutable_values();
      for (int example_idx = 0; example_idx < inputs.batch_size;
           example_idx++) {
        auto value = inputs.categorical_int_features(example_idx, col_idx);
        if (value < -1 || value >= max_value) {
          value = 0;
        }
        dst[example_idx] = value;
      }
    }

    // Categorical set int features.
    //
    // Note: The categorical-set values are stored in a "two levels" ragged
    // tensor i.e. a ragged tensor inside of another one, shaped
    // "[batch_size, num_features, set_size]", where "set_size" is the only
    // ragged dimension.
    // In other words, "value[i,j,k]" is the "k-th" item, of the "j-th" feature,
    // of the "i-th" example.
    std::vector<int> tmp_values;
    for (int col_idx = 0;
         col_idx < feature_index.categorical_set_int_features().size();
         col_idx++) {
      const int feature_idx =
          feature_index.categorical_set_int_features()[col_idx];
      auto* col = cache->dataset_.MutableColumnWithCastOrNull<
          dataset::VerticalDataset::CategoricalSetColumn>(feature_idx);
      if (col == nullptr) {
        return tf::Status(tf::error::INTERNAL, "Unexpected column type.");
      }
      col->Resize(inputs.batch_size);

      const int max_value = cache->dataset_.data_spec()
                                .columns(feature_idx)
                                .categorical()
                                .number_of_unique_values();

      for (int example_idx = 0; example_idx < inputs.batch_size;
           example_idx++) {
        const auto status =
            ExtractCategoricalSetInt(inputs, feature_index, col_idx, max_value,
                                     example_idx, &tmp_values);
        if (!status.ok()) {
          return status;
        }

        if (!tmp_values.empty() && tmp_values.front() < 0) {
          col->SetNA(example_idx);
        } else {
          col->SetIter(example_idx, tmp_values.begin(), tmp_values.end());
        }
      }
    }

    return tf::Status::OK();
  }