def collect_data

def collect_data_step()

in tensorflow_decision_forests/keras/core.py [0:0]
175 lines of code
52 McCabe index (conditional complexity)

  def collect_data_step(self, data, is_training_example):
    """Collect examples e.g. training or validation."""

    if isinstance(data, dict):
      raise ValueError("No label received for training. If you used "
                       "`pd_dataframe_to_tf_dataset`, make sure to "
                       f"specify the `label` argument. data={data}")

    if len(data) == 2:
      train_x, train_y = data
      train_weights = None
    elif len(data) == 3:
      train_x, train_y, train_weights = data
    else:
      raise ValueError(f"Unexpected data shape {data}")

    if self._verbose >= 2:
      tf_logging.info(
          "%s tensor examples:\nFeatures: %s\nLabel: %s\nWeights: %s",
          "Training" if is_training_example else "Validation", train_x, train_y,
          train_weights)

    if isinstance(train_x, dict):
      _check_feature_names(
          train_x.keys(),
          self._advanced_arguments.fail_on_non_keras_compatible_feature_name)

    if self._preprocessing is not None:
      train_x = self._preprocessing(train_x)
      if self._verbose >= 2:
        tf_logging.info("Tensor example after pre-processing:\n%s", train_x)
      if isinstance(train_x, list) and self._features:
        tf_logging.warning(
            "Using \"features\" with a pre-processing stage returning a list "
            "is not recommended. Use a pre-processing stage that returns a "
            "dictionary instead.")

    if isinstance(train_x, dict):
      # Native format
      pass
    elif isinstance(train_x, tf.Tensor):
      train_x = {train_x.name: train_x}
    elif isinstance(train_x, list) or isinstance(train_x, tuple):
      # Note: The name of a tensor (value.name) can change between the training
      # and the inference.
      train_x = {str(idx): value for idx, value in enumerate(train_x)}
    else:
      raise ValueError(
          f"The training input tensor is expected to be a tensor, list of "
          f"tensors or a dictionary of tensors. Got {train_x} instead")

    # Check the labels
    if not isinstance(train_y, tf.Tensor):
      raise ValueError(
          f"The training label tensor is expected to be a tensor. Got {train_y}"
          " instead.")

    if len(train_y.shape) != 1:
      if self._verbose >= 2:
        tf_logging.info(
            "Squeeze label' shape from [batch_size, 1] to [batch_size]")
      train_y = tf.squeeze(train_y, axis=1)

    if len(train_y.shape) != 1:
      raise ValueError(
          "Labels can either be passed in as [batch_size, 1] or [batch_size]. "
          "Invalid shape %s." % train_y.shape)

    # Check the training
    self._weighted_training = train_weights is not None
    if self._weighted_training:
      if not isinstance(train_weights, tf.Tensor):
        raise ValueError(
            "The training weights tensor is expected to be a tensor. "
            f"Got {train_weights} instead.")

      if len(train_weights.shape) != 1:
        if self._verbose >= 2:
          tf_logging.info(
              "Squeeze weight' shape from [batch_size, 1] to [batch_size]")
        train_weights = tf.squeeze(train_weights, axis=1)

      if len(train_weights.shape) != 1:
        raise ValueError(
            "Weights can either be passed in as [batch_size, 1] or [batch_size]. "
            "Invalid shape %s." % train_weights.shape)

    # List the input features and their semantics.
    semantics = tf_core.infer_semantic(
        train_x, {feature.name: feature.semantic for feature in self._features},
        self._exclude_non_specified)

    # The ranking group and treatment are not part of the features unless
    # specified explicitly.
    if (self._ranking_group is not None and
        self._ranking_group not in self._features and
        self._ranking_group in semantics):
      del semantics[self._ranking_group]

    if (self._uplift_treatment is not None and
        self._uplift_treatment not in self._features and
        self._uplift_treatment in semantics):
      del semantics[self._uplift_treatment]

    if is_training_example:
      if self._semantics is not None:
        raise ValueError("The model is already trained")
      self._semantics = semantics
    else:
      self._has_validation_dataset = True
      if self._semantics is None:
        raise ValueError(
            "The validation should be collected after the training")

      if semantics != self._semantics:
        raise ValueError(
            "The validation dataset does not have the same "
            "semantic as the training dataset.\nTraining:\n{}\nValidation:\n{}"
            .format(self._semantics, semantics))

    semantic_inputs = tf_core.combine_tensors_and_semantics(train_x, semantics)

    normalized_semantic_inputs = tf_core.normalize_inputs(semantic_inputs)

    if self._verbose >= 2:
      tf_logging.info("Normalized tensor features:\n %s",
                      normalized_semantic_inputs)

    if is_training_example:
      self._normalized_input_keys = sorted(
          list(normalized_semantic_inputs.keys()))

    # Add the weights
    if self._weighted_training:
      normalized_semantic_inputs[_WEIGHTS] = tf_core.SemanticTensor(
          tensor=tf.cast(train_weights, tf_core.NormalizedNumericalType),
          semantic=tf_core.Semantic.NUMERICAL)

    # Add the semantic of the label.
    if self._task == Task.CLASSIFICATION:
      normalized_semantic_inputs[_LABEL] = tf_core.SemanticTensor(
          tensor=tf.cast(train_y, tf_core.NormalizedCategoricalIntType) +
          tf_core.CATEGORICAL_INTEGER_OFFSET,
          semantic=tf_core.Semantic.CATEGORICAL)

    elif self._task == Task.REGRESSION:
      normalized_semantic_inputs[_LABEL] = tf_core.SemanticTensor(
          tensor=tf.cast(train_y, tf_core.NormalizedNumericalType),
          semantic=tf_core.Semantic.NUMERICAL)

    elif self._task == Task.RANKING:
      normalized_semantic_inputs[_LABEL] = tf_core.SemanticTensor(
          tensor=tf.cast(train_y, tf_core.NormalizedNumericalType),
          semantic=tf_core.Semantic.NUMERICAL)

      assert self._ranking_group is not None
      if self._ranking_group not in train_x:
        raise Exception(
            "The ranking key feature \"{}\" is not available as an input "
            "feature.".format(self._ranking_group))
      normalized_semantic_inputs[_RANK_GROUP] = tf_core.SemanticTensor(
          tensor=tf.cast(train_x[self._ranking_group],
                         tf_core.NormalizedHashType),
          semantic=tf_core.Semantic.HASH)

    elif self._task == Task.CATEGORICAL_UPLIFT:
      normalized_semantic_inputs[_LABEL] = tf_core.SemanticTensor(
          tensor=tf.cast(train_y, tf_core.NormalizedCategoricalIntType) +
          tf_core.CATEGORICAL_INTEGER_OFFSET,
          semantic=tf_core.Semantic.CATEGORICAL)

      assert self._uplift_treatment is not None
      if self._uplift_treatment not in train_x:
        raise Exception(
            "The uplift treatment key feature \"{}\" is not available as an input "
            "feature.".format(self._uplift_treatment))
      normalized_semantic_inputs[_UPLIFT_TREATMENT] = tf_core.SemanticTensor(
          tensor=tf.cast(train_x[self._uplift_treatment],
                         tf_core.NormalizedCategoricalIntType) +
          tf_core.CATEGORICAL_INTEGER_OFFSET,
          semantic=tf_core.Semantic.CATEGORICAL)

    else:
      raise Exception("Non supported task {}".format(self._task))

    if not self._is_trained:
      # Collects the training examples.

      distribution_config = tf_core.get_distribution_configuration(
          self.distribute_strategy)
      if distribution_config is None:
        # No distribution strategy. Collecting examples in memory.
        tf_core.collect_training_examples(
            normalized_semantic_inputs,
            self._training_model_id,
            collect_training_data=is_training_example)

      else:

        if not is_training_example:
          tf_logging.warning(
              "The validation dataset given to `fit` is not used to help "
              "training (e.g. early stopping) in the case of distributed "
              "training. If you want to use a validation dataset use "
              "non-distributed training or use `fit_from_file` instead.")

        # Each worker collects a part of the dataset.
        if not self.capabilities().support_partial_cache_dataset_format:
          raise ValueError(
              f"The model {type(self)} does not support training with a TF "
              "Distribution strategy (i.e. model.capabilities()."
              "support_partial_cache_dataset_format == False). If the dataset "
              "is small, simply remove "
              "the distribution strategy scope (i.e. `with strategy.scope():` "
              "around the model construction). If the dataset is large, use a "
              "distributed version of the model. For Example, use "
              "DistributedGradientBoostedTreesModel instead of "
              "GradientBoostedTreesModel.")

        tf_core.collect_distributed_training_examples(
            inputs=normalized_semantic_inputs,
            model_id=self._training_model_id,
            dataset_path=self._distributed_partial_dataset_cache_path())

    # Not metrics are returned during the collection of training examples.
    return {}