def _preprocess()

in inference/xgboost_predictor/predictor.py [0:0]
105 lines of code
36 McCabe index (conditional complexity)

  def _preprocess(self, data):
    """Preprocesses raw input data for prediction.

    Args:
      data: Raw input in 2d array.

    Returns:
      Preprocessed data in 2d array.

    Raises:
      ValueError: An error occurred when features in a data row are different
      from the features in the model.
    """
    self._extract_model_metadata()
    preprocessed_data = []
    for row_index in range(len(data)):
      row = data[row_index]
      sorted_data_feature_names = sorted(row.keys())
      sorted_model_feature_names = sorted(self._feature_names)
      if sorted_data_feature_names != sorted_model_feature_names:
        raise ValueError(
            'Row %d has different features %s than the model features %s' %
            (row_index, ','.join(sorted_data_feature_names),
             ','.join(sorted_model_feature_names)))
      encoded_row = []
      for feature_name in self._feature_names:
        col = row[feature_name]
        feature_index = self._feature_name_to_index_map[feature_name]
        if feature_index in self._categorical_one_hot_vocab:
          vocab = self._categorical_one_hot_vocab[feature_index]
          one_hot_list = [None] * len(vocab)
          col_value = str(col)
          if col_value in vocab:
            one_hot_list[vocab.index(col_value)] = 1.0
          encoded_row.extend(one_hot_list)
        elif feature_index in self._categorical_target_vocab:
          vocab = self._categorical_target_vocab[feature_index]
          col_value = str(col)
          # None will be automatically handled by xgboost lib.
          target_list = vocab.get(col_value,
                                  [None] * len(list(vocab.values())[0]))
          # We will treat the zero value as missing value for multi-class
          # models.
          if len(target_list) > 1:
            target_list = [None if x == 0.0 else x for x in target_list]
          encoded_row.extend(target_list)
        elif feature_index in self._categorical_label_vocab:
          vocab = self._categorical_label_vocab[feature_index]
          col_value = str(col)
          if col_value in vocab:
            encoded_row.append(float(vocab.index(col_value)))
          else:
            # unseen category.
            encoded_row.append(None)
        elif feature_index in self._array_one_hot_vocab:
          vocab = self._array_one_hot_vocab[feature_index]
          one_hot_list = [None] * len(vocab)
          try:
            for item in col:
              item_value = str(item)
              if item_value in vocab:
                one_hot_list[vocab.index(item_value)] = 1.0
            encoded_row.extend(one_hot_list)
          except ValueError:
            raise ValueError('The feature %s in row %d is not an array' %
                             (feature_name, row_index))
        elif feature_index in self._array_target_vocab:
          vocab = self._array_target_vocab[feature_index]
          target_list = [0.0] * len(list(vocab.values())[0])
          try:
            for item in col:
              item_value = str(item)
              item_target_list = vocab.get(item_value,
                                           [0.0] * len(list(vocab.values())[0]))
              item_target_list = [x / float(len(col)) for x in item_target_list]
              target_list = [sum(x) for x in zip(target_list, item_target_list)]

            # We will treat the zero value as missing value for multi-class
            # models.
            if len(target_list) > 1:
              target_list = [None if x == 0.0 else x for x in target_list]
            encoded_row.extend(target_list)
          except ValueError:
            raise ValueError('The feature %s in row %d is not an array' %
                             (feature_name, row_index))
        elif (self._array_struct_dimension_dict and
              feature_index in self._array_struct_dimension_dict):
          dimension = self._array_struct_dimension_dict[feature_index]
          array_struct_dense_vector = [None] * dimension

          for item in col:
            key = item[0]
            if key < 0:
              raise ValueError('The key of the sparse feature %s in row %d is '
                               'smaller than 0.' % (feature_name, row_index))
            if key > dimension:
              raise ValueError('The key of the sparse feature %s in row %d is '
                               'larger than the sparse feature dimension %d.' %
                               (feature_name, row_index, dimension))
            array_struct_dense_vector[item[0]] = item[1]
          encoded_row.extend(array_struct_dense_vector)
        elif (
            self._array_numerical_length_dict
            and feature_index in self._array_numerical_length_dict
        ):
          length = self._array_numerical_length_dict[feature_index]
          if len(col) != length:
            raise ValueError(
                'The length of the array numerical feature %s '
                'in row %d does not match '
                'array_numerical_length.txt.' % (feature_name, row_index)
            )
          encoded_row.extend(np.array(col).astype(np.float64))
        else:
          # Numerical feature.
          # Treat empty string as 0 as XAI use empty string as baseline.
          if col == '':
            encoded_row.append(0.0)
          else:
            try:
              encoded_row.append(float(col))
            except ValueError:
              raise ValueError(
                  'The feature %s in row %d cannot be converted to float' %
                  (feature_name, row_index))

      preprocessed_data.append(encoded_row)
    return preprocessed_data