in inference/xgboost_predictor/predictor.py [0:0]
def _preprocess(self, data):
"""Preprocesses raw input data for prediction.
Args:
data: Raw input in 2d array.
Returns:
Preprocessed data in 2d array.
Raises:
ValueError: An error occurred when features in a data row are different
from the features in the model.
"""
self._extract_model_metadata()
preprocessed_data = []
for row_index in range(len(data)):
row = data[row_index]
sorted_data_feature_names = sorted(row.keys())
sorted_model_feature_names = sorted(self._feature_names)
if sorted_data_feature_names != sorted_model_feature_names:
raise ValueError(
'Row %d has different features %s than the model features %s' %
(row_index, ','.join(sorted_data_feature_names),
','.join(sorted_model_feature_names)))
encoded_row = []
for feature_name in self._feature_names:
col = row[feature_name]
feature_index = self._feature_name_to_index_map[feature_name]
if feature_index in self._categorical_one_hot_vocab:
vocab = self._categorical_one_hot_vocab[feature_index]
one_hot_list = [None] * len(vocab)
col_value = str(col)
if col_value in vocab:
one_hot_list[vocab.index(col_value)] = 1.0
encoded_row.extend(one_hot_list)
elif feature_index in self._categorical_target_vocab:
vocab = self._categorical_target_vocab[feature_index]
col_value = str(col)
# None will be automatically handled by xgboost lib.
target_list = vocab.get(col_value,
[None] * len(list(vocab.values())[0]))
# We will treat the zero value as missing value for multi-class
# models.
if len(target_list) > 1:
target_list = [None if x == 0.0 else x for x in target_list]
encoded_row.extend(target_list)
elif feature_index in self._categorical_label_vocab:
vocab = self._categorical_label_vocab[feature_index]
col_value = str(col)
if col_value in vocab:
encoded_row.append(float(vocab.index(col_value)))
else:
# unseen category.
encoded_row.append(None)
elif feature_index in self._array_one_hot_vocab:
vocab = self._array_one_hot_vocab[feature_index]
one_hot_list = [None] * len(vocab)
try:
for item in col:
item_value = str(item)
if item_value in vocab:
one_hot_list[vocab.index(item_value)] = 1.0
encoded_row.extend(one_hot_list)
except ValueError:
raise ValueError('The feature %s in row %d is not an array' %
(feature_name, row_index))
elif feature_index in self._array_target_vocab:
vocab = self._array_target_vocab[feature_index]
target_list = [0.0] * len(list(vocab.values())[0])
try:
for item in col:
item_value = str(item)
item_target_list = vocab.get(item_value,
[0.0] * len(list(vocab.values())[0]))
item_target_list = [x / float(len(col)) for x in item_target_list]
target_list = [sum(x) for x in zip(target_list, item_target_list)]
# We will treat the zero value as missing value for multi-class
# models.
if len(target_list) > 1:
target_list = [None if x == 0.0 else x for x in target_list]
encoded_row.extend(target_list)
except ValueError:
raise ValueError('The feature %s in row %d is not an array' %
(feature_name, row_index))
elif (self._array_struct_dimension_dict and
feature_index in self._array_struct_dimension_dict):
dimension = self._array_struct_dimension_dict[feature_index]
array_struct_dense_vector = [None] * dimension
for item in col:
key = item[0]
if key < 0:
raise ValueError('The key of the sparse feature %s in row %d is '
'smaller than 0.' % (feature_name, row_index))
if key > dimension:
raise ValueError('The key of the sparse feature %s in row %d is '
'larger than the sparse feature dimension %d.' %
(feature_name, row_index, dimension))
array_struct_dense_vector[item[0]] = item[1]
encoded_row.extend(array_struct_dense_vector)
elif (
self._array_numerical_length_dict
and feature_index in self._array_numerical_length_dict
):
length = self._array_numerical_length_dict[feature_index]
if len(col) != length:
raise ValueError(
'The length of the array numerical feature %s '
'in row %d does not match '
'array_numerical_length.txt.' % (feature_name, row_index)
)
encoded_row.extend(np.array(col).astype(np.float64))
else:
# Numerical feature.
# Treat empty string as 0 as XAI use empty string as baseline.
if col == '':
encoded_row.append(0.0)
else:
try:
encoded_row.append(float(col))
except ValueError:
raise ValueError(
'The feature %s in row %d cannot be converted to float' %
(feature_name, row_index))
preprocessed_data.append(encoded_row)
return preprocessed_data