in easy_rec/python/input/input.py [0:0]
def _parse_raw_feature(self, fc, parsed_dict, field_dict):
input_0 = fc.input_names[0]
feature_name = fc.feature_name if fc.HasField('feature_name') else input_0
if field_dict[input_0].dtype == tf.string:
if fc.HasField('seq_multi_sep') and fc.HasField('combiner'):
fea = tf.string_split(field_dict[input_0], fc.seq_multi_sep)
segment_ids = fea.indices[:, 0]
vals = fea.values
else:
vals = field_dict[input_0]
segment_ids = tf.range(0, tf.shape(vals)[0])
if fc.raw_input_dim > 1:
check_list = [
tf.py_func(
check_split, [vals, fc.separator, fc.raw_input_dim, input_0],
Tout=tf.bool)
] if self._check_mode else []
with tf.control_dependencies(check_list):
tmp_fea = tf.string_split(vals, fc.separator)
check_list = [
tf.py_func(
check_string_to_number, [tmp_fea.values, input_0], Tout=tf.bool)
] if self._check_mode else []
with tf.control_dependencies(check_list):
tmp_vals = tf.string_to_number(
tmp_fea.values,
tf.float32,
name='multi_raw_fea_to_flt_%s' % input_0)
if fc.HasField('seq_multi_sep') and fc.HasField('combiner'):
emb = tf.reshape(tmp_vals, [-1, fc.raw_input_dim])
if fc.combiner == 'max':
emb = tf.segment_max(emb, segment_ids)
elif fc.combiner == 'sum':
emb = tf.segment_sum(emb, segment_ids)
elif fc.combiner == 'min':
emb = tf.segment_min(emb, segment_ids)
elif fc.combiner == 'mean':
emb = tf.segment_mean(emb, segment_ids)
else:
assert False, 'unsupported combine operator: ' + fc.combiner
parsed_dict[feature_name] = emb
else:
parsed_dict[feature_name] = tf.sparse_to_dense(
tmp_fea.indices,
[tf.shape(field_dict[input_0])[0], fc.raw_input_dim],
tmp_vals,
default_value=0)
elif fc.HasField('seq_multi_sep') and fc.HasField('combiner'):
check_list = [
tf.py_func(check_string_to_number, [vals, input_0], Tout=tf.bool)
] if self._check_mode else []
with tf.control_dependencies(check_list):
emb = tf.string_to_number(
vals, tf.float32, name='raw_fea_to_flt_%s' % input_0)
if fc.combiner == 'max':
emb = tf.segment_max(emb, segment_ids)
elif fc.combiner == 'sum':
emb = tf.segment_sum(emb, segment_ids)
elif fc.combiner == 'min':
emb = tf.segment_min(emb, segment_ids)
elif fc.combiner == 'mean':
emb = tf.segment_mean(emb, segment_ids)
else:
assert False, 'unsupported combine operator: ' + fc.combiner
parsed_dict[feature_name] = emb
else:
check_list = [
tf.py_func(
check_string_to_number, [field_dict[input_0], input_0],
Tout=tf.bool)
] if self._check_mode else []
with tf.control_dependencies(check_list):
parsed_dict[feature_name] = tf.string_to_number(
field_dict[input_0], tf.float32)
elif field_dict[input_0].dtype in [
tf.int32, tf.int64, tf.double, tf.float32
]:
parsed_dict[feature_name] = tf.to_float(field_dict[input_0])
else:
assert False, 'invalid dtype[%s] for raw feature' % str(
field_dict[input_0].dtype)
if fc.max_val > fc.min_val:
parsed_dict[feature_name] = (parsed_dict[feature_name] - fc.min_val) / (
fc.max_val - fc.min_val)
if fc.HasField('normalizer_fn'):
logging.info('apply normalizer_fn %s to `%s`' %
(fc.normalizer_fn, feature_name))
parsed_dict[feature_name] = self._normalizer_fn[feature_name](
parsed_dict[feature_name])
if not fc.boundaries and fc.num_buckets <= 1 and \
fc.embedding_dim > 0 and \
self._data_config.sample_weight != input_0:
# may need by wide model and deep model to project
# raw values to a vector, it maybe better implemented
# by a ProjectionColumn later
sample_num = tf.to_int64(tf.shape(parsed_dict[feature_name])[0])
indices_0 = tf.range(sample_num, dtype=tf.int64)
indices_1 = tf.range(fc.raw_input_dim, dtype=tf.int64)
indices_0 = indices_0[:, None]
indices_1 = indices_1[None, :]
indices_0 = tf.tile(indices_0, [1, fc.raw_input_dim])
indices_1 = tf.tile(indices_1, [sample_num, 1])
indices_0 = tf.reshape(indices_0, [-1, 1])
indices_1 = tf.reshape(indices_1, [-1, 1])
indices = tf.concat([indices_0, indices_1], axis=1)
tmp_parsed = parsed_dict[feature_name]
parsed_dict[feature_name + '_raw_proj_id'] = tf.SparseTensor(
indices=indices,
values=indices_1[:, 0],
dense_shape=[sample_num, fc.raw_input_dim])
parsed_dict[feature_name + '_raw_proj_val'] = tf.SparseTensor(
indices=indices,
values=tf.reshape(tmp_parsed, [-1]),
dense_shape=[sample_num, fc.raw_input_dim])