in easy_rec/python/utils/convert_rtp_fg.py [0:0]
def process_features(feature_type,
feature_name,
feature,
pipeline_config,
embedding_dim,
incol_separator,
is_sequence=False):
feature_config = FeatureConfig()
feature_config.input_names.append(feature_name)
feature_config.separator = incol_separator
input_field = DatasetConfig.Field()
input_field.input_name = feature_name
curr_embed_dim = feature.get('embedding_dimension',
feature.get('embedding_dim', embedding_dim))
curr_combiner = feature.get('combiner', 'sum')
if feature.get('is_cache', False):
logging.info('will cache %s' % feature_name)
feature_config.is_cache = True
is_multi = feature.get('is_multi', False)
# is_seq = feature.get('is_seq', False)
if is_sequence:
feature_config.feature_type = feature_config.SequenceFeature
feature_config.embedding_dim = curr_embed_dim
if feature_type == 'raw_feature':
feature_config.sub_feature_type = feature_config.RawFeature
input_field.default_val = feature.get('default_value', '0.0')
raw_input_dim = feature.get('value_dimension', 1)
if 'boundaries' in feature:
feature_config.boundaries.extend(feature['boundaries'])
if raw_input_dim > 1:
feature_config.raw_input_dim = raw_input_dim
else:
feature_config.sub_feature_type = feature_config.IdFeature
_set_hash_bucket(feature, feature_config, input_field)
feature_config.combiner = curr_combiner
elif feature_type == 'id_feature':
if is_multi:
feature_config.feature_type = feature_config.TagFeature
kv_separator = feature.get('kv_separator', None)
if kv_separator:
feature_config.kv_separator = kv_separator
# elif is_seq:
# feature_config.feature_type = feature_config.SequenceFeature
else:
feature_config.feature_type = feature_config.IdFeature
feature_config.embedding_dim = curr_embed_dim
_set_hash_bucket(feature, feature_config, input_field)
feature_config.combiner = curr_combiner
elif feature_type == 'lookup_feature':
need_discrete = feature.get('needDiscrete', True)
if not need_discrete:
_gen_raw_config(feature, input_field, feature_config, is_multi,
curr_embed_dim)
else:
feature_config.feature_type = feature_config.TagFeature
if feature.get('needWeighting', False):
feature_config.kv_separator = ''
feature_config.embedding_dim = curr_embed_dim
_set_hash_bucket(feature, feature_config, input_field)
feature_config.combiner = curr_combiner
elif feature_type == 'raw_feature':
_gen_raw_config(feature, input_field, feature_config, is_multi,
curr_embed_dim)
elif feature_type == 'match_feature':
need_discrete = feature.get('needDiscrete', True)
if feature.get('matchType', '') == 'multihit':
is_multi = True
if need_discrete:
feature_config.feature_type = feature_config.TagFeature
if feature.get('needWeighting', False):
feature_config.kv_separator = ''
feature_config.embedding_dim = curr_embed_dim
_set_hash_bucket(feature, feature_config, input_field)
feature_config.combiner = curr_combiner
else:
assert 'bucketize_boundaries' not in feature
_gen_raw_config(feature, input_field, feature_config, is_multi,
curr_embed_dim)
elif feature_type == 'combo_feature':
feature_config.feature_type = feature_config.TagFeature
_set_hash_bucket(feature, feature_config, input_field)
feature_config.embedding_dim = curr_embed_dim
feature_config.combiner = curr_combiner
elif feature_type == 'overlap_feature':
if feature['method'] in ['common_word_divided', 'diff_word_divided']:
feature_config.feature_type = feature_config.TagFeature
else:
feature_config.feature_type = feature_config.IdFeature
_set_hash_bucket(feature, feature_config, input_field)
feature_config.embedding_dim = curr_embed_dim
feature_config.combiner = curr_combiner
else:
assert 'unknown feature type %s, currently not supported' % feature_type
if 'shared_name' in feature:
feature_config.embedding_name = feature['shared_name']
# pipeline_config.feature_configs.append(feature_config)
if pipeline_config.feature_configs:
pipeline_config.feature_configs.append(feature_config)
else:
pipeline_config.feature_config.features.append(feature_config)
pipeline_config.data_config.input_fields.append(input_field)
if 'extra_combo_info' in feature:
extra_combo_info = feature['extra_combo_info']
feature_names = extra_combo_info.get('feature_names', [])
assert len(
feature_names
) >= 1, 'The feature number for ComboFeature must be greater than 2.'
combo_feature_config = FeatureConfig()
combo_feature_config.input_names.append(feature_name)
for fea_name in feature_names:
combo_feature_config.input_names.append(fea_name)
final_feature_name = 'combo__' + '_'.join(combo_feature_config.input_names)
final_feature_name = extra_combo_info.get('final_feature_name',
final_feature_name)
combo_feature_config.feature_name = final_feature_name
combo_feature_config.feature_type = combo_feature_config.ComboFeature
curr_embed_dim = extra_combo_info.get(
'embedding_dimension',
extra_combo_info.get('embedding_dim', embedding_dim))
curr_combiner = extra_combo_info.get('combiner', 'mean')
combo_feature_config.embedding_dim = curr_embed_dim
combo_feature_config.combiner = curr_combiner
assert 'hash_bucket_size' in extra_combo_info, 'hash_bucket_size must be set in ComboFeature.'
_set_hash_bucket(extra_combo_info, combo_feature_config, None)
if pipeline_config.feature_configs:
pipeline_config.feature_configs.append(combo_feature_config)
else:
pipeline_config.feature_config.features.append(combo_feature_config)
return pipeline_config