def process_features()

in easy_rec/python/utils/convert_rtp_fg.py [0:0]


def process_features(feature_type,
                     feature_name,
                     feature,
                     pipeline_config,
                     embedding_dim,
                     incol_separator,
                     is_sequence=False):
  feature_config = FeatureConfig()
  feature_config.input_names.append(feature_name)
  feature_config.separator = incol_separator
  input_field = DatasetConfig.Field()
  input_field.input_name = feature_name
  curr_embed_dim = feature.get('embedding_dimension',
                               feature.get('embedding_dim', embedding_dim))
  curr_combiner = feature.get('combiner', 'sum')
  if feature.get('is_cache', False):
    logging.info('will cache %s' % feature_name)
    feature_config.is_cache = True
  is_multi = feature.get('is_multi', False)
  # is_seq = feature.get('is_seq', False)
  if is_sequence:
    feature_config.feature_type = feature_config.SequenceFeature
    feature_config.embedding_dim = curr_embed_dim
    if feature_type == 'raw_feature':
      feature_config.sub_feature_type = feature_config.RawFeature
      input_field.default_val = feature.get('default_value', '0.0')
      raw_input_dim = feature.get('value_dimension', 1)
      if 'boundaries' in feature:
        feature_config.boundaries.extend(feature['boundaries'])
      if raw_input_dim > 1:
        feature_config.raw_input_dim = raw_input_dim
    else:
      feature_config.sub_feature_type = feature_config.IdFeature
      _set_hash_bucket(feature, feature_config, input_field)
      feature_config.combiner = curr_combiner
  elif feature_type == 'id_feature':
    if is_multi:
      feature_config.feature_type = feature_config.TagFeature
      kv_separator = feature.get('kv_separator', None)
      if kv_separator:
        feature_config.kv_separator = kv_separator
    # elif is_seq:
    #   feature_config.feature_type = feature_config.SequenceFeature
    else:
      feature_config.feature_type = feature_config.IdFeature
    feature_config.embedding_dim = curr_embed_dim
    _set_hash_bucket(feature, feature_config, input_field)
    feature_config.combiner = curr_combiner
  elif feature_type == 'lookup_feature':
    need_discrete = feature.get('needDiscrete', True)
    if not need_discrete:
      _gen_raw_config(feature, input_field, feature_config, is_multi,
                      curr_embed_dim)
    else:
      feature_config.feature_type = feature_config.TagFeature
      if feature.get('needWeighting', False):
        feature_config.kv_separator = ''
      feature_config.embedding_dim = curr_embed_dim
      _set_hash_bucket(feature, feature_config, input_field)
      feature_config.combiner = curr_combiner
  elif feature_type == 'raw_feature':
    _gen_raw_config(feature, input_field, feature_config, is_multi,
                    curr_embed_dim)
  elif feature_type == 'match_feature':
    need_discrete = feature.get('needDiscrete', True)
    if feature.get('matchType', '') == 'multihit':
      is_multi = True
    if need_discrete:
      feature_config.feature_type = feature_config.TagFeature
      if feature.get('needWeighting', False):
        feature_config.kv_separator = ''
      feature_config.embedding_dim = curr_embed_dim
      _set_hash_bucket(feature, feature_config, input_field)
      feature_config.combiner = curr_combiner
    else:
      assert 'bucketize_boundaries' not in feature
      _gen_raw_config(feature, input_field, feature_config, is_multi,
                      curr_embed_dim)
  elif feature_type == 'combo_feature':
    feature_config.feature_type = feature_config.TagFeature
    _set_hash_bucket(feature, feature_config, input_field)
    feature_config.embedding_dim = curr_embed_dim
    feature_config.combiner = curr_combiner
  elif feature_type == 'overlap_feature':
    if feature['method'] in ['common_word_divided', 'diff_word_divided']:
      feature_config.feature_type = feature_config.TagFeature
    else:
      feature_config.feature_type = feature_config.IdFeature
    _set_hash_bucket(feature, feature_config, input_field)
    feature_config.embedding_dim = curr_embed_dim
    feature_config.combiner = curr_combiner
  else:
    assert 'unknown feature type %s, currently not supported' % feature_type
  if 'shared_name' in feature:
    feature_config.embedding_name = feature['shared_name']
  # pipeline_config.feature_configs.append(feature_config)
  if pipeline_config.feature_configs:
    pipeline_config.feature_configs.append(feature_config)
  else:
    pipeline_config.feature_config.features.append(feature_config)
  pipeline_config.data_config.input_fields.append(input_field)

  if 'extra_combo_info' in feature:
    extra_combo_info = feature['extra_combo_info']
    feature_names = extra_combo_info.get('feature_names', [])
    assert len(
        feature_names
    ) >= 1, 'The feature number for ComboFeature must be greater than 2.'
    combo_feature_config = FeatureConfig()
    combo_feature_config.input_names.append(feature_name)

    for fea_name in feature_names:
      combo_feature_config.input_names.append(fea_name)

    final_feature_name = 'combo__' + '_'.join(combo_feature_config.input_names)
    final_feature_name = extra_combo_info.get('final_feature_name',
                                              final_feature_name)
    combo_feature_config.feature_name = final_feature_name
    combo_feature_config.feature_type = combo_feature_config.ComboFeature
    curr_embed_dim = extra_combo_info.get(
        'embedding_dimension',
        extra_combo_info.get('embedding_dim', embedding_dim))
    curr_combiner = extra_combo_info.get('combiner', 'mean')
    combo_feature_config.embedding_dim = curr_embed_dim
    combo_feature_config.combiner = curr_combiner
    assert 'hash_bucket_size' in extra_combo_info, 'hash_bucket_size must be set in ComboFeature.'
    _set_hash_bucket(extra_combo_info, combo_feature_config, None)

    if pipeline_config.feature_configs:
      pipeline_config.feature_configs.append(combo_feature_config)
    else:
      pipeline_config.feature_config.features.append(combo_feature_config)
  return pipeline_config