def _parse_raw_feature()

in easy_rec/python/input/input.py [0:0]
111 lines of code
30 McCabe index (conditional complexity)

  def _parse_raw_feature(self, fc, parsed_dict, field_dict):
    input_0 = fc.input_names[0]
    feature_name = fc.feature_name if fc.HasField('feature_name') else input_0
    if field_dict[input_0].dtype == tf.string:
      if fc.HasField('seq_multi_sep') and fc.HasField('combiner'):
        fea = tf.string_split(field_dict[input_0], fc.seq_multi_sep)
        segment_ids = fea.indices[:, 0]
        vals = fea.values
      else:
        vals = field_dict[input_0]
        segment_ids = tf.range(0, tf.shape(vals)[0])
      if fc.raw_input_dim > 1:
        check_list = [
            tf.py_func(
                check_split, [vals, fc.separator, fc.raw_input_dim, input_0],
                Tout=tf.bool)
        ] if self._check_mode else []
        with tf.control_dependencies(check_list):
          tmp_fea = tf.string_split(vals, fc.separator)
        check_list = [
            tf.py_func(
                check_string_to_number, [tmp_fea.values, input_0], Tout=tf.bool)
        ] if self._check_mode else []
        with tf.control_dependencies(check_list):
          tmp_vals = tf.string_to_number(
              tmp_fea.values,
              tf.float32,
              name='multi_raw_fea_to_flt_%s' % input_0)
        if fc.HasField('seq_multi_sep') and fc.HasField('combiner'):
          emb = tf.reshape(tmp_vals, [-1, fc.raw_input_dim])
          if fc.combiner == 'max':
            emb = tf.segment_max(emb, segment_ids)
          elif fc.combiner == 'sum':
            emb = tf.segment_sum(emb, segment_ids)
          elif fc.combiner == 'min':
            emb = tf.segment_min(emb, segment_ids)
          elif fc.combiner == 'mean':
            emb = tf.segment_mean(emb, segment_ids)
          else:
            assert False, 'unsupported combine operator: ' + fc.combiner
          parsed_dict[feature_name] = emb
        else:
          parsed_dict[feature_name] = tf.sparse_to_dense(
              tmp_fea.indices,
              [tf.shape(field_dict[input_0])[0], fc.raw_input_dim],
              tmp_vals,
              default_value=0)
      elif fc.HasField('seq_multi_sep') and fc.HasField('combiner'):
        check_list = [
            tf.py_func(check_string_to_number, [vals, input_0], Tout=tf.bool)
        ] if self._check_mode else []
        with tf.control_dependencies(check_list):
          emb = tf.string_to_number(
              vals, tf.float32, name='raw_fea_to_flt_%s' % input_0)
        if fc.combiner == 'max':
          emb = tf.segment_max(emb, segment_ids)
        elif fc.combiner == 'sum':
          emb = tf.segment_sum(emb, segment_ids)
        elif fc.combiner == 'min':
          emb = tf.segment_min(emb, segment_ids)
        elif fc.combiner == 'mean':
          emb = tf.segment_mean(emb, segment_ids)
        else:
          assert False, 'unsupported combine operator: ' + fc.combiner
        parsed_dict[feature_name] = emb
      else:
        check_list = [
            tf.py_func(
                check_string_to_number, [field_dict[input_0], input_0],
                Tout=tf.bool)
        ] if self._check_mode else []
        with tf.control_dependencies(check_list):
          parsed_dict[feature_name] = tf.string_to_number(
              field_dict[input_0], tf.float32)
    elif field_dict[input_0].dtype in [
        tf.int32, tf.int64, tf.double, tf.float32
    ]:
      parsed_dict[feature_name] = tf.to_float(field_dict[input_0])
    else:
      assert False, 'invalid dtype[%s] for raw feature' % str(
          field_dict[input_0].dtype)
    if fc.max_val > fc.min_val:
      parsed_dict[feature_name] = (parsed_dict[feature_name] - fc.min_val) / (
          fc.max_val - fc.min_val)

    if fc.HasField('normalizer_fn'):
      logging.info('apply normalizer_fn %s to `%s`' %
                   (fc.normalizer_fn, feature_name))
      parsed_dict[feature_name] = self._normalizer_fn[feature_name](
          parsed_dict[feature_name])

    if not fc.boundaries and fc.num_buckets <= 1 and \
        fc.embedding_dim > 0 and \
        self._data_config.sample_weight != input_0:
      # may need by wide model and deep model to project
      # raw values to a vector, it maybe better implemented
      # by a ProjectionColumn later
      sample_num = tf.to_int64(tf.shape(parsed_dict[feature_name])[0])
      indices_0 = tf.range(sample_num, dtype=tf.int64)
      indices_1 = tf.range(fc.raw_input_dim, dtype=tf.int64)
      indices_0 = indices_0[:, None]
      indices_1 = indices_1[None, :]
      indices_0 = tf.tile(indices_0, [1, fc.raw_input_dim])
      indices_1 = tf.tile(indices_1, [sample_num, 1])
      indices_0 = tf.reshape(indices_0, [-1, 1])
      indices_1 = tf.reshape(indices_1, [-1, 1])
      indices = tf.concat([indices_0, indices_1], axis=1)

      tmp_parsed = parsed_dict[feature_name]
      parsed_dict[feature_name + '_raw_proj_id'] = tf.SparseTensor(
          indices=indices,
          values=indices_1[:, 0],
          dense_shape=[sample_num, fc.raw_input_dim])
      parsed_dict[feature_name + '_raw_proj_val'] = tf.SparseTensor(
          indices=indices,
          values=tf.reshape(tmp_parsed, [-1]),
          dense_shape=[sample_num, fc.raw_input_dim])