in easy_rec/python/input/hive_rtp_input.py [0:0]
def _parse_csv(self, line):
non_feature_cols = self._label_fields
if self._selected_cols:
non_feature_cols = self._selected_cols[:-1]
record_defaults = []
for tid, field_name in enumerate(self._input_table_col_names):
if field_name in self._selected_cols[:-1]:
idx = self._input_fields.index(field_name)
record_defaults.append(
self.get_type_defaults(self._input_field_types[idx],
self._input_field_defaults[idx]))
else:
record_defaults.append('')
print('record_defaults: ', record_defaults)
tmp_fields = tf.decode_csv(
line,
field_delim=self._rtp_separator,
record_defaults=record_defaults,
name='decode_csv')
print('tmp_fields: ', tmp_fields)
fields = []
if self._selected_cols:
for idx, field_name in enumerate(self._input_table_col_names):
if field_name in self._selected_cols:
fields.append(tmp_fields[idx])
print('fields: ', fields)
labels = fields[:-1]
# only for features, labels and sample_weight excluded
record_types = [
t for x, t in zip(self._input_fields, self._input_field_types)
if x not in non_feature_cols
]
feature_num = len(record_types)
check_list = [
tf.py_func(
check_split,
[fields[-1], self._data_config.separator,
len(record_types)],
Tout=tf.bool)
] if self._check_mode else []
with tf.control_dependencies(check_list):
fields = tf.string_split(
fields[-1], self._data_config.separator, skip_empty=False)
tmp_fields = tf.reshape(fields.values, [-1, feature_num])
rtp_record_defaults = [
str(self.get_type_defaults(t, v))
for x, t, v in zip(self._input_fields, self._input_field_types,
self._input_field_defaults)
if x not in non_feature_cols
]
fields = labels[len(self._label_fields):]
for i in range(feature_num):
field = string_to_number(tmp_fields[:, i], record_types[i],
rtp_record_defaults[i], i)
fields.append(field)
field_keys = [x for x in self._input_fields if x not in self._label_fields]
effective_fids = [field_keys.index(x) for x in self._effective_fields]
inputs = {field_keys[x]: fields[x] for x in effective_fids}
for x in range(len(self._label_fields)):
inputs[self._label_fields[x]] = labels[x]
return inputs