in tensorflow_transform/coders/csv_coder.py [0:0]
def __init__(self,
column_names,
schema,
delimiter=',',
secondary_delimiter=None,
multivalent_columns=None):
"""Initializes CsvCoder.
Args:
column_names: Tuple of strings. Order must match the order in the file.
schema: A `Schema` proto.
delimiter: A one-character string used to separate fields.
secondary_delimiter: A one-character string used to separate values within
the same field.
multivalent_columns: A list of names for multivalent columns that need to
be split based on secondary delimiter.
Raises:
ValueError: If `schema` is invalid.
"""
self._column_names = column_names
self._schema = schema
self._delimiter = delimiter
self._secondary_delimiter = secondary_delimiter
self._encoder = self._WriterWrapper(delimiter)
if multivalent_columns is None:
multivalent_columns = []
self._multivalent_columns = multivalent_columns
if secondary_delimiter:
secondary_encoder = self._WriterWrapper(secondary_delimiter)
elif multivalent_columns:
raise ValueError(
'secondary_delimiter unspecified for multivalent columns "{}"'.format(
multivalent_columns))
secondary_encoder_by_name = {
name: secondary_encoder for name in multivalent_columns
}
indices_by_name = {
name: index for index, name in enumerate(self._column_names)
}
def index(name):
index = indices_by_name.get(name)
if index is None:
raise ValueError('Column not found: "{}"'.format(name))
else:
return index
self._feature_handlers = []
for name, feature_spec in schema_utils.schema_as_feature_spec(
schema).feature_spec.items():
if isinstance(feature_spec, tf.io.FixedLenFeature):
self._feature_handlers.append(
_FixedLenFeatureHandler(name, feature_spec, index(name),
secondary_encoder_by_name.get(name)))
elif isinstance(feature_spec, tf.io.VarLenFeature):
self._feature_handlers.append(
_VarLenFeatureHandler(name, feature_spec.dtype, index(name),
secondary_encoder_by_name.get(name)))
elif isinstance(feature_spec, tf.io.SparseFeature):
index_keys = (
feature_spec.index_key if isinstance(feature_spec.index_key, list)
else [feature_spec.index_key])
for key in index_keys:
self._feature_handlers.append(
_VarLenFeatureHandler(key, tf.int64, index(key),
secondary_encoder_by_name.get(name)))
self._feature_handlers.append(
_VarLenFeatureHandler(feature_spec.value_key, feature_spec.dtype,
index(feature_spec.value_key),
secondary_encoder_by_name.get(name)))
else:
raise ValueError(
'feature_spec should be one of tf.FixedLenFeature, '
'tf.VarLenFeature or tf.SparseFeature: {!r} was {!r}'.format(
name, type(feature_spec)))