in petastorm/tf_utils.py [0:0]
def _sanitize_field_tf_types(sample):
"""Takes a named tuple and casts/promotes types unknown to TF to the types that are known.
Three casts that are currently implemented
- Decimal to string
- uint16 to int32
- np.datetime64 to int64, as nanoseconds since unix epoch
:param sample: named tuple or a dictionary
:return: same type as the input with values casted to types supported by Tensorflow
"""
next_sample_dict = sample._asdict()
for k, v in next_sample_dict.items():
if v is None:
raise RuntimeError('Encountered "{}"=None. Tensorflow does not support None values as a tensor.'
'Consider filtering out these rows using a predicate.'.format(k))
# Assuming conversion to the same numpy type is trivial and dirty cheap
if isinstance(v, Decimal):
# Normalizing decimals only to get rid of the trailing zeros (makes testing easier, assuming has
# no other effect)
next_sample_dict[k] = str(v.normalize())
elif isinstance(v, np.ndarray) and np.issubdtype(v.dtype, np.datetime64):
# Convert to nanoseconds from POSIX epoch
next_sample_dict[k] = (v - np.datetime64('1970-01-01T00:00:00.0')) \
.astype('timedelta64[ns]').astype(np.int64)
elif isinstance(v, np.ndarray) and v.dtype == np.uint16:
next_sample_dict[k] = v.astype(np.int32)
elif isinstance(v, np.ndarray) and v.dtype == np.uint32:
next_sample_dict[k] = v.astype(np.int64)
elif isinstance(v, np.ndarray) and v.dtype.type in (np.bytes_, np.unicode_):
if v.size != 0:
next_sample_dict[k] = v.tolist()
elif isinstance(v, np.ndarray) and v.dtype.kind == 'O' and isinstance(v[0], datetime.date):
# Pyarrow 0.12.1 started returning python datetime.date when parquet column is a DateType() column.
# Convert values in such column into nsec from epoch int64.
next_sample_dict[k] = _date_to_nsec_from_epoch_vectorized(v)
# Construct object of the same type as the input
return sample.__class__(**next_sample_dict)