in petastorm/codecs.py [0:0]
def encode(self, unischema_field, value):
# Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
# (currently works only with make_batch_reader). We should move all pyspark related code into a separate module
import pyspark.sql.types as sql_types
# We treat ndarrays with shape=() as scalars
unsized_numpy_array = isinstance(value, np.ndarray) and value.shape == ()
# Validate the input to be a scalar (or an unsized numpy array)
if not unsized_numpy_array and hasattr(value, '__len__') and (not isinstance(value, str)):
raise TypeError('Expected a scalar as a value for field \'{}\'. '
'Got a non-numpy type\'{}\''.format(unischema_field.name, type(value)))
if unischema_field.shape:
raise ValueError('The shape field of unischema_field \'%s\' must be an empty tuple (i.e. \'()\' '
'to indicate a scalar. However, the actual shape is %s',
unischema_field.name, unischema_field.shape)
if isinstance(self._spark_type, (sql_types.ByteType, sql_types.ShortType, sql_types.IntegerType,
sql_types.LongType)):
return int(value)
if isinstance(self._spark_type, (sql_types.FloatType, sql_types.DoubleType)):
return float(value)
if isinstance(self._spark_type, sql_types.BooleanType):
return bool(value)
if isinstance(self._spark_type, sql_types.StringType):
if not isinstance(value, str):
raise ValueError(
'Expected a string value for field {}. Got type {}'.format(unischema_field.name, type(value)))
return str(value)
return value