in src/datasets/arrow_writer.py [0:0]
def __arrow_array__(self, type: Optional[pa.DataType] = None):
"""This function is called when calling pa.array(typed_sequence)"""
if type is not None:
raise ValueError("TypedSequence is supposed to be used with pa.array(typed_sequence, type=None)")
del type # make sure we don't use it
data = self.data
# automatic type inference for custom objects
if self.type is None and self.try_type is None:
data, self._inferred_type = self._infer_custom_type_and_encode(data)
if self._inferred_type is None:
type = self.try_type if self.trying_type else self.type
else:
type = self._inferred_type
pa_type = get_nested_type(type) if type is not None else None
optimized_int_pa_type = (
get_nested_type(self.optimized_int_type) if self.optimized_int_type is not None else None
)
trying_cast_to_python_objects = False
try:
# custom pyarrow types
if isinstance(pa_type, _ArrayXDExtensionType):
storage = to_pyarrow_listarray(data, pa_type)
return pa.ExtensionArray.from_storage(pa_type, storage)
# efficient np array to pyarrow array
if isinstance(data, np.ndarray):
out = numpy_to_pyarrow_listarray(data)
elif isinstance(data, list) and data and isinstance(first_non_null_non_empty_value(data)[1], np.ndarray):
out = list_of_np_array_to_pyarrow_listarray(data)
else:
trying_cast_to_python_objects = True
out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
# use smaller integer precisions if possible
if self.trying_int_optimization:
if pa.types.is_int64(out.type):
out = out.cast(optimized_int_pa_type)
elif pa.types.is_list(out.type):
if pa.types.is_int64(out.type.value_type):
out = array_cast(out, pa.list_(optimized_int_pa_type))
elif pa.types.is_list(out.type.value_type) and pa.types.is_int64(out.type.value_type.value_type):
out = array_cast(out, pa.list_(pa.list_(optimized_int_pa_type)))
# otherwise we can finally use the user's type
elif type is not None:
# We use cast_array_to_feature to support casting to custom types like Audio and Image
# Also, when trying type "string", we don't want to convert integers or floats to "string".
# We only do it if trying_type is False - since this is what the user asks for.
out = cast_array_to_feature(
out, type, allow_primitive_to_str=not self.trying_type, allow_decimal_to_str=not self.trying_type
)
return out
except (
TypeError,
pa.lib.ArrowInvalid,
pa.lib.ArrowNotImplementedError,
) as e: # handle type errors and overflows
# Ignore ArrowNotImplementedError caused by trying type, otherwise re-raise
if not self.trying_type and isinstance(e, pa.lib.ArrowNotImplementedError):
raise
if self.trying_type:
try: # second chance
if isinstance(data, np.ndarray):
return numpy_to_pyarrow_listarray(data)
elif isinstance(data, list) and data and any(isinstance(value, np.ndarray) for value in data):
return list_of_np_array_to_pyarrow_listarray(data)
else:
trying_cast_to_python_objects = True
return pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
except pa.lib.ArrowInvalid as e:
if "overflow" in str(e):
raise OverflowError(
f"There was an overflow with type {type_(data)}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({e})"
) from None
elif self.trying_int_optimization and "not in range" in str(e):
optimized_int_pa_type_str = np.dtype(optimized_int_pa_type.to_pandas_dtype()).name
logger.info(
f"Failed to cast a sequence to {optimized_int_pa_type_str}. Falling back to int64."
)
return out
elif trying_cast_to_python_objects and "Could not convert" in str(e):
out = pa.array(
cast_to_python_objects(data, only_1d_for_numpy=True, optimize_list_casting=False)
)
if type is not None:
out = cast_array_to_feature(
out, type, allow_primitive_to_str=True, allow_decimal_to_str=True
)
return out
else:
raise
elif "overflow" in str(e):
raise OverflowError(
f"There was an overflow with type {type_(data)}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({e})"
) from None
elif self.trying_int_optimization and "not in range" in str(e):
optimized_int_pa_type_str = np.dtype(optimized_int_pa_type.to_pandas_dtype()).name
logger.info(f"Failed to cast a sequence to {optimized_int_pa_type_str}. Falling back to int64.")
return out
elif trying_cast_to_python_objects and "Could not convert" in str(e):
out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True, optimize_list_casting=False))
if type is not None:
out = cast_array_to_feature(out, type, allow_primitive_to_str=True, allow_decimal_to_str=True)
return out
else:
raise