in src/datasets/features/features.py [0:0]
def _arrow_to_datasets_dtype(arrow_type: pa.DataType) -> str:
"""
_arrow_to_datasets_dtype takes a pyarrow.DataType and converts it to a datasets string dtype.
In effect, `dt == string_to_arrow(_arrow_to_datasets_dtype(dt))`
"""
if pyarrow.types.is_null(arrow_type):
return "null"
elif pyarrow.types.is_boolean(arrow_type):
return "bool"
elif pyarrow.types.is_int8(arrow_type):
return "int8"
elif pyarrow.types.is_int16(arrow_type):
return "int16"
elif pyarrow.types.is_int32(arrow_type):
return "int32"
elif pyarrow.types.is_int64(arrow_type):
return "int64"
elif pyarrow.types.is_uint8(arrow_type):
return "uint8"
elif pyarrow.types.is_uint16(arrow_type):
return "uint16"
elif pyarrow.types.is_uint32(arrow_type):
return "uint32"
elif pyarrow.types.is_uint64(arrow_type):
return "uint64"
elif pyarrow.types.is_float16(arrow_type):
return "float16" # pyarrow dtype is "halffloat"
elif pyarrow.types.is_float32(arrow_type):
return "float32" # pyarrow dtype is "float"
elif pyarrow.types.is_float64(arrow_type):
return "float64" # pyarrow dtype is "double"
elif pyarrow.types.is_time32(arrow_type):
return f"time32[{pa.type_for_alias(str(arrow_type)).unit}]"
elif pyarrow.types.is_time64(arrow_type):
return f"time64[{pa.type_for_alias(str(arrow_type)).unit}]"
elif pyarrow.types.is_timestamp(arrow_type):
if arrow_type.tz is None:
return f"timestamp[{arrow_type.unit}]"
elif arrow_type.tz:
return f"timestamp[{arrow_type.unit}, tz={arrow_type.tz}]"
else:
raise ValueError(f"Unexpected timestamp object {arrow_type}.")
elif pyarrow.types.is_date32(arrow_type):
return "date32" # pyarrow dtype is "date32[day]"
elif pyarrow.types.is_date64(arrow_type):
return "date64" # pyarrow dtype is "date64[ms]"
elif pyarrow.types.is_duration(arrow_type):
return f"duration[{arrow_type.unit}]"
elif pyarrow.types.is_decimal128(arrow_type):
return f"decimal128({arrow_type.precision}, {arrow_type.scale})"
elif pyarrow.types.is_decimal256(arrow_type):
return f"decimal256({arrow_type.precision}, {arrow_type.scale})"
elif pyarrow.types.is_binary(arrow_type):
return "binary"
elif pyarrow.types.is_large_binary(arrow_type):
return "large_binary"
elif pyarrow.types.is_string(arrow_type):
return "string"
elif pyarrow.types.is_large_string(arrow_type):
return "large_string"
elif pyarrow.types.is_dictionary(arrow_type):
return _arrow_to_datasets_dtype(arrow_type.value_type)
else:
raise ValueError(f"Arrow type {arrow_type} does not have a datasets dtype equivalent.")