in smallpond/io/arrow.py [0:0]
def convert_type_to_large(type_: arrow.DataType) -> arrow.DataType:
"""
Convert all string and binary types to large types recursively.
"""
# Since arrow uses 32-bit signed offsets for string and binary types, convert all string and binary columns
# to large_string and large_binary to avoid offset overflow, see https://issues.apache.org/jira/browse/ARROW-17828.
if arrow.types.is_string(type_):
return arrow.large_string()
elif arrow.types.is_binary(type_):
return arrow.large_binary()
elif isinstance(type_, arrow.ListType):
return arrow.list_(convert_type_to_large(type_.value_type))
elif isinstance(type_, arrow.StructType):
return arrow.struct(
[
arrow.field(
field.name,
convert_type_to_large(field.type),
nullable=field.nullable,
)
for field in type_
]
)
elif isinstance(type_, arrow.MapType):
return arrow.map_(
convert_type_to_large(type_.key_type),
convert_type_to_large(type_.item_type),
)
else:
return type_