in python/pyarrow/pandas_compat.py [0:0]
def _get_extension_dtypes(table, columns_metadata, types_mapper, options, categories):
"""
Based on the stored column pandas metadata and the extension types
in the arrow schema, infer which columns should be converted to a
pandas extension dtype.
The 'numpy_type' field in the column metadata stores the string
representation of the original pandas dtype (and, despite its name,
not the 'pandas_type' field).
Based on this string representation, a pandas/numpy dtype is constructed
and then we can check if this dtype supports conversion from arrow.
"""
strings_to_categorical = options["strings_to_categorical"]
categories = categories or []
ext_columns = {}
# older pandas version that does not yet support extension dtypes
if _pandas_api.extension_dtype is None:
return ext_columns
# use the specified mapping of built-in arrow types to pandas dtypes
if types_mapper:
for field in table.schema:
typ = field.type
pandas_dtype = types_mapper(typ)
if pandas_dtype is not None:
ext_columns[field.name] = pandas_dtype
# infer from extension type in the schema
for field in table.schema:
typ = field.type
if field.name not in ext_columns and isinstance(typ, pa.BaseExtensionType):
try:
pandas_dtype = typ.to_pandas_dtype()
except NotImplementedError:
pass
else:
ext_columns[field.name] = pandas_dtype
# infer the extension columns from the pandas metadata
for col_meta in columns_metadata:
try:
name = col_meta['field_name']
except KeyError:
name = col_meta['name']
dtype = col_meta['numpy_type']
if name not in ext_columns and dtype not in _pandas_supported_numpy_types:
# pandas_dtype is expensive, so avoid doing this for types
# that are certainly numpy dtypes
pandas_dtype = _pandas_api.pandas_dtype(dtype)
if isinstance(pandas_dtype, _pandas_api.extension_dtype):
if isinstance(pandas_dtype, _pandas_api.pd.StringDtype):
# when the metadata indicate to use the string dtype,
# ignore this in case:
# - it is specified to convert strings / this column to categorical
# - the column itself is dictionary encoded and would otherwise be
# converted to categorical
if strings_to_categorical or name in categories:
continue
try:
if pa.types.is_dictionary(table.schema.field(name).type):
continue
except KeyError:
pass
if hasattr(pandas_dtype, "__from_arrow__"):
ext_columns[name] = pandas_dtype
# for pandas 3.0+, use pandas' new default string dtype
if _pandas_api.uses_string_dtype() and not strings_to_categorical:
for field in table.schema:
if field.name not in ext_columns and (
pa.types.is_string(field.type)
or pa.types.is_large_string(field.type)
or pa.types.is_string_view(field.type)
) and field.name not in categories:
ext_columns[field.name] = _pandas_api.pd.StringDtype(na_value=np.nan)
return ext_columns