def _get_extension_dtypes()

in python/pyarrow/pandas_compat.py [0:0]


def _get_extension_dtypes(table, columns_metadata, types_mapper, options, categories):
    """
    Based on the stored column pandas metadata and the extension types
    in the arrow schema, infer which columns should be converted to a
    pandas extension dtype.

    The 'numpy_type' field in the column metadata stores the string
    representation of the original pandas dtype (and, despite its name,
    not the 'pandas_type' field).
    Based on this string representation, a pandas/numpy dtype is constructed
    and then we can check if this dtype supports conversion from arrow.

    """
    strings_to_categorical = options["strings_to_categorical"]
    categories = categories or []

    ext_columns = {}

    # older pandas version that does not yet support extension dtypes
    if _pandas_api.extension_dtype is None:
        return ext_columns

    # use the specified mapping of built-in arrow types to pandas dtypes
    if types_mapper:
        for field in table.schema:
            typ = field.type
            pandas_dtype = types_mapper(typ)
            if pandas_dtype is not None:
                ext_columns[field.name] = pandas_dtype

    # infer from extension type in the schema
    for field in table.schema:
        typ = field.type
        if field.name not in ext_columns and isinstance(typ, pa.BaseExtensionType):
            try:
                pandas_dtype = typ.to_pandas_dtype()
            except NotImplementedError:
                pass
            else:
                ext_columns[field.name] = pandas_dtype

    # infer the extension columns from the pandas metadata
    for col_meta in columns_metadata:
        try:
            name = col_meta['field_name']
        except KeyError:
            name = col_meta['name']
        dtype = col_meta['numpy_type']

        if name not in ext_columns and dtype not in _pandas_supported_numpy_types:
            # pandas_dtype is expensive, so avoid doing this for types
            # that are certainly numpy dtypes
            pandas_dtype = _pandas_api.pandas_dtype(dtype)
            if isinstance(pandas_dtype, _pandas_api.extension_dtype):
                if isinstance(pandas_dtype, _pandas_api.pd.StringDtype):
                    # when the metadata indicate to use the string dtype,
                    # ignore this in case:
                    # - it is specified to convert strings / this column to categorical
                    # - the column itself is dictionary encoded and would otherwise be
                    #   converted to categorical
                    if strings_to_categorical or name in categories:
                        continue
                    try:
                        if pa.types.is_dictionary(table.schema.field(name).type):
                            continue
                    except KeyError:
                        pass
                if hasattr(pandas_dtype, "__from_arrow__"):
                    ext_columns[name] = pandas_dtype

    # for pandas 3.0+, use pandas' new default string dtype
    if _pandas_api.uses_string_dtype() and not strings_to_categorical:
        for field in table.schema:
            if field.name not in ext_columns and (
                pa.types.is_string(field.type)
                or pa.types.is_large_string(field.type)
                or pa.types.is_string_view(field.type)
            ) and field.name not in categories:
                ext_columns[field.name] = _pandas_api.pd.StringDtype(na_value=np.nan)

    return ext_columns