def series_datatype()

in src/smclarify/bias/metrics/common.py [0:0]


def series_datatype(series: pd.Series, values: Optional[List[Any]] = None) -> DataType:
    """
    Determine given data series is categorical or continuous using set of rules.
    WARNING: The deduced data type can be different from real data type of the data series. Please
    use the function `ensure_series_data_type` instead if you'd like ensure the series data type.

    :param series: data for facet/label/predicted_label columns
    :param values: list of facet or label values provided by user
    :return: Enum {CATEGORICAL|CONTINUOUS}
    """
    # if datatype is boolean or categorical we return data as categorical
    data_type = DataType.CATEGORICAL
    data_uniqueness_fraction = divide(series.nunique(), series.count())
    # Assumption: user will give single value for threshold currently
    # Todo: fix me if multiple thresholds for facet or label are supported
    if series.dtype.name == "category" or (isinstance(values, list) and len(values) > 1):
        logger.info(
            f"Column {series.name} with data uniqueness fraction {data_uniqueness_fraction} is classifed as a "
            f"{data_type.name} column"
        )
        return data_type
    if series.dtype.name in ["str", "string", "object"]:
        # cast the dtype to int, if exception is raised data is categorical
        casted_data = series.astype("int64", copy=True, errors="ignore")
        if np.issubdtype(casted_data.dtype, np.integer) and data_uniqueness_fraction >= UNIQUENESS_THRESHOLD:
            data_type = DataType.CONTINUOUS  # type: ignore
    elif np.issubdtype(series.dtype, np.floating):
        data_type = DataType.CONTINUOUS
    elif np.issubdtype(series.dtype, np.integer):
        # Current rule: If data has more than 5% if unique values then it is continuous
        # Todo: Needs to be enhanced, This rule doesn't always determine the datatype correctly
        if data_uniqueness_fraction >= UNIQUENESS_THRESHOLD:
            data_type = DataType.CONTINUOUS
    logger.info(
        f"Column {series.name} with data uniqueness fraction {data_uniqueness_fraction} is classifed as a "
        f"{data_type.name} column"
    )
    return data_type