in src/smclarify/bias/metrics/common.py [0:0]
def series_datatype(series: pd.Series, values: Optional[List[Any]] = None) -> DataType:
"""
Determine given data series is categorical or continuous using set of rules.
WARNING: The deduced data type can be different from real data type of the data series. Please
use the function `ensure_series_data_type` instead if you'd like ensure the series data type.
:param series: data for facet/label/predicted_label columns
:param values: list of facet or label values provided by user
:return: Enum {CATEGORICAL|CONTINUOUS}
"""
# if datatype is boolean or categorical we return data as categorical
data_type = DataType.CATEGORICAL
data_uniqueness_fraction = divide(series.nunique(), series.count())
# Assumption: user will give single value for threshold currently
# Todo: fix me if multiple thresholds for facet or label are supported
if series.dtype.name == "category" or (isinstance(values, list) and len(values) > 1):
logger.info(
f"Column {series.name} with data uniqueness fraction {data_uniqueness_fraction} is classifed as a "
f"{data_type.name} column"
)
return data_type
if series.dtype.name in ["str", "string", "object"]:
# cast the dtype to int, if exception is raised data is categorical
casted_data = series.astype("int64", copy=True, errors="ignore")
if np.issubdtype(casted_data.dtype, np.integer) and data_uniqueness_fraction >= UNIQUENESS_THRESHOLD:
data_type = DataType.CONTINUOUS # type: ignore
elif np.issubdtype(series.dtype, np.floating):
data_type = DataType.CONTINUOUS
elif np.issubdtype(series.dtype, np.integer):
# Current rule: If data has more than 5% if unique values then it is continuous
# Todo: Needs to be enhanced, This rule doesn't always determine the datatype correctly
if data_uniqueness_fraction >= UNIQUENESS_THRESHOLD:
data_type = DataType.CONTINUOUS
logger.info(
f"Column {series.name} with data uniqueness fraction {data_uniqueness_fraction} is classifed as a "
f"{data_type.name} column"
)
return data_type