in src/smclarify/bias/metrics/pretraining.py [0:0]
def CI(feature: pd.Series, sensitive_facet_index: pd.Series) -> float:
r"""
Class Imbalance (CI)
:param feature: input feature
:param sensitive_facet_index: boolean column indicating sensitive group
:return: a float in the interval [-1, +1] indicating an under-representation or over-representation
of the sensitive class.
.. math::
CI = \frac{na-nd}{na+nd}
Bias is often generated from an under-representation of
the sensitive class in the dataset, especially if the desired “golden truth”
is equality across classes. Imbalance carries over into model predictions.
We will report all measures in differences and normalized differences. Since
the measures are often probabilities or proportions, the differences will lie in
We define CI = (np − p)/(np + p). Where np is the number of instances in the not sensitive group
and p is number of instances in the sensitive group.
"""
require(sensitive_facet_index.dtype == bool, "sensitive_facet_index must be of type bool")
pos = len(feature[sensitive_facet_index])
neg = len(feature[~sensitive_facet_index])
q = pos + neg
if neg == 0:
raise ValueError("CI: negated facet set is empty. Check that x[~facet] has non-zero length.")
if pos == 0:
raise ValueError("CI: facet set is empty. Check that x[facet] has non-zero length.")
assert q != 0
ci = float(neg - pos) / q
return ci