in tensorflow_data_validation/utils/mutual_information_util.py [0:0]
def _mi_for_arrays(c_arrs0: List[np.ndarray],
c_arrs1: List[np.ndarray],
d_arrs0: List[np.ndarray],
d_arrs1: List[np.ndarray],
weights: Optional[np.ndarray] = None,
k: int = 3,
estimate_method: str = 'larger_data',
seed: Optional[int] = None) -> Tuple[float, np.ndarray]:
"""Computes MI for a list of np.ndarrays."""
assert (bool(c_arrs0 + d_arrs0) and
bool(c_arrs1 + d_arrs1)), 'Both sides are expected to be nonempty.'
fs = list(itertools.chain(c_arrs0, c_arrs1, d_arrs0, d_arrs1))
for other_f in fs[1:]:
assert len(fs[0]) == len(other_f)
np.random.seed(seed)
# Scale ordinal features, and replace missing values in all features.
c_arrs0 = [
_replace_none_categorical(_unit_variance_scale(f)) for f in c_arrs0
]
c_arrs1 = [
_replace_none_categorical(_unit_variance_scale(f)) for f in c_arrs1
]
d_arrs0 = [_to_dense_discrete_array(f) for f in d_arrs0]
d_arrs1 = [_to_dense_discrete_array(f) for f in d_arrs1]
arr0 = _to_noisy_numpy_array(c_arrs0)
arr1 = _to_noisy_numpy_array(c_arrs1)
df0 = _merge_categorical(d_arrs0)
df1 = _merge_categorical(d_arrs1)
if weights is None:
weights = np.ones_like(fs[0], dtype=float)
if (arr0 is None and arr1 is None) or (df0 is None and df1 is None):
mi_c01_d01, each_c01_d01 = 0., 0.
else:
arr = np.hstack(([] if arr0 is None else [arr0]) +
([] if arr1 is None else [arr1]))
df = _merge_categorical(([] if df0 is None else [df0]) +
([] if df1 is None else [df1]))
mi_c01_d01, each_c01_d01 = _mi_high_dim_cd(arr, df, k, estimate_method,
weights)
if arr0 is None or arr1 is None:
mi_c0_c1, each_c0_c1 = 0., 0.
else:
mi_c0_c1, each_c0_c1 = _mi_high_dim_cc(arr0, arr1, k, estimate_method,
weights)
if df0 is None or df1 is None:
mi_d0_d1, each_d0_d1 = 0., 0.
else:
mi_d0_d1, each_d0_d1 = _mi_high_dim_dd(df0, df1, weights)
if arr0 is None or df0 is None:
mi_c0_d0, each_c0_d0 = 0., 0.
else:
mi_c0_d0, each_c0_d0 = _mi_high_dim_cd(arr0, df0, k, estimate_method,
weights)
if arr1 is None or df1 is None:
mi_c1_d1, each_c1_d1 = 0., 0.
else:
mi_c1_d1, each_c1_d1 = _mi_high_dim_cd(arr1, df1, k, estimate_method,
weights)
final_mi = max(0., mi_c01_d01 + mi_c0_c1 + mi_d0_d1 - mi_c0_d0 - mi_c1_d1)
each = each_c01_d01 + each_c0_c1 + each_d0_d1 - each_c0_d0 - each_c1_d1
assert isinstance(each, np.ndarray)
return final_mi, each