def compute_noisy_counts()

in mozetl/taar/taar_locale.py [0:0]


def compute_noisy_counts(locale_addon_counts, addon_limits, whitelist, eps=EPSILON):
    """Apply DP protections to the raw per-locale add-on frequency counts.

    Laplace noise is added to each of the counts. Additionally, each per-locale
    set of frequency counts is expanded to include every add-on in the
    whitelist, even if some were not observed in the raw data.

    This computation is done in local memory, rather than in Spark, to simplify
    working with random number generation. This relies on the assumption that
    the number of unique locales and whitelist add-ons each remain small (on the
    order of 100-1000).

    :param locale_addon_counts: a Pandas DF of per-locale add-on frequency
                                counts, with columns `locale`, `addon`, `count`
    :param addon_limits: a dict mapping locale strings to ints representing the
                         max number of add-ons retained per client in that locale.
                         Any locale not present in the dict is excluded from the
                         final dataset.
    :param whitelist: a list of add-on IDs belonging to the AMO whitelist
    :param eps: the DP epsilon parameter, representing the privacy budget
    :return: a DF with the same structure as `locale_addon_counts`. Counts may
             now be non-integer and negative.
    """
    # First expand the frequency count table to include all whitelisted add-ons
    # in each locale.
    locale_wl_addons = DataFrame.from_records(
        [(loc, a) for loc in addon_limits.keys() for a in whitelist],
        columns=["locale", "addon"],
    )
    raw_counts = locale_addon_counts.set_index(["locale", "addon"])
    locale_wl = locale_wl_addons.set_index(["locale", "addon"])
    # Left join means the result will have every add-on in the whitelist and
    # only the locales in the limits dict.
    expanded_counts = locale_wl.join(raw_counts, how="left").fillna(0)

    # Add the Laplace noise.
    #
    # For each add-on in the whitelist, in each locale, we take the observed
    # installation frequency count and add independent random noise.
    # Observed frequencies may be 0 if no profile had those add-ons installed.
    #
    # The random noise is Laplace-distributed with scale parameter $m/\epsilon$,
    # where epsilon is the DP privacy budget, and m is the max number of add-ons
    # reported per client in the current locale.
    #
    # Since the Laplace parametrization depends only on locale, we iterate over
    # locales and add a numpy array of independent simulated Laplace random
    # values to the series of add-on frequency counts.
    #
    # Since the Laplace noise is continuous and real-valued, counts will no
    # longer be integer, and may become negative.
    for locale in expanded_counts.index.unique("locale"):
        # The scale parameter depends on the max number of add-ons per client,
        # which varies by locale.
        locale_laplace_param = float(addon_limits[locale]) / eps
        # Select counts for all add-ons in the current locale.
        locale_idx = IndexSlice[locale, :]
        locale_counts = expanded_counts.loc[locale_idx, "count"]
        locale_counts += rlaplace(scale=locale_laplace_param, size=len(locale_counts))
        expanded_counts.loc[locale_idx, "count"] = locale_counts

    return expanded_counts.reset_index()