in mozetl/taar/taar_ensemble.py [0:0]
def get_addons_per_client(users_df, minimum_addons_count):
"""Extracts a DataFrame that contains one row
for each client along with the list of active add-on GUIDs.
"""
def is_valid_addon(addon):
return not (
addon.is_system
or addon.app_disabled
or addon.type != "extension"
or addon.user_disabled
or addon.foreign_install
or addon.install_day is None
)
# may need additional whitelisting to remove shield addons
def get_valid_addon_ids(addons):
sorted_addons = sorted(
[(a.addon_id, a.install_day) for a in addons if is_valid_addon(a)],
key=lambda addon_tuple: addon_tuple[1],
)
return [addon_id for (addon_id, install_day) in sorted_addons]
get_valid_addon_ids_udf = udf(get_valid_addon_ids, ArrayType(StringType()))
# Create an add-ons dataset un-nesting the add-on map from each
# user to a list of add-on GUIDs. Also filter undesired add-ons.
return users_df.select(
"client_id", get_valid_addon_ids_udf("active_addons").alias("addon_ids")
).filter(size("addon_ids") > minimum_addons_count)