in mozetl/taar/taar_similarity.py [0:0]
def get_addons_per_client(users_df, addon_whitelist, minimum_addons_count):
"""Extracts a DataFrame that contains one row
for each client along with the list of active add-on GUIDs.
"""
def is_valid_addon(guid, addon):
return not (
addon.is_system
or addon.app_disabled
or addon.type != "extension"
or addon.user_disabled
or addon.foreign_install
or guid not in addon_whitelist
)
# Create an add-ons dataset un-nesting the add-on map from each
# user to a list of add-on GUIDs. Also filter undesired add-ons.
# Note that this list comprehension was restructured
# from the original longitudinal query. In particular, note that
# each client's 'active_addons' entry is a list containing the
# a dictionary of {addon_guid: {addon_metadata_dict}}
def flatten_valid_guid_generator(p):
for data in p["active_addons"]:
addon_guid = data["addon_id"]
if not is_valid_addon(addon_guid, data):
continue
yield addon_guid
return (
users_df.rdd.map(
lambda p: (p["client_id"], list(flatten_valid_guid_generator(p)))
)
.filter(lambda p: len(p[1]) > minimum_addons_count)
.toDF(["client_id", "addon_ids"])
)