in mozetl/addon_aggregates/addon_aggregates.py [0:0]
def aggregate_addons(df):
"""
Aggregates add-on indicators by client, channel, version and locale.
The result is a DataFrame with the additional aggregate columns:
n_self_installed_addons (int)
n_shield_addons (int)
n_foreign_installed_addons (int)
n_system_addons (int)
n_web_extensions (int)
first_addon_install_date (str %Y%m%d)
profile_creation_date (str %Y%m%d)
for each of the above facets.
:param df: an expoded instance of main_summary by active_addons
with various additional indicator columns
:return SparkDF: an aggregated dataset with each of the above columns
"""
addon_aggregates = (
df.distinct()
.groupBy("client_id", "normalized_channel", "app_version", "locale")
.agg(
fun.sum("is_self_install").alias("n_self_installed_addons"),
fun.sum("is_shield_addon").alias("n_shield_addons"),
fun.sum("is_foreign_install").alias("n_foreign_installed_addons"),
fun.sum("is_system").alias("n_system_addons"),
fun.sum("is_web_extension").alias("n_web_extensions"),
fun.min(
fun.when(
df.is_self_install == 1,
fun.date_format(
fun.from_unixtime(fun.col("install_day") * 60 * 60 * 24),
"yyyyMMdd",
),
).otherwise(None)
).alias("first_addon_install_date"),
fun.date_format(
fun.from_unixtime(fun.min("profile_creation_date") * 60 * 60 * 24),
"yyyyMMdd",
).alias("profile_creation_date"),
)
)
return addon_aggregates