in mozetl/taar/taar_similarity.py [0:0]
def get_samples(spark, date_from):
"""
Get a DataFrame with a valid set of sample to base the next
processing on.
Sample is limited to submissions received since `date_from` and latest row per each client.
Reference documentation is found here:
Firefox Clients Daily telemetry table
https://docs.telemetry.mozilla.org/datasets/batch_view/clients_daily/reference.html
BUG 1485152: PR include active_addons to clients_daily table:
https://github.com/mozilla/telemetry-batch-view/pull/490
"""
df = (
spark.sql("SELECT * FROM clients_daily")
.where("client_id IS NOT null")
.where("active_addons IS NOT null")
.where("size(active_addons) > 2")
.where("size(active_addons) < 100")
.where("channel = 'release'")
.where("app_name = 'Firefox'")
.where("submission_date_s3 >= {}".format(date_from))
.selectExpr(
"client_id as client_id",
"active_addons as active_addons",
"city as city",
"cast(subsession_hours_sum as double)",
"locale as locale",
"os as os",
"places_bookmarks_count_mean AS bookmark_count",
"scalar_parent_browser_engagement_tab_open_event_count_sum "
"AS tab_open_count",
"scalar_parent_browser_engagement_total_uri_count_sum AS total_uri",
"scalar_parent_browser_engagement_unique_domains_count_mean AS unique_tlds",
"row_number() OVER (PARTITION BY client_id ORDER BY submission_date_s3 desc) as rn",
)
.where("rn = 1")
.drop("rn")
)
return df