def get_samples()

in mozetl/taar/taar_similarity.py [0:0]


def get_samples(spark, date_from):
    """
    Get a DataFrame with a valid set of sample to base the next
    processing on.

    Sample is limited to submissions received since `date_from` and latest row per each client.

    Reference documentation is found here:

    Firefox Clients Daily telemetry table
    https://docs.telemetry.mozilla.org/datasets/batch_view/clients_daily/reference.html

    BUG 1485152: PR include active_addons to clients_daily table:
    https://github.com/mozilla/telemetry-batch-view/pull/490
    """
    df = (
        spark.sql("SELECT * FROM clients_daily")
        .where("client_id IS NOT null")
        .where("active_addons IS NOT null")
        .where("size(active_addons) > 2")
        .where("size(active_addons) < 100")
        .where("channel = 'release'")
        .where("app_name = 'Firefox'")
        .where("submission_date_s3 >= {}".format(date_from))
        .selectExpr(
            "client_id as client_id",
            "active_addons as active_addons",
            "city as city",
            "cast(subsession_hours_sum as double)",
            "locale as locale",
            "os as os",
            "places_bookmarks_count_mean AS bookmark_count",
            "scalar_parent_browser_engagement_tab_open_event_count_sum "
            "AS tab_open_count",
            "scalar_parent_browser_engagement_total_uri_count_sum AS total_uri",
            "scalar_parent_browser_engagement_unique_domains_count_mean AS unique_tlds",
            "row_number() OVER (PARTITION BY client_id ORDER BY submission_date_s3 desc) as rn",
        )
        .where("rn = 1")
        .drop("rn")
    )
    return df