mozetl/taar/taar_update_whitelist.py (60 lines of code) (raw):
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import click
import json
import requests
from .taar_utils import store_json_to_s3
class LoadError(Exception):
pass
class ShortWhitelistError(Exception):
pass
WHITELIST_FILENAME = "only_guids_top_200"
ADDON_META_URI = "https://addons.mozilla.org/api/v3/addons/search/?app=firefox&sort=created&type=extension&guid={}" # noqa
EDITORIAL_URI = "https://addons.mozilla.org/api/v4/discovery/editorial/"
class GUIDError(BaseException):
pass
def load_amo_editorial(url, only_recommended=True):
param_dict = {}
if only_recommended:
param_dict["recommended"] = "true"
r = requests.get(url, params=param_dict)
if 200 == r.status_code:
# process stuff here
json_data = json.loads(r.text)
return json_data
err_msg = "HTTP {} status loading JSON from AMO editorial endpoint.".format(
r.status_code
)
raise LoadError(err_msg)
def validate_row(row):
guid = row.get("addon", {}).get("guid", None)
return guid not in (None, "null", "")
def check_guid(guid):
full_uri = ADDON_META_URI.format(guid)
r = requests.get(full_uri)
return r.status_code == 200
def parse_json(json_data, allow_short_guidlist, validate_guids=False):
guids = {row["addon"]["guid"] for row in json_data["results"] if validate_row(row)}
if validate_guids:
for guid in guids:
if not check_guid(guid):
raise GUIDError("Can't validate GUID: {}".format(guid))
result = sorted(list(guids))
if not allow_short_guidlist and len(result) < 100:
raise ShortWhitelistError(
"Only obtained {} editorial reviewed addons.".format(len(result))
)
return result
def load_etl(transformed_data, date, prefix, bucket):
store_json_to_s3(
json.dumps(transformed_data, indent=2), WHITELIST_FILENAME, date, prefix, bucket
)
@click.command()
@click.option("--date", required=True)
@click.option("--url", default=EDITORIAL_URI)
@click.option("--only-recommended", default=True)
@click.option("--bucket", default="telemetry-parquet")
@click.option("--prefix", default="telemetry-ml/addon_recommender/")
@click.option("--validate-guid", default=False)
@click.option("--allow-shortlist", default=True)
def main(date, url, only_recommended, bucket, prefix, validate_guid, allow_shortlist):
data_extract = load_amo_editorial(url, only_recommended)
jdata = parse_json(data_extract, allow_shortlist, validate_guid)
load_etl(jdata, date, prefix, bucket)