probe_scraper/scrapers/buildhub.py (161 lines of code) (raw):
import pprint
import re
from datetime import datetime
import requests
class NoDataFoundException(Exception):
pass
class Buildhub(object):
search_url = "https://buildhub.moz.tools/api/search"
default_window = 1000
date_formats = ("%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S.%f")
def _paginate_revision_dates(
self,
iteration,
channel,
min_version,
product,
locale,
platform,
max_version,
verbose,
window,
):
query_str = [
{"term": {"source.product": product}},
{"term": {"target.channel": channel}},
{"term": {"target.locale": locale}},
{"term": {"target.platform": platform}},
]
# See: "99" > "65" == True, "100" > "65" == False
# FIXME: This breaks if we get to v200
# If we only need versions above 99 we restrict it to versions below 200,
# then we're good for a bunch of versions.
if min_version >= 100:
query_str.append({"range": {"target.version": {"gte": str(min_version)}}})
if max_version is None:
# This works because the minimum we ever ask for is v30.
query_str.append({"range": {"target.version": {"lt": "200"}}})
else:
# If the user didn't set a max version we need to explicitly include v100..v200 here.
if max_version is None:
query_str.append(
{
"bool": {
"should": [
{
"range": {
"target.version": {"gte": str(min_version)}
}
},
{
"bool": {
"must": [
{
"range": {
"target.version": {"gte": "100"}
}
},
{
"range": {
"target.version": {"lt": "200"}
}
},
]
}
},
]
}
}
)
else:
# Otherwise we only check the min version,
# the max version check will be appended
query_str.append(
{"range": {"target.version": {"gte": str(min_version)}}}
)
if max_version is not None:
query_str.append(
{
"bool": {
"should": [
{"range": {"target.version": {"lte": str(max_version)}}},
{"prefix": {"target.version": str(max_version)}},
]
}
}
)
body = {"query": {"bool": {"filter": query_str}}, "size": window}
if iteration != 0:
body["from"] = iteration * window
if verbose:
print("------QUERY STRING------\n")
pprint.pprint(body)
response = requests.post(url=Buildhub.search_url, json=body)
data = response.json()
if verbose:
print("------QUERY RESULTS------\n")
pprint.pprint(data)
return data
def _distinct_and_clean(self, records):
"""
For more information on the schema of the records,
see the Buildhub API documentation:
https://buildhub.readthedocs.io/en/latest/api.html#more-about-the-data-schema
"""
cleaned_records = {}
for record in records:
# %:z not supported, see https://bugs.python.org/msg169952
# Drop the tz portion entirely
d = record["_source"]["download"]["date"]
if re.search(r"\+\d{2}:\d{2}$", d):
d = d[:-6]
date = None
try:
date = datetime.strptime(d, self.date_formats[0])
except ValueError:
pass
if date is None:
date = datetime.strptime(d, self.date_formats[1])
entry = {
"date": date,
"revision": record["_source"]["source"]["revision"],
"version": record["_source"]["target"]["version"],
"tree": record["_source"]["source"]["tree"],
}
revision = entry["revision"]
min_entry = entry
if revision in cleaned_records:
if cleaned_records[revision] != entry:
min_entry = min(
(entry, cleaned_records[revision]), key=lambda x: x["date"]
)
cleaned_records[revision] = min_entry
return sorted(cleaned_records.values(), key=lambda x: x["date"])
def get_revision_dates(
self,
channel,
min_version,
product="firefox",
locale="en-US",
platform="win64",
max_version=None,
verbose=False,
window=500,
):
"""
Retrieve the revisions and publish-dates for a given filter set.
The combination of channel, product, local, and platform almost
gives a set of unique (revision, publication-dates). For example,
`win64` includes x86 and arm-64 builds. As such we de-duplicate
the result set and include the build with the earliest publication
date.
Tree is the source tree, usually one of:
- mozilla-central
- mozilla-beta
- mozilla-release
:param channel: The release channel
:param min_version: The minimum version to include
:param product: Defaults to firefox
:param locale: Defaults to en-US
:param platform: Defaults to win64
:param max_version: Optional maximum version to include
:param verbose: Verbose output of query string and results
:param window: Number of records to retrieve at a time
returns a list of records of type
{
"date": <date>
"revision": <revision>,
"version": <version>,
"tree": <tree>
}
"""
# Because "100" > "99" == False we special-case v100 to v199.
# v200 is far out, so we just ignore that for now.
assert min_version < 200, "Only versions below 200 are supported"
total_hits = 0
results = []
for i in range(2**20):
data = self._paginate_revision_dates(
i,
channel,
min_version,
product,
locale,
platform,
max_version,
verbose,
window,
)
# hits/total gives total number of records, including
# those outside the window. We need to know the number
# inside the window.
hits = len(data["hits"]["hits"])
if hits:
total_hits += hits
results.append(data)
# optimization, removes the last no-result window
if hits < window:
break
if total_hits == 0:
raise NoDataFoundException(
"No data found for channel {} and minimum \
version {}".format(
channel, min_version
)
)
all_records = [
record for result in results for record in result["hits"]["hits"]
]
return self._distinct_and_clean(all_records)