in probe_scraper/scrapers/buildhub.py [0:0]
def _distinct_and_clean(self, records):
"""
For more information on the schema of the records,
see the Buildhub API documentation:
https://buildhub.readthedocs.io/en/latest/api.html#more-about-the-data-schema
"""
cleaned_records = {}
for record in records:
# %:z not supported, see https://bugs.python.org/msg169952
# Drop the tz portion entirely
d = record["_source"]["download"]["date"]
if re.search(r"\+\d{2}:\d{2}$", d):
d = d[:-6]
date = None
try:
date = datetime.strptime(d, self.date_formats[0])
except ValueError:
pass
if date is None:
date = datetime.strptime(d, self.date_formats[1])
entry = {
"date": date,
"revision": record["_source"]["source"]["revision"],
"version": record["_source"]["target"]["version"],
"tree": record["_source"]["source"]["tree"],
}
revision = entry["revision"]
min_entry = entry
if revision in cleaned_records:
if cleaned_records[revision] != entry:
min_entry = min(
(entry, cleaned_records[revision]), key=lambda x: x["date"]
)
cleaned_records[revision] = min_entry
return sorted(cleaned_records.values(), key=lambda x: x["date"])