def _distinct_and_clean()

in probe_scraper/scrapers/buildhub.py [0:0]


    def _distinct_and_clean(self, records):
        """
        For more information on the schema of the records,
        see the Buildhub API documentation:
        https://buildhub.readthedocs.io/en/latest/api.html#more-about-the-data-schema
        """
        cleaned_records = {}

        for record in records:
            # %:z not supported, see https://bugs.python.org/msg169952
            # Drop the tz portion entirely
            d = record["_source"]["download"]["date"]
            if re.search(r"\+\d{2}:\d{2}$", d):
                d = d[:-6]

            date = None
            try:
                date = datetime.strptime(d, self.date_formats[0])
            except ValueError:
                pass

            if date is None:
                date = datetime.strptime(d, self.date_formats[1])

            entry = {
                "date": date,
                "revision": record["_source"]["source"]["revision"],
                "version": record["_source"]["target"]["version"],
                "tree": record["_source"]["source"]["tree"],
            }

            revision = entry["revision"]
            min_entry = entry

            if revision in cleaned_records:
                if cleaned_records[revision] != entry:
                    min_entry = min(
                        (entry, cleaned_records[revision]), key=lambda x: x["date"]
                    )

            cleaned_records[revision] = min_entry

        return sorted(cleaned_records.values(), key=lambda x: x["date"])