def get_labels()

in bugbug/models/regressor.py [0:0]


    def get_labels(self):
        classes = {}

        if self.use_finder or self.exclude_finder:
            if self.finder_regressions_only:
                regression_fixes = set(
                    bug_fixing_commit["rev"]
                    for bug_fixing_commit in db.read(BUG_FIXING_COMMITS_DB)
                    if bug_fixing_commit["type"] == "r"
                )

            regressors = set(
                r["bug_introducing_rev"]
                for r in db.read(BUG_INTRODUCING_COMMITS_DB)
                if r["bug_introducing_rev"]
                and (
                    not self.finder_regressions_only
                    or r["bug_fixing_rev"] in regression_fixes
                )
            )

        regressor_bugs = set(
            sum((bug["regressed_by"] for bug in bugzilla.get_bugs()), [])
        )

        for commit_data in repository.get_commits():
            if commit_data["backedoutby"]:
                continue

            if repository.is_wptsync(commit_data):
                continue

            push_date = dateutil.parser.parse(commit_data["pushdate"])

            # Skip commits used for the evaluation phase.
            if push_date > datetime.utcnow() - relativedelta(months=EVALUATION_MONTHS):
                continue

            node = commit_data["node"]
            if commit_data["bug_id"] in regressor_bugs or (
                self.use_finder and node in regressors
            ):
                classes[node] = 1
            elif not self.exclude_finder or node not in regressors:
                # The labels we have are only from two years ago (see https://groups.google.com/g/mozilla.dev.platform/c/SjjW6_O-FqM/m/G-CrIVT2BAAJ).
                # While we can go further back with the regressor finder script, it isn't remotely
                # as precise as the "Regressed By" data.
                # In the future, we might want to re-evaluate this limit (e.g. extend ), but we
                # have to be careful (using too old patches might cause worse results as patch
                # characteristics evolve over time).
                if push_date < datetime.utcnow() - relativedelta(years=2):
                    continue

                # We remove the last 3 months, as there could be regressions which haven't been
                # filed yet. While it is true that some regressions might not be found for a long
                # time, more than 3 months seems overly conservative.
                # There will be some patches we currently add to the clean set and will later move
                # to the regressor set, but they are a very small subset.
                if push_date > datetime.utcnow() - relativedelta(months=3):
                    continue

                classes[node] = 0

        logger.info(
            "%d commits caused regressions",
            sum(label == 1 for label in classes.values()),
        )

        logger.info(
            "%d commits did not cause regressions",
            sum(label == 0 for label in classes.values()),
        )

        return classes, [0, 1]