def classify()

in torchci/log_classifier/classify_log.py [0:0]


def classify(rules, id):
    logger.info(f"classifying {id}")
    logger.info("fetching from s3")
    log_obj = s3.Object(BUCKET_NAME, f"log/{id}")
    log_obj.load()

    if log_obj.metadata.get("conclusion") != "failure":
        # only classify failed jobs
        logger.info("skipping non-failing job")
        return

    log = log_obj.get()

    # logs are stored gzip-compressed
    logger.info("decompressing")
    log = gzip.decompress(log["Body"].read())
    lines = log.split(b"\n")

    # GHA adds a timestamp to the front of every log. Strip it before matching.
    logger.info("stripping timestamps")
    lines = [line.partition(b" ")[2] for line in lines]

    # Color, etc. in terminal output should be removed
    logger.info("stripping escape codes")
    lines = [ESCAPE_CODE_REGEX.sub(b"", line) for line in lines]

    logger.info("stripping ignore rules")
    ignore_start, ignore_stop = ignore
    is_ignoring = False
    for idx, line in enumerate(lines):
        match = ignore_start.search(line)
        if match:
            is_ignoring = True
        match = ignore_stop.search(line)
        if match:
            is_ignoring = False

        if is_ignoring:
            lines[idx] = b""

    if is_ignoring:
        logger.warn("still ignoring at the end of the log, probably you got the stop condition wrong")

    logger.info("running engine")
    engine = RuleEngine(rules)
    engine.run(lines)
    match = engine.best_match()
    if not match:
        logger.info("no match found")
        return "no match found"

    json = match_to_json(id, match, lines)
    if WRITE_TO_S3:
        logger.info("writing to s3")
        s3.Object(BUCKET_NAME, f"classification/{id}").put(
            Body=json, ContentType="application/json"
        )
    else:
        logger.info("writing to stdout")
        print(json)
    logger.info("done")
    return json