in torchci/log_classifier/classify_log.py [0:0]
def classify(rules, id):
logger.info(f"classifying {id}")
logger.info("fetching from s3")
log_obj = s3.Object(BUCKET_NAME, f"log/{id}")
log_obj.load()
if log_obj.metadata.get("conclusion") != "failure":
# only classify failed jobs
logger.info("skipping non-failing job")
return
log = log_obj.get()
# logs are stored gzip-compressed
logger.info("decompressing")
log = gzip.decompress(log["Body"].read())
lines = log.split(b"\n")
# GHA adds a timestamp to the front of every log. Strip it before matching.
logger.info("stripping timestamps")
lines = [line.partition(b" ")[2] for line in lines]
# Color, etc. in terminal output should be removed
logger.info("stripping escape codes")
lines = [ESCAPE_CODE_REGEX.sub(b"", line) for line in lines]
logger.info("stripping ignore rules")
ignore_start, ignore_stop = ignore
is_ignoring = False
for idx, line in enumerate(lines):
match = ignore_start.search(line)
if match:
is_ignoring = True
match = ignore_stop.search(line)
if match:
is_ignoring = False
if is_ignoring:
lines[idx] = b""
if is_ignoring:
logger.warn("still ignoring at the end of the log, probably you got the stop condition wrong")
logger.info("running engine")
engine = RuleEngine(rules)
engine.run(lines)
match = engine.best_match()
if not match:
logger.info("no match found")
return "no match found"
json = match_to_json(id, match, lines)
if WRITE_TO_S3:
logger.info("writing to s3")
s3.Object(BUCKET_NAME, f"classification/{id}").put(
Body=json, ContentType="application/json"
)
else:
logger.info("writing to stdout")
print(json)
logger.info("done")
return json