aiops/ContrastiveLearningLogClustering/utils/preprocess.py (32 lines of code) (raw):

import re import time import pandas as pd from collections import OrderedDict def add_blank_token(log): new_log = re.sub('\s+','[blank]',log) return new_log def add_var_token(rex, line): for currentRex in rex: line = re.sub(currentRex, '[var]', line) line = re.sub('(\[var\])+','[var]',line) # line = re.sub('(\s*\[var\]\s*)+',' [var] ',line) return line def clean_log(log): log = replace_special_characters(log) pattern = r"(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])" log = re.sub(pattern, ' ', log) log = re.sub("\s{2,}"," ",log) return log def replace_special_characters(s): words = s.split() pattern = r"[^A-Za-z0-9]" result = [] for word in words: if "[var]" in word: parts = word.split("[var]") new_parts = [re.sub(pattern, " ", part) for part in parts] # print(new_parts) new_word = " [var] ".join(new_parts) result.append(new_word) else: new_word = re.sub(pattern, " ", word) result.append(new_word) return " ".join(result)