in aiops/ContrastiveLearningLogClustering/utils/datasets.py [0:0]
def load_event_log(test_log_type=None, benchmark_settings=None, model=None):
all_event_log = {}
log_to_event = {}
for log_type in benchmark_settings:
if log_type != test_log_type and log_type!='industrial1' and log_type!='industrial2':
# if log_type == 'HPC':
df_log_structured = pd.read_csv("./logs/"+log_type+"/"+log_type+"_2k.log_structured.csv")
df_log_template = pd.read_csv("./logs/"+log_type+"/"+log_type+"_2k.log_templates.csv")
df_log_template = df_log_template.drop_duplicates(subset=['EventId'])
for idx, line in df_log_template.iterrows():
temp_id = line['EventId']
temp_log = df_log_structured[df_log_structured['EventId']==temp_id]
temp_log = temp_log['Content'].to_list()
# temp_log = [add_blank_token(s) for s in temp_log]
log_rex = benchmark_settings[log_type]['regex']
temp_log = [(add_var_token(log_rex,s)) for s in temp_log]
# temp_log = [(add_var_token(log_rex,s)) for s in temp_log]
event_id = log_type+temp_id
all_event_log[event_id] = temp_log
log_tokens = model.tokenize(temp_log)
for i in range(len(log_tokens['input_ids'])):
log_token = log_tokens['input_ids'][i].cpu().numpy()
token_mask = log_tokens['attention_mask'][i].cpu().numpy()
log_token = log_token[token_mask!=0]
log_to_event[tuple(log_token.tolist())] = event_id
# for log_token in log_tokens['input_ids']:
# log_token = log_token.cpu().numpy()
# log_token = log_token[log_token!=0]
# log_to_event[tuple(log_token.tolist())] = event_id
if test_log_type=='industrial1' or test_log_type=='industrial2':
log_type = 'industrial1' if test_log_type=='industrial2' else 'industrial2'
print("Loading",log_type,"event...")
log_path = './'+log_type.lower()+'_test.csv'
df_log = pd.read_csv(log_path)
df_labeled = df_log[df_log['label_id']!=-1]
label_ids = df_labeled['label_id'].unique()
for temp_id in label_ids:
temp_log = df_labeled[df_labeled['label_id']==temp_id]
temp_log = temp_log['Content'].to_list()
all_event_log[temp_id] = temp_log
log_tokens = model.tokenize(temp_log)
for i in range(len(log_tokens['input_ids'])):
log_token = log_tokens['input_ids'][i].cpu().numpy()
token_mask = log_tokens['attention_mask'][i].cpu().numpy()
log_token = log_token[token_mask!=0]
log_to_event[tuple(log_token.tolist())] = temp_id
return all_event_log, log_to_event