in aiops/ContrastiveLearningLogClustering/utils/datasets.py [0:0]
def generate_positive_samples(test_log_type=None, benchmark_settings=None):
positive_samples = {}
for log_type in benchmark_settings:
if log_type != test_log_type and log_type!='industrial1' and log_type!='industrial2':
df_log_structured = pd.read_csv("./logs/"+log_type+"/"+log_type+"_2k.log_structured.csv")
df_log_template = pd.read_csv("./logs/"+log_type+"/"+log_type+"_2k.log_templates.csv")
df_log_template = df_log_template.drop_duplicates(subset=['EventId'])
samples = {}
for idx, line in df_log_template.iterrows():
temp_id = line['EventId']
temp_log = df_log_structured[df_log_structured['EventId']==temp_id]
temp_log = temp_log['Content'].to_list()
log_rex = benchmark_settings[log_type]['regex']
temp_log = [(add_var_token(log_rex,s)) for s in temp_log]
if len(temp_log)>=2:
for pairs in combinations(temp_log,2):
if pairs[0]!=pairs[1]:
if not temp_id in samples.keys():
samples[temp_id]=OrderedDict()
# pairs = tuple(pairs)
reverse_pairs = tuple([pairs[1],pairs[0]])
if not (pairs in samples[temp_id]) and not (reverse_pairs in samples[temp_id]):
samples[temp_id][pairs] = 0
if temp_id in samples.keys():
samples[temp_id] = list(samples[temp_id].keys())
positive_samples[log_type] = samples
if test_log_type=='industrial1' or test_log_type=='industrial2':
log_type = 'industrial1' if test_log_type=='industrial2' else 'industrial2'
# print("Loading",log_type,"positive pairs...")
log_path = './'+log_type.lower()+'_test.csv'
df_log = pd.read_csv(log_path)
df_labeled = df_log[df_log['label_id']!=-1]
positive_event = list(df_labeled['label_id'].value_counts().index)
samples = {}
for temp_id in positive_event:
df_temp = df_labeled[df_labeled['label_id']==temp_id]
df_temp = df_temp.sample(frac=1.0, random_state=42)
temp_log = df_temp['Content'].to_list()
if len(temp_log)>=2:
for pairs in combinations(temp_log,2):
if pairs[0]!=pairs[1]:
if not temp_id in samples.keys():
samples[temp_id]=OrderedDict()
reverse_pairs = tuple([pairs[1],pairs[0]])
if not (pairs in samples[temp_id]) and not (reverse_pairs in samples[temp_id]):
samples[temp_id][pairs] = 0
if temp_id in samples.keys():
samples[temp_id] = list(samples[temp_id].keys())
positive_samples[log_type] = samples
positive_corpus = []
all_event = OrderedDict()
for d in positive_samples:
# print(d)
for e in positive_samples[d]:
all_event[(d,e)] = len(positive_samples[d][e])
for pairs in positive_samples[d][e]:
# print(i)
positive_corpus.append(pairs)
return positive_corpus, all_event, positive_samples