in aiops/ContrastiveLearningLogClustering/utils/datasets.py [0:0]
def generate_neutral_samples(test_log_type=None, positive_corpus=[], benchmark_settings={}):
# dataset_corpus = []
neutral_corpus = set()
# positive_corpus = set(positive_corpus)
neutral_nums = 150000
sub_nums = int(neutral_nums/16) if test_log_type is None else int(neutral_nums/15)
for log_type in benchmark_settings:
if log_type != test_log_type:
# print(log_type)
df_log_structured = pd.read_csv("./logs/"+log_type+"/"+log_type+"_2k.log_structured.csv")
df_log_template = pd.read_csv("./logs/"+log_type+"/"+log_type+"_2k.log_templates.csv")
df_log_template = df_log_template.drop_duplicates(subset=['EventId'])
samples = {}
dataset_event = []
for idx, line in df_log_template.iterrows():
dataset_event.append(line['EventId'])
subsub_nums = int(sub_nums/math.comb(len(dataset_event),2))
for event_pairs in combinations(dataset_event,2):
pair1_corpus = []
pair2_corpus = []
for idx, line in df_log_structured[df_log_structured['EventId']==event_pairs[0]].iterrows():
temp_log = line['Content']
pair1_corpus.append(temp_log)
for idx, line in df_log_structured[df_log_structured['EventId']==event_pairs[1]].iterrows():
temp_log = line['Content']
pair2_corpus.append(temp_log)
log_rex = benchmark_settings[log_type]['regex']
pair1_corpus = [add_var_token(log_rex,s) for s in pair1_corpus]
pair2_corpus = [add_var_token(log_rex,s) for s in pair2_corpus]
random.shuffle(pair1_corpus)
random.shuffle(pair2_corpus)
count = 0
for i in range(len(pair1_corpus)):
for j in range(len(pair2_corpus)):
pairs = [pair1_corpus[i],pair2_corpus[j]]
reverse_pairs = pairs[::-1]
pairs = tuple(pairs)
reverse_pairs = tuple(reverse_pairs)
if pairs[0]!=pairs[1] and not (pairs in neutral_corpus) and not (reverse_pairs in neutral_corpus):
neutral_corpus.add(pairs)
count += 1
if count>=subsub_nums:
break
if count>=subsub_nums:
break
return list(neutral_corpus)