in aiops/ContrastiveLearningLogClustering/utils/datasets.py [0:0]
def generate_negetive_samples(test_log_type=None, positive_corpus=[], neutral_corpus=[], benchmark_settings={}):
# df_log = None
all_dataset = []
# all_corpus = []
# positive_corpus = set(positive_corpus)
# neutral_corpus = set(neutral_corpus)
for log_type in benchmark_settings:
if log_type != test_log_type:
all_dataset.append(log_type)
random.seed(42)
negetive_corpus = set()
count = 0
negetive_nums = 160000
sub_nums = int(negetive_nums/math.comb(len(all_dataset),2))
index = 0
for dataset_pairs in combinations(all_dataset,2):
pair1_corpus = []
pair2_corpus = []
df_log_structured = pd.read_csv("./logs/"+dataset_pairs[0]+"/"+dataset_pairs[0]+"_2k.log_structured.csv")
for idx, line in df_log_structured.iterrows():
temp_log = line['Content']
pair1_corpus.append(temp_log)
log_rex = benchmark_settings[dataset_pairs[0]]['regex']
pair1_corpus = [add_var_token(log_rex,s) for s in pair1_corpus]
df_log_structured = pd.read_csv("./logs/"+dataset_pairs[1]+"/"+dataset_pairs[1]+"_2k.log_structured.csv")
for idx, line in df_log_structured.iterrows():
temp_log = line['Content']
pair2_corpus.append(temp_log)
log_rex = benchmark_settings[dataset_pairs[1]]['regex']
pair2_corpus = [add_var_token(log_rex,s) for s in pair2_corpus]
random.shuffle(pair1_corpus)
random.shuffle(pair2_corpus)
count = 0
while count<sub_nums:
pairs = [pair1_corpus[index],pair2_corpus[index]]
reverse_pairs = pairs[::-1]
pairs = tuple(pairs)
reverse_pairs = tuple(reverse_pairs)
if pairs[0]!=pairs[1] and not (pairs in negetive_corpus) and not (reverse_pairs in negetive_corpus):
negetive_corpus.add(pairs)
count += 1
index += 1
if index>=len(pair1_corpus) or index>=len(pair2_corpus):
index = 0
random.shuffle(pair1_corpus)
random.shuffle(pair2_corpus)
return list(negetive_corpus)