in aiops/ContrastiveLearningLogClustering/utils/datasets.py [0:0]
def industry_positive_samples(log_path,batch_size):
df_log = pd.read_csv(log_path)
df_labeled = df_log[df_log['label_id']!=-1]
log_select_num = 4
positive_event = list(df_labeled['label_id'].value_counts().index)
positive_samples = {}
samples = {}
for temp_id in positive_event:
# if len(samples)>=batch_size:
# break
df_temp = df_labeled[df_labeled['label_id']==temp_id]
df_temp = df_temp.sample(frac=1.0, random_state=42)
temp_log = df_temp['Content'].iloc[:log_select_num].to_list()
# temp_log = df_temp['Content'].to_list()
if len(temp_log)>=2:
for pairs in combinations(temp_log,2):
if pairs[0]!=pairs[1]:
if not temp_id in samples.keys():
samples[temp_id]=set()
# pairs = tuple(pairs)
reverse_pairs = tuple([pairs[1],pairs[0]])
# if not (pairs in samples[temp_id]) and not (reverse_pairs in samples[temp_id]):
# samples[temp_id].add(pairs)
samples[temp_id].add(pairs)
if temp_id in samples.keys():
samples[temp_id] = list(samples[temp_id])
if len(samples)<batch_size:
print("Positive samples len:",len(samples))
print("Cannot generate enough positive samples!")
raise
positive_samples['industry'] = samples
positive_corpus = []
all_event = {}
for d in positive_samples:
# print(d)
for e in positive_samples[d]:
all_event[(d,e)] = len(positive_samples[d][e])
for pairs in positive_samples[d][e]:
# print(i)
positive_corpus.append(pairs)
return positive_corpus, all_event, positive_samples