def generate_neutral_samples()

in aiops/ContrastiveLearningLogClustering/utils/datasets.py [0:0]


def generate_neutral_samples(test_log_type=None, positive_corpus=[], benchmark_settings={}):

    # dataset_corpus = []
    neutral_corpus = set()

    # positive_corpus = set(positive_corpus)

    neutral_nums = 150000
    sub_nums = int(neutral_nums/16) if test_log_type is None else int(neutral_nums/15)
    
    for log_type in benchmark_settings:
        if log_type != test_log_type:
            # print(log_type)
            df_log_structured = pd.read_csv("./logs/"+log_type+"/"+log_type+"_2k.log_structured.csv")
            df_log_template = pd.read_csv("./logs/"+log_type+"/"+log_type+"_2k.log_templates.csv")

            df_log_template = df_log_template.drop_duplicates(subset=['EventId'])

            samples = {}

            dataset_event = []

            for idx, line in df_log_template.iterrows():
                dataset_event.append(line['EventId'])

            subsub_nums = int(sub_nums/math.comb(len(dataset_event),2))

            for event_pairs in combinations(dataset_event,2):
                pair1_corpus = []
                pair2_corpus = []

                for idx, line in df_log_structured[df_log_structured['EventId']==event_pairs[0]].iterrows():
                    temp_log = line['Content']
                    pair1_corpus.append(temp_log)
                    

                for idx, line in df_log_structured[df_log_structured['EventId']==event_pairs[1]].iterrows():
                    temp_log = line['Content']
                    pair2_corpus.append(temp_log)
                    
                log_rex = benchmark_settings[log_type]['regex']
                pair1_corpus = [add_var_token(log_rex,s) for s in pair1_corpus]
                pair2_corpus = [add_var_token(log_rex,s) for s in pair2_corpus]

                random.shuffle(pair1_corpus)
                random.shuffle(pair2_corpus)

                count = 0

                for i in range(len(pair1_corpus)):
                    for j in range(len(pair2_corpus)):
                        pairs = [pair1_corpus[i],pair2_corpus[j]]
                        reverse_pairs = pairs[::-1]

                        pairs = tuple(pairs)
                        reverse_pairs = tuple(reverse_pairs)

                        if pairs[0]!=pairs[1] and not (pairs in neutral_corpus) and not (reverse_pairs in neutral_corpus):
                            neutral_corpus.add(pairs)
                            count += 1

                        if count>=subsub_nums:
                            break
                    if count>=subsub_nums:
                            break

    return list(neutral_corpus)