def generate_negetive_samples()

in aiops/ContrastiveLearningLogClustering/utils/datasets.py [0:0]


def generate_negetive_samples(test_log_type=None, positive_corpus=[], neutral_corpus=[], benchmark_settings={}):

    # df_log = None
    all_dataset = []

    # all_corpus = []

    # positive_corpus = set(positive_corpus)
    # neutral_corpus = set(neutral_corpus)

    for log_type in benchmark_settings:
        if log_type != test_log_type:
            all_dataset.append(log_type)

    random.seed(42)

    negetive_corpus = set()

    count = 0
    negetive_nums = 160000
    sub_nums = int(negetive_nums/math.comb(len(all_dataset),2))
    index = 0

    for dataset_pairs in combinations(all_dataset,2):
        pair1_corpus = []
        pair2_corpus = []

        df_log_structured = pd.read_csv("./logs/"+dataset_pairs[0]+"/"+dataset_pairs[0]+"_2k.log_structured.csv")

        for idx, line in df_log_structured.iterrows():
            temp_log = line['Content']
            pair1_corpus.append(temp_log)
            
        log_rex = benchmark_settings[dataset_pairs[0]]['regex']
        pair1_corpus = [add_var_token(log_rex,s) for s in pair1_corpus]

        df_log_structured = pd.read_csv("./logs/"+dataset_pairs[1]+"/"+dataset_pairs[1]+"_2k.log_structured.csv")

        for idx, line in df_log_structured.iterrows():
            temp_log = line['Content']
            pair2_corpus.append(temp_log)
        
        log_rex = benchmark_settings[dataset_pairs[1]]['regex']
        pair2_corpus = [add_var_token(log_rex,s) for s in pair2_corpus]

        random.shuffle(pair1_corpus)
        random.shuffle(pair2_corpus)

        count = 0

        while count<sub_nums:
            
            pairs = [pair1_corpus[index],pair2_corpus[index]]
            reverse_pairs = pairs[::-1]

            pairs = tuple(pairs)
            reverse_pairs = tuple(reverse_pairs)

            if pairs[0]!=pairs[1] and not (pairs in negetive_corpus) and not (reverse_pairs in negetive_corpus):
                negetive_corpus.add(pairs)
                count += 1

            index += 1

            if index>=len(pair1_corpus) or index>=len(pair2_corpus):
                index = 0
                random.shuffle(pair1_corpus)
                random.shuffle(pair2_corpus)

    return list(negetive_corpus)