def industry_positive_samples()

in aiops/ContrastiveLearningLogClustering/utils/datasets.py [0:0]


def industry_positive_samples(log_path,batch_size):

    df_log = pd.read_csv(log_path)
    df_labeled = df_log[df_log['label_id']!=-1]

    log_select_num = 4

    positive_event = list(df_labeled['label_id'].value_counts().index)

    positive_samples = {}
    samples = {}

    for temp_id in positive_event:

        # if len(samples)>=batch_size:
        #     break

        df_temp = df_labeled[df_labeled['label_id']==temp_id]
        df_temp = df_temp.sample(frac=1.0, random_state=42)

        temp_log = df_temp['Content'].iloc[:log_select_num].to_list()
        # temp_log = df_temp['Content'].to_list()

        if len(temp_log)>=2:
            for pairs in combinations(temp_log,2):
                if pairs[0]!=pairs[1]:
                    if not temp_id in samples.keys():
                        samples[temp_id]=set()
                    
                    # pairs = tuple(pairs)
                    reverse_pairs = tuple([pairs[1],pairs[0]])

                    # if not (pairs in samples[temp_id]) and not (reverse_pairs in samples[temp_id]):
                    #     samples[temp_id].add(pairs)
                    samples[temp_id].add(pairs)

        if temp_id in samples.keys():
            samples[temp_id] = list(samples[temp_id])

    if len(samples)<batch_size:
        print("Positive samples len:",len(samples))
        print("Cannot generate enough positive samples!")
        raise

    positive_samples['industry'] = samples

    positive_corpus = []
    all_event = {}

    for d in positive_samples:
        # print(d)
        for e in positive_samples[d]:
            all_event[(d,e)] = len(positive_samples[d][e])
            for pairs in positive_samples[d][e]:
            # print(i)
                positive_corpus.append(pairs)

    return positive_corpus, all_event, positive_samples