def generate_positive_samples()

in aiops/ContrastiveLearningLogClustering/utils/datasets.py [0:0]


def generate_positive_samples(test_log_type=None, benchmark_settings=None):

    positive_samples = {}

    for log_type in benchmark_settings:
        if log_type != test_log_type and log_type!='industrial1' and log_type!='industrial2':
            
            df_log_structured = pd.read_csv("./logs/"+log_type+"/"+log_type+"_2k.log_structured.csv")
            df_log_template = pd.read_csv("./logs/"+log_type+"/"+log_type+"_2k.log_templates.csv")

            df_log_template = df_log_template.drop_duplicates(subset=['EventId'])

            samples = {}

            for idx, line in df_log_template.iterrows():
                    temp_id = line['EventId']

                    temp_log = df_log_structured[df_log_structured['EventId']==temp_id]
                    temp_log = temp_log['Content'].to_list()

                    log_rex = benchmark_settings[log_type]['regex']
                    temp_log = [(add_var_token(log_rex,s)) for s in temp_log]

                    if len(temp_log)>=2:
                        for pairs in combinations(temp_log,2):
                            if pairs[0]!=pairs[1]:
                                if not temp_id in samples.keys():
                                    samples[temp_id]=OrderedDict()
                                
                                # pairs = tuple(pairs)
                                reverse_pairs = tuple([pairs[1],pairs[0]])

                                if not (pairs in samples[temp_id]) and not (reverse_pairs in samples[temp_id]):
                                    samples[temp_id][pairs] = 0

                    if temp_id in samples.keys():
                        samples[temp_id] = list(samples[temp_id].keys())

            positive_samples[log_type] = samples
            
    if test_log_type=='industrial1' or test_log_type=='industrial2':
        
        log_type = 'industrial1' if test_log_type=='industrial2' else 'industrial2'
        # print("Loading",log_type,"positive pairs...")
        
        log_path = './'+log_type.lower()+'_test.csv'
        df_log = pd.read_csv(log_path)
        df_labeled = df_log[df_log['label_id']!=-1]
        
        positive_event = list(df_labeled['label_id'].value_counts().index)
    
        samples = {}

        for temp_id in positive_event:

            df_temp = df_labeled[df_labeled['label_id']==temp_id]
            df_temp = df_temp.sample(frac=1.0, random_state=42)

            temp_log = df_temp['Content'].to_list()

            if len(temp_log)>=2:
                for pairs in combinations(temp_log,2):
                    if pairs[0]!=pairs[1]:
                        if not temp_id in samples.keys():
                            samples[temp_id]=OrderedDict()
                        
                        reverse_pairs = tuple([pairs[1],pairs[0]])

                        if not (pairs in samples[temp_id]) and not (reverse_pairs in samples[temp_id]):
                            samples[temp_id][pairs] = 0

            if temp_id in samples.keys():
                samples[temp_id] = list(samples[temp_id].keys())
        
        positive_samples[log_type] = samples

    positive_corpus = []

    all_event = OrderedDict()

    for d in positive_samples:
        # print(d)
        for e in positive_samples[d]:
            all_event[(d,e)] = len(positive_samples[d][e])
            for pairs in positive_samples[d][e]:
            # print(i)
                positive_corpus.append(pairs)
    
    return positive_corpus, all_event, positive_samples