def load_event_log()

in aiops/ContrastiveLearningLogClustering/utils/datasets.py [0:0]


def load_event_log(test_log_type=None, benchmark_settings=None, model=None):

    all_event_log = {}
    log_to_event = {}

    for log_type in benchmark_settings:
        if log_type != test_log_type and log_type!='industrial1' and log_type!='industrial2':
        # if log_type == 'HPC':
            df_log_structured = pd.read_csv("./logs/"+log_type+"/"+log_type+"_2k.log_structured.csv")
            df_log_template = pd.read_csv("./logs/"+log_type+"/"+log_type+"_2k.log_templates.csv")

            df_log_template = df_log_template.drop_duplicates(subset=['EventId'])

            for idx, line in df_log_template.iterrows():
                    temp_id = line['EventId']
                    

                    temp_log = df_log_structured[df_log_structured['EventId']==temp_id]
                    temp_log = temp_log['Content'].to_list()

                    # temp_log = [add_blank_token(s) for s in temp_log]
                    log_rex = benchmark_settings[log_type]['regex']
                    temp_log = [(add_var_token(log_rex,s)) for s in temp_log]
                    # temp_log = [(add_var_token(log_rex,s)) for s in temp_log]

                    event_id = log_type+temp_id

                    all_event_log[event_id] = temp_log

                    log_tokens = model.tokenize(temp_log)
                    
                    for i in range(len(log_tokens['input_ids'])):
                        log_token = log_tokens['input_ids'][i].cpu().numpy()
                        token_mask = log_tokens['attention_mask'][i].cpu().numpy()
                        log_token = log_token[token_mask!=0]
                        log_to_event[tuple(log_token.tolist())] = event_id

                    # for log_token in log_tokens['input_ids']:
                    #     log_token = log_token.cpu().numpy()
                    #     log_token = log_token[log_token!=0]
                    #     log_to_event[tuple(log_token.tolist())] = event_id

    if test_log_type=='industrial1' or test_log_type=='industrial2':
    
        log_type = 'industrial1' if test_log_type=='industrial2' else 'industrial2'
        print("Loading",log_type,"event...")
        
        log_path = './'+log_type.lower()+'_test.csv'
        df_log = pd.read_csv(log_path)
        df_labeled = df_log[df_log['label_id']!=-1]
        
        label_ids = df_labeled['label_id'].unique()
        
        for temp_id in label_ids:
            temp_log = df_labeled[df_labeled['label_id']==temp_id]
            temp_log = temp_log['Content'].to_list()
            
            all_event_log[temp_id] = temp_log
            
            log_tokens = model.tokenize(temp_log)
                        
            for i in range(len(log_tokens['input_ids'])):
                log_token = log_tokens['input_ids'][i].cpu().numpy()
                token_mask = log_tokens['attention_mask'][i].cpu().numpy()
                log_token = log_token[token_mask!=0]
                log_to_event[tuple(log_token.tolist())] = temp_id
        
                        
    return all_event_log, log_to_event