aiops/ContrastiveLearningLogClustering/train.py (54 lines of code) (raw):

import torch import numpy as np import random from sentence_transformers import SentenceTransformer from torch.utils.data import DataLoader from utils.datasets import * from utils.losses import * from utils.evaluation import * seed = 22 torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed) random.seed(seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True model_name = 'multi-qa-MiniLM-L6-cos-v1' batch_size = 20 evaluate_score = [] for test_name in benchmark_settings: print("Test dataset: ", test_name) test_log_type = test_name train_len = 20000 train_examples = generate_samples(train_len,test_log_type,batch_size) print("Train sentence pairs: ",len(train_examples)) model = SentenceTransformer(model_name) special_tokens = ['[var]'] word_embedding_model = model._first_module() word_embedding_model.tokenizer.add_special_tokens({'additional_special_tokens':special_tokens}) word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer)) ft_train_dataloader = DataLoader(train_examples, shuffle=False, batch_size=batch_size) all_event_log, log_to_event = load_event_log(test_log_type=test_log_type,benchmark_settings=benchmark_settings,model=model) event_center = calculate_center(model,all_event_log) ft_train_loss = MNR_Hyper_Loss(model,log_to_event=log_to_event,event_center=event_center,hyper_ratio=0) model.fit(train_objectives=[(ft_train_dataloader, ft_train_loss)], epochs=2, warmup_steps=100, scheduler='constantlr', optimizer_params={'lr': 3e-5}) for _ in range(5): train_examples = generate_samples(train_len,test_log_type) ft_train_dataloader = DataLoader(train_examples, shuffle=False, batch_size=batch_size) all_event_log, log_to_event = load_event_log(test_log_type=test_log_type,benchmark_settings=benchmark_settings,model=model) event_center = calculate_center(model,all_event_log) ft_train_loss = MNR_Hyper_Loss(model,log_to_event=log_to_event,event_center=event_center,hyper_ratio=0.2) model.fit(train_objectives=[(ft_train_dataloader, ft_train_loss)], epochs=1, warmup_steps=0, scheduler='constantlr', optimizer_params={'lr': 1e-5}) print("Model sentence pairs trainning done.") model.to('cpu') df_log, test_corpus = load_test_log(test_log_type,benchmark_settings) corpus_embeddings = generate_embeddings(model,test_corpus) distance_threshold = benchmark_settings[test_log_type]['distance_threshold'] clustered_sentences, cluster_assignment = embeddings_clustering(test_corpus, corpus_embeddings, distance_threshold) score, event_amount, cluster_amount = clustering_evaluate(test_log_type, cluster_assignment, clustered_sentences) score['dataset'] = test_log_type score['event amount'] = event_amount score['cluster amount'] = cluster_amount evaluate_score.append(score) df_socre = pd.DataFrame(evaluate_score)