aiops/ContraLSP/switchstate/switchgenerator.py (283 lines of code) (raw):

import numpy as np import pickle import argparse import os from scipy.signal import butter, lfilter, freqz import timesynth as ts def shade_state_state_data(state_subj, t, ax, data='simulation'): cmap = plt.get_cmap("tab10") # Shade the state on simulation data plots for ttt in range(t[0], len(t)): if state_subj[ttt] == 0: ax.axvspan(ttt + 1, ttt, facecolor='blue', alpha=0.3) elif state_subj[ttt] == 1: ax.axvspan(ttt + 1, ttt, facecolor='green', alpha=0.3) elif state_subj[ttt] == 2: ax.axvspan(ttt + 1, ttt, facecolor='orange', alpha=0.3) np.random.seed(42) SIG_NUM = 3 STATE_NUM = 3 P_S0 = [1 / 3] imp_feature = [[0], [1], [2]] correlated_feature = {0: {0: [1]}, 1: {1: [2]}, 2: {0: [1, 2]}} scale = {0: [0.8, -0.5, -0.2], 1: [0, -1.0, 0], 2: [-0.2, -0.2, 0.8]} transition_matrix = [[0.95, 0.02, 0.03], [0.02, 0.95, 0.03], [0.03, 0.02, 0.95]] def init_distribution_params(): # Covariance matrix is constant across states but distribution means change based on the state value state_count = STATE_NUM cov = np.eye(SIG_NUM) * 0.1 covariance = [] for i in range(state_count): c = cov.copy() # for j in correlated_feature[i].keys(): # c[j,correlated_feature[i][j]] = 0.01 # c[correlated_feature[i][j], j] = 0.01 # c = c + np.eye(SIG_NUM)*1e-3 # print(c) covariance.append(c) covariance = np.array(covariance) mean = [] for i in range(state_count): m = scale[i] mean.append(m) mean = np.array(mean) return mean, covariance def next_state(previous_state, t): p_vec = transition_matrix[previous_state] # if previous_state == 0: # params = 0.9 # elif previous_state==1: # params = 0.9 # else: # params = 0.9 # # params = params - float(t / 500) if params > 0.7 else params # p_vec = np.zeros(STATE_NUM) # p_vec[previous_state] = params # p_vec[np.setdiff1d([0,1,2], previous_state)] = (1-params)/2. next_st = np.random.choice([0, 1, 2], p=p_vec) return next_st def state_decoder(previous, next_st): return int(next_st * (1 - previous) + (1 - next_st) * previous) def generate_linear_labels(X): logit = np.exp(-3 * np.sum(X, axis=1)) prob_1 = np.expand_dims(1 / (1 + logit), 1) prob_0 = np.expand_dims(logit / (1 + logit), 1) y = np.concatenate((prob_0, prob_1), axis=1) return y def generate_XOR_labels(X): y = np.exp(X[:, 0] * X[:, 1]) prob_1 = np.expand_dims(1 / (1 + y), 1) prob_0 = np.expand_dims(y / (1 + y), 1) y = np.concatenate((prob_0, prob_1), axis=1) return y def generate_orange_labels(X): logit = np.exp(np.sum(X[:, :4] ** 2, axis=1) - 1.5) prob_1 = np.expand_dims(1 / (1 + logit), 1) prob_0 = np.expand_dims(logit / (1 + logit), 1) y = np.concatenate((prob_0, prob_1), axis=1) return y def generate_additive_labels(X): logit = np.exp(-10 * np.sin(-0.2 * X[:, 0]) + 0.5 * X[:, 1] + X[:, 2] + np.exp(X[:, 3]) - 0.8) prob_1 = np.expand_dims(1 / (1 + logit), 1) prob_0 = np.expand_dims(logit / (1 + logit), 1) y = np.concatenate((prob_0, prob_1), axis=1) return y def generate_linear_labels_v2(X): logit = np.exp(-0.2 * X[:, 0] + 0.5 * X[:, 1] + X[:, 2] + X[:, 3] - 0.8) prob_1 = np.expand_dims(1 / (1 + logit), 1) prob_0 = np.expand_dims(logit / (1 + logit), 1) y = np.concatenate((prob_0, prob_1), axis=1) return y def create_signal(sig_len, gp_params, mean, cov): signal = None state_local = [] y = [] importance = [] y_logits = [] previous = np.random.binomial(1, P_S0)[0] previous_label = None delta_state = 1 # Sample for "previous" state (this is current state now) imp_sig = np.zeros(SIG_NUM) imp_sig[imp_feature[previous]] = 1 importance.append(imp_sig) state_local.append(previous) for ii in range(1, sig_len): next_st = next_state(previous, delta_state) state_n = next_st imp_sig = np.zeros(SIG_NUM) if previous != state_n: # this samples labels+samples until current point - before state change at next time point gp_vec = [ts.signals.GaussianProcess(lengthscale=g, mean=m, variance=0.1) for g, m in zip(gp_params, mean[previous])] sample_ii = np.array([gp.sample_vectorized(time_vector=np.array(range(delta_state))) for gp in gp_vec]) # print(sample_ii.shape) # sample_ii = np.random.multivariate_normal(mean[state_n], cov[state_n]) if signal is not None: signal = np.hstack((signal, sample_ii)) else: signal = sample_ii # signal.extend(sample_ii) # sample_ii = (sample_ii).reshape((1, -1)) # y_probs = state_n * generate_linear_labels(sample_ii[:, imp_feature[state_n]]) + \ # (1 - state_n) * generate_linear_labels(sample_ii[:, imp_feature[state_n]]) y_probs = generate_linear_labels(sample_ii.T[:, imp_feature[previous]]) y_logit = [yy[1] for yy in y_probs] y_label = [np.random.binomial(1, yy) for yy in y_logit] # y_logit_past = y_logit y.extend(y_label) y_logits.extend(y_logit) delta_state = 1 imp_sig[imp_feature[state_n]] = 1 imp_sig[-1] = 1 else: delta_state += 1 importance.append(imp_sig) # previous_label = y_label state_local.append(state_n) previous = state_n # sample points in the last state-change gp_vec = [ts.signals.GaussianProcess(lengthscale=g, mean=m, variance=0.1) for g, m in zip(gp_params, mean[previous])] sample_ii = np.array([gp.sample_vectorized(time_vector=np.array(range(delta_state))) for gp in gp_vec]) # sometimes only one state is ever sampled if signal is not None: signal = np.hstack((signal, sample_ii)) else: signal = sample_ii y_probs = generate_linear_labels(sample_ii.T[:, imp_feature[previous]]) y_logit = [yy[1] for yy in y_probs] y_label = [np.random.binomial(1, yy) for yy in y_logit] y.extend(y_label) y_logits.extend(y_logit) # signal = signal y = np.array(y) importance = np.array(importance) # print(signal.shape, y.shape, len(state_local), importance.shape, len(y_logits)) return signal, y, state_local, importance, y_logits def decay(x): return [0.9 * (1 - 0.1) ** x, 0.9 * (1 - 0.1) ** x] def logit(x): return 1. / (1 + np.exp(-2 * (x))) def normalize(train_data, test_data, config='mean_normalized'): """ Calculate the mean and std of each feature from the training set """ feature_size = train_data.shape[1] len_of_stay = train_data.shape[2] d = [x.T for x in train_data] d = np.stack(d, axis=0) if config == 'mean_normalized': feature_means = np.tile(np.mean(d.reshape(-1, feature_size), axis=0), (len_of_stay, 1)).T feature_std = np.tile(np.std(d.reshape(-1, feature_size), axis=0), (len_of_stay, 1)).T np.seterr(divide='ignore', invalid='ignore') train_data_n = np.array( [np.where(feature_std == 0, (x - feature_means), (x - feature_means) / feature_std) for x in train_data]) test_data_n = np.array( [np.where(feature_std == 0, (x - feature_means), (x - feature_means) / feature_std) for x in test_data]) elif config == 'zero_to_one': feature_max = np.tile(np.max(d.reshape(-1, feature_size), axis=0), (len_of_stay, 1)).T feature_min = np.tile(np.min(d.reshape(-1, feature_size), axis=0), (len_of_stay, 1)).T train_data_n = np.array([(x - feature_min) / (feature_max - feature_min) for x in train_data]) test_data_n = np.array([(x - feature_min) / (feature_max - feature_min) for x in test_data]) return train_data_n, test_data_n def create_dataset(count, signal_len): dataset = [] labels = [] importance_score = [] states = [] label_logits = [] mean, cov = init_distribution_params() gp_lengthscale = np.random.uniform(0.2, 0.2, SIG_NUM) for num in range(count): sig, y, state, importance, y_logits = create_signal(signal_len, gp_params=gp_lengthscale, mean=mean, cov=cov) dataset.append(sig) labels.append(y) importance_score.append(importance.T) states.append(state) label_logits.append(y_logits) if num % 50 == 0: print(num, count) dataset = np.array(dataset) labels = np.array(labels) importance_score = np.array(importance_score).transpose(0,2,1) states = np.array(states) label_logits = np.array(label_logits) n_train = int(len(dataset) * 0.8) train_data = dataset[:n_train] test_data = dataset[n_train:] # train_data_n, test_data_n = normalize(train_data, test_data) train_data_n = train_data.transpose(0,2,1) test_data_n = test_data.transpose(0,2,1) if not os.path.exists('simulated_data_l2x'): os.mkdir('simulated_data_l2x') with open('simulated_data_l2x/state_dataset_x_train.pkl', 'wb') as f: pickle.dump(train_data_n, f) with open('simulated_data_l2x/state_dataset_x_test.pkl', 'wb') as f: pickle.dump(test_data_n, f) with open('simulated_data_l2x/state_dataset_y_train.pkl', 'wb') as f: pickle.dump(labels[:n_train], f) with open('simulated_data_l2x/state_dataset_y_test.pkl', 'wb') as f: pickle.dump(labels[n_train:], f) with open('simulated_data_l2x/state_dataset_importance_train.pkl', 'wb') as f: pickle.dump(importance_score[:n_train], f) with open('simulated_data_l2x/state_dataset_importance_test.pkl', 'wb') as f: pickle.dump(importance_score[n_train:], f) with open('simulated_data_l2x/state_dataset_logits_train.pkl', 'wb') as f: pickle.dump(label_logits[:n_train], f) with open('simulated_data_l2x/state_dataset_logits_test.pkl', 'wb') as f: pickle.dump(label_logits[n_train:], f) with open('simulated_data_l2x/state_dataset_states_train.pkl', 'wb') as f: pickle.dump(states[:n_train], f) with open('simulated_data_l2x/state_dataset_states_test.pkl', 'wb') as f: pickle.dump(states[n_train:], f) print(train_data_n.shape) print(labels.shape) print(importance_score.shape) print(label_logits.shape) print(states.shape) return dataset, labels, states, label_logits if __name__ == '__main__': if not os.path.exists('./data'): os.mkdir('./data') parser = argparse.ArgumentParser() parser.add_argument('--signal_len', type=int, default=100, help='Length of the signal to generate') parser.add_argument('--signal_num', type=int, default=1000, help='Number of the signals to generate') parser.add_argument('--plot', action='store_true') args = parser.parse_args() np.random.seed(234) dataset, labels, states, label_logits = create_dataset(args.signal_num, args.signal_len) if args.plot: import matplotlib.pyplot as plt f, (x1, x2, x3) = plt.subplots(3, 1) state_1_1 = [] state_1_0 = [] state_0_1 = [] state_0_0 = [] state_2_0 = [] state_2_1 = [] idx0 = np.where(states == 0) idx1 = np.where(states == 1) idx2 = np.where(states == 2) for c in range(len(idx0[0])): if labels[idx0[0][c], idx0[1][c]] == 0: state_0_0.append(labels[idx0[0][c], idx0[1][c]]) else: state_0_1.append(labels[idx0[0][c], idx0[1][c]]) for c in range(len(idx1[0])): if labels[idx1[0][c], idx1[1][c]] == 0: state_1_0.append(labels[idx1[0][c], idx1[1][c]]) else: state_1_1.append(labels[idx1[0][c], idx1[1][c]]) for c in range(len(idx2[0])): if labels[idx2[0][c], idx2[1][c]] == 0: state_2_0.append(labels[idx2[0][c], idx2[1][c]]) else: state_2_1.append(labels[idx2[0][c], idx2[1][c]]) x1.hist(state_0_0, label='label 0') x1.hist(state_0_1, label='label 1') x1.set_title('state 0') x1.legend() x2.hist(state_1_0, label='label 0') x2.hist(state_1_1, label='label 1') x2.set_title('state 1') x2.legend() x3.hist(state_2_0, label='label 0') x3.hist(state_2_1, label='label 1') x3.set_title('state 2') x3.legend() plt.savefig('plot.pdf') f, (x1, x2) = plt.subplots(2, 1) for id in range(len(labels)): for i, sample in enumerate(dataset[id]): if labels[id, i]: x1.scatter(sample[0], sample[1], c='r') else: x1.scatter(sample[0], sample[1], c='b') if states[id, i]: x2.scatter(sample[0], sample[1], c='b') else: x2.scatter(sample[0], sample[1], c='r') x1.set_title('Distribution based on label') x2.set_title('Distribution based on state') plt.savefig('plot2.pdf') plot_id = 5 f = plt.figure(figsize=(18, 9)) x1 = f.subplots() shade_state_state_data(states[plot_id], range(dataset.shape[2]), x1) for i in range(SIG_NUM): x1.plot(range(dataset.shape[2]), dataset[plot_id, i, :], linewidth=1, label='feature %d' % (i)) # x1.plot(range(dataset.shape[2]), dataset[plot_id, 1, :], linewidth=3, label='feature %d' % (1)) # x1.plot(range(dataset.shape[2]), dataset[plot_id, 2, :], linewidth=3, label='feature %d' % (2)) x1.plot(range(dataset.shape[2]), label_logits[plot_id, :], linewidth=3, label='label') plt.legend() plt.savefig('plotsample_l2x.pdf')