in torchbenchmark/models/dlrm/tools/visualize.py [0:0]
def visualize_all_data_umap(dlrm,
train_ld,
test_ld = None,
max_umap_size = 50000,
output_dir = "",
umap_metric = "euclidean"):
data_ratio = 1
print("creating umap data")
umap_train_feat, umap_train_X, umap_train_cat, umap_train_T, umap_train_z, umap_train_c, umap_train_p = create_umap_data(dlrm=dlrm, data_ld=train_ld, max_size=max_umap_size, offset=0, info="umap")
# transform train and test data
train_feat, train_X, train_cat, train_T, train_z, train_c, train_p = create_umap_data(dlrm=dlrm, data_ld=train_ld, max_size=max_umap_size*data_ratio, offset=max_umap_size, info="train")
test_feat, test_X, test_cat, test_T, test_z, test_c, test_p = create_umap_data(dlrm=dlrm, data_ld=test_ld, max_size=max_umap_size*data_ratio, offset=0, info="test")
print("umap_train_feat", np.array(umap_train_feat).shape)
reducer_all_feat = umap.UMAP(random_state=42, metric=umap_metric)
umap_feat_Y = reducer_all_feat.fit_transform(umap_train_feat)
train_feat_Y = reducer_all_feat.transform(train_feat)
test_feat_Y = reducer_all_feat.transform(test_feat)
visualize_umap_data(umap_Y = umap_feat_Y,
umap_T = umap_train_T,
umap_C = umap_train_c,
umap_P = umap_train_p,
train_Y = train_feat_Y,
train_T = train_T,
train_C = train_c,
train_P = train_p,
test_Y = test_feat_Y,
test_T = test_T,
test_C = test_c,
test_P = test_p,
total_train_size = str(len(train_ld)),
total_test_size = str(len(test_ld)),
info = "all-features",
output_dir = output_dir,
orig_space_dim = np.array(umap_train_feat).shape[1])
hdbscan_clustering(umap_data = umap_feat_Y,
train_data = train_feat_Y,
test_data = test_feat_Y,
info = "umap-all-features",
output_dir = output_dir)
# hdbscan_clustering(umap_data = np.array(umap_train_feat),
# train_data = np.array(train_feat),
# test_data = np.array(test_feat),
# info = "all-features",
# output_dir = output_dir)
print("umap_train_X", np.array(umap_train_X).shape)
reducer_X = umap.UMAP(random_state=42, metric=umap_metric)
umap_X_Y = reducer_X.fit_transform(umap_train_X)
train_X_Y = reducer_X.transform(train_X)
test_X_Y = reducer_X.transform(test_X)
visualize_umap_data(umap_Y = umap_X_Y,
umap_T = umap_train_T,
umap_C = umap_train_c,
umap_P = umap_train_p,
train_Y = train_X_Y,
train_T = train_T,
train_C = train_c,
train_P = train_p,
test_Y = test_X_Y,
test_T = test_T,
test_C = test_c,
test_P = test_p,
total_train_size = str(len(train_ld)),
total_test_size = str(len(test_ld)),
info = "cont-features",
output_dir = output_dir,
orig_space_dim = np.array(umap_train_X).shape[1])
print("umap_train_cat", np.array(umap_train_cat).shape)
reducer_cat = umap.UMAP(random_state=42, metric=umap_metric)
umap_cat_Y = reducer_cat.fit_transform(umap_train_cat)
train_cat_Y = reducer_cat.transform(train_cat)
test_cat_Y = reducer_cat.transform(test_cat)
visualize_umap_data(umap_Y = umap_cat_Y,
umap_T = umap_train_T,
umap_C = umap_train_c,
umap_P = umap_train_p,
train_Y = train_cat_Y,
train_T = train_T,
train_C = train_c,
train_P = train_p,
test_Y = test_cat_Y,
test_T = test_T,
test_C = test_c,
test_P = test_p,
total_train_size = str(len(train_ld)),
total_test_size = str(len(test_ld)),
info = "cat-features",
output_dir = output_dir,
orig_space_dim = np.array(umap_train_cat).shape[1])
# UMAP for z data
for i in range(0,len(umap_train_z)):
print("z", i, np.array(umap_train_z[i]).shape)
reducer_z = umap.UMAP(random_state=42, metric=umap_metric)
umap_z_Y = reducer_z.fit_transform(umap_train_z[i])
train_z_Y = reducer_z.transform(train_z[i])
test_z_Y = reducer_z.transform(test_z[i])
visualize_umap_data(umap_Y = umap_z_Y,
umap_T = umap_train_T,
umap_C = umap_train_c,
umap_P = umap_train_p,
train_Y = train_z_Y,
train_T = train_T,
train_C = train_c,
train_P = train_p,
test_Y = test_z_Y,
test_T = test_T,
test_C = test_c,
test_P = test_p,
total_train_size = str(len(train_ld)),
total_test_size = str(len(test_ld)),
info = "z-features-"+str(i),
output_dir = output_dir,
orig_space_dim = np.array(umap_train_z[i]).shape[1])