torchbenchmark/models/dlrm/tools/visualize.py (712 lines of code) (raw):

# Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # # # This script performs the visualization of the embedding tables created in # DLRM during the training procedure. We use two popular techniques for # visualization: umap (https://umap-learn.readthedocs.io/en/latest/) and # tsne (https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html). # These links also provide instructions on how to install these packages # in different environments. # # Warning: the size of the data to be visualized depends on the RAM on your machine. # # # Connand line examples: # # Full analysis of embeddings and data representations for Criteo Kaggle data: # $python ./tools/visualize.py --data-set=kaggle --load-model=../dlrm-2020-05-25/criteo.pytorch-e-0-i-110591 # --raw-data-file=../../criteo/input/train.txt --skip-categorical-analysis # --processed-data-file=../../criteo/input/kaggleAdDisplayChallenge_processed.npz # # # To run just the analysis of categoricala data for Criteo Kaggle data set: # $python ./tools/visualize.py --data-set=kaggle --load-model=../dlrm-2020-05-25/criteo.pytorch-e-0-i-110591 \ # --raw-data-file=../../criteo/input/train.txt --data-randomize=none --processed-data-file=../../criteo/input/kaggleAdDisplayChallenge_processed.npz \ # --skip-embedding --skip-data-plots # # # The following command line arguments are available to the user: # # --load-model - DLRM model file # --data-set - one of ["kaggle", "terabyte"] # --max-ind-range - max index range used during the traning # --output-dir - output directory, if not specified, it will be traeted from the model and datset names # --max-umap-size - max number of points to visualize using UMAP, default=50000 # --use-tsne - use T-SNE # --max-tsne-size - max number of points to visualize using T-SNE, default=1000) # --skip-embedding - skips analysis of embedding tables # --umap-metric - metric for UMAP # --skip-data-plots - skips data plots # --skip-categorical-analysis - skips categorical analysis # # # data file related # --raw-data-file # --processed-data-file # --data-sub-sample-rate # --data-randomize # --memory-map # --mini-batch-size # --num-workers # --test-mini-batch-size # --test-num-workers # --num-batches # --mlperf-logging import os import sys import argparse import numpy as np import umap import hdbscan import json import torch import math import matplotlib import matplotlib.pyplot as plt import collections from sklearn.metrics import accuracy_score from sklearn.metrics import f1_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn import manifold import dlrm_data_pytorch as dp from dlrm_s_pytorch import DLRM_Net def visualize_embeddings_umap(emb_l, output_dir = "", max_size = 500000, umap_metric = "euclidean", cat_counts = None, use_max_count = True): for k in range(0, len(emb_l)): E = emb_l[k].weight.detach().cpu().numpy() print("umap", E.shape) # create histogram of norms bins = 50 norms = [np.linalg.norm(E[i], ord=2) for i in range(0,E.shape[0])] # plt.hist(norms, bins = bins) # plt.title("Cat norm hist var. "+str(k)) hist, bins = np.histogram(norms, bins=bins) logbins = np.logspace(np.log10(bins[0]),np.log10(bins[-1]),len(bins)) plt.figure(figsize=(8,8)) plt.title("Categorical norms: " + str(k) + " cardinality " + str(len(cat_counts[k]))) plt.hist(norms, bins=logbins) plt.xscale("log") # plt.legend() plt.savefig(output_dir+"/cat-norm-histogram-"+str(k)+".png") plt.close() if E.shape[0] < 20: print("Skipping small embedding") continue n_vis = min(max_size, E.shape[0]) min_cnt = 0 # reducer = umap.UMAP(random_state=42, n_neighbors=25, min_dist=0.1) reducer = umap.UMAP(random_state=42, metric=umap_metric) if use_max_count is False or n_vis == E.shape[0]: Y = reducer.fit_transform(E[:n_vis,:]) else: # select values with couns > 1 done = False min_cnt = 1 while done == False: el_cnt = (cat_counts[k] > min_cnt).sum() if el_cnt <= max_size: done = True else: min_cnt = min_cnt+1 E1= [] for i in range(0, E.shape[0]): if cat_counts[k][i] > min_cnt: E1.append(E[i,:]) print("max_count_len", len(E1), "mincount", min_cnt) Y = reducer.fit_transform(np.array(E1)) n_vis = len(E1) plt.figure(figsize=(8,8)) linewidth = 0 size = 1 if Y.shape[0] < 2500: linewidth = 1 size = 5 if cat_counts is None: plt.scatter(-Y[:,0], -Y[:,1], s=size, marker=".", linewidth=linewidth) else: #print(cat_counts[k]) n_disp = min(len(cat_counts[k]), Y.shape[0]) cur_max = math.log(max(cat_counts[k])) norm_cat_count = [math.log(cat_counts[k][i]+1)/cur_max for i in range(0, len(cat_counts[k]))] plt.scatter(-Y[0:n_disp,0], -Y[0:n_disp,1], s=size, marker=".", linewidth=linewidth, c=np.array(norm_cat_count)[0:n_disp], cmap="viridis") plt.colorbar() plt.title("UMAP: categorical var. " + str(k) + " (" + str(n_vis) + " of " + str(E.shape[0]) + ", min count " + str(min_cnt) + ")") plt.savefig(output_dir + "/cat-" + str(k) + "-" + str(n_vis) + "-of-" + str(E.shape[0]) + "-umap.png") plt.close() def visualize_embeddings_tsne(emb_l, output_dir = "", max_size = 10000): for k in range(0, len(emb_l)): E = emb_l[k].weight.detach().cpu() print("tsne", E.shape) if E.shape[0] < 20: print("Skipping small embedding") continue n_vis = min(max_size, E.shape[0]) tsne = manifold.TSNE(init="pca", random_state=0, method="exact") Y = tsne.fit_transform(E[:n_vis,:]) plt.figure(figsize=(8, 8)) linewidth = 0 if Y.shape[0] < 5000: linewidth = 1 plt.scatter(-Y[:,0], -Y[:,1], s=1, marker=".", linewidth=linewidth) plt.title("TSNE: categorical var. " + str(k) + " (" + str(n_vis) + " of " + str(E.shape[0]) + ")") plt.savefig(output_dir + "/cat-" + str(k) + "-" + str(n_vis) + "-of-" + str(E.shape[0]) + "-tsne.png") plt.close() def analyse_categorical_data(X_cat, n_days=10, output_dir=""): # analyse categorical variables n_vec = len(X_cat) n_cat = len(X_cat[0]) n_days = n_days print("n_vec", n_vec, "n_cat", n_cat) # for c in train_data.X_cat: # print(n_cat, c) all_cat = np.array(X_cat) print("all_cat.shape", all_cat.shape) day_size = all_cat.shape[0]/n_days for i in range(0,n_cat): l_d = [] l_s1 = [] l_s2 = [] l_int = [] l_rem = [] cat = all_cat[:,i] print("cat", i, cat.shape) for d in range(1,n_days): offset = int(d*day_size) #print(offset) cat1 = cat[:offset] cat2 = cat[offset:] s1 = set(cat1) s2 = set(cat2) intersect = list(s1 & s2) #print(intersect) l_d.append(d) l_s1.append(len(s1)) l_s2.append(len(s2)) l_int.append(len(intersect)) l_rem.append((len(s1)-len(intersect))) print(d, ",", len(s1), ",", len(s2), ",", len(intersect), ",", (len(s1)-len(intersect))) print("spit", l_d) print("before", l_s1) print("after", l_s2) print("inters.", l_int) print("removed", l_rem) plt.figure(figsize=(8,8)) plt.plot(l_d, l_s1, "g", label="before") plt.plot(l_d, l_s2, "r", label="after") plt.plot(l_d, l_int, "b", label="intersect") plt.plot(l_d, l_rem, "y", label="removed") plt.title("categorical var. "+str(i)) plt.legend() plt.savefig(output_dir+"/cat-"+str(i).zfill(3)+".png") plt.close() def analyse_categorical_counts(X_cat, emb_l=None, output_dir=""): # analyse categorical variables n_vec = len(X_cat) n_cat = len(X_cat[0]) print("n_vec", n_vec, "n_cat", n_cat) # for c in train_data.X_cat: # print(n_cat, c) all_cat = np.array(X_cat) print("all_cat.shape", all_cat.shape) all_counts = [] for i in range(0,n_cat): cat = all_cat[:,i] if emb_l is None: s = set(cat) counts = np.zeros((len(s))) print("cat", i, cat.shape, len(s)) else: s = emb_l[i].weight.detach().cpu().shape[0] counts = np.zeros((s)) print("cat", i, cat.shape, s) for d in range(0,n_vec): cv = int(cat[d]) counts[cv] = counts[cv]+1 all_counts.append(counts) if emb_l is None: plt.figure(figsize=(8,8)) plt.plot(counts) plt.title("Categorical var "+str(i) + " cardinality " + str(len(counts))) # plt.legend() else: E = emb_l[i].weight.detach().cpu().numpy() norms = [np.linalg.norm(E[i], ord=2) for i in range(0,E.shape[0])] fig, (ax0, ax1) = plt.subplots(2, 1) fig.suptitle("Categorical variable: " + str(i)+" cardinality "+str(len(counts))) ax0.plot(counts) ax0.set_yscale("log") ax0.set_title("Counts", fontsize=10) ax1.plot(norms) ax1.set_title("Norms", fontsize=10) plt.savefig(output_dir+"/cat_counts-"+str(i).zfill(3)+".png") plt.close() return all_counts def dlrm_output_wrap(dlrm, X, lS_o, lS_i, T): all_feat_vec = [] all_cat_vec = [] x_vec = None t_out = None c_out = None z_out = [] p_out = None z_size = len(dlrm.top_l) x = dlrm.apply_mlp(X, dlrm.bot_l) # debug prints #print("intermediate") #print(x[0].detach().cpu().numpy()) x_vec = x[0].detach().cpu().numpy() all_feat_vec.append(x_vec) # all_X.append(x[0].detach().cpu().numpy()) # process sparse features(using embeddings), resulting in a list of row vectors ly = dlrm.apply_emb(lS_o, lS_i, dlrm.emb_l) for e in ly: #print(e.detach().cpu().numpy()) all_feat_vec.append(e[0].detach().cpu().numpy()) all_cat_vec.append(e[0].detach().cpu().numpy()) all_feat_vec= np.concatenate(all_feat_vec, axis=0) all_cat_vec= np.concatenate(all_cat_vec, axis=0) # all_features.append(all_feat_vec) # all_cat.append(all_cat_vec) t_out = int(T.detach().cpu().numpy()[0,0]) # all_T.append(int(T.detach().cpu().numpy()[0,0])) z = dlrm.interact_features(x, ly) # print(z.detach().cpu().numpy()) # z_out = z.detach().cpu().numpy().flatten() z_out.append(z.detach().cpu().numpy().flatten()) # all_z[0].append(z.detach().cpu().numpy().flatten()) # obtain probability of a click (using top mlp) # print(dlrm.top_l) # p = dlrm.apply_mlp(z, dlrm.top_l) for i in range(0, z_size): z = dlrm.top_l[i](z) # if i < z_size-1: # curr_z = z.detach().cpu().numpy().flatten() z_out.append(z.detach().cpu().numpy().flatten()) # all_z[i+1].append(curr_z) # print("z append", i) # print("z",i, z.detach().cpu().numpy().flatten().shape) p = z # clamp output if needed if 0.0 < dlrm.loss_threshold and dlrm.loss_threshold < 1.0: z = torch.clamp(p, min=dlrm.loss_threshold, max=(1.0 - dlrm.loss_threshold)) else: z = p class_thresh = 0.0 #-0.25 zp = z.detach().cpu().numpy()[0,0]+ class_thresh p_out = int(zp+0.5) if p_out > 1: p_out = 1 if p_out < 0: p_out = 0 # all_pred.append(int(z.detach().cpu().numpy()[0,0]+0.5)) #print(int(z.detach().cpu().numpy()[0,0]+0.5)) if int(p_out) == t_out: c_out = 0 else: c_out = 1 return all_feat_vec, x_vec, all_cat_vec, t_out, c_out, z_out, p_out def create_umap_data(dlrm, data_ld, max_size=50000, offset=0, info=""): all_features = [] all_X = [] all_cat = [] all_T = [] all_c = [] all_z = [] all_pred = [] z_size = len(dlrm.top_l) print("z_size", z_size) for i in range(0, z_size): all_z.append([]) for j, (X, lS_o, lS_i, T) in enumerate(data_ld): if j < offset: continue if j >= max_size+offset: break af, x, cat, t, c, z, p = dlrm_output_wrap(dlrm, X, lS_o, lS_i, T) all_features.append(af) all_X.append(x) all_cat.append(cat) all_T.append(t) all_c.append(c) all_pred.append(p) for i in range(0, z_size): all_z[i].append(z[i]) # # calculate classifier metrics ac = accuracy_score(all_T, all_pred) f1 = f1_score(all_T, all_pred) ps = precision_score(all_T, all_pred) rc = recall_score(all_T, all_pred) print(info, "accuracy", ac, "f1", f1, "precision", ps, "recall", rc) return all_features, all_X, all_cat, all_T, all_z, all_c, all_pred def plot_all_data_3(umap_Y, umap_T, train_Y = None, train_T = None, test_Y = None, test_T = None, total_train_size = "", total_test_size = "", info = "", output_dir = "", orig_space_dim = 0): size = 1 colors = ["red","green"] fig, (ax0, ax1, ax2) = plt.subplots(1, 3) fig.suptitle("UMAP: " + info + " space dim "+str(orig_space_dim)) ax0.scatter(umap_Y[:,0], umap_Y[:,1], s=size, c=umap_T, cmap=matplotlib.colors.ListedColormap(colors), marker=".", linewidth=0) ax0.set_title("UMAP ("+str(len(umap_T))+" of "+ total_train_size+")", fontsize=7) if train_Y is not None and train_T is not None: ax1.scatter(train_Y[:,0], train_Y[:,1], s=size, c=train_T, cmap=matplotlib.colors.ListedColormap(colors), marker=".", linewidth=0) ax1.set_title("Train ("+str(len(train_T))+" of "+ total_train_size+")", fontsize=7) if test_Y is not None and test_T is not None: ax2.scatter(test_Y[:,0], test_Y[:,1], s=size, c=test_T, cmap=matplotlib.colors.ListedColormap(colors), marker=".", linewidth=0) ax2.set_title("Test ("+str(len(test_T))+" of "+ total_test_size+")", fontsize=7) plt.savefig(output_dir+"/"+info+"-umap.png") plt.close() def plot_one_class_3(umap_Y, umap_T, train_Y, train_T, test_Y, test_T, target = 0, col = "red", total_train_size = "", total_test_size = "", info = "", output_dir = "", orig_space_dim = 0): size = 1 fig, (ax0, ax1, ax2) = plt.subplots(1, 3) fig.suptitle("UMAP: "+ info + " space dim "+str(orig_space_dim)) ind_l_umap = [i for i,x in enumerate(umap_T) if x == target] Y_umap_l = np.array([umap_Y[i,:] for i in ind_l_umap]) ax0.scatter(Y_umap_l[:,0], Y_umap_l[:,1], s=size, c=col, marker=".", linewidth=0) ax0.set_title("UMAP, ("+str(len(umap_T))+" of "+ total_train_size+")", fontsize=7) if train_Y is not None and train_T is not None: ind_l_test = [i for i,x in enumerate(train_T) if x == target] Y_test_l = np.array([train_Y[i,:] for i in ind_l_test]) ax1.scatter(Y_test_l[:,0], Y_test_l[:,1], s=size, c=col, marker=".", linewidth=0) ax1.set_title("Train, ("+str(len(train_T))+" of "+ total_train_size+")", fontsize=7) if test_Y is not None and test_T is not None: ind_l_test = [i for i,x in enumerate(test_T) if x == target] Y_test_l = np.array([test_Y[i,:] for i in ind_l_test]) ax2.scatter(Y_test_l[:,0], Y_test_l[:,1], s=size, c=col, marker=".", linewidth=0) ax2.set_title("Test, ("+str(len(test_T))+" of "+ total_test_size+")", fontsize=7) plt.savefig(output_dir+"/"+info+"-umap.png") plt.close() def visualize_umap_data(umap_Y, umap_T, umap_C, umap_P, train_Y, train_T, train_C, train_P, test_Y = None, test_T = None, test_C = None, test_P = None, total_train_size = "", total_test_size = "", info = "", output_dir = "", orig_space_dim = 0): # all classes plot_all_data_3(umap_Y = umap_Y, umap_T = umap_T, train_Y = train_Y, train_T = train_T, test_Y = test_Y, test_T = test_T, total_train_size = total_train_size, total_test_size = total_test_size, info = info, output_dir = output_dir, orig_space_dim = orig_space_dim) # all predictions plot_all_data_3(umap_Y = umap_Y, umap_T = umap_P, train_Y = train_Y, train_T = train_P, test_Y = test_Y, test_T = test_P, total_train_size = total_train_size, total_test_size = total_test_size, info = info+", all-predictions", output_dir = output_dir, orig_space_dim = orig_space_dim) # class 0 plot_one_class_3(umap_Y = umap_Y, umap_T = umap_T, train_Y = train_Y, train_T = train_T, test_Y = test_Y, test_T = test_T, target = 0, col = "red", total_train_size = total_train_size, total_test_size = total_test_size, info = info+" class " + str(0), output_dir = output_dir, orig_space_dim = orig_space_dim) # class 1 plot_one_class_3(umap_Y = umap_Y, umap_T = umap_T, train_Y = train_Y, train_T = train_T, test_Y = test_Y, test_T = test_T, target = 1, col = "green", total_train_size = total_train_size, total_test_size = total_test_size, info = info + " class " + str(1), output_dir = output_dir, orig_space_dim = orig_space_dim) # correct classification plot_one_class_3(umap_Y = umap_Y, umap_T = umap_C, train_Y = train_Y, train_T = train_C, test_Y = test_Y, test_T = test_C, target = 0, col = "green", total_train_size = total_train_size, total_test_size = total_test_size, info = info + " correct ", output_dir = output_dir, orig_space_dim = orig_space_dim) # errors plot_one_class_3(umap_Y = umap_Y, umap_T = umap_C, train_Y = train_Y, train_T = train_C, test_Y = test_Y, test_T = test_C, target = 1, col = "red", total_train_size = total_train_size, total_test_size = total_test_size, info = info + " errors ", output_dir = output_dir, orig_space_dim = orig_space_dim) # prediction 0 plot_one_class_3(umap_Y = umap_Y, umap_T = umap_P, train_Y = train_Y, train_T = train_P, test_Y = test_Y, test_T = test_P, target = 0, col = "red", total_train_size = total_train_size, total_test_size = total_test_size, info = info + " predict-0 ", output_dir = output_dir, orig_space_dim = orig_space_dim) # prediction 1 plot_one_class_3(umap_Y = umap_Y, umap_T = umap_P, train_Y = train_Y, train_T = train_P, test_Y = test_Y, test_T = test_P, target = 1, col = "green", total_train_size = total_train_size, total_test_size = total_test_size, info = info + " predict-1 ", output_dir = output_dir, orig_space_dim = orig_space_dim) def hdbscan_clustering(umap_data, train_data, test_data, info="", output_dir=""): clusterer = hdbscan.HDBSCAN(min_samples=10, min_cluster_size=500, prediction_data=True) umap_labels = clusterer.fit_predict(umap_data) train_labels, _ = hdbscan.approximate_predict(clusterer, train_data) test_labels, _ = hdbscan.approximate_predict(clusterer, test_data) fig, ((ax00, ax01, ax02), (ax10, ax11, ax12)) = plt.subplots(2, 3) fig.suptitle("HDBSCAN clastering: "+ info ) # plot umap data umap_clustered = (umap_labels >= 0) umap_coll = collections.Counter(umap_clustered) print("umap_clustered", umap_coll) # print("umap_data", umap_data.shape) # print("~umap_clustered", umap_clustered.count(False), ~umap_clustered) ax00.scatter(umap_data[~umap_clustered, 0], umap_data[~umap_clustered, 1], c=(0.5, 0.5, 0.5), s=0.1, alpha=0.5) ax00.set_title("UMAP Outliers " + str(umap_coll[False]), fontsize=7) ax10.scatter(umap_data[umap_clustered, 0], umap_data[umap_clustered, 1], c=umap_labels[umap_clustered], s=0.1, cmap="Spectral") ax10.set_title("UMAP Inliers " + str(umap_coll[True]), fontsize=7) # plot train data train_clustered = (train_labels >= 0) train_coll = collections.Counter(train_clustered) ax01.scatter(train_data[~train_clustered, 0], train_data[~train_clustered, 1], c=(0.5, 0.5, 0.5), s=0.1, alpha=0.5) ax01.set_title("Train Outliers " + str(train_coll[False]), fontsize=7) ax11.scatter(train_data[train_clustered, 0], train_data[train_clustered, 1], c=train_labels[train_clustered], s=0.1, cmap="Spectral") ax11.set_title("Train Inliers " + str(train_coll[True]), fontsize=7) # plot test data test_clustered = (test_labels >= 0) test_coll = collections.Counter(test_clustered) ax02.scatter(test_data[~test_clustered, 0], test_data[~test_clustered, 1], c=(0.5, 0.5, 0.5), s=0.1, alpha=0.5) ax02.set_title("Tets Outliers " + str(test_coll[False]), fontsize=7) ax12.scatter(test_data[test_clustered, 0], test_data[test_clustered, 1], c=test_labels[test_clustered], s=0.1, cmap="Spectral") ax12.set_title("Test Inliers " + str(test_coll[True]), fontsize=7) plt.savefig(output_dir+"/"+info+"-hdbscan.png") plt.close() def visualize_all_data_umap(dlrm, train_ld, test_ld = None, max_umap_size = 50000, output_dir = "", umap_metric = "euclidean"): data_ratio = 1 print("creating umap data") umap_train_feat, umap_train_X, umap_train_cat, umap_train_T, umap_train_z, umap_train_c, umap_train_p = create_umap_data(dlrm=dlrm, data_ld=train_ld, max_size=max_umap_size, offset=0, info="umap") # transform train and test data train_feat, train_X, train_cat, train_T, train_z, train_c, train_p = create_umap_data(dlrm=dlrm, data_ld=train_ld, max_size=max_umap_size*data_ratio, offset=max_umap_size, info="train") test_feat, test_X, test_cat, test_T, test_z, test_c, test_p = create_umap_data(dlrm=dlrm, data_ld=test_ld, max_size=max_umap_size*data_ratio, offset=0, info="test") print("umap_train_feat", np.array(umap_train_feat).shape) reducer_all_feat = umap.UMAP(random_state=42, metric=umap_metric) umap_feat_Y = reducer_all_feat.fit_transform(umap_train_feat) train_feat_Y = reducer_all_feat.transform(train_feat) test_feat_Y = reducer_all_feat.transform(test_feat) visualize_umap_data(umap_Y = umap_feat_Y, umap_T = umap_train_T, umap_C = umap_train_c, umap_P = umap_train_p, train_Y = train_feat_Y, train_T = train_T, train_C = train_c, train_P = train_p, test_Y = test_feat_Y, test_T = test_T, test_C = test_c, test_P = test_p, total_train_size = str(len(train_ld)), total_test_size = str(len(test_ld)), info = "all-features", output_dir = output_dir, orig_space_dim = np.array(umap_train_feat).shape[1]) hdbscan_clustering(umap_data = umap_feat_Y, train_data = train_feat_Y, test_data = test_feat_Y, info = "umap-all-features", output_dir = output_dir) # hdbscan_clustering(umap_data = np.array(umap_train_feat), # train_data = np.array(train_feat), # test_data = np.array(test_feat), # info = "all-features", # output_dir = output_dir) print("umap_train_X", np.array(umap_train_X).shape) reducer_X = umap.UMAP(random_state=42, metric=umap_metric) umap_X_Y = reducer_X.fit_transform(umap_train_X) train_X_Y = reducer_X.transform(train_X) test_X_Y = reducer_X.transform(test_X) visualize_umap_data(umap_Y = umap_X_Y, umap_T = umap_train_T, umap_C = umap_train_c, umap_P = umap_train_p, train_Y = train_X_Y, train_T = train_T, train_C = train_c, train_P = train_p, test_Y = test_X_Y, test_T = test_T, test_C = test_c, test_P = test_p, total_train_size = str(len(train_ld)), total_test_size = str(len(test_ld)), info = "cont-features", output_dir = output_dir, orig_space_dim = np.array(umap_train_X).shape[1]) print("umap_train_cat", np.array(umap_train_cat).shape) reducer_cat = umap.UMAP(random_state=42, metric=umap_metric) umap_cat_Y = reducer_cat.fit_transform(umap_train_cat) train_cat_Y = reducer_cat.transform(train_cat) test_cat_Y = reducer_cat.transform(test_cat) visualize_umap_data(umap_Y = umap_cat_Y, umap_T = umap_train_T, umap_C = umap_train_c, umap_P = umap_train_p, train_Y = train_cat_Y, train_T = train_T, train_C = train_c, train_P = train_p, test_Y = test_cat_Y, test_T = test_T, test_C = test_c, test_P = test_p, total_train_size = str(len(train_ld)), total_test_size = str(len(test_ld)), info = "cat-features", output_dir = output_dir, orig_space_dim = np.array(umap_train_cat).shape[1]) # UMAP for z data for i in range(0,len(umap_train_z)): print("z", i, np.array(umap_train_z[i]).shape) reducer_z = umap.UMAP(random_state=42, metric=umap_metric) umap_z_Y = reducer_z.fit_transform(umap_train_z[i]) train_z_Y = reducer_z.transform(train_z[i]) test_z_Y = reducer_z.transform(test_z[i]) visualize_umap_data(umap_Y = umap_z_Y, umap_T = umap_train_T, umap_C = umap_train_c, umap_P = umap_train_p, train_Y = train_z_Y, train_T = train_T, train_C = train_c, train_P = train_p, test_Y = test_z_Y, test_T = test_T, test_C = test_c, test_P = test_p, total_train_size = str(len(train_ld)), total_test_size = str(len(test_ld)), info = "z-features-"+str(i), output_dir = output_dir, orig_space_dim = np.array(umap_train_z[i]).shape[1]) def analyze_model_data(output_dir, dlrm, train_ld, test_ld, train_data, skip_embedding = False, use_tsne = False, max_umap_size = 50000, max_tsne_size = 10000, skip_categorical_analysis = False, skip_data_plots = False, umap_metric = "euclidean"): if not os.path.exists(output_dir): os.makedirs(output_dir) if skip_embedding is False: cat_counts = None cat_counts = analyse_categorical_counts(X_cat=train_data.X_cat, emb_l=dlrm.emb_l, output_dir=output_dir) visualize_embeddings_umap(emb_l = dlrm.emb_l, output_dir = output_dir, max_size = max_umap_size, umap_metric = umap_metric, cat_counts = cat_counts) if use_tsne is True: visualize_embeddings_tsne(emb_l = dlrm.emb_l, output_dir = output_dir, max_size = max_tsne_size) # data visualization and analysis if skip_data_plots is False: visualize_all_data_umap(dlrm=dlrm, train_ld=train_ld, test_ld=test_ld, max_umap_size=max_umap_size, output_dir=output_dir, umap_metric=umap_metric) # analyse categorical variables if skip_categorical_analysis is False and args.data_randomize == "none": analyse_categorical_data(X_cat=train_data.X_cat, n_days=10, output_dir=output_dir) if __name__ == "__main__": output_dir = "" ### parse arguments ### parser = argparse.ArgumentParser( description="Exploratory DLRM analysis" ) parser.add_argument("--load-model", type=str, default="") parser.add_argument("--data-set", choices=["kaggle", "terabyte"], help="dataset") # parser.add_argument("--dataset-path", required=True, help="path to the dataset") parser.add_argument("--max-ind-range", type=int, default=-1) # parser.add_argument("--mlperf-bin-loader", action="store_true", default=False) parser.add_argument("--output-dir", type=str, default="") parser.add_argument("--skip-embedding", action="store_true", default=False) parser.add_argument("--umap-metric", type=str, default="euclidean") parser.add_argument("--skip-data-plots", action="store_true", default=False) parser.add_argument("--skip-categorical-analysis", action="store_true", default=False) # umap relatet parser.add_argument("--max-umap-size", type=int, default=50000) # tsne related parser.add_argument("--use-tsne", action="store_true", default=False) parser.add_argument("--max-tsne-size", type=int, default=1000) # data file related parser.add_argument("--raw-data-file", type=str, default="") parser.add_argument("--processed-data-file", type=str, default="") parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1] parser.add_argument("--data-randomize", type=str, default="total") # none, total or day or none parser.add_argument("--memory-map", action="store_true", default=False) parser.add_argument("--mini-batch-size", type=int, default=1) parser.add_argument("--num-workers", type=int, default=0) parser.add_argument("--test-mini-batch-size", type=int, default=1) parser.add_argument("--test-num-workers", type=int, default=0) parser.add_argument("--num-batches", type=int, default=0) # mlperf logging (disables other output and stops early) parser.add_argument("--mlperf-logging", action="store_true", default=False) args = parser.parse_args() print("command line args: ", json.dumps(vars(args))) if output_dir == "": output_dir = args.data_set+"-"+os.path.split(args.load_model)[-1]+"-vis_all" print("output_dir:", output_dir) if args.data_set == "kaggle": # 1. Criteo Kaggle Display Advertisement Challenge Dataset (see ./bench/dlrm_s_criteo_kaggle.sh) m_spa=16 ln_emb=np.array([1460,583,10131227,2202608,305,24,12517,633,3,93145,5683,8351593,3194,27,14992,5461306,10,5652,2173,4,7046547,18,15,286181,105,142572]) ln_bot=np.array([13,512,256,64,16]) ln_top=np.array([367,512,256,1]) elif args.dataset == "terabyte": if args.max_ind_range == 10000000: # 2. Criteo Terabyte (see ./bench/dlrm_s_criteo_terabyte.sh [--sub-sample=0.875] --max-in-range=10000000) m_spa=64 ln_emb=np.array([9980333,36084,17217,7378,20134,3,7112,1442,61, 9758201,1333352,313829,10,2208,11156,122,4,970,14, 9994222, 7267859, 9946608,415421,12420,101, 36]) ln_bot=np.array([13,512,256,64]) ln_top=np.array([415,512,512,256,1]) elif args.max_ind_range == 40000000: # 3. Criteo Terabyte MLPerf training (see ./bench/run_and_time.sh --max-in-range=40000000) m_spa=128 ln_emb=np.array([39884406,39043,17289,7420,20263,3,7120,1543,63,38532951,2953546,403346,10,2208,11938,155,4,976,14,39979771,25641295,39664984,585935,12972,108,36]) ln_bot=np.array([13,512,256,128]) ln_top=np.array([479,1024,1024,512,256,1]) else: raise ValueError("only --max-in-range 10M or 40M is supported") else: raise ValueError("only kaggle|terabyte dataset options are supported") # check input parameters if args.data_randomize != "none" and args.skip_categorical_analysis is not True: print("Incorrect option for categoricat analysis, use: --data-randomize=none") sys.exit(-1) dlrm = DLRM_Net( m_spa, ln_emb, ln_bot, ln_top, arch_interaction_op="dot", arch_interaction_itself=False, sigmoid_bot=-1, sigmoid_top=ln_top.size - 2, sync_dense_params=True, loss_threshold=0.0, ndevices=-1, qr_flag=False, qr_operation=None, qr_collisions=None, qr_threshold=None, md_flag=False, md_threshold=None, ) # Load model is specified if not (args.load_model == ""): print("Loading saved model {}".format(args.load_model)) ld_model = torch.load(args.load_model, map_location=torch.device("cpu")) dlrm.load_state_dict(ld_model["state_dict"]) print("Model loaded", args.load_model) #print(dlrm) z_size = len(dlrm.top_l) for i in range(0, z_size): print("z", i, dlrm.top_l[i]) # load data train_data = None test_data = None if args.raw_data_file is not "" or args.processed_data_file is not "": train_data, train_ld, test_data, test_ld = dp.make_criteo_data_and_loaders(args) analyze_model_data(output_dir = output_dir, dlrm = dlrm, train_ld = train_ld, test_ld = test_ld, train_data = train_data, skip_embedding = args.skip_embedding, use_tsne = args.use_tsne, max_umap_size = args.max_umap_size, max_tsne_size = args.max_tsne_size, skip_categorical_analysis = args.skip_categorical_analysis, skip_data_plots = args.skip_data_plots, umap_metric = args.umap_metric)