in scripts/preprocess.py [0:0]
def final_preprocess(datafile):
X_int = []
X_cat = []
y = []
missing_sparse = []
if args.dataset_name == "criteo":
num_dense, num_sparse = 13, 26
TRAIN_X = "train_x2.npy"
elif args.dataset_name == "avazu":
num_dense, num_sparse = 0, 23
TRAIN_X = "train_x.npy"
elif args.dataset_name == "kdd":
num_dense, num_sparse = 3, 10
TRAIN_X = "train_x2.npy"
TRAIN_Y = "train_y.npy"
TRAIN_I = "train_i.npy"
for i in [3,4,5,6,7,8,9,10,2,1]:#range(1,11): # todo
f = np.load(os.path.join(datafile, "part" + str(i), TRAIN_I), "r", allow_pickle=True)
g = np.load(os.path.join(datafile, "part" + str(i), TRAIN_X), "r", allow_pickle=True)
h = np.load(os.path.join(datafile, "part" + str(i), TRAIN_Y), "r", allow_pickle=True)
X_int_split = np.array(g[:, 0:num_dense])
X_cat_split = np.array(f[:, num_dense:])
y_split = h
missing_sparse_split = np.array(g[:,0:])
indices = np.arange(len(y_split))
indices = np.random.permutation(indices)
# shuffle data
X_cat_split = X_cat_split[indices]
X_int_split = X_int_split[indices]
y_split = y_split[indices].astype(np.float32)
missing_sparse_split = missing_sparse_split[indices]
X_int.append(X_int_split)
X_cat.append(X_cat_split)
y.append(y_split)
missing_sparse.append(missing_sparse_split)
X_int = np.concatenate(X_int)
X_cat = np.concatenate(X_cat)
y = np.concatenate(y)
missing_sparse = np.concatenate(missing_sparse)
print("expected feature size", X_cat.max() + 1)
flat = X_cat.flatten()
fset = set(flat)
print("expected size", len(fset))
missing_sparse_maps = []
for i in range(num_sparse):
missing_slice = missing_sparse[:,i]
if 0 in missing_slice:
locs = np.where(missing_slice==0)[0]
missing_sparse_maps.append({X_cat[locs[0],i]:0})
else:
missing_sparse_maps.append(None)
raw_to_new_ids = []
for i in range(X_cat.shape[1]):
print("compressing the ids for the {}-th feature.".format(i))
raw_to_new_ids.append(compress_ids(X_cat[:, i], missing_sparse_maps[i]))
total = 0
hashsizes = []
for i in range(len(raw_to_new_ids)):
hashsize = max(raw_to_new_ids[i].values())+1 # 1 is for the zero
hashsizes.append(hashsize)
print("sparse_" + str(i),"\t", hashsize)
total += hashsize
if args.dataset_name == "criteo":
hashsize_filename = "criteo_hashsizes.npy"
finaldata_filename = "criteo_processed.npz"
elif args.dataset_name == "avazu":
hashsize_filename = "avazu_hashsizes.npy"
finaldata_filename = "avazu_processed.npz"
elif args.dataset_name == "kdd":
hashsize_filename = "kdd2012_hashsizes.npy"
finaldata_filename = "kdd2012_processed.npz"
np.save(os.path.join(datafile, hashsize_filename), np.array(hashsizes))
np.savez_compressed(os.path.join(datafile, finaldata_filename), X_int=X_int, X_cat=X_cat, y=y)