def final_preprocess()

in scripts/preprocess.py [0:0]


def final_preprocess(datafile):
    X_int = []
    X_cat = []
    y = []
    missing_sparse = []


    if args.dataset_name == "criteo":
        num_dense, num_sparse = 13, 26
        TRAIN_X = "train_x2.npy"
    elif args.dataset_name == "avazu":
        num_dense, num_sparse = 0, 23
        TRAIN_X = "train_x.npy"
    elif args.dataset_name == "kdd":
        num_dense, num_sparse = 3, 10
        TRAIN_X = "train_x2.npy"

    TRAIN_Y = "train_y.npy"
    TRAIN_I = "train_i.npy"

    for i in [3,4,5,6,7,8,9,10,2,1]:#range(1,11): # todo
        f = np.load(os.path.join(datafile, "part" + str(i), TRAIN_I), "r",  allow_pickle=True)
        g = np.load(os.path.join(datafile, "part" + str(i), TRAIN_X), "r",  allow_pickle=True)
        h = np.load(os.path.join(datafile, "part" + str(i), TRAIN_Y), "r",  allow_pickle=True)

        X_int_split = np.array(g[:, 0:num_dense])
        X_cat_split = np.array(f[:, num_dense:])
        y_split = h
        missing_sparse_split = np.array(g[:,0:])

        indices = np.arange(len(y_split))
        indices = np.random.permutation(indices)

        # shuffle data
        X_cat_split = X_cat_split[indices]
        X_int_split = X_int_split[indices]
        y_split = y_split[indices].astype(np.float32)
        missing_sparse_split = missing_sparse_split[indices]

        X_int.append(X_int_split)
        X_cat.append(X_cat_split)
        y.append(y_split)
        missing_sparse.append(missing_sparse_split)

    X_int = np.concatenate(X_int)
    X_cat = np.concatenate(X_cat)
    y = np.concatenate(y)
    missing_sparse = np.concatenate(missing_sparse)

    print("expected feature size", X_cat.max() + 1)

    flat = X_cat.flatten()

    fset = set(flat)
    print("expected size", len(fset))


    missing_sparse_maps = []

    for i in range(num_sparse):
        missing_slice = missing_sparse[:,i]
        if 0 in missing_slice:
            locs = np.where(missing_slice==0)[0]
            missing_sparse_maps.append({X_cat[locs[0],i]:0})
        else:
            missing_sparse_maps.append(None)

    raw_to_new_ids = []
    for i in range(X_cat.shape[1]):
        print("compressing the ids for the {}-th feature.".format(i))
        raw_to_new_ids.append(compress_ids(X_cat[:, i], missing_sparse_maps[i]))


    total = 0
    hashsizes = []
    for i in range(len(raw_to_new_ids)):
        hashsize = max(raw_to_new_ids[i].values())+1 # 1 is for the zero
        hashsizes.append(hashsize)
        print("sparse_" + str(i),"\t", hashsize)
        total += hashsize


    if args.dataset_name == "criteo":
        hashsize_filename = "criteo_hashsizes.npy"
        finaldata_filename = "criteo_processed.npz"
    elif args.dataset_name == "avazu":
        hashsize_filename = "avazu_hashsizes.npy"
        finaldata_filename = "avazu_processed.npz"
    elif args.dataset_name == "kdd":
        hashsize_filename = "kdd2012_hashsizes.npy"
        finaldata_filename = "kdd2012_processed.npz"
    np.save(os.path.join(datafile, hashsize_filename), np.array(hashsizes))
    np.savez_compressed(os.path.join(datafile, finaldata_filename), X_int=X_int, X_cat=X_cat, y=y)