def processCriteoAdData()

in data_utils.py [0:0]


def processCriteoAdData(d_path, d_file, npzfile, i, convertDicts, pre_comp_counts):
    # Process Kaggle Display Advertising Challenge or Terabyte Dataset
    # by converting unicode strings in X_cat to integers and
    # converting negative integer values in X_int.
    #
    # Loads data in the form "{kaggle|terabyte}_day_i.npz" where i is the day.
    #
    # Inputs:
    #   d_path (str): path for {kaggle|terabyte}_day_i.npz files
    #   i (int): splits in the dataset (typically 0 to 7 or 0 to 24)

    # process data if not all files exist
    filename_i = npzfile + "_{0}_processed.npz".format(i)

    if path.exists(filename_i):
        print("Using existing " + filename_i, end="\n")
    else:
        print("Not existing " + filename_i)
        with np.load(npzfile + "_{0}.npz".format(i)) as data:
            # categorical features
            '''
            # Approach 1a: using empty dictionaries
            X_cat, convertDicts, counts = convertUStringToDistinctIntsDict(
                data["X_cat"], convertDicts, counts
            )
            '''
            '''
            # Approach 1b: using empty np.unique
            X_cat, convertDicts, counts = convertUStringToDistinctIntsUnique(
                data["X_cat"], convertDicts, counts
            )
            '''
            # Approach 2a: using pre-computed dictionaries
            X_cat_t = np.zeros(data["X_cat_t"].shape)
            for j in range(26):
                for k, x in enumerate(data["X_cat_t"][j, :]):
                    X_cat_t[j, k] = convertDicts[j][x]
            # continuous features
            X_int = data["X_int"]
            X_int[X_int < 0] = 0
            # targets
            y = data["y"]

        np.savez_compressed(
            filename_i,
            # X_cat = X_cat,
            X_cat=np.transpose(X_cat_t),  # transpose of the data
            X_int=X_int,
            y=y,
        )
        print("Processed " + filename_i, end="\n")
    # sanity check (applicable only if counts have been pre-computed & are re-computed)
    # for j in range(26):
    #    if pre_comp_counts[j] != counts[j]:
    #        sys.exit("ERROR: Sanity check on counts has failed")
    # print("\nSanity check on counts passed")

    return