in data_utils.py [0:0]
def processCriteoAdData(d_path, d_file, npzfile, i, convertDicts, pre_comp_counts):
# Process Kaggle Display Advertising Challenge or Terabyte Dataset
# by converting unicode strings in X_cat to integers and
# converting negative integer values in X_int.
#
# Loads data in the form "{kaggle|terabyte}_day_i.npz" where i is the day.
#
# Inputs:
# d_path (str): path for {kaggle|terabyte}_day_i.npz files
# i (int): splits in the dataset (typically 0 to 7 or 0 to 24)
# process data if not all files exist
filename_i = npzfile + "_{0}_processed.npz".format(i)
if path.exists(filename_i):
print("Using existing " + filename_i, end="\n")
else:
print("Not existing " + filename_i)
with np.load(npzfile + "_{0}.npz".format(i)) as data:
# categorical features
'''
# Approach 1a: using empty dictionaries
X_cat, convertDicts, counts = convertUStringToDistinctIntsDict(
data["X_cat"], convertDicts, counts
)
'''
'''
# Approach 1b: using empty np.unique
X_cat, convertDicts, counts = convertUStringToDistinctIntsUnique(
data["X_cat"], convertDicts, counts
)
'''
# Approach 2a: using pre-computed dictionaries
X_cat_t = np.zeros(data["X_cat_t"].shape)
for j in range(26):
for k, x in enumerate(data["X_cat_t"][j, :]):
X_cat_t[j, k] = convertDicts[j][x]
# continuous features
X_int = data["X_int"]
X_int[X_int < 0] = 0
# targets
y = data["y"]
np.savez_compressed(
filename_i,
# X_cat = X_cat,
X_cat=np.transpose(X_cat_t), # transpose of the data
X_int=X_int,
y=y,
)
print("Processed " + filename_i, end="\n")
# sanity check (applicable only if counts have been pre-computed & are re-computed)
# for j in range(26):
# if pre_comp_counts[j] != counts[j]:
# sys.exit("ERROR: Sanity check on counts has failed")
# print("\nSanity check on counts passed")
return