in tbsm_data_pytorch.py [0:0]
def build_taobao_train_or_val(self, raw_path, out_file):
with open(str(raw_path)) as f:
for i, _ in enumerate(f):
if i % 50000 == 0:
print("pre-processing line: ", i)
self.total = min(self.total, i + 1)
print("total lines: ", self.total)
self.total_out = self.total * self.points_per_user * 2 # pos + neg points
print("Total number of points in raw datafile: ", self.total)
print("Total number of points in output will be at most: ", self.total_out)
np.random.seed(self.numpy_rand_seed)
r_target = np.arange(0, self.M - 1)
time = np.arange(self.ts_length + 1, dtype=np.int32) / (self.ts_length + 1)
# time = np.ones(self.ts_length + 1, dtype=np.int32)
users = np.zeros((self.total_out, self.ts_length + 1), dtype="i4") # 4 byte int
items = np.zeros((self.total_out, self.ts_length + 1), dtype="i4") # 4 byte int
cats = np.zeros((self.total_out, self.ts_length + 1), dtype="i4") # 4 byte int
times = np.zeros((self.total_out, self.ts_length + 1), dtype=np.float)
y = np.zeros(self.total_out, dtype="i4") # 4 byte int
# determine how many datapoints to take from each user based on the length of
# user behavior sequence
# ind=0, 1, 2, 3,... t < 10, 20, 30, 40, 50, 60, ...
k = 20
regime = np.zeros(k, dtype=np.int)
regime[1], regime[2], regime[3] = 1, 3, 6
for j in range(4, k):
regime[j] = self.points_per_user
if self.mode == "val":
self.points_per_user = 1
for j in range(k):
regime[j] = np.min([regime[j], self.points_per_user])
last = self.M - 1 # max index of last item
# try to generate the desired number of points (time series) per each user.
# if history is short it may not succeed to generate sufficiently different
# time series for a particular user.
t, t_pos, t_neg, t_short = 0, 0, 0, 0
with open(str(raw_path)) as f:
for i, line in enumerate(f):
if i % 1000 == 0:
print("processing line: ", i, t, t_pos, t_neg, t_short)
if i >= self.total:
break
units = line.strip().split("\t")
item_hist_list = units[4].split(",")
cate_hist_list = units[5].split(",")
neg_item_hist_list = units[6].split(",")
neg_cate_hist_list = units[7].split(",")
user = np.array(np.maximum(np.int32(units[0]) - self.Inum, 0),
dtype=np.int32)
# y[i] = np.int32(units[3])
items_ = np.array(
list(map(lambda x: np.maximum(np.int32(x), 0), item_hist_list)),
dtype=np.int32
)
cats_ = np.array(
list(map(lambda x: np.maximum(np.int32(x)
- self.Inum - self.Unum, 0), cate_hist_list)), dtype=np.int32
)
neg_items_ = np.array(
list(map(lambda x: np.maximum(np.int32(x), 0), neg_item_hist_list)),
dtype=np.int32
)
neg_cats_ = np.array(
list(map(lambda x: np.maximum(np.int32(x)
- self.Inum - self.Unum, 0), neg_cate_hist_list)),
dtype=np.int32
)
# select datapoints
first = np.argmax(items_ > 0)
ind = int((last - first) // 10) # index into regime array
# pos
for _ in range(regime[ind]):
a1 = min(first + self.ts_length, last - 1)
end = np.random.randint(a1, last)
indices = np.arange(end - self.ts_length, end + 1)
if items_[indices[0]] == 0:
t_short += 1
items[t] = items_[indices]
cats[t] = cats_[indices]
users[t] = np.full(self.ts_length + 1, user)
times[t] = time
y[t] = 1
# check
if np.any(users[t] < 0) or np.any(items[t] < 0) \
or np.any(cats[t] < 0):
sys.exit("Categorical feature less than zero after \
processing. Aborting...")
t += 1
t_pos += 1
# neg
for _ in range(regime[ind]):
a1 = min(first + self.ts_length - 1, last - 1)
end = np.random.randint(a1, last)
indices = np.arange(end - self.ts_length + 1, end + 1)
if items_[indices[0]] == 0:
t_short += 1
items[t, :-1] = items_[indices]
cats[t, :-1] = cats_[indices]
neg_indices = np.random.choice(r_target, 1,
replace=False) # random final item
items[t, -1] = neg_items_[neg_indices]
cats[t, -1] = neg_cats_[neg_indices]
users[t] = np.full(self.ts_length + 1, user)
times[t] = time
y[t] = 0
# check
if np.any(users[t] < 0) or np.any(items[t] < 0) \
or np.any(cats[t] < 0):
sys.exit("Categorical feature less than zero after \
processing. Aborting...")
t += 1
t_neg += 1
print("total points, pos points, neg points: ", t, t_pos, t_neg)
self.truncate_and_save(out_file, True, t, users, items, cats, times, y)
return