in tbsm_data_pytorch.py [0:0]
def build_taobao_test(self, raw_path, out_file):
with open(str(raw_path)) as f:
for i, _ in enumerate(f):
if i % 50000 == 0:
print("pre-processing line: ", i)
self.total = i + 1
self.total_out = self.total # pos + neg points
print("ts_length: ", self.ts_length)
print("Total number of points in raw datafile: ", self.total)
print("Total number of points in output will be at most: ", self.total_out)
time = np.arange(self.ts_length + 1, dtype=np.int32) / (self.ts_length + 1)
users = np.zeros((self.total_out, self.ts_length + 1), dtype="i4") # 4 byte int
items = np.zeros((self.total_out, self.ts_length + 1), dtype="i4") # 4 byte int
cats = np.zeros((self.total_out, self.ts_length + 1), dtype="i4") # 4 byte int
times = np.zeros((self.total_out, self.ts_length + 1), dtype=np.float)
y = np.zeros(self.total_out, dtype="i4") # 4 byte int
# try to generate the desired number of points (time series) per each user.
# if history is short it may not succeed to generate sufficiently different
# time series for a particular user.
t, t_pos, t_neg = 0, 0, 0
with open(str(raw_path)) as f:
for i, line in enumerate(f):
if i % 1000 == 0:
print("processing line: ", i, t, t_pos, t_neg)
if i >= self.total:
break
units = line.strip().split("\t")
item_hist_list = units[4].split(",")
cate_hist_list = units[5].split(",")
user = np.array(np.maximum(np.int32(units[0]) - self.Inum, 0),
dtype=np.int32)
y[t] = np.int32(units[3])
items_ = np.array(
list(map(lambda x: np.maximum(np.int32(x), 0), item_hist_list)),
dtype=np.int32
)
cats_ = np.array(
list(map(lambda x: np.maximum(np.int32(x)
- self.Inum - self.Unum, 0), cate_hist_list)), dtype=np.int32
)
# get pts
items[t] = items_[-(self.ts_length + 1):]
cats[t] = cats_[-(self.ts_length + 1):]
users[t] = np.full(self.ts_length + 1, user)
times[t] = time
# check
if np.any(users[t] < 0) or np.any(items[t] < 0) \
or np.any(cats[t] < 0):
sys.exit("Categorical feature less than zero after \
processing. Aborting...")
if y[t] == 1:
t_pos += 1
else:
t_neg += 1
t += 1
print("total points, pos points, neg points: ", t, t_pos, t_neg)
self.truncate_and_save(out_file, False, t, users, items, cats, times, y)
return