def build_taobao_test()

in tbsm_data_pytorch.py [0:0]


    def build_taobao_test(self, raw_path, out_file):

        with open(str(raw_path)) as f:
            for i, _ in enumerate(f):
                if i % 50000 == 0:
                    print("pre-processing line: ", i)
        self.total = i + 1

        self.total_out = self.total  # pos + neg points
        print("ts_length: ", self.ts_length)
        print("Total number of points in raw datafile: ", self.total)
        print("Total number of points in output will be at most: ", self.total_out)

        time = np.arange(self.ts_length + 1, dtype=np.int32) / (self.ts_length + 1)

        users = np.zeros((self.total_out, self.ts_length + 1), dtype="i4")  # 4 byte int
        items = np.zeros((self.total_out, self.ts_length + 1), dtype="i4")  # 4 byte int
        cats = np.zeros((self.total_out, self.ts_length + 1), dtype="i4")  # 4 byte int
        times = np.zeros((self.total_out, self.ts_length + 1), dtype=np.float)
        y = np.zeros(self.total_out, dtype="i4")  # 4 byte int

        # try to generate the desired number of points (time series) per each user.
        # if history is short it may not succeed to generate sufficiently different
        # time series for a particular user.
        t, t_pos, t_neg = 0, 0, 0
        with open(str(raw_path)) as f:
            for i, line in enumerate(f):
                if i % 1000 == 0:
                    print("processing line: ", i, t, t_pos, t_neg)
                if i >= self.total:
                    break
                units = line.strip().split("\t")
                item_hist_list = units[4].split(",")
                cate_hist_list = units[5].split(",")

                user = np.array(np.maximum(np.int32(units[0]) - self.Inum, 0),
                    dtype=np.int32)
                y[t] = np.int32(units[3])
                items_ = np.array(
                    list(map(lambda x: np.maximum(np.int32(x), 0), item_hist_list)),
                    dtype=np.int32
                )
                cats_ = np.array(
                    list(map(lambda x: np.maximum(np.int32(x)
                        - self.Inum - self.Unum, 0), cate_hist_list)), dtype=np.int32
                )

                # get pts
                items[t] = items_[-(self.ts_length + 1):]
                cats[t] = cats_[-(self.ts_length + 1):]
                users[t] = np.full(self.ts_length + 1, user)
                times[t] = time
                # check
                if np.any(users[t] < 0) or np.any(items[t] < 0) \
                        or np.any(cats[t] < 0):
                    sys.exit("Categorical feature less than zero after \
                        processing. Aborting...")
                if y[t] == 1:
                    t_pos += 1
                else:
                    t_neg += 1
                t += 1

        print("total points, pos points, neg points: ", t, t_pos, t_neg)

        self.truncate_and_save(out_file, False, t, users, items, cats, times, y)
        return