def build_taobao_train_or_val()

in tbsm_data_pytorch.py [0:0]


    def build_taobao_train_or_val(self, raw_path, out_file):

        with open(str(raw_path)) as f:
            for i, _ in enumerate(f):
                if i % 50000 == 0:
                    print("pre-processing line: ", i)
        self.total = min(self.total, i + 1)

        print("total lines: ", self.total)

        self.total_out = self.total * self.points_per_user * 2  # pos + neg points
        print("Total number of points in raw datafile: ", self.total)
        print("Total number of points in output will be at most: ", self.total_out)

        np.random.seed(self.numpy_rand_seed)

        r_target = np.arange(0, self.M - 1)

        time = np.arange(self.ts_length + 1, dtype=np.int32) / (self.ts_length + 1)
        # time = np.ones(self.ts_length + 1, dtype=np.int32)

        users = np.zeros((self.total_out, self.ts_length + 1), dtype="i4")  # 4 byte int
        items = np.zeros((self.total_out, self.ts_length + 1), dtype="i4")  # 4 byte int
        cats = np.zeros((self.total_out, self.ts_length + 1), dtype="i4")  # 4 byte int
        times = np.zeros((self.total_out, self.ts_length + 1), dtype=np.float)
        y = np.zeros(self.total_out, dtype="i4")  # 4 byte int

        # determine how many datapoints to take from each user based on the length of
        # user behavior sequence
        # ind=0, 1, 2, 3,... t < 10, 20, 30, 40, 50, 60, ...
        k = 20
        regime = np.zeros(k, dtype=np.int)
        regime[1], regime[2], regime[3] = 1, 3, 6
        for j in range(4, k):
            regime[j] = self.points_per_user
        if self.mode == "val":
            self.points_per_user = 1
            for j in range(k):
                regime[j] = np.min([regime[j], self.points_per_user])
        last = self.M - 1  # max index of last item

        # try to generate the desired number of points (time series) per each user.
        # if history is short it may not succeed to generate sufficiently different
        # time series for a particular user.
        t, t_pos, t_neg, t_short = 0, 0, 0, 0
        with open(str(raw_path)) as f:
            for i, line in enumerate(f):
                if i % 1000 == 0:
                    print("processing line: ", i, t, t_pos, t_neg, t_short)
                if i >= self.total:
                    break
                units = line.strip().split("\t")
                item_hist_list = units[4].split(",")
                cate_hist_list = units[5].split(",")
                neg_item_hist_list = units[6].split(",")
                neg_cate_hist_list = units[7].split(",")
                user = np.array(np.maximum(np.int32(units[0]) - self.Inum, 0),
                    dtype=np.int32)
                # y[i] = np.int32(units[3])
                items_ = np.array(
                    list(map(lambda x: np.maximum(np.int32(x), 0), item_hist_list)),
                    dtype=np.int32
                )
                cats_ = np.array(
                    list(map(lambda x: np.maximum(np.int32(x)
                        - self.Inum - self.Unum, 0), cate_hist_list)), dtype=np.int32
                )
                neg_items_ = np.array(
                    list(map(lambda x: np.maximum(np.int32(x), 0), neg_item_hist_list)),
                    dtype=np.int32
                )
                neg_cats_ = np.array(
                    list(map(lambda x: np.maximum(np.int32(x)
                        - self.Inum - self.Unum, 0), neg_cate_hist_list)),
                    dtype=np.int32
                )

                # select datapoints
                first = np.argmax(items_ > 0)
                ind = int((last - first) // 10)  # index into regime array
                # pos
                for _ in range(regime[ind]):
                    a1 = min(first + self.ts_length, last - 1)
                    end = np.random.randint(a1, last)
                    indices = np.arange(end - self.ts_length, end + 1)
                    if items_[indices[0]] == 0:
                        t_short += 1
                    items[t] = items_[indices]
                    cats[t] = cats_[indices]
                    users[t] = np.full(self.ts_length + 1, user)
                    times[t] = time
                    y[t] = 1
                    # check
                    if np.any(users[t] < 0) or np.any(items[t] < 0) \
                            or np.any(cats[t] < 0):
                        sys.exit("Categorical feature less than zero after \
                            processing. Aborting...")
                    t += 1
                    t_pos += 1
                # neg
                for _ in range(regime[ind]):
                    a1 = min(first + self.ts_length - 1, last - 1)
                    end = np.random.randint(a1, last)
                    indices = np.arange(end - self.ts_length + 1, end + 1)
                    if items_[indices[0]] == 0:
                        t_short += 1
                    items[t, :-1] = items_[indices]
                    cats[t, :-1] = cats_[indices]
                    neg_indices = np.random.choice(r_target, 1,
                    replace=False)   # random final item
                    items[t, -1] = neg_items_[neg_indices]
                    cats[t, -1] = neg_cats_[neg_indices]
                    users[t] = np.full(self.ts_length + 1, user)
                    times[t] = time
                    y[t] = 0
                    # check
                    if np.any(users[t] < 0) or np.any(items[t] < 0) \
                            or np.any(cats[t] < 0):
                        sys.exit("Categorical feature less than zero after \
                        processing. Aborting...")
                    t += 1
                    t_neg += 1

        print("total points, pos points, neg points: ", t, t_pos, t_neg)

        self.truncate_and_save(out_file, True, t, users, items, cats, times, y)
        return