def fit_transform()

in recommenders/models/lightgbm/lightgbm_utils.py [0:0]


    def fit_transform(self, df):
        """Input a training set (pandas.DataFrame) and return the converted 2 numpy.ndarray (x,y).

        Args:
            df (pandas.DataFrame): Input dataframe

        Returns:
            numpy.ndarray, numpy.ndarray: New features and labels.
        """
        df = df.astype(dtype=self.dtype_dict)
        self.samples = df.shape[0]
        logging.info("Filtering and fillna features")
        for item in tqdm(self.cate_cols):
            value_counts = df[item].value_counts()
            num = value_counts.shape[0]
            self.save_value_filter[item] = list(
                value_counts[: int(num * self.thresrate)][
                    value_counts > self.threshold
                ].index
            )
            rm_values = set(value_counts.index) - set(self.save_value_filter[item])
            df[item] = df[item].map(lambda x: "<LESS>" if x in rm_values else x)
            df[item] = df[item].fillna("<UNK>")
            del value_counts
            gc.collect()

        for item in tqdm(self.nume_cols):
            df[item] = df[item].fillna(df[item].mean())
            self.save_num_embs[item] = {"sum": df[item].sum(), "cnt": df[item].shape[0]}

        logging.info("Ordinal encoding cate features")
        # ordinal_encoding
        df = self.encoder.fit_transform(df)

        logging.info("Target encoding cate features")
        # dynamic_targeting_encoding
        for item in tqdm(self.cate_cols):
            feats = df[item].values
            labels = df[self.label_name].values
            feat_encoding = {"mean": [], "count": []}
            self.save_cate_avgs[item] = collections.defaultdict(lambda: [0, 0])
            for idx in range(self.samples):
                cur_feat = feats[idx]
                if cur_feat in self.save_cate_avgs[item]:
                    feat_encoding["mean"].append(
                        self.save_cate_avgs[item][cur_feat][0]
                        / self.save_cate_avgs[item][cur_feat][1]
                    )
                    feat_encoding["count"].append(
                        self.save_cate_avgs[item][cur_feat][1] / idx
                    )
                else:
                    feat_encoding["mean"].append(0)
                    feat_encoding["count"].append(0)
                self.save_cate_avgs[item][cur_feat][0] += labels[idx]
                self.save_cate_avgs[item][cur_feat][1] += 1
            df[item + "_t_mean"] = feat_encoding["mean"]
            df[item + "_t_count"] = feat_encoding["count"]
            self.tgt_nume_cols.append(item + "_t_mean")
            self.tgt_nume_cols.append(item + "_t_count")

        logging.info("Start manual binary encoding")
        rows = None
        for item in tqdm(self.nume_cols + self.tgt_nume_cols):
            feats = df[item].values
            if rows is None:
                rows = feats.reshape((-1, 1))
            else:
                rows = np.concatenate([rows, feats.reshape((-1, 1))], axis=1)
            del feats
            gc.collect()
        for item in tqdm(self.cate_cols):
            feats = df[item].values
            Max = df[item].max()
            bit_len = len(bin(Max)) - 2
            samples = self.samples
            self.Max_len[item] = bit_len
            res = unpackbits(feats, bit_len).reshape((samples, -1))
            rows = np.concatenate([rows, res], axis=1)
            del feats
            gc.collect()
        trn_y = np.array(df[self.label_name].values).reshape((-1, 1))
        del df
        gc.collect()
        trn_x = np.array(rows)
        return trn_x, trn_y