in recommenders/models/lightgbm/lightgbm_utils.py [0:0]
def fit_transform(self, df):
"""Input a training set (pandas.DataFrame) and return the converted 2 numpy.ndarray (x,y).
Args:
df (pandas.DataFrame): Input dataframe
Returns:
numpy.ndarray, numpy.ndarray: New features and labels.
"""
df = df.astype(dtype=self.dtype_dict)
self.samples = df.shape[0]
logging.info("Filtering and fillna features")
for item in tqdm(self.cate_cols):
value_counts = df[item].value_counts()
num = value_counts.shape[0]
self.save_value_filter[item] = list(
value_counts[: int(num * self.thresrate)][
value_counts > self.threshold
].index
)
rm_values = set(value_counts.index) - set(self.save_value_filter[item])
df[item] = df[item].map(lambda x: "<LESS>" if x in rm_values else x)
df[item] = df[item].fillna("<UNK>")
del value_counts
gc.collect()
for item in tqdm(self.nume_cols):
df[item] = df[item].fillna(df[item].mean())
self.save_num_embs[item] = {"sum": df[item].sum(), "cnt": df[item].shape[0]}
logging.info("Ordinal encoding cate features")
# ordinal_encoding
df = self.encoder.fit_transform(df)
logging.info("Target encoding cate features")
# dynamic_targeting_encoding
for item in tqdm(self.cate_cols):
feats = df[item].values
labels = df[self.label_name].values
feat_encoding = {"mean": [], "count": []}
self.save_cate_avgs[item] = collections.defaultdict(lambda: [0, 0])
for idx in range(self.samples):
cur_feat = feats[idx]
if cur_feat in self.save_cate_avgs[item]:
feat_encoding["mean"].append(
self.save_cate_avgs[item][cur_feat][0]
/ self.save_cate_avgs[item][cur_feat][1]
)
feat_encoding["count"].append(
self.save_cate_avgs[item][cur_feat][1] / idx
)
else:
feat_encoding["mean"].append(0)
feat_encoding["count"].append(0)
self.save_cate_avgs[item][cur_feat][0] += labels[idx]
self.save_cate_avgs[item][cur_feat][1] += 1
df[item + "_t_mean"] = feat_encoding["mean"]
df[item + "_t_count"] = feat_encoding["count"]
self.tgt_nume_cols.append(item + "_t_mean")
self.tgt_nume_cols.append(item + "_t_count")
logging.info("Start manual binary encoding")
rows = None
for item in tqdm(self.nume_cols + self.tgt_nume_cols):
feats = df[item].values
if rows is None:
rows = feats.reshape((-1, 1))
else:
rows = np.concatenate([rows, feats.reshape((-1, 1))], axis=1)
del feats
gc.collect()
for item in tqdm(self.cate_cols):
feats = df[item].values
Max = df[item].max()
bit_len = len(bin(Max)) - 2
samples = self.samples
self.Max_len[item] = bit_len
res = unpackbits(feats, bit_len).reshape((samples, -1))
rows = np.concatenate([rows, res], axis=1)
del feats
gc.collect()
trn_y = np.array(df[self.label_name].values).reshape((-1, 1))
del df
gc.collect()
trn_x = np.array(rows)
return trn_x, trn_y