in sourcecode/scoring/pflip_plus_model.py [0:0]
def _profile_pipeline(self, pipe: Pipeline, noteInfo: pd.DataFrame) -> str:
"""Generate a numerical profile of each extracted feature.
For each feature, we examine the dimensionality and sparsity of the feature. For low
dimensional features representing discretized continuous values, we also profile the
size and boundaries of each bin.
"""
# Generate feature matrix
matrix = pipe.transform(noteInfo)
# Profile matrix
start = 0
lines = []
for name, transformer, _ in pipe.transformers_:
if name == "remainder":
continue
end = start + len(transformer[-1].get_feature_names_out())
total = int(matrix[:, start:end].sum())
colMin = int(matrix[:, start:end].sum(axis=0).min())
colMean = total / (end - start)
colMax = int(matrix[:, start:end].sum(axis=0).max())
rowMin = int(matrix[:, start:end].sum(axis=1).min())
rowMean = total / (matrix.shape[0])
rowMax = int(matrix[:, start:end].sum(axis=1).max())
columns = [
f"{name:<60}pos=[{start:8} {end:8} {end-start:8}]",
f"total={total:9}",
f"col=[{colMin:8} {colMean:8.1f} {colMax:8}]",
f"row=[{rowMin:8} {rowMean:8.1f} {rowMax:8}]",
]
if (end - start) <= 10:
columns.append(f"{str(matrix[:, start:end].sum(axis=0).astype(np.int64)):<80}")
columns.append(str(transformer[-1].bin_edges_[0].round(3).tolist()))
lines.append(" ".join(columns))
start = end
return "\n".join(lines)