def _profile_pipeline()

in sourcecode/scoring/pflip_plus_model.py [0:0]


  def _profile_pipeline(self, pipe: Pipeline, noteInfo: pd.DataFrame) -> str:
    """Generate a numerical profile of each extracted feature.

    For each feature, we examine the dimensionality and sparsity of the feature.  For low
    dimensional features representing discretized continuous values, we also profile the
    size and boundaries of each bin.
    """
    # Generate feature matrix
    matrix = pipe.transform(noteInfo)
    # Profile matrix
    start = 0
    lines = []
    for name, transformer, _ in pipe.transformers_:
      if name == "remainder":
        continue
      end = start + len(transformer[-1].get_feature_names_out())
      total = int(matrix[:, start:end].sum())
      colMin = int(matrix[:, start:end].sum(axis=0).min())
      colMean = total / (end - start)
      colMax = int(matrix[:, start:end].sum(axis=0).max())
      rowMin = int(matrix[:, start:end].sum(axis=1).min())
      rowMean = total / (matrix.shape[0])
      rowMax = int(matrix[:, start:end].sum(axis=1).max())
      columns = [
        f"{name:<60}pos=[{start:8} {end:8} {end-start:8}]",
        f"total={total:9}",
        f"col=[{colMin:8} {colMean:8.1f} {colMax:8}]",
        f"row=[{rowMin:8} {rowMean:8.1f} {rowMax:8}]",
      ]
      if (end - start) <= 10:
        columns.append(f"{str(matrix[:, start:end].sum(axis=0).astype(np.int64)):<80}")
        columns.append(str(transformer[-1].bin_edges_[0].round(3).tolist()))
      lines.append("    ".join(columns))
      start = end
    return "\n".join(lines)