def _get_feature_pipeline()

in sourcecode/scoring/pflip_plus_model.py [0:0]


  def _get_feature_pipeline(self, noteInfo: pd.DataFrame) -> Pipeline:
    # Begin with author pipeline
    columnPipes: List[Tuple[str, Any, Union[str, List[str]]]] = [
      (
        c.noteAuthorParticipantIdKey,
        Pipeline([("onehot", OneHotEncoder(handle_unknown="ignore"))]),
        [c.noteAuthorParticipantIdKey],
      )
    ]
    # Add pipelines for individual user helpfulness ratings
    for prefix in [_LOCAL, _PEER_MISLEADING, _PEER_NON_MISLEADING]:
      column = f"{prefix}_{_USER_HELPFULNESS_RATINGS}"
      pipe = Pipeline(
        [
          ("_set_to_list", FunctionTransformer(_set_to_list)),
          (
            "onehot",
            CountVectorizer(
              tokenizer=_identity, preprocessor=_identity, min_df=self._helpfulnessRaterMin
            ),
          ),
        ]
      )
      columnPipes.append((column, pipe, column))
    # Add pipelines for individual helpful tag ratings
    for prefix in [_LOCAL, _PEER_MISLEADING, _PEER_NON_MISLEADING]:
      column = f"{prefix}_{_USER_HELPFUL_TAGS}"
      pipe = Pipeline(
        [
          ("_set_to_list", FunctionTransformer(_set_to_list)),
          (
            "onehot",
            CountVectorizer(tokenizer=_identity, preprocessor=_identity, min_df=self._tagRaterMin),
          ),
          ("selection", SelectPercentile(chi2, percentile=self._helpfulTagPercentile)),
        ]
      )
      columnPipes.append((column, pipe, column))
    # Add pipelines for individual not-helpful tag ratings
    for prefix in [_LOCAL, _PEER_MISLEADING, _PEER_NON_MISLEADING]:
      column = f"{prefix}_{_USER_NOT_HELPFUL_TAGS}"
      pipe = Pipeline(
        [
          ("_set_to_list", FunctionTransformer(_set_to_list)),
          (
            "onehot",
            CountVectorizer(tokenizer=_identity, preprocessor=_identity, min_df=self._tagRaterMin),
          ),
          ("selection", SelectPercentile(chi2, percentile=self._notHelpfulTagPercentile)),
        ]
      )
      columnPipes.append((column, pipe, column))
    # Add pipelines for tag ratio columns
    for prefix in [_LOCAL, _PEER_MISLEADING, _PEER_NON_MISLEADING]:
      for tagset in [c.notHelpfulTagsTSVOrder, c.helpfulTagsTSVOrder]:
        for tag in tagset:
          column = f"{prefix}_{tag}"
          self._column_thresholds[column] = noteInfo[column].quantile(0.99)
          if noteInfo[column].min() == noteInfo[column].max():
            continue
          pipe = Pipeline(
            [
              ("reshape", FunctionTransformer(_reshape)),
              ("drop_constants", VarianceThreshold()),
              (
                "binize",
                KBinsDiscretizer(n_bins=self._tag_ratio_bins, encode="onehot", strategy="kmeans"),
              ),
            ]
          )
          columnPipes.append((column, pipe, column))
    # Add pipelines for rating counts across notes
    columns = []
    for prefix in [_LOCAL, _PEER_MISLEADING, _PEER_NON_MISLEADING]:
      for col in _BUCKET_COUNT_COLS:
        columns.append(f"{prefix}_{col}")
    assert noteInfo[columns].isna().sum().sum() == 0
    for col in columns:
      pipe = Pipeline(
        [
          ("log", FunctionTransformer(_feature_log)),
          (
            "binize",
            KBinsDiscretizer(n_bins=self._rating_count_bins, encode="onehot", strategy="kmeans"),
          ),
        ]
      )
      columnPipes.append((col, pipe, [col]))
    for degree in [2, 3]:
      pipe = Pipeline(
        [
          ("log", FunctionTransformer(_feature_log)),
          (
            "binize",
            KBinsDiscretizer(n_bins=self._rating_count_bins, encode="onehot", strategy="kmeans"),
          ),
          ("drop_rare", VarianceThreshold(threshold=0.001)),
          (
            "cross",
            PolynomialFeatures(degree=(degree, degree), interaction_only=True, include_bias=False),
          ),
          ("drop_rare_again", VarianceThreshold(threshold=0.001)),
        ]
      )
      columnPipes.append((f"cross_note_counts_degree_{degree}", pipe, columns))
    # Add pipelines for rating counts within notes
    for prefix in [_LOCAL, _PEER_MISLEADING, _PEER_NON_MISLEADING]:
      columns = []
      for col in _BUCKET_COUNT_COLS:
        columns.append(f"{prefix}_{col}")
      assert noteInfo[columns].isna().sum().sum() == 0
      pipe = Pipeline(
        [
          ("log", FunctionTransformer(_feature_log)),
          (
            "binize",
            KBinsDiscretizer(n_bins=self._rating_count_bins, encode="onehot", strategy="kmeans"),
          ),
          ("drop_rare", VarianceThreshold(threshold=0.001)),
          ("cross_0", PolynomialFeatures(degree=(2, 2), interaction_only=True, include_bias=False)),
          ("cross_1", PolynomialFeatures(degree=(2, 2), interaction_only=True, include_bias=False)),
          ("drop_rare_again", VarianceThreshold(threshold=0.001)),
        ]
      )
      columnPipes.append((f"{prefix}_cross_note_counts_degree_4", pipe, columns))
    # Add pipelines for rater factor stats
    for prefix in [_LOCAL, _PEER_MISLEADING, _PEER_NON_MISLEADING]:
      columns = []
      for col in _STATS_COLS:
        columns.append(f"{prefix}_{col}")
      pipe = Pipeline(
        [
          ("fill_nans_df", FunctionTransformer(_fill_na)),
          (
            "binize",
            KBinsDiscretizer(n_bins=self._factor_stats_bins, encode="onehot", strategy="kmeans"),
          ),
          ("cross", PolynomialFeatures(degree=(1, 2), interaction_only=True, include_bias=False)),
          ("drop_rare", VarianceThreshold(threshold=0.001)),
        ]
      )
      columnPipes.append((f"{prefix}_stat_cols", pipe, columns))
    # Add pipelines for rating bursts
    for colset in [_QUICK_RATING_COLS, _BURST_RATING_COLS, _RECENT_RATING_COLS]:
      for col in colset:
        assert noteInfo[col].isna().sum() == 0
        if "RATIO" in col:
          self._column_thresholds[col] = noteInfo[col].quantile(0.999)
          pipe = Pipeline(
            [
              (
                "binize",
                KBinsDiscretizer(n_bins=self._burst_bins, encode="onehot", strategy="kmeans"),
              ),
            ]
          )
        else:
          assert "TOTAL" in col
          self._column_thresholds[col] = noteInfo[col].quantile(0.999)
          pipe = Pipeline(
            [
              (
                "binize",
                KBinsDiscretizer(n_bins=self._burst_bins, encode="onehot", strategy="kmeans"),
              ),
            ]
          )
        columnPipes.append((col, pipe, [col]))
    # Add pipeline for note writing latency
    assert noteInfo[_NOTE_WRITING_LATENCY].isna().sum() == 0
    self._column_thresholds[_NOTE_WRITING_LATENCY] = noteInfo[_NOTE_WRITING_LATENCY].quantile(0.999)
    pipe = Pipeline(
      [
        ("binize", KBinsDiscretizer(n_bins=self._latency_bins, encode="onehot", strategy="kmeans")),
      ]
    )
    columnPipes.append((_NOTE_WRITING_LATENCY, pipe, [_NOTE_WRITING_LATENCY]))
    # Add columns for peer notes
    peerNoteCols = [
      _TOTAL_PEER_NOTES,
      _TOTAL_PEER_MISLEADING_NOTES,
      _TOTAL_PEER_NON_MISLEADING_NOTES,
      _TOTAL_PEER_CRH_NOTES,
      _TOTAL_PEER_STABILIZATION_NOTES,
    ]
    assert noteInfo[peerNoteCols].isna().sum().sum() == 0
    for col in peerNoteCols:
      self._column_thresholds[col] = noteInfo[col].quantile(0.9999)
      pipe = Pipeline(
        [
          (
            "binize",
            KBinsDiscretizer(n_bins=self._peer_note_count_bins, encode="onehot", strategy="kmeans"),
          ),
        ]
      )
      columnPipes.append((col, pipe, [col]))
    pipe = Pipeline(
      [
        ("log", FunctionTransformer(_feature_log)),
        (
          "binize",
          KBinsDiscretizer(n_bins=self._peer_note_count_bins, encode="onehot", strategy="kmeans"),
        ),
        ("cross", PolynomialFeatures(degree=(2, 2), interaction_only=True, include_bias=False)),
        ("drop_rare", VarianceThreshold(threshold=0.001)),
      ]
    )
    columnPipes.append(("peer_note_cross_degree_2", pipe, peerNoteCols))

    # Build and return column transformer
    return ColumnTransformer(columnPipes, verbose=True)