in sourcecode/scoring/pflip_plus_model.py [0:0]
def _get_feature_pipeline(self, noteInfo: pd.DataFrame) -> Pipeline:
# Begin with author pipeline
columnPipes: List[Tuple[str, Any, Union[str, List[str]]]] = [
(
c.noteAuthorParticipantIdKey,
Pipeline([("onehot", OneHotEncoder(handle_unknown="ignore"))]),
[c.noteAuthorParticipantIdKey],
)
]
# Add pipelines for individual user helpfulness ratings
for prefix in [_LOCAL, _PEER_MISLEADING, _PEER_NON_MISLEADING]:
column = f"{prefix}_{_USER_HELPFULNESS_RATINGS}"
pipe = Pipeline(
[
("_set_to_list", FunctionTransformer(_set_to_list)),
(
"onehot",
CountVectorizer(
tokenizer=_identity, preprocessor=_identity, min_df=self._helpfulnessRaterMin
),
),
]
)
columnPipes.append((column, pipe, column))
# Add pipelines for individual helpful tag ratings
for prefix in [_LOCAL, _PEER_MISLEADING, _PEER_NON_MISLEADING]:
column = f"{prefix}_{_USER_HELPFUL_TAGS}"
pipe = Pipeline(
[
("_set_to_list", FunctionTransformer(_set_to_list)),
(
"onehot",
CountVectorizer(tokenizer=_identity, preprocessor=_identity, min_df=self._tagRaterMin),
),
("selection", SelectPercentile(chi2, percentile=self._helpfulTagPercentile)),
]
)
columnPipes.append((column, pipe, column))
# Add pipelines for individual not-helpful tag ratings
for prefix in [_LOCAL, _PEER_MISLEADING, _PEER_NON_MISLEADING]:
column = f"{prefix}_{_USER_NOT_HELPFUL_TAGS}"
pipe = Pipeline(
[
("_set_to_list", FunctionTransformer(_set_to_list)),
(
"onehot",
CountVectorizer(tokenizer=_identity, preprocessor=_identity, min_df=self._tagRaterMin),
),
("selection", SelectPercentile(chi2, percentile=self._notHelpfulTagPercentile)),
]
)
columnPipes.append((column, pipe, column))
# Add pipelines for tag ratio columns
for prefix in [_LOCAL, _PEER_MISLEADING, _PEER_NON_MISLEADING]:
for tagset in [c.notHelpfulTagsTSVOrder, c.helpfulTagsTSVOrder]:
for tag in tagset:
column = f"{prefix}_{tag}"
self._column_thresholds[column] = noteInfo[column].quantile(0.99)
if noteInfo[column].min() == noteInfo[column].max():
continue
pipe = Pipeline(
[
("reshape", FunctionTransformer(_reshape)),
("drop_constants", VarianceThreshold()),
(
"binize",
KBinsDiscretizer(n_bins=self._tag_ratio_bins, encode="onehot", strategy="kmeans"),
),
]
)
columnPipes.append((column, pipe, column))
# Add pipelines for rating counts across notes
columns = []
for prefix in [_LOCAL, _PEER_MISLEADING, _PEER_NON_MISLEADING]:
for col in _BUCKET_COUNT_COLS:
columns.append(f"{prefix}_{col}")
assert noteInfo[columns].isna().sum().sum() == 0
for col in columns:
pipe = Pipeline(
[
("log", FunctionTransformer(_feature_log)),
(
"binize",
KBinsDiscretizer(n_bins=self._rating_count_bins, encode="onehot", strategy="kmeans"),
),
]
)
columnPipes.append((col, pipe, [col]))
for degree in [2, 3]:
pipe = Pipeline(
[
("log", FunctionTransformer(_feature_log)),
(
"binize",
KBinsDiscretizer(n_bins=self._rating_count_bins, encode="onehot", strategy="kmeans"),
),
("drop_rare", VarianceThreshold(threshold=0.001)),
(
"cross",
PolynomialFeatures(degree=(degree, degree), interaction_only=True, include_bias=False),
),
("drop_rare_again", VarianceThreshold(threshold=0.001)),
]
)
columnPipes.append((f"cross_note_counts_degree_{degree}", pipe, columns))
# Add pipelines for rating counts within notes
for prefix in [_LOCAL, _PEER_MISLEADING, _PEER_NON_MISLEADING]:
columns = []
for col in _BUCKET_COUNT_COLS:
columns.append(f"{prefix}_{col}")
assert noteInfo[columns].isna().sum().sum() == 0
pipe = Pipeline(
[
("log", FunctionTransformer(_feature_log)),
(
"binize",
KBinsDiscretizer(n_bins=self._rating_count_bins, encode="onehot", strategy="kmeans"),
),
("drop_rare", VarianceThreshold(threshold=0.001)),
("cross_0", PolynomialFeatures(degree=(2, 2), interaction_only=True, include_bias=False)),
("cross_1", PolynomialFeatures(degree=(2, 2), interaction_only=True, include_bias=False)),
("drop_rare_again", VarianceThreshold(threshold=0.001)),
]
)
columnPipes.append((f"{prefix}_cross_note_counts_degree_4", pipe, columns))
# Add pipelines for rater factor stats
for prefix in [_LOCAL, _PEER_MISLEADING, _PEER_NON_MISLEADING]:
columns = []
for col in _STATS_COLS:
columns.append(f"{prefix}_{col}")
pipe = Pipeline(
[
("fill_nans_df", FunctionTransformer(_fill_na)),
(
"binize",
KBinsDiscretizer(n_bins=self._factor_stats_bins, encode="onehot", strategy="kmeans"),
),
("cross", PolynomialFeatures(degree=(1, 2), interaction_only=True, include_bias=False)),
("drop_rare", VarianceThreshold(threshold=0.001)),
]
)
columnPipes.append((f"{prefix}_stat_cols", pipe, columns))
# Add pipelines for rating bursts
for colset in [_QUICK_RATING_COLS, _BURST_RATING_COLS, _RECENT_RATING_COLS]:
for col in colset:
assert noteInfo[col].isna().sum() == 0
if "RATIO" in col:
self._column_thresholds[col] = noteInfo[col].quantile(0.999)
pipe = Pipeline(
[
(
"binize",
KBinsDiscretizer(n_bins=self._burst_bins, encode="onehot", strategy="kmeans"),
),
]
)
else:
assert "TOTAL" in col
self._column_thresholds[col] = noteInfo[col].quantile(0.999)
pipe = Pipeline(
[
(
"binize",
KBinsDiscretizer(n_bins=self._burst_bins, encode="onehot", strategy="kmeans"),
),
]
)
columnPipes.append((col, pipe, [col]))
# Add pipeline for note writing latency
assert noteInfo[_NOTE_WRITING_LATENCY].isna().sum() == 0
self._column_thresholds[_NOTE_WRITING_LATENCY] = noteInfo[_NOTE_WRITING_LATENCY].quantile(0.999)
pipe = Pipeline(
[
("binize", KBinsDiscretizer(n_bins=self._latency_bins, encode="onehot", strategy="kmeans")),
]
)
columnPipes.append((_NOTE_WRITING_LATENCY, pipe, [_NOTE_WRITING_LATENCY]))
# Add columns for peer notes
peerNoteCols = [
_TOTAL_PEER_NOTES,
_TOTAL_PEER_MISLEADING_NOTES,
_TOTAL_PEER_NON_MISLEADING_NOTES,
_TOTAL_PEER_CRH_NOTES,
_TOTAL_PEER_STABILIZATION_NOTES,
]
assert noteInfo[peerNoteCols].isna().sum().sum() == 0
for col in peerNoteCols:
self._column_thresholds[col] = noteInfo[col].quantile(0.9999)
pipe = Pipeline(
[
(
"binize",
KBinsDiscretizer(n_bins=self._peer_note_count_bins, encode="onehot", strategy="kmeans"),
),
]
)
columnPipes.append((col, pipe, [col]))
pipe = Pipeline(
[
("log", FunctionTransformer(_feature_log)),
(
"binize",
KBinsDiscretizer(n_bins=self._peer_note_count_bins, encode="onehot", strategy="kmeans"),
),
("cross", PolynomialFeatures(degree=(2, 2), interaction_only=True, include_bias=False)),
("drop_rare", VarianceThreshold(threshold=0.001)),
]
)
columnPipes.append(("peer_note_cross_degree_2", pipe, peerNoteCols))
# Build and return column transformer
return ColumnTransformer(columnPipes, verbose=True)