sourcecode/scoring/mf_topic_scorer.py (173 lines of code) (raw):
from typing import Dict, List, Optional, Tuple
from . import constants as c
from .mf_base_scorer import MFBaseScorer, coalesce_columns
import pandas as pd
def coalesce_topic_models(scoredNotes: pd.DataFrame) -> pd.DataFrame:
"""Coalesce all topic modeling columns across note and user scoring.
Since each Scorer must have distinct output columns, we use coalescing to run
multiple instances of MFTopicScorer objects and then condense the results into
a single set of columns. This approach works because each note will be scored
by at most one MFTopicScorer instance.
Args:
scoredNotes: scoring output for notes.
Returns:
tuple containing coalesced scoring results for notes and users.
"""
for col in [
c.topicNoteInterceptKey,
c.topicNoteFactor1Key,
c.topicRatingStatusKey,
c.topicNoteConfidentKey,
c.noteTopicKey,
c.topicInternalActiveRulesKey,
c.topicNumFinalRoundRatingsKey,
]:
scoredNotes = coalesce_columns(scoredNotes, col)
return scoredNotes
class MFTopicScorer(MFBaseScorer):
def __init__(
self,
topicName: str,
seed: Optional[int] = None,
pseudoraters: Optional[bool] = False,
saveIntermediateState: bool = False,
userFactorLambda=None,
noteFactorLambda=None,
userInterceptLambda=None,
noteInterceptLambda=None,
globalInterceptLambda=None,
diamondLambda=None,
normalizedLossHyperparameters=None,
maxFirstMFTrainError: float = 0.16,
maxFinalMFTrainError: float = 0.09,
minMeanNoteScore: float = 0.05,
crhThreshold: float = 0.40,
crnhThresholdIntercept: float = -0.05,
crnhThresholdNoteFactorMultiplier: float = -0.8,
crnhThresholdNMIntercept: float = -0.15,
crhSuperThreshold: float = 0.5,
lowDiligenceThreshold: float = 0.263,
factorThreshold: float = 0.5,
multiplyPenaltyByHarassmentScore: bool = True,
minimumHarassmentScoreToPenalize: float = 2.0,
tagConsensusHarassmentHelpfulRatingPenalty: int = 10,
) -> None:
"""Configure MFTopicScorer object.
Notice that each MFTopicScorer defines column names by appending the topicName to
column prefixes which are constant. Dynamically defining the column names allows the
topic scorer to be instantiated multiple times while maintaining the property that
the columns attached by each scorer remain unique. Once all scorers have ran, we
(will) validate that each note was scored by at most one topic scorer and then coalesce
all of the topic scoring columns and remove the topicName suffix.
Args:
topicName: str indicating which topic this scorer instance should filter for.
seed: if not None, seed value to ensure deterministic execution
pseudoraters: if True, compute optional pseudorater confidence intervals
"""
super().__init__(
includedTopics={topicName},
seed=seed,
pseudoraters=pseudoraters,
useStableInitialization=False,
saveIntermediateState=saveIntermediateState,
threads=4,
userFactorLambda=userFactorLambda,
noteFactorLambda=noteFactorLambda,
userInterceptLambda=userInterceptLambda,
noteInterceptLambda=noteInterceptLambda,
globalInterceptLambda=globalInterceptLambda,
diamondLambda=diamondLambda,
normalizedLossHyperparameters=normalizedLossHyperparameters,
maxFirstMFTrainError=maxFirstMFTrainError,
maxFinalMFTrainError=maxFinalMFTrainError,
minMeanNoteScore=minMeanNoteScore,
crhThreshold=crhThreshold,
crnhThresholdIntercept=crnhThresholdIntercept,
crnhThresholdNoteFactorMultiplier=crnhThresholdNoteFactorMultiplier,
crnhThresholdNMIntercept=crnhThresholdNMIntercept,
crhSuperThreshold=crhSuperThreshold,
lowDiligenceThreshold=lowDiligenceThreshold,
factorThreshold=factorThreshold,
multiplyPenaltyByHarassmentScore=multiplyPenaltyByHarassmentScore,
minimumHarassmentScoreToPenalize=minimumHarassmentScoreToPenalize,
tagConsensusHarassmentHelpfulRatingPenalty=tagConsensusHarassmentHelpfulRatingPenalty,
useReputation=False,
)
self._topicName = topicName
self._topicNoteInterceptKey = f"{c.topicNoteInterceptKey}_{self._topicName}"
self._topicNoteFactor1Key = f"{c.topicNoteFactor1Key}_{self._topicName}"
self._topicRatingStatusKey = f"{c.topicRatingStatusKey}_{self._topicName}"
self._topicInternalActiveRulesKey = f"{c.topicInternalActiveRulesKey}_{self._topicName}"
self._topicNumFinalRoundRatingsKey = f"{c.topicNumFinalRoundRatingsKey}_{self._topicName}"
self._noteTopicKey = f"{c.noteTopicKey}_{self._topicName}"
self._noteTopicConfidentKey = f"{c.topicNoteConfidentKey}_{self._topicName}"
def get_name(self):
return f"MFTopicScorer_{self._topicName}"
def _get_note_col_mapping(self) -> Dict[str, str]:
"""Returns a dict mapping default note column names to custom names for a specific model."""
return {
c.internalNoteInterceptKey: self._topicNoteInterceptKey,
c.internalNoteFactor1Key: self._topicNoteFactor1Key,
c.internalRatingStatusKey: self._topicRatingStatusKey,
c.internalActiveRulesKey: self._topicInternalActiveRulesKey,
c.numFinalRoundRatingsKey: self._topicNumFinalRoundRatingsKey,
c.lowDiligenceNoteInterceptKey: c.lowDiligenceLegacyNoteInterceptKey,
}
def get_scored_notes_cols(self) -> List[str]:
"""Returns a list of columns which should be present in the scoredNotes output."""
return [
c.noteIdKey,
self._topicNoteInterceptKey,
self._topicNoteFactor1Key,
self._topicRatingStatusKey,
self._noteTopicKey,
self._noteTopicConfidentKey,
self._topicInternalActiveRulesKey,
self._topicNumFinalRoundRatingsKey,
]
def get_helpfulness_scores_cols(self) -> List[str]:
"""Returns a list of columns which should be present in the helpfulnessScores output."""
return []
def get_auxiliary_note_info_cols(self) -> List[str]:
"""Returns a list of columns which should be present in the auxiliaryNoteInfo output."""
return []
def _get_dropped_note_cols(self) -> List[str]:
"""Returns a list of columns which should be excluded from scoredNotes and auxiliaryNoteInfo."""
return super()._get_dropped_note_cols() + (
[
c.activeFilterTagsKey,
c.ratingWeightKey,
c.noteInterceptMinKey,
c.noteInterceptMaxKey,
]
+ c.notHelpfulTagsAdjustedColumns
+ c.notHelpfulTagsAdjustedRatioColumns
+ c.incorrectFilterColumns
+ c.noteParameterUncertaintyTSVAuxColumns
)
def _get_dropped_user_cols(self) -> List[str]:
"""Returns a list of columns which should be excluded from helpfulnessScores output."""
return super()._get_dropped_user_cols() + [
c.crhCrnhRatioDifferenceKey,
c.meanNoteScoreKey,
c.raterAgreeRatioKey,
c.aboveHelpfulnessThresholdKey,
c.internalRaterInterceptKey,
c.internalRaterFactor1Key,
c.raterParticipantIdKey,
]
def _postprocess_output(
self,
noteScores: pd.DataFrame,
userScores: pd.DataFrame,
ratings: pd.DataFrame,
noteStatusHistory: pd.DataFrame,
userEnrollment: pd.DataFrame,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Add noteTopicKey to notes output.
Args:
noteScores: note outputs from scoring
userScores: user outputs from scoring
ratings (pd.DataFrame): preprocessed ratings
noteStatusHistory (pd.DataFrame): one row per note; history of when note had each status
userEnrollment (pd.DataFrame): one row per user specifying enrollment properties
Returns:
Tuple[pd.DataFrame, pd.DataFrame]:
noteScores: filtered and updated note scoring output
userScores: filtered and updated user scoring output
"""
# Set the modelingGroupKey column in each output
noteScores[self._noteTopicKey] = self._topicName
# Calculate total counts of positive and negative factor ratings
scoredNotes = noteScores[~noteScores[c.internalNoteInterceptKey].isna()][[c.noteIdKey]]
posFactorRaters = userScores[userScores[c.internalRaterFactor1Key] >= 0][
[c.raterParticipantIdKey]
]
posFactorRatings = (
ratings[[c.noteIdKey, c.raterParticipantIdKey]].merge(scoredNotes).merge(posFactorRaters)
)
posFactorCounts = (
posFactorRatings.groupby(c.noteIdKey)
.count()
.reset_index(drop=False)
.rename(columns={c.raterParticipantIdKey: "posRatingTotal"})
)
negFactorRaters = userScores[userScores[c.internalRaterFactor1Key] < 0][
[c.raterParticipantIdKey]
]
negFactorRatings = (
ratings[[c.noteIdKey, c.raterParticipantIdKey]].merge(scoredNotes).merge(negFactorRaters)
)
negFactorCounts = (
negFactorRatings.groupby(c.noteIdKey)
.count()
.reset_index(drop=False)
.rename(columns={c.raterParticipantIdKey: "negRatingTotal"})
)
# Set scoring confidence bit
posFactorCounts = posFactorCounts[posFactorCounts["posRatingTotal"] > 4][[c.noteIdKey]]
negFactorCounts = negFactorCounts[negFactorCounts["negRatingTotal"] > 4][[c.noteIdKey]]
confidentNotes = posFactorCounts.merge(negFactorCounts)
confidentNotes[self._noteTopicConfidentKey] = True
noteScores = noteScores.merge(
confidentNotes, how="left", unsafeAllowed=[self._noteTopicConfidentKey, c.defaultIndexKey]
)
noteScores = noteScores.fillna({self._noteTopicConfidentKey: False})
return noteScores, userScores