in sourcecode/scoring/note_ratings.py [0:0]
def get_note_counts_by_rater_sign(raterModelOutput, ratings):
raterModelOutput[c.raterParticipantIdKey].astype(ratings[c.raterParticipantIdKey].dtype)
if c.helpfulNumKey not in ratings.columns:
ratings[c.helpfulNumKey] = 0.5
ratings.loc[ratings[c.helpfulnessLevelKey] == "HELPFUL", c.helpfulNumKey] = 1.0
ratings.loc[ratings[c.helpfulnessLevelKey] == "NOT_HELPFUL", c.helpfulNumKey] = 0.0
ratingsToUse = pd.DataFrame(ratings[[c.noteIdKey, c.raterParticipantIdKey, c.helpfulNumKey]])
raterModelOutputToUse = pd.DataFrame(
raterModelOutput[[c.raterParticipantIdKey, c.internalRaterFactor1Key]]
)
mergedRatings = ratingsToUse.merge(raterModelOutputToUse, on=c.raterParticipantIdKey)
origLength = len(mergedRatings)
mergedRatings = mergedRatings[mergedRatings[c.internalRaterFactor1Key].notna()]
logger.info(
f"dropped {origLength - len(mergedRatings)} out of {origLength} ratings due to NaN factor."
)
negFactorKey = "negFactor"
posFactorKey = "posFactor"
raterFactorBucketKey = "raterFactorBucket"
mergedRatings["raterFactorBucket"] = np.where(
mergedRatings[c.internalRaterFactor1Key] < 0, negFactorKey, posFactorKey
)
noteCountsByRaterSign = (
mergedRatings.groupby([c.noteIdKey, raterFactorBucketKey])
.size()
.unstack(fill_value=0)
.reset_index()
).rename(
columns={negFactorKey: c.negFactorRatingCountKey, posFactorKey: c.posFactorRatingCountKey}
)
if c.negFactorRatingCountKey not in noteCountsByRaterSign.columns:
noteCountsByRaterSign[c.negFactorRatingCountKey] = 0
if c.posFactorRatingCountKey not in noteCountsByRaterSign.columns:
noteCountsByRaterSign[c.posFactorRatingCountKey] = 0
noteCountsByRaterSign[c.minSignCountKey] = noteCountsByRaterSign[
[c.negFactorRatingCountKey, c.posFactorRatingCountKey]
].min(axis=1)
meanHelpfulnessByRaterSign = (
mergedRatings.groupby([c.noteIdKey, raterFactorBucketKey])[c.helpfulNumKey]
.mean()
.unstack()
.reset_index()
).rename(
columns={negFactorKey: c.negFactorMeanHelpfulNumKey, posFactorKey: c.posFactorMeanHelpfulNumKey}
)
noteCountsByRaterSign = noteCountsByRaterSign.merge(
meanHelpfulnessByRaterSign, on=[c.noteIdKey], how="left", unsafeAllowed=c.minSignCountKey
)
if c.negFactorMeanHelpfulNumKey not in noteCountsByRaterSign.columns:
noteCountsByRaterSign[c.negFactorMeanHelpfulNumKey] = np.nan
if c.posFactorMeanHelpfulNumKey not in noteCountsByRaterSign.columns:
noteCountsByRaterSign[c.posFactorMeanHelpfulNumKey] = np.nan
# Merge in net minority helpfulness
counts = mergedRatings[[c.noteIdKey, raterFactorBucketKey, c.helpfulNumKey]]
counts = pd.crosstab(
index=counts[c.noteIdKey], columns=[counts[raterFactorBucketKey], counts[c.helpfulNumKey]]
)
counts.columns = [f"{col1}_{col2}" for col1, col2 in counts.columns]
counts = counts.reset_index(drop=False)
for bucket in [negFactorKey, posFactorKey]:
for level in [0.0, 0.5, 1.0]:
col = f"{bucket}_{level}"
if col not in counts:
counts[col] = 0
counts[f"{negFactorKey}_{c.netMinHelpfulKey}"] = (
counts[f"{negFactorKey}_1.0"] - counts[f"{negFactorKey}_0.0"]
)
counts[f"{posFactorKey}_{c.netMinHelpfulKey}"] = (
counts[f"{posFactorKey}_1.0"] - counts[f"{posFactorKey}_0.0"]
)
counts[c.netMinHelpfulKey] = (
counts[[f"{negFactorKey}_{c.netMinHelpfulKey}", f"{posFactorKey}_{c.netMinHelpfulKey}"]]
.min(axis=1)
.clip(lower=0)
)
counts = counts.merge(
mergedRatings[[c.noteIdKey]]
.value_counts()
.reset_index(drop=False)
.rename(columns={"count": "total"})
)
counts[c.netMinHelpfulRatioKey] = counts[c.netMinHelpfulKey] / counts["total"]
noteCountsByRaterSign = noteCountsByRaterSign.merge(
counts[[c.noteIdKey, c.netMinHelpfulKey, c.netMinHelpfulRatioKey]]
)
# Downcast types from 64=>32
noteCountsByRaterSign[c.minSignCountKey] = noteCountsByRaterSign[c.minSignCountKey].astype(
np.int32
)
noteCountsByRaterSign[c.netMinHelpfulKey] = noteCountsByRaterSign[c.netMinHelpfulKey].astype(
np.int32
)
noteCountsByRaterSign[c.netMinHelpfulRatioKey] = noteCountsByRaterSign[
c.netMinHelpfulRatioKey
].astype(np.float32)
noteCountsByRaterSign[c.negFactorRatingCountKey] = noteCountsByRaterSign[
c.negFactorRatingCountKey
].astype(np.int32)
noteCountsByRaterSign[c.posFactorRatingCountKey] = noteCountsByRaterSign[
c.posFactorRatingCountKey
].astype(np.int32)
noteCountsByRaterSign[c.negFactorMeanHelpfulNumKey] = noteCountsByRaterSign[
c.negFactorMeanHelpfulNumKey
].astype(np.float32)
noteCountsByRaterSign[c.posFactorMeanHelpfulNumKey] = noteCountsByRaterSign[
c.posFactorMeanHelpfulNumKey
].astype(np.float32)
return noteCountsByRaterSign[
[
c.noteIdKey,
c.minSignCountKey,
c.negFactorMeanHelpfulNumKey,
c.posFactorMeanHelpfulNumKey,
c.netMinHelpfulKey,
c.netMinHelpfulRatioKey,
]
].rename_axis(None, axis=1)