def get_note_counts_by_rater_sign()

in sourcecode/scoring/note_ratings.py [0:0]


def get_note_counts_by_rater_sign(raterModelOutput, ratings):
  raterModelOutput[c.raterParticipantIdKey].astype(ratings[c.raterParticipantIdKey].dtype)

  if c.helpfulNumKey not in ratings.columns:
    ratings[c.helpfulNumKey] = 0.5
    ratings.loc[ratings[c.helpfulnessLevelKey] == "HELPFUL", c.helpfulNumKey] = 1.0
    ratings.loc[ratings[c.helpfulnessLevelKey] == "NOT_HELPFUL", c.helpfulNumKey] = 0.0

  ratingsToUse = pd.DataFrame(ratings[[c.noteIdKey, c.raterParticipantIdKey, c.helpfulNumKey]])
  raterModelOutputToUse = pd.DataFrame(
    raterModelOutput[[c.raterParticipantIdKey, c.internalRaterFactor1Key]]
  )

  mergedRatings = ratingsToUse.merge(raterModelOutputToUse, on=c.raterParticipantIdKey)
  origLength = len(mergedRatings)
  mergedRatings = mergedRatings[mergedRatings[c.internalRaterFactor1Key].notna()]
  logger.info(
    f"dropped {origLength - len(mergedRatings)} out of {origLength} ratings due to NaN factor."
  )

  negFactorKey = "negFactor"
  posFactorKey = "posFactor"
  raterFactorBucketKey = "raterFactorBucket"

  mergedRatings["raterFactorBucket"] = np.where(
    mergedRatings[c.internalRaterFactor1Key] < 0, negFactorKey, posFactorKey
  )

  noteCountsByRaterSign = (
    mergedRatings.groupby([c.noteIdKey, raterFactorBucketKey])
    .size()
    .unstack(fill_value=0)
    .reset_index()
  ).rename(
    columns={negFactorKey: c.negFactorRatingCountKey, posFactorKey: c.posFactorRatingCountKey}
  )

  if c.negFactorRatingCountKey not in noteCountsByRaterSign.columns:
    noteCountsByRaterSign[c.negFactorRatingCountKey] = 0
  if c.posFactorRatingCountKey not in noteCountsByRaterSign.columns:
    noteCountsByRaterSign[c.posFactorRatingCountKey] = 0

  noteCountsByRaterSign[c.minSignCountKey] = noteCountsByRaterSign[
    [c.negFactorRatingCountKey, c.posFactorRatingCountKey]
  ].min(axis=1)

  meanHelpfulnessByRaterSign = (
    mergedRatings.groupby([c.noteIdKey, raterFactorBucketKey])[c.helpfulNumKey]
    .mean()
    .unstack()
    .reset_index()
  ).rename(
    columns={negFactorKey: c.negFactorMeanHelpfulNumKey, posFactorKey: c.posFactorMeanHelpfulNumKey}
  )

  noteCountsByRaterSign = noteCountsByRaterSign.merge(
    meanHelpfulnessByRaterSign, on=[c.noteIdKey], how="left", unsafeAllowed=c.minSignCountKey
  )
  if c.negFactorMeanHelpfulNumKey not in noteCountsByRaterSign.columns:
    noteCountsByRaterSign[c.negFactorMeanHelpfulNumKey] = np.nan
  if c.posFactorMeanHelpfulNumKey not in noteCountsByRaterSign.columns:
    noteCountsByRaterSign[c.posFactorMeanHelpfulNumKey] = np.nan

  # Merge in net minority helpfulness
  counts = mergedRatings[[c.noteIdKey, raterFactorBucketKey, c.helpfulNumKey]]
  counts = pd.crosstab(
    index=counts[c.noteIdKey], columns=[counts[raterFactorBucketKey], counts[c.helpfulNumKey]]
  )
  counts.columns = [f"{col1}_{col2}" for col1, col2 in counts.columns]
  counts = counts.reset_index(drop=False)
  for bucket in [negFactorKey, posFactorKey]:
    for level in [0.0, 0.5, 1.0]:
      col = f"{bucket}_{level}"
      if col not in counts:
        counts[col] = 0
  counts[f"{negFactorKey}_{c.netMinHelpfulKey}"] = (
    counts[f"{negFactorKey}_1.0"] - counts[f"{negFactorKey}_0.0"]
  )
  counts[f"{posFactorKey}_{c.netMinHelpfulKey}"] = (
    counts[f"{posFactorKey}_1.0"] - counts[f"{posFactorKey}_0.0"]
  )
  counts[c.netMinHelpfulKey] = (
    counts[[f"{negFactorKey}_{c.netMinHelpfulKey}", f"{posFactorKey}_{c.netMinHelpfulKey}"]]
    .min(axis=1)
    .clip(lower=0)
  )
  counts = counts.merge(
    mergedRatings[[c.noteIdKey]]
    .value_counts()
    .reset_index(drop=False)
    .rename(columns={"count": "total"})
  )
  counts[c.netMinHelpfulRatioKey] = counts[c.netMinHelpfulKey] / counts["total"]
  noteCountsByRaterSign = noteCountsByRaterSign.merge(
    counts[[c.noteIdKey, c.netMinHelpfulKey, c.netMinHelpfulRatioKey]]
  )

  # Downcast types from 64=>32
  noteCountsByRaterSign[c.minSignCountKey] = noteCountsByRaterSign[c.minSignCountKey].astype(
    np.int32
  )
  noteCountsByRaterSign[c.netMinHelpfulKey] = noteCountsByRaterSign[c.netMinHelpfulKey].astype(
    np.int32
  )
  noteCountsByRaterSign[c.netMinHelpfulRatioKey] = noteCountsByRaterSign[
    c.netMinHelpfulRatioKey
  ].astype(np.float32)
  noteCountsByRaterSign[c.negFactorRatingCountKey] = noteCountsByRaterSign[
    c.negFactorRatingCountKey
  ].astype(np.int32)
  noteCountsByRaterSign[c.posFactorRatingCountKey] = noteCountsByRaterSign[
    c.posFactorRatingCountKey
  ].astype(np.int32)
  noteCountsByRaterSign[c.negFactorMeanHelpfulNumKey] = noteCountsByRaterSign[
    c.negFactorMeanHelpfulNumKey
  ].astype(np.float32)
  noteCountsByRaterSign[c.posFactorMeanHelpfulNumKey] = noteCountsByRaterSign[
    c.posFactorMeanHelpfulNumKey
  ].astype(np.float32)

  return noteCountsByRaterSign[
    [
      c.noteIdKey,
      c.minSignCountKey,
      c.negFactorMeanHelpfulNumKey,
      c.posFactorMeanHelpfulNumKey,
      c.netMinHelpfulKey,
      c.netMinHelpfulRatioKey,
    ]
  ].rename_axis(None, axis=1)