sourcecode/scoring/pflip_model.py [312:371]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  def _label_notes(
    self,
    noteStatusHistory: pd.DataFrame,
  ) -> pd.DataFrame:
    """Generate a DataFrame mapping noteIds to labels.

    We define a CRH note as any note that is locked to CRH, and a FLIP note as any note
    that was scored as CRH at some point but has since locked to NMR.  Note that we exclude
    notes that are locked to CRH but decided by ScoringDriftGuard, since that indicates the
    model wanted to score the note as NMR (and therefore it is unclear whether the note is
    best labeled CRH or FLIP).

    Args:
      noteStatusHistory: pd.DataFrame used to determine locked status and whether there was
        a prior CRH status.

    Returns:
      pd.DataFrame with noteId and LABEL columns
    """
    # Assemble relevant data for labeling
    labels = noteStatusHistory[
      [
        c.noteIdKey,
        # If set, implies note was on track to be CRH at some point
        c.timestampMillisOfFirstNmrDueToMinStableCrhTimeKey,
        # If set to CRH, implies note was actually CRH at some point
        c.firstNonNMRLabelKey,
        # Use to determine final status and whether note is locked
        c.lockedStatusKey,
        # If set to ScoringDriftGuard, indicates note may be prone to flipping
        c.currentDecidedByKey,
      ]
    ].copy()
    labels[LABEL] = pd.NA
    labels.loc[(labels[c.lockedStatusKey] == c.currentlyRatedHelpful), LABEL] = CRH
    labels.loc[
      (labels[c.firstNonNMRLabelKey] == c.currentlyRatedHelpful)
      & (labels[c.lockedStatusKey].isin({c.needsMoreRatings, c.currentlyRatedNotHelpful})),
      LABEL,
    ] = FLIP
    labels.loc[
      (~labels[c.timestampMillisOfFirstNmrDueToMinStableCrhTimeKey].isna())
      & (labels[c.firstNonNMRLabelKey].isna())
      & (labels[c.lockedStatusKey].isin({c.needsMoreRatings, c.currentlyRatedNotHelpful})),
      LABEL,
    ] = FLIP
    labels = labels.dropna(subset=LABEL)
    logger.info(f"labels before ScoringDriftGuard:\n{labels[LABEL].value_counts(dropna=False)}")
    # Note that we don't exclude notes decided by ScoringDriftGuard when a note locks to NMR
    # after being CRH and is now decided by ScoringDriftGuard (implying the note was once again
    # scored as CRH) because in that case the note is labeled as FLIP and the involvement of
    # ScoringDriftGuard only provides further evidence that the note flips status.
    dropRows = (labels[LABEL] == CRH) & (
      np.array(
        [decider.startswith("ScoringDriftGuard") for decider in labels[c.currentDecidedByKey]]
      )
    )
    labels = labels[~dropRows][[c.noteIdKey, LABEL]]
    logger.info(f"labels after ScoringDriftGuard:\n{labels[LABEL].value_counts(dropna=False)}")
    return labels
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


sourcecode/scoring/pflip_plus_model.py [356:415]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  def _label_notes(
    self,
    noteStatusHistory: pd.DataFrame,
  ) -> pd.DataFrame:
    """Generate a DataFrame mapping noteIds to labels.

    We define a CRH note as any note that is locked to CRH, and a FLIP note as any note
    that was scored as CRH at some point but has since locked to NMR.  Note that we exclude
    notes that are locked to CRH but decided by ScoringDriftGuard, since that indicates the
    model wanted to score the note as NMR (and therefore it is unclear whether the note is
    best labeled CRH or FLIP).

    Args:
      noteStatusHistory: pd.DataFrame used to determine locked status and whether there was
        a prior CRH status.

    Returns:
      pd.DataFrame with noteId and LABEL columns
    """
    # Assemble relevant data for labeling
    labels = noteStatusHistory[
      [
        c.noteIdKey,
        # If set, implies note was on track to be CRH at some point
        c.timestampMillisOfFirstNmrDueToMinStableCrhTimeKey,
        # If set to CRH, implies note was actually CRH at some point
        c.firstNonNMRLabelKey,
        # Use to determine final status and whether note is locked
        c.lockedStatusKey,
        # If set to ScoringDriftGuard, indicates note may be prone to flipping
        c.currentDecidedByKey,
      ]
    ].copy()
    labels[LABEL] = pd.NA
    labels.loc[(labels[c.lockedStatusKey] == c.currentlyRatedHelpful), LABEL] = CRH
    labels.loc[
      (labels[c.firstNonNMRLabelKey] == c.currentlyRatedHelpful)
      & (labels[c.lockedStatusKey].isin({c.needsMoreRatings, c.currentlyRatedNotHelpful})),
      LABEL,
    ] = FLIP
    labels.loc[
      (~labels[c.timestampMillisOfFirstNmrDueToMinStableCrhTimeKey].isna())
      & (labels[c.firstNonNMRLabelKey].isna())
      & (labels[c.lockedStatusKey].isin({c.needsMoreRatings, c.currentlyRatedNotHelpful})),
      LABEL,
    ] = FLIP
    labels = labels.dropna(subset=LABEL)
    logger.info(f"labels before ScoringDriftGuard:\n{labels[LABEL].value_counts(dropna=False)}")
    # Note that we don't exclude notes decided by ScoringDriftGuard when a note locks to NMR
    # after being CRH and is now decided by ScoringDriftGuard (implying the note was once again
    # scored as CRH) because in that case the note is labeled as FLIP and the involvement of
    # ScoringDriftGuard only provides further evidence that the note flips status.
    dropRows = (labels[LABEL] == CRH) & (
      np.array(
        [decider.startswith("ScoringDriftGuard") for decider in labels[c.currentDecidedByKey]]
      )
    )
    labels = labels[~dropRows][[c.noteIdKey, LABEL]]
    logger.info(f"labels after ScoringDriftGuard:\n{labels[LABEL].value_counts(dropna=False)}")
    return labels
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -