def _get_burst_rating_stats()

in sourcecode/scoring/pflip_plus_model.py [0:0]


  def _get_burst_rating_stats(self, notes: pd.DataFrame, ratings: pd.DataFrame) -> pd.DataFrame:
    """Return counts and ratios of the max ratings in 1/5/15/60 minute windows.

    Args:
      ratings: DF specifying local rating timestamps.
    """
    ratingTotals = (
      ratings[[c.noteIdKey]]
      .value_counts()
      .to_frame()
      .reset_index(drop=False)
      .rename(columns={"count": "total"})
    )
    initialNotes = len(notes)
    ratingTotals = notes[[c.noteIdKey]].merge(ratingTotals, how="left")
    ratingTotals = ratingTotals.fillna({"total": 0}).astype(pd.Int64Dtype())
    for cutoff in _RATING_TIME_BUCKETS:
      ratingCounts = []
      for offset in range(cutoff):
        offsetRatings = ratings[[c.noteIdKey, c.createdAtMillisKey]].copy()
        offsetRatings[c.createdAtMillisKey] = (
          offsetRatings[c.createdAtMillisKey] + (1000 * 60 * offset)
        ) // (1000 * 60 * cutoff)
        offsetRatings = (
          offsetRatings.value_counts()
          .to_frame()
          .reset_index(drop=False)[[c.noteIdKey, "count"]]
          .groupby(c.noteIdKey)
          .max()
          .reset_index(drop=False)
        )
        ratingCounts.append(offsetRatings)
      ratingCounts = (
        pd.concat(ratingCounts)
        .groupby(c.noteIdKey)
        .max()
        .reset_index(drop=False)
        .rename(columns={"count": f"BURST_{cutoff}_TOTAL"})
      ).astype(pd.Int64Dtype())
      ratingTotals = ratingTotals.merge(ratingCounts, how="left").fillna(
        {f"BURST_{cutoff}_TOTAL": 0}
      )
      ratingTotals[f"BURST_{cutoff}_RATIO"] = ratingTotals[f"BURST_{cutoff}_TOTAL"] / (
        ratingTotals["total"].clip(lower=1)
      )
    assert (
      len(ratingTotals) == initialNotes
    ), f"unexpected length mismatch: {len(ratingTotals)} vs. {initialNotes}"
    return ratingTotals[[c.noteIdKey] + _BURST_RATING_COLS]