in sourcecode/scoring/pflip_plus_model.py [0:0]
def _get_burst_rating_stats(self, notes: pd.DataFrame, ratings: pd.DataFrame) -> pd.DataFrame:
"""Return counts and ratios of the max ratings in 1/5/15/60 minute windows.
Args:
ratings: DF specifying local rating timestamps.
"""
ratingTotals = (
ratings[[c.noteIdKey]]
.value_counts()
.to_frame()
.reset_index(drop=False)
.rename(columns={"count": "total"})
)
initialNotes = len(notes)
ratingTotals = notes[[c.noteIdKey]].merge(ratingTotals, how="left")
ratingTotals = ratingTotals.fillna({"total": 0}).astype(pd.Int64Dtype())
for cutoff in _RATING_TIME_BUCKETS:
ratingCounts = []
for offset in range(cutoff):
offsetRatings = ratings[[c.noteIdKey, c.createdAtMillisKey]].copy()
offsetRatings[c.createdAtMillisKey] = (
offsetRatings[c.createdAtMillisKey] + (1000 * 60 * offset)
) // (1000 * 60 * cutoff)
offsetRatings = (
offsetRatings.value_counts()
.to_frame()
.reset_index(drop=False)[[c.noteIdKey, "count"]]
.groupby(c.noteIdKey)
.max()
.reset_index(drop=False)
)
ratingCounts.append(offsetRatings)
ratingCounts = (
pd.concat(ratingCounts)
.groupby(c.noteIdKey)
.max()
.reset_index(drop=False)
.rename(columns={"count": f"BURST_{cutoff}_TOTAL"})
).astype(pd.Int64Dtype())
ratingTotals = ratingTotals.merge(ratingCounts, how="left").fillna(
{f"BURST_{cutoff}_TOTAL": 0}
)
ratingTotals[f"BURST_{cutoff}_RATIO"] = ratingTotals[f"BURST_{cutoff}_TOTAL"] / (
ratingTotals["total"].clip(lower=1)
)
assert (
len(ratingTotals) == initialNotes
), f"unexpected length mismatch: {len(ratingTotals)} vs. {initialNotes}"
return ratingTotals[[c.noteIdKey] + _BURST_RATING_COLS]