in sourcecode/scoring/pflip_plus_model.py [0:0]
def _get_quick_rating_stats(self, notes: pd.DataFrame, ratings: pd.DataFrame) -> pd.DataFrame:
"""Return counts and ratios of how many ratings occurred in the first 1/5/15/60 minutes.
Args:
notes: DF specifying note creation timestamps.
ratings: DF specifying local rating timestamps.
"""
ratingTotals = (
ratings[[c.noteIdKey]]
.value_counts()
.to_frame()
.reset_index(drop=False)
.rename(columns={"count": "total"})
)
ratingTotals = notes[[c.noteIdKey]].merge(ratingTotals, how="left")
ratingTotals = ratingTotals.fillna({"total": 0}).astype(pd.Int64Dtype())
for cutoff in _RATING_TIME_BUCKETS:
beforeCutoff = ratings[[c.noteIdKey, c.createdAtMillisKey]].rename(
columns={c.createdAtMillisKey: "ratingCreationMts"}
)
beforeCutoff = beforeCutoff.merge(notes[[c.noteIdKey, _NOTE_CREATION_MILLIS]])
beforeCutoff = beforeCutoff[
beforeCutoff["ratingCreationMts"]
< (beforeCutoff[_NOTE_CREATION_MILLIS] + (1000 * 60 * cutoff))
]
cutoffCount = (
beforeCutoff[[c.noteIdKey]]
.value_counts()
.to_frame()
.reset_index(drop=False)
.rename(columns={"count": f"FIRST_{cutoff}_TOTAL"})
)
ratingTotals = ratingTotals.merge(cutoffCount, how="left").fillna(0)
ratingTotals = ratingTotals.astype(pd.Int64Dtype())
for cutoff in _RATING_TIME_BUCKETS:
ratingTotals[f"FIRST_{cutoff}_RATIO"] = ratingTotals[f"FIRST_{cutoff}_TOTAL"] / (
ratingTotals["total"].clip(lower=1)
)
return ratingTotals[[c.noteIdKey] + _QUICK_RATING_COLS]