in sourcecode/scoring/topic_model.py [0:0]
def _prepare_post_text(self, notes: pd.DataFrame) -> pd.DataFrame:
"""Concatenate all notes within each post into a single row associated with the post.
Args:
notes: dataframe containing raw note text
Returns:
DataFrame with one post per row containing note text
"""
postNoteText = (
notes[[c.tweetIdKey, c.summaryKey]]
.fillna({c.summaryKey: ""})
.groupby(c.tweetIdKey)[c.summaryKey]
.apply(lambda postNotes: " ".join(postNotes))
.reset_index(drop=False)
)
# Default tokenization for CountVectorizer will not split on underscore, which
# results in very long tokens containing many words inside of URLs. Removing
# underscores allows us to keep default splitting while fixing that problem.
postNoteText[c.summaryKey] = [
text.replace("_", " ") for text in postNoteText[c.summaryKey].values
]
return postNoteText