in data_measurements/dataset_statistics.py [0:0]
def count_vocab_frequencies(tokenized_df):
"""
Based on an input pandas DataFrame with a 'text' column,
this function will count the occurrences of all words.
:return: [num_words x num_sentences] DataFrame with the rows corresponding to the
different vocabulary words and the column to the presence (0 or 1) of that word.
"""
cvec = CountVectorizer(
tokenizer=dummy,
preprocessor=dummy,
)
# We do this to calculate per-word statistics
# Fast calculation of single word counts
logs.info(
"Fitting dummy tokenization to make matrix using the previous tokenization"
)
cvec.fit(tokenized_df[TOKENIZED_FIELD])
document_matrix = cvec.transform(tokenized_df[TOKENIZED_FIELD])
batches = np.linspace(0, tokenized_df.shape[0], _NUM_VOCAB_BATCHES).astype(
int)
i = 0
tf = []
while i < len(batches) - 1:
if i % 100 == 0:
logs.info("%s of %s vocab batches" % (str(i), str(len(batches))))
batch_result = np.sum(
document_matrix[batches[i]: batches[i + 1]].toarray(), axis=0
)
tf.append(batch_result)
i += 1
word_count_df = pd.DataFrame(
[np.sum(tf, axis=0)], columns=cvec.get_feature_names_out()
).transpose()
# Now organize everything into the dataframes
word_count_df.columns = [CNT]
word_count_df.index.name = WORD
return word_count_df