in data_measurements/npmi/npmi.py [0:0]
def count_words_per_sentence(self):
# Counts the number of each vocabulary item per-sentence in batches.
logs.info("Creating co-occurrence matrix for nPMI calculations.")
word_cnts_per_sentence = []
logs.info(self.tokenized_sentence_df)
batches = np.linspace(0, self.tokenized_sentence_df.shape[0],
NUM_BATCHES).astype(int)
# Creates matrix of size # batches x # sentences
for batch_num in range(len(batches) - 1):
# Makes matrix shape: batch size (# sentences) x # words,
# with the occurrence of each word per sentence.
# vocab_counts_df.index is the vocabulary.
mlb = MultiLabelBinarizer(classes=self.vocabulary)
if batch_num % 100 == 0:
logs.debug(
"%s of %s sentence binarize batches." % (
str(batch_num), str(len(batches)))
)
# Per-sentence word counts
sentence_batch = self.tokenized_sentence_df[
batches[batch_num]:batches[batch_num + 1]]
mlb_series = mlb.fit_transform(sentence_batch)
word_cnts_per_sentence.append(mlb_series)
return word_cnts_per_sentence