in npmi/npmi.py [0:0]
def calc_cooccurrences(self, subgroup, subgroup_idx):
initialize = True
coo_df = None
# Big computation here! Should only happen once.
print(
"Approaching big computation! Here, we binarize all words in the sentences, making a sparse matrix of sentences."
)
if not self.mlb_list:
self._binarize_words_in_sentence()
for batch_id in range(len(self.mlb_list)):
print(
"%s of %s co-occurrence count batches"
% (str(batch_id), str(len(self.mlb_list)))
)
# List of all the sentences (list of vocab) in that batch
batch_sentence_row = self.mlb_list[batch_id]
# Dataframe of # sentences in batch x vocabulary size
sent_batch_df = pd.DataFrame(batch_sentence_row)
# Subgroup counts per-sentence for the given batch
subgroup_df = sent_batch_df[subgroup_idx]
subgroup_df.columns = [subgroup]
# Remove the sentences where the count of the subgroup is 0.
# This way we have less computation & resources needs.
subgroup_df = subgroup_df[subgroup_df > 0]
mlb_subgroup_only = sent_batch_df[sent_batch_df[subgroup_idx] > 0]
# Create cooccurrence matrix for the given subgroup and all words.
batch_coo_df = pd.DataFrame(mlb_subgroup_only.T.dot(subgroup_df))
# Creates a batch-sized dataframe of co-occurrence counts.
# Note these could just be summed rather than be batch size.
if initialize:
coo_df = batch_coo_df
else:
coo_df = coo_df.add(batch_coo_df, fill_value=0)
initialize = False
print("Returning co-occurrence matrix")
return pd.DataFrame(coo_df)