in data_measurements/npmi/npmi.py [0:0]
def calc_cooccurrences(self, subgroup, subgroup_idx):
initialize = True
coo_df = None
# Big computation here! Should only happen once.
logs.debug(
"Approaching big computation! Here, we binarize all words in the "
"sentences, making a sparse matrix of sentences."
)
for batch_id in range(len(self.word_cnts_per_sentence)):
# Every 100 batches, print out the progress.
if not batch_id % 100:
logs.debug(
"%s of %s co-occurrence count batches"
% (str(batch_id), str(len(self.word_cnts_per_sentence)))
)
# List of all the sentences (list of vocab) in that batch
batch_sentence_row = self.word_cnts_per_sentence[batch_id]
# Dataframe of # sentences in batch x vocabulary size
sent_batch_df = pd.DataFrame(batch_sentence_row)
# Subgroup counts per-sentence for the given batch
subgroup_df = sent_batch_df[subgroup_idx]
subgroup_df.columns = [subgroup]
# Remove the sentences where the count of the subgroup is 0.
# This way we have less computation & resources needs.
subgroup_df = subgroup_df[subgroup_df > 0]
mlb_subgroup_only = sent_batch_df[sent_batch_df[subgroup_idx] > 0]
# Create cooccurrence matrix for the given subgroup and all words.
batch_coo_df = pd.DataFrame(mlb_subgroup_only.T.dot(subgroup_df))
# Creates a batch-sized dataframe of co-occurrence counts.
# Note these could just be summed rather than be batch size.
if initialize:
coo_df = batch_coo_df
else:
coo_df = coo_df.add(batch_coo_df, fill_value=0)
initialize = False
logs.debug("Made co-occurrence matrix")
logs.debug(coo_df)
count_df = coo_df.set_index(self.vocab_counts_df.index)
count_df.columns = ["count"]
count_df["count"] = count_df["count"].astype(int)
return count_df