in npmi/npmi.py [0:0]
def _compute(self, references, vocab_counts, subgroup):
if isinstance(vocab_counts, dict):
vocab_counts_df = pd.DataFrame.from_dict(vocab_counts,
orient='index',
columns=[CNT])
elif isinstance(vocab_counts, pd.DataFrame):
vocab_counts_df = vocab_counts
else:
print("Can't support the data structure for the vocab counts. =(")
return
# These are used throughout the rest of the functions
self.references = references
self.vocab_counts_df = vocab_counts_df
self.vocab_counts_df[PROP] = vocab_counts_df[CNT] / sum(
vocab_counts_df[CNT])
# self.mlb_list holds num batches x num_sentences
self.mlb_list = []
# Index of the subgroup word in the sparse vector
subgroup_idx = vocab_counts_df.index.get_loc(subgroup)
print("Calculating co-occurrences...")
df_coo = self.calc_cooccurrences(subgroup, subgroup_idx)
vocab_cooc_df = self.set_idx_cols(df_coo, subgroup)
print("Calculating PMI...")
pmi_df = self.calc_PMI(vocab_cooc_df, subgroup)
print("Calculating nPMI...")
npmi_df = self.calc_nPMI(pmi_df, vocab_cooc_df, subgroup)
npmi_bias = npmi_df.max(axis=0) + abs(npmi_df.min(axis=0))
return {"bias": npmi_bias, "co-occurrences": vocab_cooc_df,
"pmi": pmi_df, "npmi": npmi_df}