in data_measurements/npmi/npmi.py [0:0]
def __init__(self, dstats, identity_terms, load_only=False, use_cache=False,
save=True):
# The data measurements tool settings (dataset, config, etc.)
self.dstats = dstats
# Whether we can use caching (when live, no).
self.load_only = load_only
# Whether to first try using cache before calculating
self.use_cache = use_cache
# Whether to save results
self.save = save
# Tokenized dataset
tokenized_df = dstats.tokenized_df
self.tokenized_sentence_df = tokenized_df[TOKENIZED_FIELD]
# Dataframe of shape #vocab x 1 (count)
self.vocab_counts_df = dstats.vocab_counts_df
# Cutoff for the number of times something must occur to be included
self.min_count = dstats.min_vocab_count
self.cache_path = pjoin(dstats.dataset_cache_dir, SING)
self.avail_terms_json_fid = pjoin(self.cache_path,
"identity_terms.json")
# TODO: Users ideally can type in whatever words they want.
# This is the full list of terms.
self.identity_terms = identity_terms
logs.info("Using term list:")
logs.info(self.identity_terms)
# identity_terms terms that are available more than MIN_VOCAB_COUNT
self.avail_identity_terms = []
# TODO: Let users specify
self.open_class_only = True
# Single-word associations
self.assoc_results_dict = defaultdict(dict)
# Paired term association bias
self.bias_results_dict = defaultdict(dict)
# Dataframes used in displays.
self.bias_dfs_dict = defaultdict(dict)
# Results of the single word associations and their paired bias values.
# Formatted as:
# {(s1,s2)): {pd.DataFrame({s1-s2:diffs, s1:assoc, s2:assoc})}}
self.results_dict = defaultdict(lambda: defaultdict(dict))
# Filenames for cache, based on the results
self.filenames_dict = defaultdict(dict)