in data_measurements/dataset_statistics.py [0:0]
def load_or_prepare_vocab(self, load_only=False):
"""
Calculates the vocabulary count from the tokenized text.
The resulting dataframes may be used in nPMI calculations, zipf, etc.
:param
:return:
"""
if self.use_cache and exists(self.vocab_counts_df_fid):
logs.info("Reading vocab from cache")
self.load_vocab()
self.vocab_counts_filtered_df = filter_vocab(self.vocab_counts_df)
elif not load_only:
if self.tokenized_df is None:
# Building the vocabulary starts with tokenizing.
self.load_or_prepare_tokenized_df(load_only=False)
logs.info("Calculating vocab afresh")
word_count_df = count_vocab_frequencies(self.tokenized_df)
logs.info("Making dfs with proportion.")
self.vocab_counts_df = calc_p_word(word_count_df)
self.vocab_counts_filtered_df = filter_vocab(self.vocab_counts_df)
if self.save:
logs.info("Writing out.")
ds_utils.write_df(self.vocab_counts_df, self.vocab_counts_df_fid)
logs.info("unfiltered vocab")
logs.info(self.vocab_counts_df)
logs.info("filtered vocab")
logs.info(self.vocab_counts_filtered_df)