in data_measurements/dataset_statistics.py [0:0]
def load_or_prepare_general_stats(self, load_only=False):
"""
Content for expander_general_stats widget.
Provides statistics for total words, total open words,
the sorted top vocab, the NaN count, and the duplicate count.
Args:
Returns:
"""
# General statistics
# For the general statistics, text duplicates are not saved in their
# own files, but rather just the text duplicate fraction is saved in the
# "general" file. We therefore set save=False for
# the text duplicate files in this case.
# Similarly, we don't get the full list of duplicates
# in general stats, so set list_duplicates to False
self.load_or_prepare_text_duplicates(load_only=load_only, save=False,
list_duplicates=False)
logs.info("Duplicates results:")
logs.info(self.duplicates_results)
self.general_stats_dict.update(self.duplicates_results)
# TODO: Tighten the rest of this similar to text_duplicates.
if (
self.use_cache
and exists(self.general_stats_json_fid)
and exists(self.sorted_top_vocab_df_fid)
):
logs.info("Loading cached general stats")
self.load_general_stats()
elif not load_only:
logs.info("Preparing general stats")
self.prepare_general_stats()
if self.save:
ds_utils.write_df(self.sorted_top_vocab_df,
self.sorted_top_vocab_df_fid)
ds_utils.write_json(self.general_stats_dict,
self.general_stats_json_fid)