in data_measurements/dataset_statistics.py [0:0]
def prepare_general_stats(self):
if self.tokenized_df is None:
logs.warning("Tokenized dataset not yet loaded; doing so.")
self.load_or_prepare_tokenized_df()
if self.vocab_counts_df is None:
logs.warning("Vocab not yet loaded; doing so.")
self.load_or_prepare_vocab()
self.sorted_top_vocab_df = self.vocab_counts_filtered_df.sort_values(
"count", ascending=False
).head(_TOP_N)
self.total_words = len(self.vocab_counts_df)
self.total_open_words = len(self.vocab_counts_filtered_df)
self.text_nan_count = int(self.tokenized_df.isnull().sum().sum())
self.load_or_prepare_text_duplicates()
self.general_stats_dict = {
TOT_WORDS: self.total_words,
TOT_OPEN_WORDS: self.total_open_words,
TEXT_NAN_CNT: self.text_nan_count,
td.DUPS_FRAC: self.dups_frac
}