in run_data_measurements.py [0:0]
def load_or_prepare(dataset_args, calculation=False, use_cache=False):
# TODO: Catch error exceptions for each measurement, so that an error
# for one measurement doesn't break the calculation of all of them.
do_all = False
dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args,
use_cache=use_cache)
logs.info("Tokenizing dataset.")
dstats.load_or_prepare_tokenized_df()
logs.info("Calculating vocab.")
dstats.load_or_prepare_vocab()
if not calculation:
do_all = True
if do_all or calculation == "general":
logs.info("\n* Calculating general statistics.")
dstats.load_or_prepare_general_stats()
logs.info("Done!")
logs.info(
"Basic text statistics now available at %s." % dstats.general_stats_json_fid)
if do_all or calculation == "duplicates":
logs.info("\n* Calculating text duplicates.")
dstats.load_or_prepare_text_duplicates()
duplicates_fid_dict = dstats.duplicates_files
logs.info("If all went well, then results are in the following files:")
for key, value in duplicates_fid_dict.items():
logs.info("%s: %s" % (key, value))
if do_all or calculation == "lengths":
logs.info("\n* Calculating text lengths.")
dstats.load_or_prepare_text_lengths()
length_fid_dict = dstats.length_obj.get_filenames()
print("If all went well, then results are in the following files:")
for key, value in length_fid_dict.items():
print("%s: %s" % (key, value))
print()
if do_all or calculation == "labels":
logs.info("\n* Calculating label statistics.")
if dstats.label_field not in dstats.dset.features:
logs.warning("No label field found.")
logs.info("No label statistics to calculate.")
else:
dstats.load_or_prepare_labels()
npmi_fid_dict = dstats.label_files
print("If all went well, then results are in the following files:")
for key, value in npmi_fid_dict.items():
print("%s: %s" % (key, value))
print()
if do_all or calculation == "npmi":
print("\n* Preparing nPMI.")
dstats.load_or_prepare_npmi()
npmi_fid_dict = dstats.npmi_files
print("If all went well, then results are in the following files:")
for key, value in npmi_fid_dict.items():
if isinstance(value, dict):
print(key + ":")
for key2, value2 in value.items():
print("\t%s: %s" % (key2, value2))
else:
print("%s: %s" % (key, value))
print()
if do_all or calculation == "zipf":
logs.info("\n* Preparing Zipf.")
dstats.load_or_prepare_zipf()
logs.info("Done!")
zipf_json_fid, zipf_fig_json_fid, zipf_fig_html_fid = zipf.get_zipf_fids(
dstats.dataset_cache_dir)
logs.info("Zipf results now available at %s." % zipf_json_fid)
logs.info(
"Figure saved to %s, with corresponding json at %s."
% (zipf_fig_html_fid, zipf_fig_json_fid)
)
# Don't do this one until someone specifically asks for it -- takes awhile.
if calculation == "embeddings":
logs.info("\n* Preparing text embeddings.")
dstats.load_or_prepare_embeddings()
# Don't do this one until someone specifically asks for it -- takes awhile.
if calculation == "perplexities":
logs.info("\n* Preparing text perplexities.")
dstats.load_or_prepare_text_perplexities()