def load_or_prepare()

in run_data_measurements.py [0:0]


def load_or_prepare(dataset_args, calculation=False, use_cache=False):
    # TODO: Catch error exceptions for each measurement, so that an error
    # for one measurement doesn't break the calculation of all of them.

    do_all = False
    dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args,
                                                            use_cache=use_cache)
    logs.info("Tokenizing dataset.")
    dstats.load_or_prepare_tokenized_df()
    logs.info("Calculating vocab.")
    dstats.load_or_prepare_vocab()

    if not calculation:
        do_all = True

    if do_all or calculation == "general":
        logs.info("\n* Calculating general statistics.")
        dstats.load_or_prepare_general_stats()
        logs.info("Done!")
        logs.info(
            "Basic text statistics now available at %s." % dstats.general_stats_json_fid)

    if do_all or calculation == "duplicates":
        logs.info("\n* Calculating text duplicates.")
        dstats.load_or_prepare_text_duplicates()
        duplicates_fid_dict = dstats.duplicates_files
        logs.info("If all went well, then results are in the following files:")
        for key, value in duplicates_fid_dict.items():
            logs.info("%s: %s" % (key, value))

    if do_all or calculation == "lengths":
        logs.info("\n* Calculating text lengths.")
        dstats.load_or_prepare_text_lengths()
        length_fid_dict = dstats.length_obj.get_filenames()
        print("If all went well, then results are in the following files:")
        for key, value in length_fid_dict.items():
            print("%s: %s" % (key, value))
        print()

    if do_all or calculation == "labels":
        logs.info("\n* Calculating label statistics.")
        if dstats.label_field not in dstats.dset.features:
            logs.warning("No label field found.")
            logs.info("No label statistics to calculate.")
        else:
            dstats.load_or_prepare_labels()
            npmi_fid_dict = dstats.label_files
            print("If all went well, then results are in the following files:")
            for key, value in npmi_fid_dict.items():
                print("%s: %s" % (key, value))
            print()

    if do_all or calculation == "npmi":
        print("\n* Preparing nPMI.")
        dstats.load_or_prepare_npmi()
        npmi_fid_dict = dstats.npmi_files
        print("If all went well, then results are in the following files:")
        for key, value in npmi_fid_dict.items():
            if isinstance(value, dict):
                print(key + ":")
                for key2, value2 in value.items():
                    print("\t%s: %s" % (key2, value2))
            else:
                print("%s: %s" % (key, value))
        print()

    if do_all or calculation == "zipf":
        logs.info("\n* Preparing Zipf.")
        dstats.load_or_prepare_zipf()
        logs.info("Done!")
        zipf_json_fid, zipf_fig_json_fid, zipf_fig_html_fid = zipf.get_zipf_fids(
            dstats.dataset_cache_dir)
        logs.info("Zipf results now available at %s." % zipf_json_fid)
        logs.info(
            "Figure saved to %s, with corresponding json at %s."
            % (zipf_fig_html_fid, zipf_fig_json_fid)
        )

    # Don't do this one until someone specifically asks for it -- takes awhile.
    if calculation == "embeddings":
        logs.info("\n* Preparing text embeddings.")
        dstats.load_or_prepare_embeddings()

    # Don't do this one until someone specifically asks for it -- takes awhile.
    if calculation == "perplexities":
        logs.info("\n* Preparing text perplexities.")
        dstats.load_or_prepare_text_perplexities()