def load_and_cache_examples()

in model/mm_dst/gpt2_dst/scripts/run_language_modeling.py [0:0]


def load_and_cache_examples(args, tokenizer, evaluate=False):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        dataset = LineByLineTextDataset(
            tokenizer, args, file_path=file_path, block_size=args.block_size
        )
    else:
        dataset = TextDataset(
            tokenizer, args, file_path=file_path, block_size=args.block_size
        )

    # Unknown issues have been reported around not being able to handle incomplete batches (e.g. w/ older CUDA 9.2)
    # Below is a workaround in case you encounter this issue.
    # Alternatively, --nocuda could avoid this issue too.
    # Comment out the following if you do not encounuter this issue or if you are not using any GPU.
    n = len(dataset) % args.per_gpu_train_batch_size
    if n != 0:
        print("Truncating from %d examples" % len(dataset.examples))
        dataset.examples = dataset.examples[:-n]
        print("Truncating to %d examples" % len(dataset.examples))
    return dataset