in model/mm_dst/gpt2_dst/scripts/run_language_modeling.py [0:0]
def load_and_cache_examples(args, tokenizer, evaluate=False):
file_path = args.eval_data_file if evaluate else args.train_data_file
if args.line_by_line:
dataset = LineByLineTextDataset(
tokenizer, args, file_path=file_path, block_size=args.block_size
)
else:
dataset = TextDataset(
tokenizer, args, file_path=file_path, block_size=args.block_size
)
# Unknown issues have been reported around not being able to handle incomplete batches (e.g. w/ older CUDA 9.2)
# Below is a workaround in case you encounter this issue.
# Alternatively, --nocuda could avoid this issue too.
# Comment out the following if you do not encounuter this issue or if you are not using any GPU.
n = len(dataset) % args.per_gpu_train_batch_size
if n != 0:
print("Truncating from %d examples" % len(dataset.examples))
dataset.examples = dataset.examples[:-n]
print("Truncating to %d examples" % len(dataset.examples))
return dataset