in trainer.py [0:0]
def full_eval(model, optimizer, scheduler, data, block_size, hidden_size):
model.eval()
train_pos = 0
nb_batches_per_iter_max = math.ceil(data.size(1) / block_size)
h_cache = [
torch.zeros(
data.size(0),
layer.attn.attn.get_cache_size(),
hidden_size).to(data.device)
for layer in model.module.layers]
loss_all = 0
actual_nb_batches_per_iter = 0
for _ in range(nb_batches_per_iter_max):
actual_nb_batches_per_iter += 1
X = data[:, train_pos: train_pos + block_size].contiguous()
Y = data[:, train_pos + 1: train_pos + block_size + 1].contiguous()
loss, h_cache = _train_batch(
model=model,
optimizer=optimizer,
scheduler=scheduler,
X=X, Y=Y,
h_cache=h_cache,
eval_only=True,
batch_split=1)
loss_all += loss
train_pos += block_size
if train_pos >= data.size(1) - block_size:
# Skip the remaining tokens as it can't make a whole block.
# An effect on performance should be negligable for a large data.
break
loss_all = loss_all / actual_nb_batches_per_iter
return loss_all