in grade_school_math/train.py [0:0]
def main():
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
train_examples = get_examples("train")
train_dset = GSMDataset(tokenizer, train_examples)
device = th.device("cuda")
config = GPT2Config.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2", config=config)
model.to(device)
model.train()
train_loader = DataLoader(train_dset, batch_size=16, shuffle=True)
optim = AdamW(model.parameters(), lr=1e-5)
num_epochs = 20
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
"linear",
optimizer=optim,
num_warmup_steps=0,
num_training_steps=num_training_steps,
)
pbar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
for batch in train_loader:
optim.zero_grad()
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch, labels=batch["input_ids"])
loss = outputs[0]
loss.backward()
optim.step()
lr_scheduler.step()
pbar.update(1)
pbar.set_description(f"train_loss: {loss.item():.5f}")
model.save_pretrained("model_ckpts/")