sagemaker/02_getting_started_tensorflow/scripts/train.py (77 lines of code) (raw):
import argparse
import logging
import os
import sys
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding, create_optimizer
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Hyperparameters sent by the client are passed as command-line arguments to the script.
parser.add_argument("--epochs", type=int, default=3)
parser.add_argument("--train_batch_size", type=int, default=16)
parser.add_argument("--eval_batch_size", type=int, default=8)
parser.add_argument("--model_id", type=str)
parser.add_argument("--learning_rate", type=str, default=3e-5)
# Data, model, and output directories
parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
args, _ = parser.parse_known_args()
# Set up logging
logger = logging.getLogger(__name__)
logging.basicConfig(
level=logging.getLevelName("INFO"),
handlers=[logging.StreamHandler(sys.stdout)],
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.model_id)
# Load DatasetDict
dataset = load_dataset("imdb")
# Preprocess train dataset
def preprocess_function(examples):
return tokenizer(examples["text"], truncation=True)
encoded_dataset = dataset.map(preprocess_function, batched=True)
# define tokenizer_columns
# tokenizer_columns is the list of keys from the dataset that get passed to the TensorFlow model
tokenizer_columns = ["attention_mask", "input_ids"]
# convert to TF datasets
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
encoded_dataset["train"] = encoded_dataset["train"].rename_column("label", "labels")
tf_train_dataset = encoded_dataset["train"].to_tf_dataset(
columns=tokenizer_columns,
label_cols=["labels"],
shuffle=True,
batch_size=8,
collate_fn=data_collator,
)
encoded_dataset["test"] = encoded_dataset["test"].rename_column("label", "labels")
tf_validation_dataset = encoded_dataset["test"].to_tf_dataset(
columns=tokenizer_columns,
label_cols=["labels"],
shuffle=False,
batch_size=8,
collate_fn=data_collator,
)
# Prepare model labels - useful in inference API
labels = encoded_dataset["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
label2id[label] = str(i)
id2label[str(i)] = label
# download model from model hub
model = TFAutoModelForSequenceClassification.from_pretrained(
args.model_id, num_labels=num_labels, label2id=label2id, id2label=id2label
)
# create Adam optimizer with learning rate scheduling
batches_per_epoch = len(encoded_dataset["train"]) // args.train_batch_size
total_train_steps = int(batches_per_epoch * args.epochs)
optimizer, _ = create_optimizer(init_lr=args.learning_rate, num_warmup_steps=0, num_train_steps=total_train_steps)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# define metric and compile model
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
# Training
logger.info("*** Train ***")
train_results = model.fit(
tf_train_dataset,
epochs=args.epochs,
validation_data=tf_validation_dataset,
)
output_eval_file = os.path.join(args.output_data_dir, "train_results.txt")
with open(output_eval_file, "w") as writer:
logger.info("***** Train results *****")
logger.info(train_results)
for key, value in train_results.history.items():
logger.info(" %s = %s", key, value)
writer.write("%s = %s\n" % (key, value))
# Save result
model.save_pretrained(args.model_dir)
tokenizer.save_pretrained(args.model_dir)