in tabular/src/autogluon/tabular/models/tabular_nn/mxnet/tabular_nn_mxnet.py [0:0]
def train_net(self, train_dataset, params, val_dataset=None, initialize=True, setup_trainer=True, time_limit=None, reporter=None):
""" Trains neural net on given train dataset, early stops based on test_dataset.
Args:
train_dataset (TabularNNDataset): training data used to learn network weights
val_dataset (TabularNNDataset): validation data used for hyperparameter tuning
initialize (bool): set = False to continue training of a previously trained model, otherwise initializes network weights randomly
setup_trainer (bool): set = False to reuse the same trainer from a previous training run, otherwise creates new trainer from scratch
"""
start_time = time.time()
import mxnet as mx
logger.log(15, "Training neural network for up to %s epochs..." % params['num_epochs'])
seed_value = params.get('seed_value', 0)
if seed_value is not None: # Set seeds
random.seed(seed_value)
np.random.seed(seed_value)
mx.random.seed(seed_value)
if initialize: # Initialize the weights of network
logging.debug("initializing neural network...")
self.model.collect_params().initialize(ctx=self.ctx)
self.model.hybridize()
logging.debug("initialized")
if setup_trainer:
# Also setup mxboard to monitor training if visualizer has been specified:
visualizer = self.params_aux.get('visualizer', 'none')
if visualizer == 'tensorboard' or visualizer == 'mxboard':
try_import_mxboard()
from mxboard import SummaryWriter
self.summary_writer = SummaryWriter(logdir=self.path, flush_secs=5, verbose=False)
self.optimizer = self.setup_trainer(params=params, train_dataset=train_dataset)
best_val_metric = -np.inf # higher = better
val_metric = None
best_val_epoch = 0
val_improve_epoch = 0 # most recent epoch where validation-score strictly improved
num_epochs = params['num_epochs']
if val_dataset is not None:
y_val = val_dataset.get_labels()
else:
y_val = None
if params['loss_function'] is None:
if self.problem_type == REGRESSION:
params['loss_function'] = mx.gluon.loss.L1Loss()
elif self.problem_type == SOFTCLASS:
params['loss_function'] = mx.gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=False, from_logits=self.model.from_logits)
else:
params['loss_function'] = mx.gluon.loss.SoftmaxCrossEntropyLoss(from_logits=self.model.from_logits)
loss_func = params['loss_function']
epochs_wo_improve = params['epochs_wo_improve']
loss_scaling_factor = 1.0 # we divide loss by this quantity to stabilize gradients
rescale_losses = {mx.gluon.loss.L1Loss: 'std', mx.gluon.loss.HuberLoss: 'std', mx.gluon.loss.L2Loss: 'var'} # dict of loss names where we should rescale loss, value indicates how to rescale.
loss_torescale = [key for key in rescale_losses if isinstance(loss_func, key)]
if loss_torescale:
loss_torescale = loss_torescale[0]
if rescale_losses[loss_torescale] == 'std':
loss_scaling_factor = np.std(train_dataset.get_labels())/5.0 + EPS # std-dev of labels
elif rescale_losses[loss_torescale] == 'var':
loss_scaling_factor = np.var(train_dataset.get_labels())/5.0 + EPS # variance of labels
else:
raise ValueError("Unknown loss-rescaling type %s specified for loss_func==%s" % (rescale_losses[loss_torescale], loss_func))
if self.verbosity <= 1:
verbose_eval = -1 # Print losses every verbose epochs, Never if -1
elif self.verbosity == 2:
verbose_eval = 50
elif self.verbosity == 3:
verbose_eval = 10
else:
verbose_eval = 1
net_filename = self.path + self.temp_file_name
if num_epochs == 0: # use dummy training loop that stops immediately (useful for using NN just for data preprocessing / debugging)
logger.log(20, "Not training Neural Net since num_epochs == 0. Neural network architecture is:")
for batch_idx, data_batch in enumerate(train_dataset.dataloader):
data_batch = train_dataset.format_batch_data(data_batch, self.ctx)
with mx.autograd.record():
output = self.model(data_batch)
labels = data_batch['label']
loss = loss_func(output, labels) / loss_scaling_factor
# print(str(mx.nd.mean(loss).asscalar()), end="\r") # prints per-batch losses
loss.backward()
self.optimizer.step(labels.shape[0])
if batch_idx > 0:
break
self.model.save_parameters(net_filename)
logger.log(15, "untrained Neural Net saved to file")
return
start_fit_time = time.time()
if time_limit is not None:
time_limit = time_limit - (start_fit_time - start_time)
# Training Loop:
for e in range(num_epochs):
if e == 0: # special actions during first epoch:
logger.log(15, "Neural network architecture:")
logger.log(15, str(self.model))
cumulative_loss = 0
for batch_idx, data_batch in enumerate(train_dataset.dataloader):
data_batch = train_dataset.format_batch_data(data_batch, self.ctx)
with mx.autograd.record():
output = self.model(data_batch)
labels = data_batch['label']
loss = loss_func(output, labels) / loss_scaling_factor
# print(str(mx.nd.mean(loss).asscalar()), end="\r") # prints per-batch losses
loss.backward()
self.optimizer.step(labels.shape[0])
cumulative_loss += loss.sum()
train_loss = cumulative_loss/float(train_dataset.num_examples) # training loss this epoch
if val_dataset is not None:
# FIXME: Switch to adaptive ES
val_metric = self.score(X=val_dataset, y=y_val, metric=self.stopping_metric)
if np.isnan(val_metric):
if e == 0:
raise RuntimeError("NaNs encountered in TabularNeuralNetModel training. Features/labels may be improperly formatted or NN weights may have diverged.")
else:
logger.warning("Warning: NaNs encountered in TabularNeuralNetModel training. Reverting model to last checkpoint without NaNs.")
break
if (val_metric >= best_val_metric) or (e == 0):
if val_metric > best_val_metric:
val_improve_epoch = e
best_val_metric = val_metric
best_val_epoch = e
# Until functionality is added to restart training from a particular epoch, there is no point in saving params without test_dataset
self.model.save_parameters(net_filename)
else:
best_val_epoch = e
if val_dataset is not None:
if verbose_eval > 0 and e % verbose_eval == 0:
logger.log(15, "Epoch %s. Train loss: %s, Val %s: %s" %
(e, train_loss.asscalar(), self.stopping_metric.name, val_metric))
if self.summary_writer is not None:
self.summary_writer.add_scalar(tag='val_'+self.stopping_metric.name,
value=val_metric, global_step=e)
else:
if verbose_eval > 0 and e % verbose_eval == 0:
logger.log(15, "Epoch %s. Train loss: %s" % (e, train_loss.asscalar()))
if self.summary_writer is not None:
self.summary_writer.add_scalar(tag='train_loss', value=train_loss.asscalar(), global_step=e) # TODO: do we want to keep mxboard support?
if reporter is not None:
# TODO: Ensure reporter/scheduler properly handle None/nan values after refactor
if val_dataset is not None and (not np.isnan(val_metric)): # TODO: This might work without the if statement
# epoch must be number of epochs done (starting at 1)
reporter(epoch=e + 1,
validation_performance=val_metric, # Higher val_metric = better
train_loss=float(train_loss.asscalar()),
eval_metric=self.eval_metric.name,
greater_is_better=self.eval_metric.greater_is_better)
if e - val_improve_epoch > epochs_wo_improve:
break # early-stop if validation-score hasn't strictly improved in `epochs_wo_improve` consecutive epochs
if time_limit is not None:
time_elapsed = time.time() - start_fit_time
time_epoch_average = time_elapsed / (e+1)
time_left = time_limit - time_elapsed
if time_left < time_epoch_average:
logger.log(20, f"\tRan out of time, stopping training early. (Stopping on epoch {e})")
break
if val_dataset is not None:
self.model.load_parameters(net_filename) # Revert back to best model
try:
os.remove(net_filename)
except FileNotFoundError:
pass
if val_dataset is None:
logger.log(15, "Best model found in epoch %d" % best_val_epoch)
else: # evaluate one final time:
final_val_metric = self.score(X=val_dataset, y=y_val, metric=self.stopping_metric)
if np.isnan(final_val_metric):
final_val_metric = -np.inf
logger.log(15, "Best model found in epoch %d. Val %s: %s" %
(best_val_epoch, self.stopping_metric.name, final_val_metric))
self.params_trained['num_epochs'] = best_val_epoch + 1
return