in adanet/core/iteration.py [0:0]
def _create_hooks(self, base_global_step, subnetwork_specs, candidates,
num_subnetworks, rebuilding, train_manager_dir, is_chief):
"""Returns the hooks to monitor and train this iteration.
Args:
base_global_step: Integer global step at the beginning of this iteration.
subnetwork_specs: List of `_SubnetworkSpec` instances.
candidates: List of `_Candidate` instances to compare.
num_subnetworks: Integer number of subnetwork builders generated for the
current iteration.
rebuilding: Boolean whether the iteration is being rebuilt only to restore
the previous best subnetworks and ensembles.
train_manager_dir: Directory for the TrainManager to store spec metadata.
is_chief: Whether the current worker is chief.
Returns:
A 3-tuple of a _TrainManager for monitoring training, a list of
`SessionRunHooks` to run on chief, and a list of `SessionRunHooks` to run
on all workers.
"""
training_chief_hooks, training_hooks = [], []
ensemble_specs = [c.ensemble_spec for c in candidates]
train_manager = _TrainManager(subnetwork_specs, ensemble_specs,
train_manager_dir, is_chief)
if not self._use_tpu:
# On TPU, the global step gets incremented in an op since it doesn't have
# hook run granularity of CPU and GPU training.
training_chief_hooks.append(
_GlobalStepSetterHook(train_manager, subnetwork_specs,
base_global_step,
self._global_step_combiner_fn))
should_train_subnetworks = (
self._placement_strategy.should_train_subnetworks(num_subnetworks))
for spec in subnetwork_specs:
if not self._use_tpu:
training_hooks.append(_NanLossHook(train_manager, spec))
# We increment the step along with the global step as part of the train
# op on TPU, whereas on CPU and GPU we use hooks for fine grained control.
if self._use_tpu or not should_train_subnetworks or spec.train_op is None:
increment_step_op = None
else:
with tf.control_dependencies([spec.train_op.train_op]):
increment_step_op = spec.step.assign_add(1)
# TPU also supports uneven training, but up to num_iterations_per_loop.
training_hooks.append(
_TrainingLimitHook(
train_manager,
spec,
self._max_steps,
increment_step_op=increment_step_op))
if not should_train_subnetworks and not rebuilding:
continue
self._add_hooks(spec, train_manager, training_chief_hooks, training_hooks)
for spec in ensemble_specs:
if not self._use_tpu:
training_hooks.append(_NanLossHook(train_manager, spec))
# See above comment about incrementing the step on CPU vs. TPU.
if self._use_tpu or spec.train_op is None:
increment_step_op = None
else:
with tf.control_dependencies([spec.train_op.train_op]):
increment_step_op = spec.step.assign_add(1)
training_hooks.append(
_TrainingLimitHook(
train_manager,
spec,
self._max_steps,
increment_step_op=increment_step_op))
self._add_hooks(spec, train_manager, training_chief_hooks, training_hooks)
return train_manager, training_chief_hooks, training_hooks