in vissl/hooks/tensorboard_hook.py [0:0]
def on_update(self, task: "tasks.ClassyTask") -> None:
"""
Called after every parameters update if tensorboard hook is enabled.
Logs the parameter gradients if they are being set to log,
log the scalars like training loss, learning rate, average training
iteration time, batch size per gpu, img/sec/gpu, ETA, gpu memory used,
peak gpu memory used.
"""
if not is_primary():
return
iteration = task.iteration
if (
self.log_params_every_n_iterations > 0
and self.log_params_gradients
and task.train
and iteration % self.log_params_every_n_iterations == 0
):
logging.info(f"Logging Parameter gradients. Iteration {iteration}")
for name, parameter in task.base_model.named_parameters():
if parameter.grad is not None:
try:
self.tb_writer.add_histogram(
f"Gradients/{name}",
parameter.grad,
global_step=task.iteration,
)
except ValueError:
logging.info(
f"Gradient histogram empty for {name}, "
f"iteration {task.iteration}. Unable to "
f"log gradient."
)
if iteration % task.config["LOG_FREQUENCY"] == 0 or (
iteration <= 100 and iteration % 5 == 0
):
logging.info(f"Logging metrics. Iteration {iteration}")
self.tb_writer.add_scalar(
tag="Training/Loss",
scalar_value=round(task.last_batch.loss.data.cpu().item(), 5),
global_step=iteration,
)
self.tb_writer.add_scalar(
tag="Training/Learning_rate",
scalar_value=round(task.optimizer.options_view.lr, 5),
global_step=iteration,
)
# Batch processing time
if len(task.batch_time) > 0:
batch_times = task.batch_time
else:
batch_times = [0]
batch_time_avg_s = sum(batch_times) / max(len(batch_times), 1)
self.tb_writer.add_scalar(
tag="Speed/Batch_processing_time_ms",
scalar_value=int(1000.0 * batch_time_avg_s),
global_step=iteration,
)
# Images per second per replica
pic_per_batch_per_gpu = task.config["DATA"]["TRAIN"][
"BATCHSIZE_PER_REPLICA"
]
pic_per_batch_per_gpu_per_sec = (
int(pic_per_batch_per_gpu / batch_time_avg_s)
if batch_time_avg_s > 0
else 0.0
)
self.tb_writer.add_scalar(
tag="Speed/img_per_sec_per_gpu",
scalar_value=pic_per_batch_per_gpu_per_sec,
global_step=iteration,
)
# ETA
avg_time = sum(batch_times) / len(batch_times)
eta_secs = avg_time * (task.max_iteration - iteration)
self.tb_writer.add_scalar(
tag="Speed/ETA_hours",
scalar_value=eta_secs / 3600.0,
global_step=iteration,
)
# GPU Memory
if torch.cuda.is_available():
# Memory actually being used
self.tb_writer.add_scalar(
tag="Memory/Peak_GPU_Memory_allocated_MiB",
scalar_value=torch.cuda.max_memory_allocated() / BYTE_TO_MiB,
global_step=iteration,
)
# Memory reserved by PyTorch's memory allocator
self.tb_writer.add_scalar(
tag="Memory/Peak_GPU_Memory_reserved_MiB",
scalar_value=torch.cuda.max_memory_reserved()
/ BYTE_TO_MiB, # byte to MiB
global_step=iteration,
)
self.tb_writer.add_scalar(
tag="Memory/Current_GPU_Memory_reserved_MiB",
scalar_value=torch.cuda.memory_reserved()
/ BYTE_TO_MiB, # byte to MiB
global_step=iteration,
)