in ssiog/training.py [0:0]
def training():
# Parse arguments
args = arguments.parse_args()
# Initialize the global application logger.
logger.info("Setting up logger.")
setup_logger(args)
logger.debug(f"Running with args: {args}")
# Initialize the OpenTelemetry MeterProvider
if args.export_metrics:
logger.info("Setting up otlp metrics exporter.")
setup_metrics_exporter(args)
# Initialize the metrics logger.
if args.log_metrics:
logger.info(f"Logging metrics to: {args.metrics_file}")
setup_metrics_logger(args)
logger.info("Initial setup completed.\n")
logger.info(f"Starting process: {args.group_member_id}/{args.group_size}")
td.init_process_group(
"gloo",
init_method=f"tcp://{args.group_coordinator_address}:{args.group_coordinator_port}",
rank=args.group_member_id,
world_size=args.group_size,
)
logger.info(f"Process started successfully: {args.group_member_id}/{args.group_size}\n")
logger.info(f"Logging important workload configurations.")
logger.info(f"Total epochs: {args.epochs}")
logger.info(f"Sample size (bytes): {args.sample_size}")
logger.info(f"Batch size: {args.batch_size}")
logger.info(f"Steps: {args.steps}")
logger.info(f"Read order: {args.read_order[0]}")
logger.info(f"Background queue max size: {args.background_queue_maxsize}")
logger.info(f"Background threads: {args.background_threads}")
logger.info(f"Group member id: {args.group_member_id}")
logger.info(f"Group size: {args.group_size}")
logger.info(f"Label: {args.label}")
logger.info(f"Data set path: {args.prefix}.\n")
sources = configure_object_sources(args)
for epoch in range(args.epochs):
logger.info(f"******** Starting epoch: {epoch} ********.")
logger.info(f"Configure epoch: {epoch}.")
(reader, read_order, filesystem_name, filesystem, epoch_objects) = (configure_epoch(sources, args))
logger.info(f"Configured, total objects: {len(epoch_objects)}")
logger.info(f"Configuring samples.")
samples = configure_samples(epoch_objects, filesystem, args)
logger.info(f"Configured, total selected samples: {len(samples)}")
logger.info(f"Running epoch: {epoch}")
for summary in Epoch(reader, epoch_objects, filesystem, samples, args):
logger.info(f"Epoch: {epoch}, {summary}")
logger.info(f"Epoch {epoch} completed.\n")
# Clear the kernel cache
if args.clear_pagecache_after_epoch:
util.clear_kernel_cache(logger)