def training()

in ssiog/training.py [0:0]


def training():
    # Parse arguments
    args = arguments.parse_args()

    # Initialize the global application logger.
    logger.info("Setting up logger.")
    setup_logger(args)
    
    logger.debug(f"Running with args: {args}")

    # Initialize the OpenTelemetry MeterProvider
    if args.export_metrics:
        logger.info("Setting up otlp metrics exporter.")
        setup_metrics_exporter(args)

    # Initialize the metrics logger.
    if args.log_metrics:
        logger.info(f"Logging metrics to: {args.metrics_file}")
        setup_metrics_logger(args)
        
        
    logger.info("Initial setup completed.\n")

    logger.info(f"Starting process: {args.group_member_id}/{args.group_size}")
    td.init_process_group(
        "gloo",
        init_method=f"tcp://{args.group_coordinator_address}:{args.group_coordinator_port}",
        rank=args.group_member_id,
        world_size=args.group_size,
    )
    logger.info(f"Process started successfully: {args.group_member_id}/{args.group_size}\n")
    
    logger.info(f"Logging important workload configurations.")
    logger.info(f"Total epochs: {args.epochs}")
    logger.info(f"Sample size (bytes): {args.sample_size}")
    logger.info(f"Batch size: {args.batch_size}")
    logger.info(f"Steps: {args.steps}")
    logger.info(f"Read order: {args.read_order[0]}")
    logger.info(f"Background queue max size: {args.background_queue_maxsize}")
    logger.info(f"Background threads: {args.background_threads}")
    logger.info(f"Group member id: {args.group_member_id}")
    logger.info(f"Group size: {args.group_size}")
    logger.info(f"Label: {args.label}")
    logger.info(f"Data set path: {args.prefix}.\n")
    sources = configure_object_sources(args)
    
    for epoch in range(args.epochs):
        logger.info(f"******** Starting epoch: {epoch} ********.")
        logger.info(f"Configure epoch: {epoch}.")
        (reader, read_order, filesystem_name, filesystem, epoch_objects) = (configure_epoch(sources, args))
        logger.info(f"Configured, total objects: {len(epoch_objects)}")
        
        logger.info(f"Configuring samples.")
        samples = configure_samples(epoch_objects, filesystem, args)
        logger.info(f"Configured, total selected samples: {len(samples)}")

        logger.info(f"Running epoch: {epoch}")
        for summary in Epoch(reader, epoch_objects, filesystem, samples, args):
            logger.info(f"Epoch: {epoch}, {summary}")
            
        logger.info(f"Epoch {epoch} completed.\n")
        
        # Clear the kernel cache
        if args.clear_pagecache_after_epoch:
            util.clear_kernel_cache(logger)