scripts/train_detection.py [435:504]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    }

    if args.resume:
        starting_epoch = snapshot["training_meta"]["epoch"] + 1
        best_score = snapshot["training_meta"]["best_score"]
        global_step = snapshot["training_meta"]["global_step"]
        for name, meter in meters.items():
            meter.load_state_dict(snapshot["state_dict"][name + "_meter"])
        del snapshot
    else:
        starting_epoch = 0
        best_score = 0
        global_step = 0

    # Optional: evaluation only:
    if args.eval:
        log_info("Validating epoch %d", starting_epoch - 1)
        validate(model, val_dataloader, config["optimizer"].getstruct("loss_weights"),
                 device=device, summary=summary, global_step=global_step,
                 epoch=starting_epoch - 1, num_epochs=total_epochs,
                 log_interval=config["general"].getint("log_interval"),
                 coco_gt=config["dataloader"]["coco_gt"], log_dir=args.log_dir)
        exit(0)

    for epoch in range(starting_epoch, total_epochs):
        log_info("Starting epoch %d", epoch + 1)
        if not batch_update:
            scheduler.step(epoch)

        # Run training epoch
        global_step = train(model, optimizer, scheduler, train_dataloader, meters,
                            batch_update=batch_update, epoch=epoch, summary=summary, device=device,
                            log_interval=config["general"].getint("log_interval"), num_epochs=total_epochs,
                            global_step=global_step, loss_weights=config["optimizer"].getstruct("loss_weights"))

        # Save snapshot (only on rank 0)
        if rank == 0:
            snapshot_file = path.join(args.log_dir, "model_last.pth.tar")
            log_debug("Saving snapshot to %s", snapshot_file)
            meters_out_dict = {k + "_meter": v.state_dict() for k, v in meters.items()}
            save_snapshot(snapshot_file, config, epoch, 0, best_score, global_step,
                          body=model.module.body.state_dict(),
                          rpn_head=model.module.rpn_head.state_dict(),
                          roi_head=model.module.roi_head.state_dict(),
                          optimizer=optimizer.state_dict(),
                          **meters_out_dict)

        if (epoch + 1) % config["general"].getint("val_interval") == 0:
            log_info("Validating epoch %d", epoch + 1)
            score = validate(model, val_dataloader, config["optimizer"].getstruct("loss_weights"),
                             device=device, summary=summary, global_step=global_step,
                             epoch=epoch, num_epochs=total_epochs,
                             log_interval=config["general"].getint("log_interval"),
                             coco_gt=config["dataloader"]["coco_gt"], log_dir=args.log_dir)

            # Update the score on the last saved snapshot
            if rank == 0:
                snapshot = torch.load(snapshot_file, map_location="cpu")
                snapshot["training_meta"]["last_score"] = score
                torch.save(snapshot, snapshot_file)
                del snapshot

            if score > best_score:
                best_score = score
                if rank == 0:
                    shutil.copy(snapshot_file, path.join(args.log_dir, "model_best.pth.tar"))


if __name__ == "__main__":
    main(parser.parse_args())
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



scripts/train_instance_seg.py [444:513]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    }

    if args.resume:
        starting_epoch = snapshot["training_meta"]["epoch"] + 1
        best_score = snapshot["training_meta"]["best_score"]
        global_step = snapshot["training_meta"]["global_step"]
        for name, meter in meters.items():
            meter.load_state_dict(snapshot["state_dict"][name + "_meter"])
        del snapshot
    else:
        starting_epoch = 0
        best_score = 0
        global_step = 0

    # Optional: evaluation only:
    if args.eval:
        log_info("Validating epoch %d", starting_epoch - 1)
        validate(model, val_dataloader, config["optimizer"].getstruct("loss_weights"),
                 device=device, summary=summary, global_step=global_step,
                 epoch=starting_epoch - 1, num_epochs=total_epochs,
                 log_interval=config["general"].getint("log_interval"),
                 coco_gt=config["dataloader"]["coco_gt"], log_dir=args.log_dir)
        exit(0)

    for epoch in range(starting_epoch, total_epochs):
        log_info("Starting epoch %d", epoch + 1)
        if not batch_update:
            scheduler.step(epoch)

        # Run training epoch
        global_step = train(model, optimizer, scheduler, train_dataloader, meters,
                            batch_update=batch_update, epoch=epoch, summary=summary, device=device,
                            log_interval=config["general"].getint("log_interval"), num_epochs=total_epochs,
                            global_step=global_step, loss_weights=config["optimizer"].getstruct("loss_weights"))

        # Save snapshot (only on rank 0)
        if rank == 0:
            snapshot_file = path.join(args.log_dir, "model_last.pth.tar")
            log_debug("Saving snapshot to %s", snapshot_file)
            meters_out_dict = {k + "_meter": v.state_dict() for k, v in meters.items()}
            save_snapshot(snapshot_file, config, epoch, 0, best_score, global_step,
                          body=model.module.body.state_dict(),
                          rpn_head=model.module.rpn_head.state_dict(),
                          roi_head=model.module.roi_head.state_dict(),
                          optimizer=optimizer.state_dict(),
                          **meters_out_dict)

        if (epoch + 1) % config["general"].getint("val_interval") == 0:
            log_info("Validating epoch %d", epoch + 1)
            score = validate(model, val_dataloader, config["optimizer"].getstruct("loss_weights"),
                             device=device, summary=summary, global_step=global_step,
                             epoch=epoch, num_epochs=total_epochs,
                             log_interval=config["general"].getint("log_interval"),
                             coco_gt=config["dataloader"]["coco_gt"], log_dir=args.log_dir)

            # Update the score on the last saved snapshot
            if rank == 0:
                snapshot = torch.load(snapshot_file, map_location="cpu")
                snapshot["training_meta"]["last_score"] = score
                torch.save(snapshot, snapshot_file)
                del snapshot

            if score > best_score:
                best_score = score
                if rank == 0:
                    shutil.copy(snapshot_file, path.join(args.log_dir, "model_best.pth.tar"))


if __name__ == "__main__":
    main(parser.parse_args())
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



