def train()

in movement-pruning/masked_run_glue.py [0:0]


def train(args, train_dataset, model, tokenizer, teacher=None):
    """Train the model"""
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter(log_dir=args.output_dir)

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if "mask_score" in n and p.requires_grad],
            "lr": args.mask_scores_learning_rate,
        },
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if "mask_score" not in n and p.requires_grad and not any(nd in n for nd in no_decay)
            ],
            "lr": args.learning_rate,
            "weight_decay": args.weight_decay,
        },
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if "mask_score" not in n and p.requires_grad and any(nd in n for nd in no_decay)
            ],
            "lr": args.learning_rate,
            "weight_decay": 0.0,
        },
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
        os.path.join(args.model_name_or_path, "scheduler.pt")
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True,
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    # Distillation
    if teacher is not None:
        logger.info("  Training with distillation")

    global_step = 0
    # Global TopK
    if args.global_topk:
        threshold_mem = None
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to global_step of last saved checkpoint from model path
        try:
            global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
        except ValueError:
            global_step = 0
        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        epochs_trained,
        int(args.num_train_epochs),
        desc="Epoch",
        disable=args.local_rank not in [-1, 0],
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            threshold, regu_lambda = schedule_threshold(
                step=global_step,
                total_step=t_total,
                warmup_steps=args.warmup_steps,
                final_threshold=args.final_threshold,
                initial_threshold=args.initial_threshold,
                final_warmup=args.final_warmup,
                initial_warmup=args.initial_warmup,
                final_lambda=args.final_lambda,
            )
            # Global TopK
            if args.global_topk:
                if threshold == 1.0:
                    threshold = -1e2  # Or an indefinitely low quantity
                else:
                    if (threshold_mem is None) or (global_step % args.global_topk_frequency_compute == 0):
                        # Sort all the values to get the global topK
                        concat = torch.cat(
                            [param.view(-1) for name, param in model.named_parameters() if "mask_scores" in name]
                        )
                        n = concat.numel()
                        kth = max(n - (int(n * threshold) + 1), 1)
                        threshold_mem = concat.kthvalue(kth).values.item()
                        threshold = threshold_mem
                    else:
                        threshold = threshold_mem
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = (
                    batch[2] if args.model_type in ["bert", "masked_bert", "xlnet", "albert"] else None
                )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids

            if "masked" in args.model_type:
                inputs["threshold"] = threshold

            outputs = model(**inputs)
            loss, logits_stu = outputs  # model outputs are always tuple in transformers (see doc)

            # Distillation loss
            if teacher is not None:
                if "token_type_ids" not in inputs:
                    inputs["token_type_ids"] = None if args.teacher_type == "xlm" else batch[2]
                with torch.no_grad():
                    (logits_tea,) = teacher(
                        input_ids=inputs["input_ids"],
                        token_type_ids=inputs["token_type_ids"],
                        attention_mask=inputs["attention_mask"],
                    )

                loss_logits = nn.functional.kl_div(
                    input=nn.functional.log_softmax(logits_stu / args.temperature, dim=-1),
                    target=nn.functional.softmax(logits_tea / args.temperature, dim=-1),
                    reduction="batchmean",
                ) * (args.temperature**2)

                loss = args.alpha_distil * loss_logits + args.alpha_ce * loss

            # Regularization
            if args.regularization is not None:
                regu_ = regularization(model=model, mode=args.regularization)
                loss = loss + regu_lambda * regu_

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0 or (
                # last step in epoch but step is always smaller than gradient_accumulation_steps
                len(epoch_iterator) <= args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator)
            ):
                if args.fp16:
                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    tb_writer.add_scalar("threshold", threshold, global_step)
                    for name, param in model.named_parameters():
                        if not param.requires_grad:
                            continue
                        tb_writer.add_scalar("parameter_mean/" + name, param.data.mean(), global_step)
                        tb_writer.add_scalar("parameter_std/" + name, param.data.std(), global_step)
                        tb_writer.add_scalar("parameter_min/" + name, param.data.min(), global_step)
                        tb_writer.add_scalar("parameter_max/" + name, param.data.max(), global_step)
                        tb_writer.add_scalar("grad_mean/" + name, param.grad.data.mean(), global_step)
                        tb_writer.add_scalar("grad_std/" + name, param.grad.data.std(), global_step)
                        if args.regularization is not None and "mask_scores" in name:
                            if args.regularization == "l1":
                                perc = (torch.sigmoid(param) > threshold).sum().item() / param.numel()
                            elif args.regularization == "l0":
                                perc = (torch.sigmoid(param - 2 / 3 * np.log(0.1 / 1.1))).sum().item() / param.numel()
                            tb_writer.add_scalar("retained_weights_perc/" + name, perc, global_step)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logs = {}
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            eval_key = "eval_{}".format(key)
                            logs[eval_key] = value

                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    learning_rate_scalar = scheduler.get_lr()
                    logs["learning_rate"] = learning_rate_scalar[0]
                    if len(learning_rate_scalar) > 1:
                        for idx, lr in enumerate(learning_rate_scalar[1:]):
                            logs[f"learning_rate/{idx+1}"] = lr
                    logs["loss"] = loss_scalar
                    if teacher is not None:
                        logs["loss/distil"] = loss_logits.item()
                    if args.regularization is not None:
                        logs["loss/regularization"] = regu_.item()
                    if (teacher is not None) or (args.regularization is not None):
                        if (teacher is not None) and (args.regularization is not None):
                            logs["loss/instant_ce"] = (
                                loss.item()
                                - regu_lambda * logs["loss/regularization"]
                                - args.alpha_distil * logs["loss/distil"]
                            ) / args.alpha_ce
                        elif teacher is not None:
                            logs["loss/instant_ce"] = (
                                loss.item() - args.alpha_distil * logs["loss/distil"]
                            ) / args.alpha_ce
                        else:
                            logs["loss/instant_ce"] = loss.item() - regu_lambda * logs["loss/regularization"]
                    logging_loss = tr_loss

                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
                    print(json.dumps({**logs, **{"step": global_step}}))

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step