abstractive_summarization/src/dapt_pretraining.py [196:266]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        torch.save(self.model, saved_path)

if __name__ == "__main__":
    # configuration
    parser = argparse.ArgumentParser()
    parser.add_argument('-visible_gpu', default='1', type=str)
    parser.add_argument('-bsz', type=int, default=4, help="batch size")
    parser.add_argument('-path', type=str, default="", help="data path")
    parser.add_argument('-epoch', type=int, default=10, help="epoch size")
    parser.add_argument('-mask_prob', type=float, default=0.15, help="mask probability")
    parser.add_argument('-dm', type=str, default="", help="domain name")
    parser.add_argument('-random_seed', type=int, default=0)
    parser.add_argument('-save_interval', default=10000, type=int)
    # optimizer configuration
    parser.add_argument('-lr', default=0.05, type=float)
    parser.add_argument('-optim', default='adam', type=str)
    parser.add_argument('-max_grad_norm', default=0, type=float)
    parser.add_argument('-beta1', default=0.9, type=float)
    parser.add_argument('-beta2', default=0.998, type=float)
    parser.add_argument('-warmup_steps', default=10000, type=int)
    parser.add_argument('-decay_method', default='noam', type=str)
    parser.add_argument('-enc_hidden_size', default=768, type=int)
    parser.add_argument('-clip', type=float, default=1.0, help="gradient clip")
    parser.add_argument('-accum_step', type=int, default=10, help="accumulation steps")
    parser.add_argument('-train_from', default='', type=str)
    # using RecAdam
    parser.add_argument("-adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument('-recadam', default=False, action='store_true')
    parser.add_argument("-weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("-anneal_w", type=float, default=1.0, help="Weight for the annealing function in RecAdam. Default 1.0.")
    parser.add_argument("-anneal_fun", type=str, default='sigmoid', choices=["sigmoid", "linear", 'constant'], help="the type of annealing function in RecAdam. Default sigmoid")
    parser.add_argument("-anneal_t0", type=int, default=1000, help="t0 for the annealing function in RecAdam.")
    parser.add_argument("-anneal_k", type=float, default=0.1, help="k for the annealing function in RecAdam.")
    parser.add_argument("-pretrain_cof", type=float, default=5000.0, help="Coefficient of the quadratic penalty in RecAdam. Default 5000.0.")
    parser.add_argument("-logging_Euclid_dist", action="store_true", help="Whether to log the Euclidean distance between the pretrained model and fine-tuning model")
    parser.add_argument("-max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
    parser.add_argument("-model_type", type=str, default="layers")

    args = parser.parse_args()

    # set random seed
    random.seed(args.random_seed)
    np.random.seed(args.random_seed)
    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed(args.random_seed)
    torch.backends.cudnn.deterministic = True

    # set gpu
    os.environ["CUDA_VISIBLE_DEVICES"] = args.visible_gpu

    print("Loading datasets ...")
    dataset = CorpusDataset(args.path)
    dataloader = DataLoader(dataset=dataset, batch_size=args.bsz, shuffle=True)

    if args.train_from:
        model = torch.load(args.train_from, map_location='cpu')
    else:
        model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
    model.cuda()

    tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

    if args.recadam:
        pretrained_model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
        pretrained_model.cuda()
    else:
        pretrained_model = None

    bart_lm_trainer = BartLMTrainer(model, dataloader, tokenizer, args, pretrained_model)

    bart_lm_trainer.train()
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



abstractive_summarization/src/tapt_pretraining.py [196:266]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        torch.save(self.model, saved_path)

if __name__ == "__main__":
    # configuration
    parser = argparse.ArgumentParser()
    parser.add_argument('-visible_gpu', default='1', type=str)
    parser.add_argument('-bsz', type=int, default=4, help="batch size")
    parser.add_argument('-path', type=str, default="", help="data path")
    parser.add_argument('-epoch', type=int, default=10, help="epoch size")
    parser.add_argument('-mask_prob', type=float, default=0.15, help="mask probability")
    parser.add_argument('-dm', type=str, default="", help="domain name")
    parser.add_argument('-random_seed', type=int, default=0)
    parser.add_argument('-save_interval', default=10000, type=int)
    # optimizer configuration
    parser.add_argument('-lr', default=0.05, type=float)
    parser.add_argument('-optim', default='adam', type=str)
    parser.add_argument('-max_grad_norm', default=0, type=float)
    parser.add_argument('-beta1', default=0.9, type=float)
    parser.add_argument('-beta2', default=0.998, type=float)
    parser.add_argument('-warmup_steps', default=10000, type=int)
    parser.add_argument('-decay_method', default='noam', type=str)
    parser.add_argument('-enc_hidden_size', default=768, type=int)
    parser.add_argument('-clip', type=float, default=1.0, help="gradient clip")
    parser.add_argument('-accum_step', type=int, default=10, help="accumulation steps")
    parser.add_argument('-train_from', default='', type=str)
    # using RecAdam
    parser.add_argument("-adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument('-recadam', default=False, action='store_true')
    parser.add_argument("-weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("-anneal_w", type=float, default=1.0, help="Weight for the annealing function in RecAdam. Default 1.0.")
    parser.add_argument("-anneal_fun", type=str, default='sigmoid', choices=["sigmoid", "linear", 'constant'], help="the type of annealing function in RecAdam. Default sigmoid")
    parser.add_argument("-anneal_t0", type=int, default=1000, help="t0 for the annealing function in RecAdam.")
    parser.add_argument("-anneal_k", type=float, default=0.1, help="k for the annealing function in RecAdam.")
    parser.add_argument("-pretrain_cof", type=float, default=5000.0, help="Coefficient of the quadratic penalty in RecAdam. Default 5000.0.")
    parser.add_argument("-logging_Euclid_dist", action="store_true", help="Whether to log the Euclidean distance between the pretrained model and fine-tuning model")
    parser.add_argument("-max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
    parser.add_argument("-model_type", type=str, default="layers")

    args = parser.parse_args()

    # set random seed
    random.seed(args.random_seed)
    np.random.seed(args.random_seed)
    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed(args.random_seed)
    torch.backends.cudnn.deterministic = True

    # set gpu
    os.environ["CUDA_VISIBLE_DEVICES"] = args.visible_gpu

    print("Loading datasets ...")
    dataset = CorpusDataset(args.path)
    dataloader = DataLoader(dataset=dataset, batch_size=args.bsz, shuffle=True)

    if args.train_from:
        model = torch.load(args.train_from, map_location='cpu')
    else:
        model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
    model.cuda()

    tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

    if args.recadam:
        pretrained_model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
        pretrained_model.cuda()
    else:
        pretrained_model = None

    bart_lm_trainer = BartLMTrainer(model, dataloader, tokenizer, args, pretrained_model)

    bart_lm_trainer.train()
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



