movement-pruning/masked_run_glue.py [652:735]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")

    # Pruning parameters
    parser.add_argument(
        "--mask_scores_learning_rate",
        default=1e-2,
        type=float,
        help="The Adam initial learning rate of the mask scores.",
    )
    parser.add_argument(
        "--initial_threshold", default=1.0, type=float, help="Initial value of the threshold (for scheduling)."
    )
    parser.add_argument(
        "--final_threshold", default=0.7, type=float, help="Final value of the threshold (for scheduling)."
    )
    parser.add_argument(
        "--initial_warmup",
        default=1,
        type=int,
        help=(
            "Run `initial_warmup` * `warmup_steps` steps of threshold warmup during which threshold stays "
            "at its `initial_threshold` value (sparsity schedule)."
        ),
    )
    parser.add_argument(
        "--final_warmup",
        default=2,
        type=int,
        help=(
            "Run `final_warmup` * `warmup_steps` steps of threshold cool-down during which threshold stays "
            "at its final_threshold value (sparsity schedule)."
        ),
    )

    parser.add_argument(
        "--pruning_method",
        default="topK",
        type=str,
        help=(
            "Pruning Method (l0 = L0 regularization, magnitude = Magnitude pruning, topK = Movement pruning,"
            " sigmoied_threshold = Soft movement pruning)."
        ),
    )
    parser.add_argument(
        "--mask_init",
        default="constant",
        type=str,
        help="Initialization method for the mask scores. Choices: constant, uniform, kaiming.",
    )
    parser.add_argument(
        "--mask_scale", default=0.0, type=float, help="Initialization parameter for the chosen initialization method."
    )

    parser.add_argument("--regularization", default=None, help="Add L0 or L1 regularization to the mask scores.")
    parser.add_argument(
        "--final_lambda",
        default=0.0,
        type=float,
        help="Regularization intensity (used in conjunction with `regularization`.",
    )

    parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
    parser.add_argument(
        "--global_topk_frequency_compute",
        default=25,
        type=int,
        help="Frequency at which we compute the TopK global threshold.",
    )

    # Distillation parameters (optional)
    parser.add_argument(
        "--teacher_type",
        default=None,
        type=str,
        help=(
            "Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for"
            " distillation."
        ),
    )
    parser.add_argument(
        "--teacher_name_or_path",
        default=None,
        type=str,
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



movement-pruning/masked_run_squad.py [770:853]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")

    # Pruning parameters
    parser.add_argument(
        "--mask_scores_learning_rate",
        default=1e-2,
        type=float,
        help="The Adam initial learning rate of the mask scores.",
    )
    parser.add_argument(
        "--initial_threshold", default=1.0, type=float, help="Initial value of the threshold (for scheduling)."
    )
    parser.add_argument(
        "--final_threshold", default=0.7, type=float, help="Final value of the threshold (for scheduling)."
    )
    parser.add_argument(
        "--initial_warmup",
        default=1,
        type=int,
        help=(
            "Run `initial_warmup` * `warmup_steps` steps of threshold warmup during which threshold stays "
            "at its `initial_threshold` value (sparsity schedule)."
        ),
    )
    parser.add_argument(
        "--final_warmup",
        default=2,
        type=int,
        help=(
            "Run `final_warmup` * `warmup_steps` steps of threshold cool-down during which threshold stays "
            "at its final_threshold value (sparsity schedule)."
        ),
    )

    parser.add_argument(
        "--pruning_method",
        default="topK",
        type=str,
        help=(
            "Pruning Method (l0 = L0 regularization, magnitude = Magnitude pruning, topK = Movement pruning,"
            " sigmoied_threshold = Soft movement pruning)."
        ),
    )
    parser.add_argument(
        "--mask_init",
        default="constant",
        type=str,
        help="Initialization method for the mask scores. Choices: constant, uniform, kaiming.",
    )
    parser.add_argument(
        "--mask_scale", default=0.0, type=float, help="Initialization parameter for the chosen initialization method."
    )

    parser.add_argument("--regularization", default=None, help="Add L0 or L1 regularization to the mask scores.")
    parser.add_argument(
        "--final_lambda",
        default=0.0,
        type=float,
        help="Regularization intensity (used in conjunction with `regularization`.",
    )

    parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
    parser.add_argument(
        "--global_topk_frequency_compute",
        default=25,
        type=int,
        help="Frequency at which we compute the TopK global threshold.",
    )

    # Distillation parameters (optional)
    parser.add_argument(
        "--teacher_type",
        default=None,
        type=str,
        help=(
            "Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for"
            " distillation."
        ),
    )
    parser.add_argument(
        "--teacher_name_or_path",
        default=None,
        type=str,
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



