in src/mlm/cmds.py [0:0]
def main() -> None:
"""Defines arguments for all subcommands"""
parser = argparse.ArgumentParser(description="Masked Language Model Scoring")
subparsers = parser.add_subparsers(help="Run 'mlm {subcommand} -h' for details")
logging.basicConfig(
format='%(asctime)s %(levelname)-8s %(message)s',
level=logging.INFO,
datefmt='%Y-%m-%d %H:%M:%S')
# score
parser_score = subparsers.add_parser('score', help='Scores JSON or TXT files of sentences')
_shared_args(parser_score)
parser_score.add_argument('--mode', type=str, choices=['ref', 'hyp'],
help="Scoring references (.txt, .json 'refs') vs. hypotheses (.json 'hyp_*')")
parser_score.add_argument('--temp', type=float, default=1.0,
help="softmax temperature")
parser_score.add_argument('--split-size', type=int, default=500,
help="split size (per GPU)")
parser_score.add_argument('--no-mask', action='store_true',
help="Instead of making masked copies, do not mask")
parser_score.add_argument('--tgt', type=str, default='en',
help="Code to use for language embeddings, where appropriate")
parser_score.add_argument('--eos', action='store_true',
help="append '.' (this can help mitigate train-test disparity)")
parser_score.add_argument('--detok', action='store_true',
help="perform Moses English detokenization on hypotheses before scoring")
capitalize_parser = parser_score.add_mutually_exclusive_group(required=False)
capitalize_parser.add_argument('--capitalize', dest='capitalize', action='store_true')
capitalize_parser.add_argument('--no-capitalize', dest='capitalize', action='store_false')
parser_score.set_defaults(capitalize=None)
parser_score.add_argument('--whole-word-mask', action='store_true',
help="mask whole words")
parser_score.add_argument('--per-token', action='store_true',
help="output lists of per-token scores (slower)")
parser_score.add_argument('infile', nargs='?', type=argparse.FileType('rt'),
help="File to score (.json = ESPNet JSON, otherwise newline-separated text). Loads whole file into memory!")
parser_score.set_defaults(func=cmd_score)
# bin (same arguments as score; when stable, make flag)
parser_bin = subparsers.add_parser('bin', help='Computes bin statistics when scoring')
_shared_args(parser_bin)
parser_bin.add_argument('--mode', type=str, choices=['ref', 'hyp'],
help="Scoring references (.txt, .json 'refs') vs. hypotheses (.json 'hyp_*')")
parser_bin.add_argument('--temp', type=float, default=1.0,
help="softmax temperature")
parser_bin.add_argument('--split-size', type=int, default=1000,
help="split size (per GPU)")
parser_bin.add_argument('--no-mask', action='store_true',
help="Instead of making masked copies, do not mask")
parser_bin.add_argument('--eos', action='store_true',
help="append '.' (this can help mitigate train-test disparity)")
capitalize_parser = parser_bin.add_mutually_exclusive_group(required=False)
capitalize_parser.add_argument('--capitalize', dest='capitalize', action='store_true')
capitalize_parser.add_argument('--no-capitalize', dest='capitalize', action='store_false')
parser_bin.set_defaults(capitalize=None)
parser_bin.add_argument('--whole-word-mask', action='store_true',
help="mask whole words")
parser_bin.add_argument('infile', nargs='?', type=argparse.FileType('rt'),
help="File to score (.json = ESPNet JSON, otherwise newline-separated text). Loads whole file into memory!")
parser_bin.add_argument('counts_file', nargs='?', type=str,
help="where to dump the counts per bin")
parser_bin.add_argument('sums_file', nargs='?', type=str,
help="where to dump the sums per bin")
parser_bin.set_defaults(func=cmd_bin)
# rescore
parser_rescore = subparsers.add_parser('rescore', help='Rescores two files together')
_shared_args(parser_rescore)
parser_rescore.add_argument('--weight', type=str, default='0.3',
help="AM score is (1-sum(weight)), LM scores are weights delimited by commas")
parser_rescore.add_argument('--ref-file', type=argparse.FileType('rt'),
help="Specify an alternative reference file to FILE_AM")
parser_rescore.add_argument('--ln', type=float, default=None,
help="apply GNMT normalization with this scale to each >>LM<< score")
parser_rescore.add_argument('--ln-type', type=str, choices=['gnmt', 'length'], default='gnmt',
help="type of normalization to apply")
parser_rescore.add_argument('file_am', type=argparse.FileType('rt'),
help="File with AM scores (.json = JSON)")
parser_rescore.add_argument('file_lm', type=str,
help="File(s) with LM scores (.json = JSON), delimited by commas")
parser_rescore.set_defaults(func=cmd_rescore)
# finetune
parser_finetune = subparsers.add_parser('finetune', help='Finetune to scoring without masks')
_shared_args(parser_finetune)
parser_finetune.add_argument('--corpus-dir', type=str, required=True,
help="Directory of part.*")
parser_finetune.add_argument('--score-dir', type=str, required=True,
help="Directory of part.*.ref.scores")
parser_finetune.add_argument('--output-dir', type=str, required=True,
help="Directory to output .param files")
parser_finetune.add_argument('--freeze', type=int, default=0,
help="Number of initial layers to freeze")
# TODO: deduplicate
parser_finetune.add_argument('--eos', action='store_true',
help="append '.' (this can help mitigate train-test disparity)")
capitalize_parser = parser_finetune.add_mutually_exclusive_group(required=False)
capitalize_parser.add_argument('--capitalize', dest='capitalize', action='store_true')
capitalize_parser.add_argument('--no-capitalize', dest='capitalize', action='store_false')
parser_finetune.set_defaults(capitalize=None)
parser_finetune.add_argument('--whole-word-mask', action='store_true',
help="mask whole words")
parser_finetune.add_argument('--split-size', type=int, default=1000,
help="split size (per GPU)")
parser_finetune.set_defaults(func=cmd_finetune)
args = parser.parse_args()
args.func(args)