def cmd_bin()

in src/mlm/cmds.py [0:0]


def cmd_bin(args: argparse.Namespace) -> None:
    """mlm score command
    """

    # Get model
    ctxs = setup_ctxs(args.gpus)
    weights_file = Path(args.weights) if isinstance(args.weights, str) else None
    model, vocab, tokenizer = get_pretrained(ctxs, args.model, weights_file, regression=args.no_mask)

    # Define output files
    counts_file = Path(args.counts_file)
    sums_file = Path(args.sums_file)

    # Set binner
    if isinstance(model, nlp.model.BERTModel):
        assert not args.whole_word_mask
        assert not args.no_mask
        binner = MLMBinner(model, vocab, tokenizer, eos=args.eos, capitalize=args.capitalize, ctxs=ctxs)
    elif isinstance(model, BERTRegression):
        raise ValueError("Not supported")
    else:
        assert not args.whole_word_mask
        assert not args.no_mask
        binner = LMBinner(model, vocab, tokenizer, eos=args.eos, capitalize=args.capitalize, ctxs=ctxs)

    # What data do we use?
    if args.mode == 'hyp':

        raise ValueError("Not supported")

    elif args.mode == 'ref':

        corpus = Corpus.from_file(args.infile, max_utts=args.max_utts)
        logging.warn("# sentences: {}".format(len(corpus)))

    # === START SHARED COMPUTATION ===

    # A binner takes a corpus and produces a list of bin counts and scores
    bin_counts, bin_sums = binner.bin(corpus, ratio=1, split_size=args.split_size)
    logging.warning("Saving bin counts to '{}'".format(counts_file))
    np.save(counts_file, bin_counts)
    logging.warning("Saving bin sums to '{}'".format(sums_file))
    np.save(sums_file, bin_sums)