in src/mlm/cmds.py [0:0]
def cmd_bin(args: argparse.Namespace) -> None:
"""mlm score command
"""
# Get model
ctxs = setup_ctxs(args.gpus)
weights_file = Path(args.weights) if isinstance(args.weights, str) else None
model, vocab, tokenizer = get_pretrained(ctxs, args.model, weights_file, regression=args.no_mask)
# Define output files
counts_file = Path(args.counts_file)
sums_file = Path(args.sums_file)
# Set binner
if isinstance(model, nlp.model.BERTModel):
assert not args.whole_word_mask
assert not args.no_mask
binner = MLMBinner(model, vocab, tokenizer, eos=args.eos, capitalize=args.capitalize, ctxs=ctxs)
elif isinstance(model, BERTRegression):
raise ValueError("Not supported")
else:
assert not args.whole_word_mask
assert not args.no_mask
binner = LMBinner(model, vocab, tokenizer, eos=args.eos, capitalize=args.capitalize, ctxs=ctxs)
# What data do we use?
if args.mode == 'hyp':
raise ValueError("Not supported")
elif args.mode == 'ref':
corpus = Corpus.from_file(args.infile, max_utts=args.max_utts)
logging.warn("# sentences: {}".format(len(corpus)))
# === START SHARED COMPUTATION ===
# A binner takes a corpus and produces a list of bin counts and scores
bin_counts, bin_sums = binner.bin(corpus, ratio=1, split_size=args.split_size)
logging.warning("Saving bin counts to '{}'".format(counts_file))
np.save(counts_file, bin_counts)
logging.warning("Saving bin sums to '{}'".format(sums_file))
np.save(sums_file, bin_sums)