def json_set_pieces()

in scripts/make_wordpieces.py [0:0]

18 lines of code
10 McCabe index (conditional complexity)


def json_set_pieces(args, dataset, vocab=None):
    # Train sentencepiece model only on the training set
    train_text = []
    for subset in dataset.SPLITS["train"]:
        ds = dataset.load_data_split(args.data_dir, subset)
        train_text.extend(l["text"] for l in ds)
    if args.text_file is not None:
        with open(args.text_file, "r") as fid:
            spm_text = [l.strip() for l in fid]
    else:
        spm_text = train_text
    num_pieces = args.num_pieces
    sp = train_spm_model(
        iter(spm_text),
        num_pieces + 1,  # to account for <unk>
    )
    if vocab is None:
        vocab = sorted(set(w for t in train_text for w in t.split("▁") if w))
    save_pieces(sp, num_pieces, args.output_prefix, vocab)