in scripts/make_wordpieces.py [0:0]
def json_set_pieces(args, dataset, vocab=None):
# Train sentencepiece model only on the training set
train_text = []
for subset in dataset.SPLITS["train"]:
ds = dataset.load_data_split(args.data_dir, subset)
train_text.extend(l["text"] for l in ds)
if args.text_file is not None:
with open(args.text_file, "r") as fid:
spm_text = [l.strip() for l in fid]
else:
spm_text = train_text
num_pieces = args.num_pieces
sp = train_spm_model(
iter(spm_text),
num_pieces + 1, # to account for <unk>
)
if vocab is None:
vocab = sorted(set(w for t in train_text for w in t.split("▁") if w))
save_pieces(sp, num_pieces, args.output_prefix, vocab)