in scripts/make_wordpieces.py [0:0]
def iamdb_pieces(args):
iamdb = module_from_file("iamdb", os.path.join(root_dir, "datasets/iamdb.py"))
forms = iamdb.load_metadata(args.data_dir, "▁")
ds_keys = set()
for _, v in iamdb.SPLITS.items():
for ds in v:
with open(os.path.join(args.data_dir, f"{ds}.txt"), "r") as fid:
ds_keys.update(l.strip() for l in fid)
# Train sentencepiece model only on the training set
text = [l["text"] for _, lines in forms.items()
for l in lines if l["key"] not in ds_keys]
num_pieces = args.num_pieces
sp = train_spm_model(
iter(text),
num_pieces + 1, # to account for <unk>
user_symbols=["/"], # added so token is in the output set
)
vocab = sorted(set(w for t in text for w in t.split("▁") if w))
assert 'MOVE' in vocab
save_pieces(sp, num_pieces, args.output_prefix, vocab)