def tok()

in filtering/deduplication/hf_dataset_to_file.py [0:0]


def tok(x):
    if args.tokenize:
        out = tokenizer.encode(x.decode("utf8"))
        out = np.array(out, dtype=np.uint16).view(np.uint8).tobytes()
    else:
        out = x
    return out