in filtering/deduplication/hf_dataset_to_file.py [0:0]
def tok(x):
if args.tokenize:
out = tokenizer.encode(x.decode("utf8"))
out = np.array(out, dtype=np.uint16).view(np.uint8).tobytes()
else:
out = x
return out