in phi3/dataset-preparation/train_tokenizer.py [0:0]
def main():
IS_BBPE = True
path_namuwiki = [str(x) for x in Path("namuwiki-extracted-txt").glob("*.txt")]
path_wiki = [str(x) for x in Path("wiki-txt").glob("*.txt")]
path_kcbert = [str(x) for x in Path("kcbert2-txt").glob("*.txt")]
path_corpus = path_namuwiki + path_wiki + path_kcbert
vocab_size = 18000
limit_alphabet = 1000
min_frequency = 30
if IS_BBPE:
tokenizer = ByteLevelBPETokenizer(unicode_normalizer="nfkc", trim_offsets=True)
t1 = time.time()
tokenizer.train(
files=path_corpus,
vocab_size=vocab_size,
special_tokens=["<|endoftext|>"],
min_frequency=min_frequency,
show_progress=True
)
tokenizer.save('english_tokenizer_bbpe.json')
print("Elapsed time:", time.time() - t1)
else:
tokenizer = SentencePieceBPETokenizer(fuse_unk=True)
t1 = time.time()
tokenizer.train(
files=path_corpus,
vocab_size=vocab_size,
special_tokens=["<unk>", "<s>", "</s>"],
min_frequency=min_frequency,
limit_alphabet=limit_alphabet,
show_progress=True
)
tokenizer.save('english_tokenizer_bpe.json')
print("Elapsed time:", time.time() - t1)