phi3/dataset-preparation/train_tokenizer.py (51 lines of code) (raw):

import os import time import pandas as pd import urllib.request from pathlib import Path from datasets import load_dataset from tokenizers import SentencePieceBPETokenizer, ByteLevelBPETokenizer def save_dataset_to_txt(txt_column, txt_dir, hf_dataset_id): dataset = load_dataset(hf_dataset_id) os.makedirs(txt_dir, exist_ok=True) for split_key in dataset.keys(): doc_path = f"{txt_dir}/{split_key}.txt" with open(doc_path, 'w') as f: for doc in dataset[split_key][txt_column]: f.write(doc+'\n') def main(): IS_BBPE = True path_namuwiki = [str(x) for x in Path("namuwiki-extracted-txt").glob("*.txt")] path_wiki = [str(x) for x in Path("wiki-txt").glob("*.txt")] path_kcbert = [str(x) for x in Path("kcbert2-txt").glob("*.txt")] path_corpus = path_namuwiki + path_wiki + path_kcbert vocab_size = 18000 limit_alphabet = 1000 min_frequency = 30 if IS_BBPE: tokenizer = ByteLevelBPETokenizer(unicode_normalizer="nfkc", trim_offsets=True) t1 = time.time() tokenizer.train( files=path_corpus, vocab_size=vocab_size, special_tokens=["<|endoftext|>"], min_frequency=min_frequency, show_progress=True ) tokenizer.save('english_tokenizer_bbpe.json') print("Elapsed time:", time.time() - t1) else: tokenizer = SentencePieceBPETokenizer(fuse_unk=True) t1 = time.time() tokenizer.train( files=path_corpus, vocab_size=vocab_size, special_tokens=["<unk>", "<s>", "</s>"], min_frequency=min_frequency, limit_alphabet=limit_alphabet, show_progress=True ) tokenizer.save('english_tokenizer_bpe.json') print("Elapsed time:", time.time() - t1) if __name__ == '__main__': main()