phi3/dataset-preparation/combine_tokenizer.py (39 lines of code) (raw):
"""
Given two tokenizers, combine them and create a new tokenizer
Usage: python combine_tokenizers.py --tokenizer1 ../config/en/roberta_8 --tokenizer2 ../config/hi/roberta_8 --save_dir ../config/en/en_hi/roberta_8
"""
# Libraries for tokenizer
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
import argparse
import json
import os
from tqdm import tqdm
from transformers import AutoTokenizer
from timeit import default_timer as timer
import sys
def combine_tokenizers(args):
# Load both the json files, take the union, and store it
json1 = json.load(open(os.path.join(args.tokenizer1, 'vocab.json')))
json2 = json.load(open(os.path.join(args.tokenizer2, 'vocab.json')))
# Create a new vocabulary
new_vocab = {}
idx = 0
for word in json1.keys():
if word not in new_vocab.keys():
new_vocab[word] = idx
idx += 1
# Add words from second tokenizer
for word in json2.keys():
if word not in new_vocab.keys():
new_vocab[word] = idx
idx += 1
# Make the directory if necessary
if not os.path.exists(args.save_dir):
os.makedirs(args.save_dir)
# Save the vocab
with open(os.path.join(args.save_dir, 'vocab.json'), 'w') as fp:
json.dump(new_vocab, fp, ensure_ascii=False)
# Merge the two merges file. Don't handle duplicates here
# Concatenate them, but ignore the first line of the second file
os.system('cat {} > {}'.format(os.path.join(args.tokenizer1, 'merges.txt'), os.path.join(args.save_dir, 'merges.txt')))
os.system('tail -n +2 -q {} >> {}'.format(os.path.join(args.tokenizer2, 'merges.txt'), os.path.join(args.save_dir, 'merges.txt')))
# Save other files
os.system('cp {} {}'.format(os.path.join(args.tokenizer1, 'special_tokens_map.json'), args.save_dir))
os.system('cp {} {}'.format(os.path.join(args.tokenizer1, 'tokenizer_config.json'), args.save_dir))
# Instantiate the new tokenizer
#tokenizer = AutoTokenizer.from_pretrained(args.save_dir, use_fast=True)
def main():
parser = argparse.ArgumentParser()
# Dataset Arguments
parser.add_argument("--tokenizer1", type=str, default="tokenizer-minstral-orig-txt", help="")
parser.add_argument("--tokenizer2", type=str, default="tokenizer-ko-nsmc-txt", help="")
parser.add_argument("--save_dir", type=str, default="tokenizer-new-1000", help="")
args = parser.parse_args()
combine_tokenizers(args)
if __name__ == '__main__':
main()