in src/tokenizer.py [0:0]
def get_tokenizer(tokenizer_type, data_path, md_transformer,
vocab_limit=0, force_new_creation=False):
"""Either loads in a pretrained tokenizer or creates a new one and saves it"""
assert(tokenizer_type in TOKENIZER_MAP),\
f"Invalid tokenizer_type: {tokenizer_type}"
tokenizer_class = TOKENIZER_MAP[tokenizer_type]
if vocab_limit:
saved_tokenizer_path = f"cache/{tokenizer_type}_{vocab_limit}.pkl"
else:
saved_tokenizer_path = f"cache/{tokenizer_type}_full.pkl"
if os.path.exists(saved_tokenizer_path) and not force_new_creation:
logger.info(f"Loading in saved tokenizer from: {saved_tokenizer_path}")
tokenizer = tokenizer_class.load_tokenizer(saved_tokenizer_path)
else:
if not os.path.exists("cache"):
os.mkdir("cache")
logger.info(f"Creating new tokenizer")
tokenizer = tokenizer_class(data_path, md_transformer, vocab_limit)
pickle.dump(tokenizer, open(saved_tokenizer_path, "wb"))
logger.info(f"Saving out tokenizer to: {saved_tokenizer_path}")
logger.info(f"Size of vocab: {tokenizer.get_vocab_size()}")
return tokenizer