def get_tokenizer()

in src/tokenizer.py [0:0]


def get_tokenizer(tokenizer_type, data_path, md_transformer,
                  vocab_limit=0, force_new_creation=False):
    """Either loads in a pretrained tokenizer or creates a new one and saves it"""
    assert(tokenizer_type in TOKENIZER_MAP),\
        f"Invalid tokenizer_type: {tokenizer_type}"
    tokenizer_class = TOKENIZER_MAP[tokenizer_type]
    if vocab_limit:
        saved_tokenizer_path = f"cache/{tokenizer_type}_{vocab_limit}.pkl"
    else:
        saved_tokenizer_path = f"cache/{tokenizer_type}_full.pkl"
    if os.path.exists(saved_tokenizer_path) and not force_new_creation:
        logger.info(f"Loading in saved tokenizer from: {saved_tokenizer_path}")
        tokenizer = tokenizer_class.load_tokenizer(saved_tokenizer_path)
    else:
        if not os.path.exists("cache"):
            os.mkdir("cache")
        logger.info(f"Creating new tokenizer")
        tokenizer = tokenizer_class(data_path, md_transformer, vocab_limit)
        pickle.dump(tokenizer, open(saved_tokenizer_path, "wb"))
        logger.info(f"Saving out tokenizer to: {saved_tokenizer_path}")
    logger.info(f"Size of vocab: {tokenizer.get_vocab_size()}")
    return tokenizer