def build_bert_dictionary()

in empchat/datasets/loader.py [0:0]


def build_bert_dictionary(opt):
    try:
        from pytorch_pretrained_bert import BertTokenizer
    except ImportError:
        raise Exception(
            "BERT rankers needs pytorch-pretrained-BERT installed. "
            "\npip install pytorch-pretrained-bert"
        )
    if BERT_ID != "bert-base-cased" and opt.dataset_name == "reddit":
        raise NotImplementedError(
            "Currently, only bert-base-cased can be used with reddit!"
        )
    if BERT_ID != "bert-base-cased" and opt.fasttext_type is not None:
        raise NotImplementedError(
            'Currently, "bert-base-cased" is the only BERT model for which we '
            "have defined lists of fastText labels without BERT tokens!"
        )
    is_cased = BERT_ID.split("-")[2] == "cased"
    tokenizer = BertTokenizer.from_pretrained(
        BERT_ID,
        do_lower_case=not is_cased,
        never_split=(
            ["[CLS]", "[MASK]"]
            + list(get_bert_token_mapping(opt.fasttext_type).values())
        ),
    )
    dict_ = dict()

    # Create dictionary from HuggingFace version. Note that the special tokens
    # have been replicated from build_dictionary() above, and I have used the
    # BERT token equivalence mapping suggested by ParlAI's
    # parlai/agents/bert_ranker/bert_dictionary.py, except for START_OF_COMMENT,
    # which I am setting to a token that hasn't been used before.
    if opt.dict_max_words is not None:
        logging.warning(
            "--dict-max-words will be ignored because we are using the BERT "
            "tokenizer."
        )
    dict_["iwords"] = list(tokenizer.vocab.keys())
    for orig_token, bert_token in get_bert_token_mapping(opt.fasttext_type).items():
        dict_["iwords"][tokenizer.convert_tokens_to_ids([bert_token])[0]] = orig_token
    dict_["words"] = {w: i for i, w in enumerate(dict_["iwords"])}
    dict_["wordcounts"] = None  # Not used here
    dict_["bert_tokenizer"] = tokenizer

    return dict_