in empchat/datasets/loader.py [0:0]
def build_bert_dictionary(opt):
try:
from pytorch_pretrained_bert import BertTokenizer
except ImportError:
raise Exception(
"BERT rankers needs pytorch-pretrained-BERT installed. "
"\npip install pytorch-pretrained-bert"
)
if BERT_ID != "bert-base-cased" and opt.dataset_name == "reddit":
raise NotImplementedError(
"Currently, only bert-base-cased can be used with reddit!"
)
if BERT_ID != "bert-base-cased" and opt.fasttext_type is not None:
raise NotImplementedError(
'Currently, "bert-base-cased" is the only BERT model for which we '
"have defined lists of fastText labels without BERT tokens!"
)
is_cased = BERT_ID.split("-")[2] == "cased"
tokenizer = BertTokenizer.from_pretrained(
BERT_ID,
do_lower_case=not is_cased,
never_split=(
["[CLS]", "[MASK]"]
+ list(get_bert_token_mapping(opt.fasttext_type).values())
),
)
dict_ = dict()
# Create dictionary from HuggingFace version. Note that the special tokens
# have been replicated from build_dictionary() above, and I have used the
# BERT token equivalence mapping suggested by ParlAI's
# parlai/agents/bert_ranker/bert_dictionary.py, except for START_OF_COMMENT,
# which I am setting to a token that hasn't been used before.
if opt.dict_max_words is not None:
logging.warning(
"--dict-max-words will be ignored because we are using the BERT "
"tokenizer."
)
dict_["iwords"] = list(tokenizer.vocab.keys())
for orig_token, bert_token in get_bert_token_mapping(opt.fasttext_type).items():
dict_["iwords"][tokenizer.convert_tokens_to_ids([bert_token])[0]] = orig_token
dict_["words"] = {w: i for i, w in enumerate(dict_["iwords"])}
dict_["wordcounts"] = None # Not used here
dict_["bert_tokenizer"] = tokenizer
return dict_