in empchat/datasets/empchat.py [0:0]
def txt2vec(dic, text, fasttext_type=None):
if hasattr(dic, "bert_tokenizer"):
orig_mapping = get_bert_token_mapping(fasttext_type)
mapping = dict((re.escape(k), v) for k, v in orig_mapping.items())
pattern = re.compile("|".join(mapping.keys()))
cleaned_text = pattern.sub(lambda m: mapping[re.escape(m.group(0))], text)
tokenized_text = dic.bert_tokenizer.tokenize(cleaned_text)
return dic.bert_tokenizer.convert_tokens_to_ids(tokenized_text)
elif type(dic) is ParlAIDictionary:
return dic.txt2vec(text)
else:
return [dic.index(token) for token in tokenize(text)]