in lama/modules/gpt_connector.py [0:0]
def __init__(self, args):
super().__init__()
if args.gpt_model_dir is not None:
# load bert model from file
gpt_model_name = str(args.gpt_model_dir) + "/"
dict_file = gpt_model_name
print("loading Open AI GPT model from {}".format(gpt_model_name))
else:
# load GPT model from huggingface cache
gpt_model_name = args.gpt_model_name
dict_file = gpt_model_name
# Load pre-trained model tokenizer (vocabulary)
self.tokenizer = OpenAIGPTTokenizer.from_pretrained(dict_file)
# GPT uses different way to represent BPE then BERT. Namely, the
# final suffixes are indicated with </w> suffix, while pieces that must
# be followed are written as is. In BERT the prefixes are written as is
# while the parts that must follow (not be followed!) have '##' prefix.
# There is no one-to-one coversion. But at least we may make pieces that
# may form a full word look the same.
# Note that we should be very careful now,
# tokenizer.convert_tokens_to_ids won't work with our vocabulary.
def convert_word(word):
if word == OPENAI_UNK:
return word
if word == '\n</w>':
# Redefine symbol EOS to improve visualization.
return OPENAI_EOS
return word[:-4] if word.endswith('</w>') else f'{word}##'
_, gpt_vocab = zip(*sorted(self.tokenizer.decoder.items()))
self.vocab = [convert_word(word) for word in gpt_vocab]
self._init_inverse_vocab()
# Get UNK symbol as it's written in the origin GPT vocab.
unk_index = self.inverse_vocab[OPENAI_UNK]
self.unk_symbol = self.tokenizer.decoder[unk_index]
# Load pre-trained model (weights)
self.gpt_model = OpenAIGPTLMHeadModel.from_pretrained(gpt_model_name)
self.gpt_model.eval()
print(self.gpt_model.config)
# Sanity check.
assert len(self.vocab) == self.gpt_model.config.vocab_size
assert 0 == self.gpt_model.config.n_special
self.eos_id = self.inverse_vocab[OPENAI_EOS]
self.model_vocab = self.vocab