in metrics/bert_score.py [0:0]
def get_bert_embedding(path, arr, model, tokenizer, all_layers=False):
"""
Compute BERT embedding in batches.
Args:
- :param: `all_sens` (list of str) : sentences to encode.
- :param: `model` : a BERT model from `pytorch_pretrained_bert`.
- :param: `tokenizer` : a BERT tokenizer corresponds to `model`.
- :param: `idf_dict` (dict) : mapping a word piece index to its
inverse document frequency
- :param: `device` (str): device to use, e.g. 'cpu' or 'cuda'
"""
print('working in', path)
max_len = max([len(a) for a in arr])
attention_mask = np.zeros((len(arr), max_len), dtype=np.float)
for i, a in enumerate(arr):
if len(a) < max_len:
arr[i] = a + [tokenizer.pad_token_id] * (max_len - len(a))
attention_mask[i, len(a):] = 1
arr = np.array(arr, dtype=np.int)
m = torch.nn.LogSoftmax(dim=2)
sub_batch_size = 256
sum_ = 0
with torch.no_grad():
for i in range(0, len(arr)):
x = arr[i:i + 1]
# repeat x
x_repeat_seqlen = np.repeat(x, block_size, axis=0).reshape(-1, x.shape[-1])
x_repeat_seqlen[np.arange(block_size), np.arange(block_size)] = tokenizer.mask_token_id
x_repeat_seqlen_cuda = torch.from_numpy(x_repeat_seqlen).to('cuda')
# repeat mask
mask_repeat_seqlen = torch.from_numpy(np.repeat(attention_mask[i:i + 1], block_size, axis=0).reshape(-1,
attention_mask[
i:i
+
1].shape[
-1])).to(
'cuda')
overal_likelihood = []
for j in range(0, x_repeat_seqlen.shape[0], sub_batch_size):
batch_prediction_score = bert_encode(model, x_repeat_seqlen_cuda[j:j + sub_batch_size],
attention_mask=mask_repeat_seqlen[j:j + sub_batch_size],
all_layers=all_layers)
likelihood = m(batch_prediction_score)
overal_likelihood.append(likelihood)
likelihood = torch.cat(overal_likelihood, 0)
likelihood = likelihood.type(torch.float16)
likelihood_np = likelihood.cpu().clone().numpy()
sequence_len, _, vocab_size = likelihood_np.shape
tmp = likelihood_np[np.arange(sequence_len), np.arange(sequence_len), np.array(x[0, :])].mean()
sum_ += tmp
return (sum_ / len(arr))