def get_bert_embedding()

in metrics/bert_score.py [0:0]


def get_bert_embedding(path, arr, model, tokenizer, all_layers=False):
    """
    Compute BERT embedding in batches.
    Args:
        - :param: `all_sens` (list of str) : sentences to encode.
        - :param: `model` : a BERT model from `pytorch_pretrained_bert`.
        - :param: `tokenizer` : a BERT tokenizer corresponds to `model`.
        - :param: `idf_dict` (dict) : mapping a word piece index to its
                               inverse document frequency
        - :param: `device` (str): device to use, e.g. 'cpu' or 'cuda'
    """
    print('working in', path)
    max_len = max([len(a) for a in arr])
    attention_mask = np.zeros((len(arr), max_len), dtype=np.float)
    for i, a in enumerate(arr):
        if len(a) < max_len:
            arr[i] = a + [tokenizer.pad_token_id] * (max_len - len(a))
            attention_mask[i, len(a):] = 1
    arr = np.array(arr, dtype=np.int)
    m = torch.nn.LogSoftmax(dim=2)
    sub_batch_size = 256
    sum_ = 0
    with torch.no_grad():
        for i in range(0, len(arr)):
            x = arr[i:i + 1]
            # repeat x
            x_repeat_seqlen = np.repeat(x, block_size, axis=0).reshape(-1, x.shape[-1])
            x_repeat_seqlen[np.arange(block_size), np.arange(block_size)] = tokenizer.mask_token_id
            x_repeat_seqlen_cuda = torch.from_numpy(x_repeat_seqlen).to('cuda')

            # repeat mask
            mask_repeat_seqlen = torch.from_numpy(np.repeat(attention_mask[i:i + 1], block_size, axis=0).reshape(-1,
                                                                                                                 attention_mask[
                                                                                                                 i:i 
                                                                                                                   + 
                                                                                                                   1].shape[
                                                                                                                     -1])).to(
                'cuda')

            overal_likelihood = []
            for j in range(0, x_repeat_seqlen.shape[0], sub_batch_size):
                batch_prediction_score = bert_encode(model, x_repeat_seqlen_cuda[j:j + sub_batch_size],
                                                     attention_mask=mask_repeat_seqlen[j:j + sub_batch_size],
                                                     all_layers=all_layers)
                likelihood = m(batch_prediction_score)
                overal_likelihood.append(likelihood)

            likelihood = torch.cat(overal_likelihood, 0)
            likelihood = likelihood.type(torch.float16)
            likelihood_np = likelihood.cpu().clone().numpy()
            sequence_len, _, vocab_size = likelihood_np.shape
            tmp = likelihood_np[np.arange(sequence_len), np.arange(sequence_len), np.array(x[0, :])].mean()
            sum_ += tmp

    return (sum_ / len(arr))