in dpr/data/biencoder_data.py [0:0]
def get_positions(self, input_ids: T, tenzorizer: Tensorizer):
if not self.token_id:
self.token_id = tenzorizer.get_token_id(self.token)
token_indexes = (input_ids == self.token_id).nonzero()
# check if all samples in input_ids has index presence and out a default value otherwise
bsz = input_ids.size(0)
if bsz == token_indexes.size(0):
return token_indexes
token_indexes_result = []
found_idx_cnt = 0
for i in range(bsz):
if found_idx_cnt < token_indexes.size(0) and token_indexes[found_idx_cnt][0] == i:
# this samples has the special token
token_indexes_result.append(token_indexes[found_idx_cnt])
found_idx_cnt += 1
else:
logger.warning("missing special token %s", input_ids[i])
token_indexes_result.append(
torch.tensor([i, 0]).to(input_ids.device)
) # setting 0-th token, i.e. CLS for BERT as the special one
token_indexes_result = torch.stack(token_indexes_result, dim=0)
return token_indexes_result