in vilbert/datasets/vqa_mc_dataset.py [0:0]
def tokenize(self, max_length=16):
"""Tokenizes the questions.
This will add q_token in each entry of the dataset.
-1 represent nil, and should be treated as padding_index in embedding
"""
count = 0
for entry in self.entries:
option = entry["option"]
if self.split != "test":
# replace one answer if it is not exist in option
ans_exist = False
if entry["answer"] in option:
ans_exist = True
if not ans_exist:
random.shuffle(option)
option.pop()
option.append(entry["answer"])
# identify the target.
for i, ans in enumerate(option):
if ans == entry["answer"]:
target = i
tokens_all = []
input_mask_all = []
segment_ids_all = []
for i, ans in enumerate(option):
tokens_a = self._tokenizer.tokenize(entry["question"])
tokens_b = self._tokenizer.tokenize(ans)
tokens_a, tokens_b = self._truncate_seq_pair(
tokens_a, tokens_b, max_length - 3
)
tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"]
tokens = [
self._tokenizer.vocab.get(w, self._tokenizer.vocab["[UNK]"])
for w in tokens
]
tokens = tokens[:max_length]
segment_ids = [0] * len(tokens)
input_mask = [1] * len(tokens)
if len(tokens) < max_length:
# Note here we pad in front of the sentence
padding = [self._padding_index] * (max_length - len(tokens))
tokens = tokens + padding
input_mask += padding
segment_ids += padding
assert_eq(len(tokens), max_length)
tokens_all.append(tokens)
input_mask_all.append(input_mask)
segment_ids_all.append(segment_ids)
entry["q_token"] = tokens_all
entry["q_input_mask"] = input_mask_all
entry["q_segment_ids"] = segment_ids_all
if self.split != "test":
entry["target"] = target
sys.stdout.write("%d/%d\r" % (count, len(self.entries)))
sys.stdout.flush()
count += 1