def tokenize()

in vilbert/datasets/vqa_mc_dataset.py [0:0]


    def tokenize(self, max_length=16):
        """Tokenizes the questions.

        This will add q_token in each entry of the dataset.
        -1 represent nil, and should be treated as padding_index in embedding
        """

        count = 0
        for entry in self.entries:
            option = entry["option"]

            if self.split != "test":
                # replace one answer if it is not exist in option
                ans_exist = False
                if entry["answer"] in option:
                    ans_exist = True

                if not ans_exist:
                    random.shuffle(option)
                    option.pop()
                    option.append(entry["answer"])

                # identify the target.
                for i, ans in enumerate(option):
                    if ans == entry["answer"]:
                        target = i

            tokens_all = []
            input_mask_all = []
            segment_ids_all = []
            for i, ans in enumerate(option):

                tokens_a = self._tokenizer.tokenize(entry["question"])
                tokens_b = self._tokenizer.tokenize(ans)
                tokens_a, tokens_b = self._truncate_seq_pair(
                    tokens_a, tokens_b, max_length - 3
                )

                tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"]

                tokens = [
                    self._tokenizer.vocab.get(w, self._tokenizer.vocab["[UNK]"])
                    for w in tokens
                ]

                tokens = tokens[:max_length]
                segment_ids = [0] * len(tokens)
                input_mask = [1] * len(tokens)

                if len(tokens) < max_length:
                    # Note here we pad in front of the sentence
                    padding = [self._padding_index] * (max_length - len(tokens))
                    tokens = tokens + padding
                    input_mask += padding
                    segment_ids += padding

                assert_eq(len(tokens), max_length)
                tokens_all.append(tokens)
                input_mask_all.append(input_mask)
                segment_ids_all.append(segment_ids)

            entry["q_token"] = tokens_all
            entry["q_input_mask"] = input_mask_all
            entry["q_segment_ids"] = segment_ids_all
            if self.split != "test":
                entry["target"] = target

            sys.stdout.write("%d/%d\r" % (count, len(self.entries)))
            sys.stdout.flush()
            count += 1