def build_vocabulary()

in datasets.py [0:0]


    def build_vocabulary(self):
        """
        Function that builds vocabulary
        :return:
        """

        self.vocabulary['<PAD>'] = 0
        self.word_to_idx['<PAD>'] = 0
        self.idx_to_word[0] = '<PAD>'

        self.vocabulary['<START>'] = 1
        self.word_to_idx['<START>'] = 1
        self.idx_to_word[1] = '<START>'

        self.vocabulary['<END>'] = 2
        self.word_to_idx['<END>'] = 2
        self.idx_to_word[2] = '<END>'

        keyword_list = []

        i = 0
        with open(self.train_file, 'r') as file:
            # skip the file line of the file as header
            file.readline()
            for line in file:
                image_id, \
                    _, \
                    cleaned_image_questions, \
                    _, keyword = self.get_processed_fields(line, self.train_file, build_image_feature=False)
                if self.use_keyword:
                    keyword_list.append(keyword.split())

                self.train_image_id_questions_dict[image_id] = cleaned_image_questions
                i += 1

        if self.build_vocab_dev:
            i = 0
            with open(self.validation_file, 'r') as file:
                # skip the file line of the file as header
                file.readline()
                for line in file:
                    image_id, \
                        _, \
                        cleaned_image_questions, \
                        _, keyword = self.get_processed_fields(line, self.validation_file, build_image_feature=False)
                    if self.use_keyword:
                        keyword_list.append(keyword.split())
                    self.dev_image_id_questions_dict[image_id] = cleaned_image_questions
                    i += 1

        # Start the word index from 3 after PAD, START, END tokens
        count = 3
        for image_id in self.train_image_id_questions_dict:
            for question in self.train_image_id_questions_dict[image_id]:
                tokens = question.split(" ")
                for token in tokens:
                    if token not in self.vocabulary:
                        self.vocabulary[token] = count
                        self.word_to_idx[token] = count
                        self.idx_to_word[count] = token
                        count += 1

        if self.build_vocab_dev:
            for image_id in self.dev_image_id_questions_dict:
                for question in self.dev_image_id_questions_dict[image_id]:
                    tokens = question.split(" ")
                    for token in tokens:
                        if token not in self.vocabulary:
                            self.vocabulary[token] = count
                            self.word_to_idx[token] = count
                            self.idx_to_word[count] = token
                            count += 1

        # Add keywords to the vocabulary
        if self.use_keyword:
            for keywords in keyword_list:
                for token in keywords:
                    if token not in self.vocabulary:
                        self.vocabulary[token] = count
                        self.word_to_idx[token] = count
                        self.idx_to_word[count] = token
                        count += 1