in datasets.py [0:0]
def build_vocabulary(self):
"""
Function that builds vocabulary
:return:
"""
self.vocabulary['<PAD>'] = 0
self.word_to_idx['<PAD>'] = 0
self.idx_to_word[0] = '<PAD>'
self.vocabulary['<START>'] = 1
self.word_to_idx['<START>'] = 1
self.idx_to_word[1] = '<START>'
self.vocabulary['<END>'] = 2
self.word_to_idx['<END>'] = 2
self.idx_to_word[2] = '<END>'
keyword_list = []
i = 0
with open(self.train_file, 'r') as file:
# skip the file line of the file as header
file.readline()
for line in file:
image_id, \
_, \
cleaned_image_questions, \
_, keyword = self.get_processed_fields(line, self.train_file, build_image_feature=False)
if self.use_keyword:
keyword_list.append(keyword.split())
self.train_image_id_questions_dict[image_id] = cleaned_image_questions
i += 1
if self.build_vocab_dev:
i = 0
with open(self.validation_file, 'r') as file:
# skip the file line of the file as header
file.readline()
for line in file:
image_id, \
_, \
cleaned_image_questions, \
_, keyword = self.get_processed_fields(line, self.validation_file, build_image_feature=False)
if self.use_keyword:
keyword_list.append(keyword.split())
self.dev_image_id_questions_dict[image_id] = cleaned_image_questions
i += 1
# Start the word index from 3 after PAD, START, END tokens
count = 3
for image_id in self.train_image_id_questions_dict:
for question in self.train_image_id_questions_dict[image_id]:
tokens = question.split(" ")
for token in tokens:
if token not in self.vocabulary:
self.vocabulary[token] = count
self.word_to_idx[token] = count
self.idx_to_word[count] = token
count += 1
if self.build_vocab_dev:
for image_id in self.dev_image_id_questions_dict:
for question in self.dev_image_id_questions_dict[image_id]:
tokens = question.split(" ")
for token in tokens:
if token not in self.vocabulary:
self.vocabulary[token] = count
self.word_to_idx[token] = count
self.idx_to_word[count] = token
count += 1
# Add keywords to the vocabulary
if self.use_keyword:
for keywords in keyword_list:
for token in keywords:
if token not in self.vocabulary:
self.vocabulary[token] = count
self.word_to_idx[token] = count
self.idx_to_word[count] = token
count += 1