in dvd_codebase/data/data_handler.py [0:0]
def get_vocabulary(dials, args, vocab=None):
#answer_options = set()
word_freq = {}
for dialog in tqdm(dials, total=len(dials)):
for turn in dialog:
for word in nltk.word_tokenize(turn['question']):
if word not in word_freq: word_freq[word] = 0
word_freq[word] += 1
answer = str(turn['answer'])
#answer_options.add(answer)
for word in nltk.word_tokenize(answer):
if word not in word_freq: word_freq[word] = 0
word_freq[word] += 1
program = turn['final_all_program']
for n in program:
if n['type'] == 'identity': continue
if n['type'] not in word_freq: word_freq[n['type']] = 0
word_freq[n['type']] += 1
if 'side_inputs' in n:
for side_input in n['side_inputs']:
for word in nltk.word_tokenize(side_input):
if word not in word_freq: word_freq[word] = 0
word_freq[word] += 1
if vocab is not None:
unk_words = set()
for word, freq in word_freq.items():
if word not in vocab:
unk_words.add(word)
return unk_words
vocab = {'<unk>':0, '<blank>':1, '<sos>':2, '<eos>':3, '<eoo>': 4}
for word, freq in word_freq.items():
vocab[word] = len(vocab)
answer_options = ['0', '1', '10', '2', '3', '4', '5', '6', '7', '8', '9', 'False', 'True', 'blue', 'brown', 'cone', 'cube', 'cyan', 'cylinder', 'flying', 'flying,rotating', 'flying,rotating,sliding', 'flying,sliding', 'gold', 'gray', 'green', 'large', 'medium', 'metal', 'no action', 'purple', 'red', 'rotating', 'rotating,sliding', 'rubber', 'sliding', 'small', 'sphere', 'spl', 'yellow']
return vocab, answer_options