def get_vocabulary()

in dvd_codebase/data/data_handler.py [0:0]


def get_vocabulary(dials, args, vocab=None):
    #answer_options = set()
    word_freq = {}
    for dialog in tqdm(dials, total=len(dials)):
        for turn in dialog:
            for word in nltk.word_tokenize(turn['question']):
                if word not in word_freq: word_freq[word] = 0
                word_freq[word] += 1                    
            answer = str(turn['answer'])
            #answer_options.add(answer)
            for word in nltk.word_tokenize(answer):
                if word not in word_freq: word_freq[word] = 0
                word_freq[word] += 1 
            program = turn['final_all_program']
            for n in program: 
                if n['type'] == 'identity': continue 
                if n['type'] not in word_freq: word_freq[n['type']] = 0
                word_freq[n['type']] += 1     
                if 'side_inputs' in n:
                    for side_input in n['side_inputs']:
                        for word in nltk.word_tokenize(side_input):
                            if word not in word_freq: word_freq[word] = 0
                            word_freq[word] += 1                           
    if vocab is not None: 
        unk_words = set()
        for word, freq in word_freq.items():
            if word not in vocab:
                unk_words.add(word)
        return unk_words 
    vocab = {'<unk>':0, '<blank>':1, '<sos>':2, '<eos>':3, '<eoo>': 4}
    for word, freq in word_freq.items():
        vocab[word] = len(vocab) 
    answer_options =  ['0', '1', '10', '2', '3', '4', '5', '6', '7', '8', '9', 'False', 'True', 'blue', 'brown', 'cone', 'cube', 'cyan', 'cylinder', 'flying', 'flying,rotating', 'flying,rotating,sliding', 'flying,sliding', 'gold', 'gray', 'green', 'large', 'medium', 'metal', 'no action', 'purple', 'red', 'rotating', 'rotating,sliding', 'rubber', 'sliding', 'small', 'sphere', 'spl', 'yellow']
    return vocab, answer_options