def build_imdb()

in util/build_imdb.py [0:0]


def build_imdb(FLAGS):
  """Method to construct and save the image-database for the dataset
  """

  print('Building imdb for visdial split: %s' % FLAGS.visdial_file)
  qid2layout_dict = np.load(FLAGS.ques_prog_file)[()]

  ques_att_file = FLAGS.ques_prog_file.replace('.layout', '.attention')
  ques_prog_att = np.load(ques_att_file)[()]

  cap_progs = np.load(FLAGS.cap_prog_file)[()]
  cap_att_file = FLAGS.cap_prog_file.replace('.layout', '.attention')
  cap_prog_att = np.load(cap_att_file)[()]
  vocab = text_processing.VocabDict(FLAGS.vocab_file)

  # load the data
  with open(FLAGS.visdial_file, 'r') as file_id:
    vd_data = json.load(file_id)

  # load the reference data
  with open(FLAGS.coreference_file, 'r') as file_id:
    references = json.load(file_id)
    references = references['data']['dialogs']

  # coco_name = img_split + '2014'
  # img_root = os.path.abspath(image_dir % coco_name)
  # feat_root = os.path.abspath(feature_dir % coco_name)
  # img_name_format = 'COCO_' + coco_name + '_%012d'

  # process and tokenize all questions and answers
  tokenizer = lambda x, suff: [vocab.word2idx(ii) for ii in
                               word_tokenize(clean.clean_non_ascii(x + suff))]

  print('Tokenizing captions')
  caption_list = [ii['caption'] for ii in vd_data['data']['dialogs']]
  clean_cap = [tokenizer(cap, '') for cap in progressbar(caption_list)]
  max_cap_len = max([len(ii) for ii in clean_cap])

  cap_tokens = np.zeros((len(clean_cap), max_cap_len)).astype('int32')
  cap_tokens.fill(vocab.word2idx('<pad>'))
  cap_lens = np.zeros(len(clean_cap)).astype('int32')

  for q_id, tokens in progressbar(enumerate(clean_cap)):
    cap_lens[q_id] = len(tokens)
    cap_tokens[q_id, :cap_lens[q_id]] = np.array(tokens)

  print('Tokenizing questions')
  question_list = vd_data['data']['questions']
  clean_ques = [tokenizer(ques, '?') for ques in progressbar(question_list)]
  max_ques_len = max([len(ii) for ii in clean_ques])

  ques_tokens = np.zeros((len(clean_ques), max_ques_len)).astype('int32')
  ques_tokens.fill(vocab.word2idx('<pad>'))
  ques_lens = np.zeros(len(clean_ques)).astype('int32')

  for q_id, tokens in progressbar(enumerate(clean_ques)):
    ques_lens[q_id] = len(tokens)
    ques_tokens[q_id, :ques_lens[q_id]] = np.array(tokens)

  print('Tokenizing answers')
  answer_list = vd_data['data']['answers']
  clean_ans = [tokenizer(ans, '') for ans in progressbar(answer_list)]
  max_ans_len = max([len(ii) for ii in clean_ans])

  ans_tokens = np.zeros((len(clean_ans), max_ans_len)).astype('int32')
  ans_tokens.fill(vocab.word2idx('<pad>'))
  ans_lens = np.zeros(len(clean_ans)).astype('int32')

  ans_in = np.zeros((len(clean_ans), max_ans_len + 1)).astype('int32')
  ans_out = np.zeros((len(clean_ans), max_ans_len + 1)).astype('int32')
  ans_in.fill(vocab.word2idx('<pad>'))
  ans_out.fill(vocab.word2idx('<pad>'))
  start_token_id = vocab.word2idx('<start>')
  end_token_id = vocab.word2idx('<end>')
  ans_in[:, 0] = start_token_id

  for a_id, tokens in progressbar(enumerate(clean_ans)):
    ans_lens[a_id] = len(tokens)
    answer = np.array(tokens)
    ans_tokens[a_id, :ans_lens[a_id]] = answer
    ans_in[a_id, 1:ans_lens[a_id]+1] = answer
    ans_out[a_id, :ans_lens[a_id]] = answer
    ans_out[a_id, ans_lens[a_id]] = end_token_id

  ans_lens += 1

  imdb = {}
  # number of entries in the database
  num_dialogs = len(vd_data['data']['dialogs'])
  imdb['data'] = [None] * num_dialogs
  imdb['ans'], imdb['ans_len'] = ans_tokens, ans_lens
  imdb['ans_in'], imdb['ans_out'] = ans_in, ans_out
  imdb['ques'], imdb['ques_len'] = ques_tokens, ques_lens
  imdb['cap'], imdb['cap_len'] = cap_tokens, cap_lens
  imdb['cap_prog'], imdb['cap_prog_att'] = cap_progs, np.array(cap_prog_att)

  for dialog_id, datum in progressbar(enumerate(vd_data['data']['dialogs'])):
    img_id = datum['image_id']
    img_path = FLAGS.image_path_format % img_id
    feat_path = FLAGS.feature_path % img_id

    # compact bundle with all the information
    bundle = {'image_name': img_id, 'image_path': img_path,
              'feature_path': feat_path, 'caption_ind': dialog_id,
              'question_id': [], 'question_ind': [], 'answer_ind': [],
              'option_ind': [], 'gt_ind' : [], 'gt_layout_tokens': [],
              'gt_layout_att': []}

    # reference datum
    refer_datum = references[dialog_id]
    assert(refer_datum['image_id'] == img_id)
    # for each cluster, get the first mention
    clusters = {}
    caption_clusters = (refer_datum['caption_reference_clusters'] +
                        refer_datum['caption_coref_clusters'])
    for ii in caption_clusters:
      c_id = ii['cluster_id']
      clusters[c_id] = clusters.get(c_id, 'c')

    # each round
    for r_id in range(10): # assuming 10 rounds for now
      referrer = refer_datum['dialog'][r_id]
      for ii in referrer['question_reference_clusters']:
        c_id = ii['cluster_id']
        clusters[c_id] = clusters.get(c_id, 'q%d' % r_id)

      for ii in referrer['answer_reference_clusters']:
        c_id = ii['cluster_id']
        # to distinguish answer
        clusters[c_id] = clusters.get(c_id, 'a%d' % r_id)

    # bundle as questions in a conversation together
    num_refers = 0
    for r_id, round_data in enumerate(datum['dialog']):
      q_id = img_id * 10 + r_id

      bundle['question_id'].append(q_id)
      bundle['question_ind'].append(round_data['question'])
      bundle['answer_ind'].append(round_data['answer'])
      bundle['option_ind'].append(round_data['answer_options'])
      bundle['gt_ind'].append(round_data['gt_index'])

      # gt attention for parsed layout
      attention = np.array(ques_prog_att[round_data['question']])

      # check if references is non-empty and replace with _Refer
      layout = copy.deepcopy(list(qid2layout_dict[q_id]))
      referrer = refer_datum['dialog'][r_id]['question_referrer_clusters']
      if len(referrer) > 0:
        refer = referrer[0]
        # pick _Find module with max attention overlap
        max_overlap = (0, 0)
        for pos, token in enumerate(layout):
          if token == '_Find':
            start = max(attention[pos][0], refer['start_word'])
            end = min(attention[pos][1], refer['end_word'])
            overlap = min(0, end - start)
            if max_overlap[1] < overlap: max_overlap = (pos, overlap)

        # reset it to _Refer
        pos, _ = max_overlap
        layout[pos] = '_Refer'
        attention[pos] = [refer['start_word'], refer['end_word']]

        # get that cluster id, and corresponding history attention
        num_refers += 1
      bundle['gt_layout_tokens'].append(layout)

      # check for the words attending to
      ques_tokens = imdb['ques'][round_data['question']]
      ques_words = [vocab.idx2word(ii) for ii in ques_tokens]
      for index, pos in enumerate(attention):
        # if single word,  'the', 'a', 'of', 'you'
        try:
          if (pos[1] - pos[0]) == 1 and ques_words[pos[0]] in stop_words:
            attention[index] = [0, 0]
        except: pdb.set_trace()
      bundle['gt_layout_att'].append(attention)

    # record
    imdb['data'][dialog_id] = bundle
  return imdb