def load_one_batch()

in loader_vd/data_reader.py [0:0]


  def load_one_batch(self, sample_ids):
    """Load data given the sample ids.
    """

    actual_batch_size = len(sample_ids)
    batch = {}

    # replace question _Find with _Refer
    find_module_token = self.assembler.name2idx_dict['_Find']
    #refer_module_token = self.assembler.name2idx_dict['_Refer']
    eos_token = self.assembler.name2idx_dict['<eos>']

    # whether to flatten or not
    flatten = 'dial' not in self.params['model']
    flatten = 'nmn-cap' not in self.params['model']
    num_rounds = self.num_rounds

    # captions
    if flatten:
      cap_inds = [self.imdb['data'][ii]['caption_ind'] for ii in sample_ids
                  for _ in range(num_rounds)]
    else:
      cap_inds = [self.imdb['data'][ii]['caption_ind'] for ii in sample_ids]
    cap_batch = self.imdb['cap'][cap_inds][:, :self.T_encoder]
    cap_len = self.imdb['cap_len'][cap_inds]

    # get caption programs
    cap_prog = None
    cap_gt_att = None
    if 'nmn-cap' in self.params['model']:
      cap_prog = np.zeros((self.T_decoder, len(cap_inds)), np.int32)
      cap_prog.fill(eos_token)
      for spot, ii in enumerate(cap_inds):
        layout = self.imdb['cap_prog'][ii]

        cap_prog[:, spot] = \
          self.assembler.module_list2tokens(layout, self.T_decoder)

        # also get attention for supervision
        if self.supervise_attention:
          cap_gt_att = np.zeros((self.T_decoder, self.T_encoder, \
                      actual_batch_size, 1), np.float32)

          for spot, ii in enumerate(cap_inds):
            for t_id, att in enumerate(self.imdb['cap_prog_att'][ii]):
              span = att[1] - att[0]
              # NOTE: number of attention hardwired to be <= 4
              if span > 0 or span == 0: continue
              if span == 0: continue
              cap_gt_att[t_id, att[0]:att[1], spot] = 1/span

    # questions
    ques_inds = [jj for ii in sample_ids
                 for jj in self.imdb['data'][ii]['question_ind']]
    ques_batch = self.imdb['ques'][ques_inds][:, :self.T_encoder].transpose()
    ques_len = self.imdb['ques_len'][ques_inds]
    ques_ids = [jj for ii in sample_ids
                for jj in self.imdb['data'][ii]['question_id']]
    gt_index = [jj for ii in sample_ids
                for jj in self.imdb['data'][ii]['gt_ind']]

    # answers
    ans_inds = [jj for ii in sample_ids
                for jj in self.imdb['data'][ii]['answer_ind']]

    ans_batch_in = self.imdb['ans_in'][ans_inds][:, :self.T_encoder]
    ans_batch_out = self.imdb['ans_out'][ans_inds][:, :self.T_encoder]
    ans_batch = self.imdb['ans_in'][ans_inds][:, 1:self.T_encoder]
    ans_len = self.imdb['ans_len'][ans_inds]

    # getting history
    if self.use_history:
      history = self.imdb['hist'][sample_ids]
      hist_len = self.imdb['hist_len'][sample_ids]
    else:
      history, hist_len = None, None

    # image features
    if 'prog' in self.params['model']:
      # single copy per conversation
      image_feats = np.zeros((actual_batch_size, self.feat_H,
                              self.feat_W, self.feat_D), np.float32)

    else:
      image_feats = None

    image_path = [None] * actual_batch_size

    # load fact
    if self.params['use_fact']:
      fact = self.imdb['fact'][sample_ids]
      fact_len = self.imdb['fact_len'][sample_ids]
      # flatten
      fact = np.reshape(fact, [-1, fact.shape[-1]])
      fact_len = np.reshape(fact_len, [-1])
    else:
      fact, fact_len = None, None

    # programs
    if self.load_gt_layout:
      gt_layout_batch = np.zeros((self.T_decoder,
                                  num_rounds * actual_batch_size), np.int32)
      gt_layout_batch.fill(eos_token)

    gt_attention = None
    if self.supervise_attention:
      gt_attention = np.zeros((self.T_decoder, self.T_encoder,
                               num_rounds * actual_batch_size, 1), np.float32)

    # mask for weights, for history attention
    weight_mask = []
    for n in range(len(sample_ids)):
      iminfo = self.imdb['data'][sample_ids[n]]

      # image features
      if 'prog' in self.params['model']:
        # if VGG features are to be used
        if self.use_vgg:
          img_id = '/'.join(iminfo['feature_path'].split('/')[-2:])
          img_id = img_id.replace('npy', 'jpg')
          if img_id in self.img_index:
            f_ind = self.img_index[img_id]
            cur_feat = self.img_feats[f_ind]
          else:
            cur_feat = self.zero_feature

        else:
          # use preloaded image features
          feat_path = self._adjust_image_dir(iminfo['feature_path'])
          if not self.preload_features: cur_feat = np.load(feat_path)
          else: cur_feat = self.img_feats[feat_path]

        # single copy per conversation
        image_feats[n] = cur_feat

      image_path[n] = iminfo['image_path']

      # programs
      if self.load_gt_layout:
        # go over all the questions
        for r_id, layout in enumerate(iminfo['gt_layout_tokens']):
          gt_layout_batch[:, num_rounds * n + r_id] = \
          self.assembler.module_list2tokens(layout, self.T_decoder)

      if self.supervise_attention:
        num_refers = 0
        for r_id, att in enumerate(iminfo['gt_layout_att']):
          for t_id in range(att.shape[0]):
            index = num_rounds * n + r_id
            span = att[t_id, 1] - att[t_id, 0]
            # NOTE: number of attention timesteps hardwired to be <= 4
            if span > 4 or span == 0: continue
            gt_attention[t_id, att[t_id,0]:att[t_id,1], index] = 1/span

      # if options are not needed, continue
      if not self.fetch_options: continue
      #------------------------------------------------------------------

      # get options
      opt_inds = [jj for ii in sample_ids
              for jj in self.imdb['data'][ii]['option_ind']]
      num_options = len(opt_inds[0])
      opt_batch_in = [None] * num_options
      opt_batch_out = [None] * num_options
      opt_len = [None] * num_options
      for ii in range(num_options):
        cur_inds = [jj[ii] for jj in opt_inds]
        opt_batch_in[ii] = self.imdb['ans_in'][cur_inds][:, :self.T_encoder]
        opt_batch_out[ii] = self.imdb['ans_out'][cur_inds][:, :self.T_encoder]
        opt_len[ii] = self.imdb['ans_len'][cur_inds]
      #------------------------------------------------------------------

    batch = {'ques': ques_batch, 'ques_len': ques_len,
             'ques_id': ques_ids, 'gt_layout': gt_layout_batch,
             'gt_att' : gt_attention,
             'cap': cap_batch, 'cap_len': cap_len, 'cap_prog': cap_prog,
             'cap_att': cap_gt_att,
             'hist': history, 'hist_len': hist_len, 'ans_in': ans_batch_in,
             'ans_out': ans_batch_out, 'ans_len':ans_len, 'ans': ans_batch,
             'fact': fact, 'fact_len': fact_len,
             'img_feat': image_feats, 'img_path': image_path}

    #------------------------------------------------------------------
    # further add options
    if self.fetch_options:
      options = {'opt_in': opt_batch_in, 'opt_out': opt_batch_out,\
                 'opt_len': opt_len, 'gt_ind': gt_index}
      batch.update(options)
    #------------------------------------------------------------------
    if 'nmn-cap' not in self.params['model']:
      return batch

    # getting data for training alignment on caption
    if actual_batch_size > 1:
      info = [batch['cap'], batch['cap_len'],
            batch['cap_prog'].transpose()]
      if batch['cap_att'] is not None:
        info.append(batch['cap_att'].transpose((2, 0, 1, 3)))

      shuffled = support.shuffle(info, actual_batch_size)

      batch['sh_cap'], batch['sh_cap_len'] = shuffled[:2]
      batch['sh_cap_prog'] = shuffled[2].transpose()
      batch['align_gt'] = np.ones(num_rounds*actual_batch_size).astype('int32')

      if batch['cap_att'] is not None:
        batch['sh_cap_att'] = shuffled[3].transpose((1, 2, 0, 3))
      for ii in range(actual_batch_size):
        start =  num_rounds * ii + num_rounds // 2
        end = num_rounds * (ii+1)
        batch['align_gt'][start:end] = 0
    else:
      batch['sh_cap'] = np.tile(batch['cap'], [num_rounds, 1])
      batch['sh_cap_len'] = np.tile(batch['cap_len'], [num_rounds])
      batch['sh_cap_prog'] = np.tile(batch['cap_prog'], [1, num_rounds])
      batch['sh_cap_att'] = np.tile(batch['cap_att'], [1, 1, num_rounds, 1])
      batch['align_gt'] = np.ones(num_rounds*actual_batch_size).astype('int32')

    return batch