in loader_vd/data_reader.py [0:0]
def load_one_batch(self, sample_ids):
"""Load data given the sample ids.
"""
actual_batch_size = len(sample_ids)
batch = {}
# replace question _Find with _Refer
find_module_token = self.assembler.name2idx_dict['_Find']
#refer_module_token = self.assembler.name2idx_dict['_Refer']
eos_token = self.assembler.name2idx_dict['<eos>']
# whether to flatten or not
flatten = 'dial' not in self.params['model']
flatten = 'nmn-cap' not in self.params['model']
num_rounds = self.num_rounds
# captions
if flatten:
cap_inds = [self.imdb['data'][ii]['caption_ind'] for ii in sample_ids
for _ in range(num_rounds)]
else:
cap_inds = [self.imdb['data'][ii]['caption_ind'] for ii in sample_ids]
cap_batch = self.imdb['cap'][cap_inds][:, :self.T_encoder]
cap_len = self.imdb['cap_len'][cap_inds]
# get caption programs
cap_prog = None
cap_gt_att = None
if 'nmn-cap' in self.params['model']:
cap_prog = np.zeros((self.T_decoder, len(cap_inds)), np.int32)
cap_prog.fill(eos_token)
for spot, ii in enumerate(cap_inds):
layout = self.imdb['cap_prog'][ii]
cap_prog[:, spot] = \
self.assembler.module_list2tokens(layout, self.T_decoder)
# also get attention for supervision
if self.supervise_attention:
cap_gt_att = np.zeros((self.T_decoder, self.T_encoder, \
actual_batch_size, 1), np.float32)
for spot, ii in enumerate(cap_inds):
for t_id, att in enumerate(self.imdb['cap_prog_att'][ii]):
span = att[1] - att[0]
# NOTE: number of attention hardwired to be <= 4
if span > 0 or span == 0: continue
if span == 0: continue
cap_gt_att[t_id, att[0]:att[1], spot] = 1/span
# questions
ques_inds = [jj for ii in sample_ids
for jj in self.imdb['data'][ii]['question_ind']]
ques_batch = self.imdb['ques'][ques_inds][:, :self.T_encoder].transpose()
ques_len = self.imdb['ques_len'][ques_inds]
ques_ids = [jj for ii in sample_ids
for jj in self.imdb['data'][ii]['question_id']]
gt_index = [jj for ii in sample_ids
for jj in self.imdb['data'][ii]['gt_ind']]
# answers
ans_inds = [jj for ii in sample_ids
for jj in self.imdb['data'][ii]['answer_ind']]
ans_batch_in = self.imdb['ans_in'][ans_inds][:, :self.T_encoder]
ans_batch_out = self.imdb['ans_out'][ans_inds][:, :self.T_encoder]
ans_batch = self.imdb['ans_in'][ans_inds][:, 1:self.T_encoder]
ans_len = self.imdb['ans_len'][ans_inds]
# getting history
if self.use_history:
history = self.imdb['hist'][sample_ids]
hist_len = self.imdb['hist_len'][sample_ids]
else:
history, hist_len = None, None
# image features
if 'prog' in self.params['model']:
# single copy per conversation
image_feats = np.zeros((actual_batch_size, self.feat_H,
self.feat_W, self.feat_D), np.float32)
else:
image_feats = None
image_path = [None] * actual_batch_size
# load fact
if self.params['use_fact']:
fact = self.imdb['fact'][sample_ids]
fact_len = self.imdb['fact_len'][sample_ids]
# flatten
fact = np.reshape(fact, [-1, fact.shape[-1]])
fact_len = np.reshape(fact_len, [-1])
else:
fact, fact_len = None, None
# programs
if self.load_gt_layout:
gt_layout_batch = np.zeros((self.T_decoder,
num_rounds * actual_batch_size), np.int32)
gt_layout_batch.fill(eos_token)
gt_attention = None
if self.supervise_attention:
gt_attention = np.zeros((self.T_decoder, self.T_encoder,
num_rounds * actual_batch_size, 1), np.float32)
# mask for weights, for history attention
weight_mask = []
for n in range(len(sample_ids)):
iminfo = self.imdb['data'][sample_ids[n]]
# image features
if 'prog' in self.params['model']:
# if VGG features are to be used
if self.use_vgg:
img_id = '/'.join(iminfo['feature_path'].split('/')[-2:])
img_id = img_id.replace('npy', 'jpg')
if img_id in self.img_index:
f_ind = self.img_index[img_id]
cur_feat = self.img_feats[f_ind]
else:
cur_feat = self.zero_feature
else:
# use preloaded image features
feat_path = self._adjust_image_dir(iminfo['feature_path'])
if not self.preload_features: cur_feat = np.load(feat_path)
else: cur_feat = self.img_feats[feat_path]
# single copy per conversation
image_feats[n] = cur_feat
image_path[n] = iminfo['image_path']
# programs
if self.load_gt_layout:
# go over all the questions
for r_id, layout in enumerate(iminfo['gt_layout_tokens']):
gt_layout_batch[:, num_rounds * n + r_id] = \
self.assembler.module_list2tokens(layout, self.T_decoder)
if self.supervise_attention:
num_refers = 0
for r_id, att in enumerate(iminfo['gt_layout_att']):
for t_id in range(att.shape[0]):
index = num_rounds * n + r_id
span = att[t_id, 1] - att[t_id, 0]
# NOTE: number of attention timesteps hardwired to be <= 4
if span > 4 or span == 0: continue
gt_attention[t_id, att[t_id,0]:att[t_id,1], index] = 1/span
# if options are not needed, continue
if not self.fetch_options: continue
#------------------------------------------------------------------
# get options
opt_inds = [jj for ii in sample_ids
for jj in self.imdb['data'][ii]['option_ind']]
num_options = len(opt_inds[0])
opt_batch_in = [None] * num_options
opt_batch_out = [None] * num_options
opt_len = [None] * num_options
for ii in range(num_options):
cur_inds = [jj[ii] for jj in opt_inds]
opt_batch_in[ii] = self.imdb['ans_in'][cur_inds][:, :self.T_encoder]
opt_batch_out[ii] = self.imdb['ans_out'][cur_inds][:, :self.T_encoder]
opt_len[ii] = self.imdb['ans_len'][cur_inds]
#------------------------------------------------------------------
batch = {'ques': ques_batch, 'ques_len': ques_len,
'ques_id': ques_ids, 'gt_layout': gt_layout_batch,
'gt_att' : gt_attention,
'cap': cap_batch, 'cap_len': cap_len, 'cap_prog': cap_prog,
'cap_att': cap_gt_att,
'hist': history, 'hist_len': hist_len, 'ans_in': ans_batch_in,
'ans_out': ans_batch_out, 'ans_len':ans_len, 'ans': ans_batch,
'fact': fact, 'fact_len': fact_len,
'img_feat': image_feats, 'img_path': image_path}
#------------------------------------------------------------------
# further add options
if self.fetch_options:
options = {'opt_in': opt_batch_in, 'opt_out': opt_batch_out,\
'opt_len': opt_len, 'gt_ind': gt_index}
batch.update(options)
#------------------------------------------------------------------
if 'nmn-cap' not in self.params['model']:
return batch
# getting data for training alignment on caption
if actual_batch_size > 1:
info = [batch['cap'], batch['cap_len'],
batch['cap_prog'].transpose()]
if batch['cap_att'] is not None:
info.append(batch['cap_att'].transpose((2, 0, 1, 3)))
shuffled = support.shuffle(info, actual_batch_size)
batch['sh_cap'], batch['sh_cap_len'] = shuffled[:2]
batch['sh_cap_prog'] = shuffled[2].transpose()
batch['align_gt'] = np.ones(num_rounds*actual_batch_size).astype('int32')
if batch['cap_att'] is not None:
batch['sh_cap_att'] = shuffled[3].transpose((1, 2, 0, 3))
for ii in range(actual_batch_size):
start = num_rounds * ii + num_rounds // 2
end = num_rounds * (ii+1)
batch['align_gt'][start:end] = 0
else:
batch['sh_cap'] = np.tile(batch['cap'], [num_rounds, 1])
batch['sh_cap_len'] = np.tile(batch['cap_len'], [num_rounds])
batch['sh_cap_prog'] = np.tile(batch['cap_prog'], [1, num_rounds])
batch['sh_cap_att'] = np.tile(batch['cap_att'], [1, 1, num_rounds, 1])
batch['align_gt'] = np.ones(num_rounds*actual_batch_size).astype('int32')
return batch