in utils/corpus_utils.py [0:0]
def get_batches(problem, data, length, target, batch_size, input_types, pad_ids=None, permutate=False, transform_tensor=True, predict_mode='batch'):
"""
Args:
data:
{
'string1': {
'word1': [...],
'postage_feature1': [..]
}
'string2': {
'word1': [...],
'postage_feature1': [..]
}
lengths:
{
'string1': [...],
'string2': [...]
}
target: [...]
input_types: {
"word": {
"cols": [
"word1",
"word2"
],
"dim": 300
},
"postag": {
"cols": ["postag_feature1", "postag_feature2"],
"dim": 20
}
permutate: shuffle data
transform_tensor: if True the data returned would be Variables in CPU (except sentence length), otherwise the data would be numpy array
Returns:
data_batches: each element is a batch of data
[
{
"string1":{
'word': ndarray/Variable, shape:[batch_size, seq_len],
'postag': ndarray/Variable, postag ids, shape: [batch_size, seq_len],
...
}
"string2":{
'word': ndarray/Variable, shape:[batch_size, seq_len],
'postag': ndarray/Variable, postag ids, shape: [batch_size, seq_len],
...
}
}
...
]
length_batches: {
'string1": ndarray, [number of batches, batch size]
'string2": ndarray, [number of batches, batch size]
}
target_batches: ndarray/Variable shape: [number of batches, batch_size, targets]
"""
if predict_mode == 'batch':
logging.info("Start making batches")
if permutate is True:
#CAUTION! data and length would be revised
# data = copy.deepcopy(data)
# length = copy.deepcopy(length)
# if target is not None:
# target = copy.deepcopy(target)
# shuffle the data
permutation = np.random.permutation(len(list(target.values())[0]))
for input_cluster in data:
for input in data[input_cluster]:
data[input_cluster][input] = np.array(data[input_cluster][input])[permutation]
for single_type in length[input_cluster]:
length[input_cluster][single_type] = np.array(length[input_cluster][single_type])[permutation]
if target is not None:
for single_target in target:
length['target'][single_target] = np.array(length['target'][single_target])[permutation]
target[single_target] = np.array(target[single_target])[permutation]
else:
for input_cluster in data:
for input in data[input_cluster]:
data[input_cluster][input] = np.array(data[input_cluster][input])
for single_type in length[input_cluster]:
length[input_cluster][single_type] = np.array(length[input_cluster][single_type])
if target is not None:
for single_target in target:
length['target'][single_target] = np.array(length['target'][single_target])
target[single_target] = np.array(target[single_target])
# set up padding symbols for inputs and target
if pad_ids is None:
pad_ids = dict()
for branch in input_types:
pad_ids[branch] = problem.input_dicts[branch].id('<pad>')
if ProblemTypes[problem.problem_type] == ProblemTypes.sequence_tagging:
#pad_ids['target'] = problem.output_dict.id('O')
if problem.target_with_pad:
pad_ids['target'] = problem.output_dict.id('<pad>')
else:
pad_ids['target'] = 0 # CAUTION
elif ProblemTypes[problem.problem_type] == ProblemTypes.classification:
if problem.target_with_pad:
pad_ids['target'] = problem.output_dict.id('<pad>') # CAUTION
else:
pad_ids['target'] = 0 # CAUTION
elif ProblemTypes[problem.problem_type] == ProblemTypes.regression:
pad_ids['target'] = None
elif ProblemTypes[problem.problem_type] == ProblemTypes.mrc:
pass
type2cluster = dict() # type2cluster['word1'] = 'word'
for input_type in input_types:
for col_name in input_types[input_type]['cols']:
type2cluster[col_name] = input_type
# get the corpus size
for input_cluster in data:
for input_type in data[input_cluster]:
corpus_size = len(data[input_cluster][input_type])
break
break
data_batches = []
if target is not None:
target_batches = []
else:
target_batches = None
length_batches = []
for stidx in range(0, corpus_size, batch_size):
data_batch = dict()
length_batch = dict()
for input_cluster in data:
data_batch[input_cluster] = dict()
length_batch[input_cluster] = dict()
max_sen_len_cur_batch = None
max_word_len_cur_batch = None
if transform_tensor is True:
# For nn.DataParallel, the length must be Variable as well, otherwise the length would not split for multiple GPUs
#length_batch[input_cluster] = Variable(torch.LongTensor(length[input_cluster][stidx: stidx + batch_size]))
for single_input_cluster in length[input_cluster]:
if not isinstance(length[input_cluster][single_input_cluster][0], list):
length_batch[input_cluster][single_input_cluster] = \
torch.LongTensor(np.array(length[input_cluster][single_input_cluster][stidx: stidx + batch_size]))
else:
length_batch[input_cluster][single_input_cluster] = []
for single_iterm in length[input_cluster][single_input_cluster][stidx: stidx + batch_size]:
length_batch[input_cluster][single_input_cluster].append(torch.LongTensor(np.array(single_iterm)))
else:
for single_input_cluster in length[input_cluster]:
length_batch[input_cluster][single_input_cluster] = \
np.array(length[input_cluster][single_input_cluster][stidx: stidx + batch_size])
# max_len_cur_batch = np.sort(length[input_cluster][stidx: stidx + batch_size])[-1]
for single_input_cluster in length[input_cluster]:
if 'sentence' in single_input_cluster:
max_sen_len_cur_batch = np.sort(length[input_cluster][single_input_cluster][stidx: stidx + batch_size])[-1]
elif 'word' in single_input_cluster:
max_word_len_cur_batch = np.sort([y for x in length[input_cluster][single_input_cluster][stidx: stidx + batch_size] for y in x])[-1]
#logging.info("stidx: %d, max_len: %d" % (stidx, max_len_cur_batch))
for input_type in data[input_cluster]:
if input_type in type2cluster:
batch_with_pad = []
# process char data
if 'char' in input_type.lower():
for seq in data[input_cluster][input_type][stidx: stidx + batch_size]:
batch_char_pad = []
for seq_index in range(max_sen_len_cur_batch):
if seq_index < len(seq):
batch_char_pad.append(cut_and_padding(seq[seq_index], max_word_len_cur_batch, pad_ids[type2cluster[input_type]]))
else:
batch_char_pad.append(cut_and_padding([pad_ids[type2cluster[input_type]]], max_word_len_cur_batch, pad_ids[type2cluster[input_type]]))
batch_with_pad.append(batch_char_pad)
else:
for seq in data[input_cluster][input_type][stidx: stidx + batch_size]:
#batch_with_pad.append(cut_and_padding(seq, max_len_cur_batch, pad_ids[input_type]))
batch_with_pad.append(cut_and_padding(seq, max_sen_len_cur_batch, pad_ids[type2cluster[input_type]]))
if transform_tensor is True:
data_batch[input_cluster][type2cluster[input_type]] = torch.LongTensor(batch_with_pad)
else:
data_batch[input_cluster][type2cluster[input_type]] = np.array(batch_with_pad)
else:
data_batch[input_cluster][input_type] = data[input_cluster][input_type][stidx: stidx + batch_size]
# word_length is used for padding char sequence, now only save sentence_length
length_batch[input_cluster] = length_batch[input_cluster]['sentence_length']
data_batches.append(data_batch)
length_batches.append(length_batch)
if target is not None:
target_batch = {}
length_batch['target'] = {}
for single_target in target:
if transform_tensor is True:
length_batch['target'][single_target] = torch.LongTensor(np.array(length['target'][single_target][stidx: stidx + batch_size]))
else:
length_batch['target'][single_target] = np.array(length['target'][single_target][stidx: stidx + batch_size])
if not (isinstance(target[single_target][0], list) or isinstance(target[single_target][0], np.ndarray)):
target_batch[single_target] = target[single_target][stidx: stidx + batch_size]
else:
# target is also a sequence, padding needed
temp_target_batch = []
for seq in target[single_target][stidx: stidx + batch_size]:
temp_target_batch.append(cut_and_padding(seq, max_sen_len_cur_batch, pad_ids['target']))
target_batch[single_target] = temp_target_batch
if transform_tensor is True:
if ProblemTypes[problem.problem_type] == ProblemTypes.classification \
or ProblemTypes[problem.problem_type] == ProblemTypes.sequence_tagging:
target_batch[single_target] = torch.LongTensor(target_batch[single_target])
elif ProblemTypes[problem.problem_type] == ProblemTypes.regression:
target_batch[single_target] = torch.FloatTensor(target_batch[single_target])
elif ProblemTypes[problem.problem_type] == ProblemTypes.mrc:
if not isinstance(target_batch[single_target][0], str):
target_batch[single_target] = torch.LongTensor(target_batch[single_target])
else:
target_batch[single_target] = np.array(target_batch[single_target])
target_batches.append(target_batch)
if predict_mode == 'batch':
logging.info("Batches got!")
return data_batches, length_batches, target_batches