def get_batches()

in utils/corpus_utils.py [0:0]
137 lines of code
1 McCabe index (conditional complexity)

def get_batches(problem, data, length, target, batch_size, input_types, pad_ids=None, permutate=False, transform_tensor=True, predict_mode='batch'):
    """

    Args:
        data:
                {
                'string1': {
                    'word1': [...],
                    'postage_feature1': [..]
                    }
                'string2': {
                    'word1': [...],
                    'postage_feature1': [..]
                }
        lengths:
                {
                'string1':   [...],
                'string2':   [...]
                }
        target:  [...]
        input_types:  {
                  "word": {
                    "cols": [
                      "word1",
                      "word2"
                    ],
                    "dim": 300
                  },
                  "postag": {
                    "cols": ["postag_feature1", "postag_feature2"],
                    "dim": 20
                  }
        permutate: shuffle data
        transform_tensor: if True the data returned would be Variables in CPU (except sentence length), otherwise the data would be numpy array

    Returns:
        data_batches: each element is a batch of data
            [
                {
                    "string1":{
                        'word': ndarray/Variable, shape:[batch_size, seq_len],
                        'postag': ndarray/Variable, postag ids, shape: [batch_size, seq_len],
                        ...
                    }
                    "string2":{
                        'word': ndarray/Variable, shape:[batch_size, seq_len],
                        'postag': ndarray/Variable, postag ids, shape: [batch_size, seq_len],
                        ...
                    }
                }
                ...
            ]
        length_batches: {
            'string1": ndarray, [number of batches, batch size]
            'string2": ndarray, [number of batches, batch size]
        }
        target_batches: ndarray/Variable shape: [number of batches, batch_size, targets]

    """
    if predict_mode == 'batch':
        logging.info("Start making batches")
    if permutate is True:
        #CAUTION! data and length would be revised
        # data = copy.deepcopy(data)
        # length = copy.deepcopy(length)
        # if target is not None:
        #     target = copy.deepcopy(target)

        # shuffle the data
        permutation = np.random.permutation(len(list(target.values())[0]))
        for input_cluster in data:
            for input in data[input_cluster]:
                data[input_cluster][input] = np.array(data[input_cluster][input])[permutation]
            for single_type in length[input_cluster]:
                length[input_cluster][single_type] = np.array(length[input_cluster][single_type])[permutation]
        if target is not None:
            for single_target in target:
                length['target'][single_target] = np.array(length['target'][single_target])[permutation]
                target[single_target] = np.array(target[single_target])[permutation]
    else:
        for input_cluster in data:
            for input in data[input_cluster]:
                data[input_cluster][input] = np.array(data[input_cluster][input])
            for single_type in length[input_cluster]:
                length[input_cluster][single_type] = np.array(length[input_cluster][single_type])
        if target is not None:
            for single_target in target:
                length['target'][single_target] = np.array(length['target'][single_target])
                target[single_target] = np.array(target[single_target])

    # set up padding symbols for inputs and target
    if pad_ids is None:
        pad_ids = dict()
        for branch in input_types:
            pad_ids[branch] = problem.input_dicts[branch].id('<pad>')

        if ProblemTypes[problem.problem_type] == ProblemTypes.sequence_tagging:
            #pad_ids['target'] = problem.output_dict.id('O')
            if problem.target_with_pad:
                pad_ids['target'] = problem.output_dict.id('<pad>')
            else:
                pad_ids['target'] = 0       # CAUTION
        elif ProblemTypes[problem.problem_type] == ProblemTypes.classification:
            if problem.target_with_pad:
                pad_ids['target'] = problem.output_dict.id('<pad>')       # CAUTION
            else:
                pad_ids['target'] = 0       # CAUTION
        elif ProblemTypes[problem.problem_type] == ProblemTypes.regression:
            pad_ids['target'] = None
        elif ProblemTypes[problem.problem_type] == ProblemTypes.mrc:
            pass
    type2cluster = dict()       # type2cluster['word1'] = 'word'
    for input_type in input_types:
        for col_name in input_types[input_type]['cols']:
            type2cluster[col_name] = input_type

    # get the corpus size
    for input_cluster in data:
        for input_type in data[input_cluster]:
            corpus_size = len(data[input_cluster][input_type])
            break
        break

    data_batches = []
    if target is not None:
        target_batches = []
    else:
        target_batches = None
    length_batches = []
    for stidx in range(0, corpus_size, batch_size):
        data_batch = dict()
        length_batch = dict()

        for input_cluster in data:
            data_batch[input_cluster] = dict()
            length_batch[input_cluster] = dict()
            max_sen_len_cur_batch = None
            max_word_len_cur_batch = None
            if transform_tensor is True:
                # For nn.DataParallel, the length must be Variable as well, otherwise the length would not split for multiple GPUs
                #length_batch[input_cluster] = Variable(torch.LongTensor(length[input_cluster][stidx: stidx + batch_size]))
                for single_input_cluster in length[input_cluster]:
                    if not isinstance(length[input_cluster][single_input_cluster][0], list):
                        length_batch[input_cluster][single_input_cluster] = \
                            torch.LongTensor(np.array(length[input_cluster][single_input_cluster][stidx: stidx + batch_size]))
                    else:
                        length_batch[input_cluster][single_input_cluster] = []
                        for single_iterm in length[input_cluster][single_input_cluster][stidx: stidx + batch_size]:
                            length_batch[input_cluster][single_input_cluster].append(torch.LongTensor(np.array(single_iterm)))
            else:
                for single_input_cluster in length[input_cluster]:
                    length_batch[input_cluster][single_input_cluster] = \
                        np.array(length[input_cluster][single_input_cluster][stidx: stidx + batch_size])

            # max_len_cur_batch = np.sort(length[input_cluster][stidx: stidx + batch_size])[-1]
            for single_input_cluster in length[input_cluster]:
                if 'sentence' in single_input_cluster:
                    max_sen_len_cur_batch = np.sort(length[input_cluster][single_input_cluster][stidx: stidx + batch_size])[-1]
                elif 'word' in single_input_cluster:
                    max_word_len_cur_batch = np.sort([y for x in length[input_cluster][single_input_cluster][stidx: stidx + batch_size] for y in x])[-1]
            #logging.info("stidx: %d, max_len: %d" % (stidx, max_len_cur_batch))
            for input_type in data[input_cluster]:
                if input_type in type2cluster:
                    batch_with_pad = []
                    # process char data
                    if 'char' in input_type.lower():
                        for seq in data[input_cluster][input_type][stidx: stidx + batch_size]:
                            batch_char_pad = []
                            for seq_index in range(max_sen_len_cur_batch):
                                if seq_index < len(seq):
                                    batch_char_pad.append(cut_and_padding(seq[seq_index], max_word_len_cur_batch, pad_ids[type2cluster[input_type]]))
                                else:
                                    batch_char_pad.append(cut_and_padding([pad_ids[type2cluster[input_type]]], max_word_len_cur_batch, pad_ids[type2cluster[input_type]]))
                            batch_with_pad.append(batch_char_pad)
                    else:
                        for seq in data[input_cluster][input_type][stidx: stidx + batch_size]:
                        #batch_with_pad.append(cut_and_padding(seq, max_len_cur_batch, pad_ids[input_type]))
                            batch_with_pad.append(cut_and_padding(seq, max_sen_len_cur_batch, pad_ids[type2cluster[input_type]]))
                    if transform_tensor is True:
                        data_batch[input_cluster][type2cluster[input_type]] = torch.LongTensor(batch_with_pad)
                    else:
                        data_batch[input_cluster][type2cluster[input_type]] = np.array(batch_with_pad)
                else:
                    data_batch[input_cluster][input_type] = data[input_cluster][input_type][stidx: stidx + batch_size]
            # word_length is used for padding char sequence, now only save sentence_length
            length_batch[input_cluster] = length_batch[input_cluster]['sentence_length']

        data_batches.append(data_batch)
        length_batches.append(length_batch)

        if target is not None:
            target_batch = {}
            length_batch['target'] = {}
            for single_target in target:
                if transform_tensor is True:
                    length_batch['target'][single_target] = torch.LongTensor(np.array(length['target'][single_target][stidx: stidx + batch_size]))
                else:
                    length_batch['target'][single_target] = np.array(length['target'][single_target][stidx: stidx + batch_size])
                if not (isinstance(target[single_target][0], list) or isinstance(target[single_target][0], np.ndarray)):
                    target_batch[single_target] = target[single_target][stidx: stidx + batch_size]
                else:
                    # target is also a sequence, padding needed
                    temp_target_batch = []
                    for seq in target[single_target][stidx: stidx + batch_size]:
                        temp_target_batch.append(cut_and_padding(seq, max_sen_len_cur_batch, pad_ids['target']))
                    target_batch[single_target] = temp_target_batch
                if transform_tensor is True:
                    if ProblemTypes[problem.problem_type] == ProblemTypes.classification \
                            or ProblemTypes[problem.problem_type] == ProblemTypes.sequence_tagging:
                        target_batch[single_target] = torch.LongTensor(target_batch[single_target])
                    elif ProblemTypes[problem.problem_type] == ProblemTypes.regression:
                        target_batch[single_target] = torch.FloatTensor(target_batch[single_target])
                    elif ProblemTypes[problem.problem_type] == ProblemTypes.mrc:
                        if not isinstance(target_batch[single_target][0], str):
                            target_batch[single_target] = torch.LongTensor(target_batch[single_target])
                else:
                    target_batch[single_target] = np.array(target_batch[single_target])

            target_batches.append(target_batch)

    if predict_mode == 'batch':
        logging.info("Batches got!")
    return data_batches, length_batches, target_batches