def build_training_data_list()

in problem.py [0:0]


    def build_training_data_list(self, training_data_list, file_columns, input_types, answer_column_name, bpe_encoder=None):
        docs = dict()           # docs of each type of input
        col_index_types = dict()        # input type of each column, col_index_types[0] = 'word'/'postag'

        target_docs = {}  # after v1.0.0, the target_docs change to dict for support multi_label
        columns_to_target = {}
        for single_target in answer_column_name:
            target_docs[single_target] = []
            columns_to_target[file_columns[single_target]] = single_target

        for input_type in input_types:
            docs[input_type] = []
            # char is not in file_columns
            if input_type == 'char':
                continue
            for col in input_types[input_type]['cols']:
                col_index_types[file_columns[col]] = input_type

        cnt_legal = 0
        cnt_illegal = 0
        for line in training_data_list:
            # line_split = list(filter(lambda x: len(x) > 0, line.rstrip().split('\t')))
            line_split = line.rstrip().split('\t')
            if len(line_split) != len(file_columns):
                logging.warning("Current line is inconsistent with configuration/inputs/file_header. Ingore now. %s" % line)
                cnt_illegal += 1
                continue
            cnt_legal += 1

            for i in range(len(line_split)):
                if i in col_index_types:
                    if self.lowercase:
                        line_split[i] = line_split[i].lower()
                    line_split[i] = self.text_preprocessor.preprocess(line_split[i])

                    if col_index_types[i] == 'word':
                        if ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
                            token_list = line_split[i].split(" ")
                        else:
                            token_list = self.tokenizer.tokenize(line_split[i])
                        docs[col_index_types[i]].append(token_list)
                        if 'char' in docs:
                            # add char
                            docs['char'].append([single_char for single_char in ''.join(token_list)])
                    elif col_index_types[i] == 'bpe':
                        bpe_tokens = []
                        for token in self.tokenizer.tokenize(line_split[i]):
                            bpe_tokens.extend(bpe_encoder.bpe(token))
                        docs[col_index_types[i]].append(bpe_tokens)
                    else:
                        docs[col_index_types[i]].append(line_split[i].split(" "))
                # target_docs change to dict
                elif i in columns_to_target.keys():
                    curr_target = columns_to_target[i]
                    if ProblemTypes[self.problem_type] == ProblemTypes.classification:
                        target_docs[curr_target].append(line_split[i])
                    elif ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
                        target_docs[curr_target].append(line_split[i].split(" "))
                    elif ProblemTypes[self.problem_type] == ProblemTypes.regression or \
                            ProblemTypes[self.problem_type] == ProblemTypes.mrc:
                        pass
        return docs, target_docs, cnt_legal, cnt_illegal