in problem.py [0:0]
def build_training_data_list(self, training_data_list, file_columns, input_types, answer_column_name, bpe_encoder=None):
docs = dict() # docs of each type of input
col_index_types = dict() # input type of each column, col_index_types[0] = 'word'/'postag'
target_docs = {} # after v1.0.0, the target_docs change to dict for support multi_label
columns_to_target = {}
for single_target in answer_column_name:
target_docs[single_target] = []
columns_to_target[file_columns[single_target]] = single_target
for input_type in input_types:
docs[input_type] = []
# char is not in file_columns
if input_type == 'char':
continue
for col in input_types[input_type]['cols']:
col_index_types[file_columns[col]] = input_type
cnt_legal = 0
cnt_illegal = 0
for line in training_data_list:
# line_split = list(filter(lambda x: len(x) > 0, line.rstrip().split('\t')))
line_split = line.rstrip().split('\t')
if len(line_split) != len(file_columns):
logging.warning("Current line is inconsistent with configuration/inputs/file_header. Ingore now. %s" % line)
cnt_illegal += 1
continue
cnt_legal += 1
for i in range(len(line_split)):
if i in col_index_types:
if self.lowercase:
line_split[i] = line_split[i].lower()
line_split[i] = self.text_preprocessor.preprocess(line_split[i])
if col_index_types[i] == 'word':
if ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
token_list = line_split[i].split(" ")
else:
token_list = self.tokenizer.tokenize(line_split[i])
docs[col_index_types[i]].append(token_list)
if 'char' in docs:
# add char
docs['char'].append([single_char for single_char in ''.join(token_list)])
elif col_index_types[i] == 'bpe':
bpe_tokens = []
for token in self.tokenizer.tokenize(line_split[i]):
bpe_tokens.extend(bpe_encoder.bpe(token))
docs[col_index_types[i]].append(bpe_tokens)
else:
docs[col_index_types[i]].append(line_split[i].split(" "))
# target_docs change to dict
elif i in columns_to_target.keys():
curr_target = columns_to_target[i]
if ProblemTypes[self.problem_type] == ProblemTypes.classification:
target_docs[curr_target].append(line_split[i])
elif ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
target_docs[curr_target].append(line_split[i].split(" "))
elif ProblemTypes[self.problem_type] == ProblemTypes.regression or \
ProblemTypes[self.problem_type] == ProblemTypes.mrc:
pass
return docs, target_docs, cnt_legal, cnt_illegal