in problem.py [0:0]
def __init__(self, phase, problem_type, input_types, answer_column_name=None, lowercase=False, with_bos_eos=True,
tagging_scheme=None, tokenizer="nltk", remove_stopwords=False, DBC2SBC=True, unicode_fix=True):
"""
Args:
input_types: {
"word": ["word1", "word1"],
"postag": ["postag_feature1", "postag_feature2"]
}
answer_column_name: "label" after v1.0.0 answer_column_name change to list
source_with_start:
source_with_end:
source_with_unk:
source_with_pad:
target_with_start:
target_with_end:
target_with_unk:
target_with_pad:
same_length:
with_bos_eos: whether to add bos and eos when encoding
"""
# init
source_with_start, source_with_end, source_with_unk, source_with_pad, \
target_with_start, target_with_end, target_with_unk, target_with_pad, \
same_length = (True, ) * 9
if ProblemTypes[problem_type] == ProblemTypes.sequence_tagging:
pass
elif \
ProblemTypes[problem_type] == ProblemTypes.classification or \
ProblemTypes[problem_type] == ProblemTypes.regression:
target_with_start, target_with_end, target_with_unk, target_with_pad, same_length = (False, ) * 5
if phase != 'train':
same_length = True
elif ProblemTypes[problem_type] == ProblemTypes.mrc:
target_with_start, target_with_end, target_with_unk, target_with_pad, same_length = (False, ) * 5
with_bos_eos = False
if ProblemTypes[problem_type] == ProblemTypes.sequence_tagging:
target_with_start = False
target_with_end = False
target_with_unk = False
self.lowercase = lowercase
self.problem_type = problem_type
self.tagging_scheme = tagging_scheme
self.with_bos_eos = with_bos_eos
self.source_with_start = source_with_start
self.source_with_end = source_with_end
self.source_with_unk = source_with_unk
self.source_with_pad = source_with_pad
self.target_with_start = target_with_start
self.target_with_end = target_with_end
self.target_with_unk = target_with_unk
self.target_with_pad = target_with_pad
self.input_dicts = dict()
for input_type in input_types:
self.input_dicts[input_type] = CellDict(with_unk=source_with_unk, with_pad=source_with_pad,
with_start=source_with_start, with_end=source_with_end)
if ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging or \
ProblemTypes[self.problem_type] == ProblemTypes.classification :
self.output_dict = CellDict(with_unk=target_with_unk, with_pad=target_with_pad,
with_start=target_with_start, with_end=target_with_end)
elif ProblemTypes[self.problem_type] == ProblemTypes.regression or \
ProblemTypes[self.problem_type] == ProblemTypes.mrc:
self.output_dict = None
self.file_column_num = None
if tokenizer in ['nltk']:
self.tokenizer = EnglishTokenizer(tokenizer=tokenizer, remove_stopwords=remove_stopwords)
elif tokenizer in ['jieba']:
self.tokenizer = ChineseTokenizer(tokenizer=tokenizer, remove_stopwords=remove_stopwords)
self.text_preprocessor = EnglishTextPreprocessor(DBC2SBC=DBC2SBC, unicode_fix=unicode_fix)