in problem.py [0:0]
def build(self, data_path_list, file_columns, input_types, file_with_col_header, answer_column_name, word2vec_path=None, word_emb_dim=None,
format=None, file_type=None, involve_all_words=None, file_format="tsv", show_progress=True,
cpu_num_workers=-1, max_vocabulary=800000, word_frequency=3, max_building_lines=1000*1000):
"""
Args:
data_path_list:
file_columns: {
"word1": 0,
"word2": 1,
"label": 2,
"postag_feature1": 3,
"postag_feature2": 4
},
input_types:
e.g.
{
"word": {
"cols": ["word1", "word2"],
"dim": 300
},
"postag": {
"cols": ["postag_feature1", "postag_feature2"],
"dim": 20
},
}
or
{
"bpe": {
"cols": ["word1", "word2"],
"dim": 100
"bpe_path": "xxx.bpe"
}
}
word2vec_path:
word_emb_dim:
involve_all_word: involve all words that show up in the pretrained embedding
file_format: "tsv", or "json". Note "json" means each sample is represented by a json string.
Returns:
"""
# parameter check
bpe_encoder = self._check_bpe_encoder(input_types)
self.file_column_num = len(file_columns)
for data_path in data_path_list:
if data_path:
progress = self.get_data_generator_from_file(data_path, file_with_col_header, chunk_size=max_building_lines)
preprocessed_data_generator= self.build_training_multi_processor(progress, cpu_num_workers, file_columns, input_types, answer_column_name, bpe_encoder=bpe_encoder)
# update symbol universe
docs, target_docs, cnt_legal, cnt_illegal = next(preprocessed_data_generator)
# input_type
for input_type in input_types:
self.input_dicts[input_type].update(docs[input_type])
# problem_type
if ProblemTypes[self.problem_type] == ProblemTypes.classification or \
ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
self.output_dict.update(list(target_docs.values())[0])
elif ProblemTypes[self.problem_type] == ProblemTypes.regression or \
ProblemTypes[self.problem_type] == ProblemTypes.mrc:
pass
logging.info("[Building Dictionary] in %s at most %d lines imported: %d legal lines, %d illegal lines." % (data_path, max_building_lines, cnt_legal, cnt_illegal))
# build dictionary
for input_type in input_types:
self.input_dicts[input_type].build(threshold=word_frequency, max_vocabulary_num=max_vocabulary)
logging.info("%d types in %s column" % (self.input_dicts[input_type].cell_num(), input_type))
if self.output_dict:
self.output_dict.build(threshold=0)
if ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
self.output_dict.cell_id_map["<start>"] = len(self.output_dict.cell_id_map)
self.output_dict.id_cell_map[len(self.output_dict.id_cell_map)] = "<start>"
self.output_dict.cell_id_map["<eos>"] = len(self.output_dict.cell_id_map)
self.output_dict.id_cell_map[len(self.output_dict.id_cell_map)] = "<eos>"
logging.info("%d types in target column" % (self.output_dict.cell_num()))
logging.debug("training data dict built")
# embedding
word_emb_matrix = None
if word2vec_path:
logging.info("Getting pre-trained embeddings...")
word_emb_dict = None
if involve_all_words is True:
word_emb_dict = load_embedding(word2vec_path, word_emb_dim, format, file_type, with_head=False, word_set=None)
self.input_dicts['word'].update([list(word_emb_dict.keys())])
self.input_dicts['word'].build(threshold=0, max_vocabulary_num=len(word_emb_dict))
else:
extend_vocabulary = set()
for single_word in self.input_dicts['word'].cell_id_map.keys():
extend_vocabulary.add(single_word)
if single_word.lower() != single_word:
extend_vocabulary.add(single_word.lower())
word_emb_dict = load_embedding(word2vec_path, word_emb_dim, format, file_type, with_head=False, word_set=extend_vocabulary)
for word in word_emb_dict:
loaded_emb_dim = len(word_emb_dict[word])
break
assert loaded_emb_dim == word_emb_dim, "The dimension of defined word embedding is inconsistent with the pretrained embedding provided!"
logging.info("constructing embedding table")
if self.input_dicts['word'].with_unk:
word_emb_dict['<unk>'] = np.random.random(size=word_emb_dim)
if self.input_dicts['word'].with_pad:
word_emb_dict['<pad>'] = np.random.random(size=word_emb_dim)
word_emb_matrix = []
unknown_word_count = 0
scale = np.sqrt(3.0 / word_emb_dim)
for i in range(self.input_dicts['word'].cell_num()):
single_word = self.input_dicts['word'].id_cell_map[i]
if single_word in word_emb_dict:
word_emb_matrix.append(word_emb_dict[single_word])
elif single_word.lower() in word_emb_dict:
word_emb_matrix.append(word_emb_dict[single_word.lower()])
else:
word_emb_matrix.append(np.random.uniform(-scale, scale, word_emb_dim))
unknown_word_count += 1
word_emb_matrix = np.array(word_emb_matrix)
logging.info("word embedding matrix shape:(%d, %d); unknown word count: %d;" %
(len(word_emb_matrix), len(word_emb_matrix[0]), unknown_word_count))
logging.info("Word embedding loaded")
return word_emb_matrix