in train.py [0:0]
def main(params):
# init
conf = ModelConf("train", params.conf_path, version, params, mode=params.mode)
problem = Problem("train", conf.problem_type, conf.input_types, conf.answer_column_name,
with_bos_eos=conf.add_start_end_for_seq, tagging_scheme=conf.tagging_scheme, tokenizer=conf.tokenizer,
remove_stopwords=conf.remove_stopwords, DBC2SBC=conf.DBC2SBC, unicode_fix=conf.unicode_fix)
if conf.pretrained_model_path:
### when finetuning, load previous saved problem
problem.load_problem(conf.saved_problem_path)
# cache verification
emb_matrix = None
cache = Cache()
if conf.use_cache:
## check
cache.check(conf, params)
## load
problem, emb_matrix = cache.load(conf, problem, emb_matrix)
# data preprocessing
## build dictionary when (not in finetune model) and (not use cache or cache invalid)
if (not conf.pretrained_model_path) and ((conf.use_cache == False) or cache.dictionary_invalid):
logging.info("="*100)
logging.info("Preprocessing... Depending on your corpus size, this step may take a while.")
# modify train_data_path to [train_data_path, valid_data_path, test_data_path]
# remember the test_data may be None
data_path_list = [conf.train_data_path, conf.valid_data_path, conf.test_data_path]
emb_matrix = problem.build(data_path_list, conf.file_columns, conf.input_types, conf.file_with_col_header,
conf.answer_column_name, word2vec_path=conf.pretrained_emb_path,
word_emb_dim=conf.pretrained_emb_dim, format=conf.pretrained_emb_type,
file_type=conf.pretrained_emb_binary_or_text, involve_all_words=conf.involve_all_words_in_pretrained_emb,
show_progress=True if params.mode == 'normal' else False, cpu_num_workers = conf.cpu_num_workers,
max_vocabulary=conf.max_vocabulary, word_frequency=conf.min_word_frequency, max_building_lines=conf.max_building_lines)
# environment preparing
## cache save
if conf.use_cache:
cache.save(conf, params, problem, emb_matrix)
if params.make_cache_only:
if conf.use_cache:
logging.info("Finish building cache!")
else:
logging.info('Please set parameters "use_cache" is true')
return
## back up the problem.pkl to save_base_dir/.necessary_cache.
## During test phase, we would load cache from save_base_dir/.necessary_cache/problem.pkl
conf.back_up(params)
cache.back_up(conf, problem)
if problem.output_dict:
logging.debug("Problem target cell dict: %s" % (problem.output_dict.cell_id_map))
# train phase
## init
### model
vocab_info, initialize = None, False
if not conf.pretrained_model_path:
vocab_info, initialize = get_vocab_info(conf, problem, emb_matrix), True
lm = LearningMachine('train', conf, problem, vocab_info=vocab_info, initialize=initialize, use_gpu=conf.use_gpu)
if conf.pretrained_model_path:
logging.info('Loading the pretrained model: %s...' % conf.pretrained_model_path)
lm.load_model(conf.pretrained_model_path)
### loss
if len(conf.metrics_post_check) > 0:
for metric_to_chk in conf.metrics_post_check:
metric, target = metric_to_chk.split('@')
if not problem.output_dict.has_cell(target):
raise Exception("The target %s of %s does not exist in the training data." % (target, metric_to_chk))
loss_conf = conf.loss
loss_conf['output_layer_id'] = conf.output_layer_id
loss_conf['answer_column_name'] = conf.answer_column_name
# loss_fn = eval(loss_conf['type'])(**loss_conf['conf'])
loss_fn = Loss(**loss_conf)
if conf.use_gpu is True:
loss_fn.cuda()
### optimizer
if isinstance(lm.model, nn.DataParallel):
if isinstance(lm.model.module.layers['embedding'].embeddings, nn.ModuleDict):
optimizer = eval(conf.optimizer_name)(list(lm.model.parameters()), **conf.optimizer_params)
else:
optimizer = eval(conf.optimizer_name)(
list(lm.model.parameters()) + list(lm.model.module.layers['embedding'].get_parameters()),
**conf.optimizer_params)
else:
if isinstance(lm.model.layers['embedding'].embeddings, nn.ModuleDict):
optimizer = eval(conf.optimizer_name)(
list(lm.model.parameters()), **conf.optimizer_params)
else:
optimizer = eval(conf.optimizer_name)(
list(lm.model.parameters()) + list(lm.model.layers['embedding'].get_parameters()),
**conf.optimizer_params)
## train
lm.train(optimizer, loss_fn)
## test the best model with the best model saved
lm.load_model(conf.model_save_path)
if conf.test_data_path is not None:
test_path = conf.test_data_path
elif conf.valid_data_path is not None:
test_path = conf.valid_data_path
logging.info('Testing the best model saved at %s, with %s' % (conf.model_save_path, test_path))
if not test_path.endswith('pkl'):
lm.test(loss_fn, test_path, predict_output_path=conf.predict_output_path)
else:
lm.test(loss_fn, test_path)