def main()

in train.py [0:0]


def main(params):
    # init
    conf = ModelConf("train", params.conf_path, version, params, mode=params.mode)
    problem = Problem("train", conf.problem_type, conf.input_types, conf.answer_column_name,
        with_bos_eos=conf.add_start_end_for_seq, tagging_scheme=conf.tagging_scheme, tokenizer=conf.tokenizer,
        remove_stopwords=conf.remove_stopwords, DBC2SBC=conf.DBC2SBC, unicode_fix=conf.unicode_fix)
    if conf.pretrained_model_path:
        ### when finetuning, load previous saved problem
        problem.load_problem(conf.saved_problem_path)
   
    # cache verification
    emb_matrix = None
    cache = Cache()
    if conf.use_cache:
        ## check
        cache.check(conf, params)
        ## load
        problem, emb_matrix = cache.load(conf, problem, emb_matrix)

    # data preprocessing
    ## build dictionary when (not in finetune model) and (not use cache or cache invalid)
    if (not conf.pretrained_model_path) and ((conf.use_cache == False) or cache.dictionary_invalid):
        logging.info("="*100)
        logging.info("Preprocessing... Depending on your corpus size, this step may take a while.")
        # modify train_data_path to [train_data_path, valid_data_path, test_data_path]
        # remember the test_data may be None
        data_path_list = [conf.train_data_path, conf.valid_data_path, conf.test_data_path]
        emb_matrix = problem.build(data_path_list, conf.file_columns, conf.input_types, conf.file_with_col_header,
                                    conf.answer_column_name, word2vec_path=conf.pretrained_emb_path,
                                    word_emb_dim=conf.pretrained_emb_dim, format=conf.pretrained_emb_type,
                                    file_type=conf.pretrained_emb_binary_or_text, involve_all_words=conf.involve_all_words_in_pretrained_emb,
                                    show_progress=True if params.mode == 'normal' else False, cpu_num_workers = conf.cpu_num_workers,
                                    max_vocabulary=conf.max_vocabulary, word_frequency=conf.min_word_frequency, max_building_lines=conf.max_building_lines)

    # environment preparing
    ## cache save
    if conf.use_cache:
        cache.save(conf, params, problem, emb_matrix)

    if params.make_cache_only:
        if conf.use_cache:
            logging.info("Finish building cache!")
        else:
            logging.info('Please set parameters "use_cache" is true')
        return

    ## back up the problem.pkl to save_base_dir/.necessary_cache. 
    ## During test phase, we would load cache from save_base_dir/.necessary_cache/problem.pkl
    conf.back_up(params) 
    cache.back_up(conf, problem)
    if problem.output_dict:
        logging.debug("Problem target cell dict: %s" % (problem.output_dict.cell_id_map))
    
    # train phase
    ## init 
    ### model
    vocab_info, initialize = None, False
    if not conf.pretrained_model_path:
        vocab_info, initialize = get_vocab_info(conf, problem, emb_matrix), True
  
    lm = LearningMachine('train', conf, problem, vocab_info=vocab_info, initialize=initialize, use_gpu=conf.use_gpu)
    if conf.pretrained_model_path:
        logging.info('Loading the pretrained model: %s...' % conf.pretrained_model_path)
        lm.load_model(conf.pretrained_model_path)

    ### loss
    if len(conf.metrics_post_check) > 0:
        for metric_to_chk in conf.metrics_post_check:
            metric, target = metric_to_chk.split('@')
            if not problem.output_dict.has_cell(target):
                raise Exception("The target %s of %s does not exist in the training data." % (target, metric_to_chk))
    loss_conf = conf.loss
    loss_conf['output_layer_id'] = conf.output_layer_id
    loss_conf['answer_column_name'] = conf.answer_column_name
    # loss_fn = eval(loss_conf['type'])(**loss_conf['conf'])
    loss_fn = Loss(**loss_conf)
    if conf.use_gpu is True:
        loss_fn.cuda()

    ### optimizer
    if isinstance(lm.model, nn.DataParallel):
        if isinstance(lm.model.module.layers['embedding'].embeddings, nn.ModuleDict):
            optimizer = eval(conf.optimizer_name)(list(lm.model.parameters()), **conf.optimizer_params)
        else:
            optimizer = eval(conf.optimizer_name)(
                list(lm.model.parameters()) + list(lm.model.module.layers['embedding'].get_parameters()),
                **conf.optimizer_params)
    else:
        if isinstance(lm.model.layers['embedding'].embeddings, nn.ModuleDict):
            optimizer = eval(conf.optimizer_name)(
                list(lm.model.parameters()), **conf.optimizer_params)
        else:
            optimizer = eval(conf.optimizer_name)(
                list(lm.model.parameters()) + list(lm.model.layers['embedding'].get_parameters()),
                **conf.optimizer_params)

    ## train
    lm.train(optimizer, loss_fn)

    ## test the best model with the best model saved
    lm.load_model(conf.model_save_path)
    if conf.test_data_path is not None:
        test_path = conf.test_data_path
    elif conf.valid_data_path is not None:
        test_path = conf.valid_data_path
    logging.info('Testing the best model saved at %s, with %s' % (conf.model_save_path, test_path))
    if not test_path.endswith('pkl'):
        lm.test(loss_fn, test_path, predict_output_path=conf.predict_output_path)
    else:
        lm.test(loss_fn, test_path)