def evaluation()

in src/nli/evaluation.py [0:0]


def evaluation():
    parser = argparse.ArgumentParser()
    parser.add_argument("--cpu", action="store_true", help="If set, we only use CPU.")
    parser.add_argument(
        "--model_class_name",
        type=str,
        help="Set the model class of the experiment.",
        required=True
    )

    parser.add_argument(
        "--model_checkpoint_path",
        type=str,
        help='Set the path to save the prediction.', required=True)

    parser.add_argument(
        "--output_prediction_path",
        type=str,
        default=None,
        help='Set the path to save the prediction.')

    parser.add_argument(
        "--per_gpu_eval_batch_size", default=16, type=int, help="Batch size per GPU/CPU for evaluation.",
    )

    parser.add_argument("--max_length", default=156, type=int, help="Max length of the sequences.")

    parser.add_argument("--eval_data",
                        type=str,
                        help="The training data used in the experiments.")

    args = parser.parse_args()

    if args.cpu:
        args.global_rank = -1
    else:
        args.global_rank = 0

    model_checkpoint_path = args.model_checkpoint_path
    num_labels = 3
    # we are doing NLI so we set num_labels = 3, for other task we can change this value.

    max_length = args.max_length

    model_class_item = MODEL_CLASSES[args.model_class_name]
    model_name = model_class_item['model_name']
    do_lower_case = model_class_item['do_lower_case'] if 'do_lower_case' in model_class_item else False

    tokenizer = model_class_item['tokenizer'].from_pretrained(model_name,
                                                              cache_dir=str(config.PRO_ROOT / "trans_cache"),
                                                              do_lower_case=do_lower_case)

    model = model_class_item['sequence_classification'].from_pretrained(model_name,
                                                                        cache_dir=str(config.PRO_ROOT / "trans_cache"),
                                                                        num_labels=num_labels)

    model.load_state_dict(torch.load(model_checkpoint_path))

    padding_token_value = tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0]
    padding_segement_value = model_class_item["padding_segement_value"]
    padding_att_value = model_class_item["padding_att_value"]
    left_pad = model_class_item['left_pad'] if 'left_pad' in model_class_item else False

    batch_size_per_gpu_eval = args.per_gpu_eval_batch_size

    eval_data_str = args.eval_data
    eval_data_name = []
    eval_data_path = []
    eval_data_list = []

    eval_data_named_path = eval_data_str.split(',')

    for named_path in eval_data_named_path:
        ind = named_path.find(':')
        name = named_path[:ind]
        path = name[ind + 1:]
        if name in registered_path:
            d_list = common.load_jsonl(registered_path[name])
        else:
            d_list = common.load_jsonl(path)
        eval_data_name.append(name)
        eval_data_path.append(path)

        eval_data_list.append(d_list)

    batching_schema = {
        'uid': RawFlintField(),
        'y': LabelFlintField(),
        'input_ids': ArrayIndexFlintField(pad_idx=padding_token_value, left_pad=left_pad),
        'token_type_ids': ArrayIndexFlintField(pad_idx=padding_segement_value, left_pad=left_pad),
        'attention_mask': ArrayIndexFlintField(pad_idx=padding_att_value, left_pad=left_pad),
    }

    data_transformer = NLITransform(model_name, tokenizer, max_length)
    eval_data_loaders = []
    for eval_d_list in eval_data_list:
        d_dataset, d_sampler, d_dataloader = build_eval_dataset_loader_and_sampler(eval_d_list, data_transformer,
                                                                                   batching_schema,
                                                                                   batch_size_per_gpu_eval)
        eval_data_loaders.append(d_dataloader)

    if not args.cpu:
        torch.cuda.set_device(0)
        model.cuda(0)

    r_dict = dict()
    # Eval loop:
    for i in range(len(eval_data_name)):
        cur_eval_data_name = eval_data_name[i]
        cur_eval_data_list = eval_data_list[i]
        cur_eval_dataloader = eval_data_loaders[i]
        # cur_eval_raw_data_list = eval_raw_data_list[i]

        evaluation_dataset(args, cur_eval_dataloader, cur_eval_data_list, model, r_dict,
                           eval_name=cur_eval_data_name)

    # save prediction:
    if args.output_prediction_path is not None:
        cur_results_path = Path(args.output_prediction_path)
        if not cur_results_path.exists():
            cur_results_path.mkdir(parents=True)
        for key, item in r_dict.items():
            common.save_jsonl(item['predictions'], cur_results_path / f"{key}.jsonl")

        # avoid saving too many things
        for key, item in r_dict.items():
            del r_dict[key]['predictions']
        common.save_json(r_dict, cur_results_path / "results_dict.json", indent=2)