def evaluation()

in anli/src/nli/evaluation.py [0:0]


def evaluation(args):

    if args.cpu:
        args.global_rank = -1
    else:
        args.global_rank = 0

    ## Resolve paths
    eval_data = args.eval_data
    if ":" not in args.eval_data:
        args.eval_data = (
            f"{args.eval_data}:{Path(data_base_loc) / data_store[args.eval_data]}"
        )
    args.model_checkpoint_path = (
        Path(model_base_loc)
        / model_store[args.model_class_name][args.train_data][args.train_mode]
        / "model.pt"
    )
    args.output_prediction_path = (
        Path("outputs")
        / args.model_class_name
        / f"{args.train_data}_train_{args.train_mode}_dev_{eval_data}"
    )

    model_checkpoint_path = args.model_checkpoint_path
    num_labels = 3
    # we are doing NLI so we set num_labels = 3, for other task we can change this value.

    max_length = args.max_length

    model_class_item = MODEL_CLASSES[args.model_class_name]
    model_name = model_class_item["model_name"]
    do_lower_case = (
        model_class_item["do_lower_case"]
        if "do_lower_case" in model_class_item
        else False
    )

    tokenizer = model_class_item["tokenizer"].from_pretrained(
        model_name,
        cache_dir=str(config.PRO_ROOT / "trans_cache"),
        do_lower_case=do_lower_case,
    )

    model = model_class_item["sequence_classification"].from_pretrained(
        model_name,
        cache_dir=str(config.PRO_ROOT / "trans_cache"),
        num_labels=num_labels,
    )

    state_dict = torch.load(model_checkpoint_path)
    if args.train_with_lm:
        model.lm_head = RobertaLMHead(model.config)

    model.load_state_dict(state_dict, strict=False)

    padding_token_value = tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0]
    padding_segement_value = model_class_item["padding_segement_value"]
    padding_att_value = model_class_item["padding_att_value"]
    left_pad = model_class_item["left_pad"] if "left_pad" in model_class_item else False

    batch_size_per_gpu_eval = args.per_gpu_eval_batch_size

    eval_data_str = args.eval_data
    eval_data_name = []
    eval_data_path = []
    eval_data_list = []

    eval_data_named_path = eval_data_str.split(",")

    for named_path in eval_data_named_path:
        ind = named_path.find(":")
        name = named_path[:ind]
        path = named_path[ind + 1 :]
        if name in registered_path:
            d_list = common.load_jsonl(registered_path[name])
        else:
            d_list = common.load_jsonl(path)
        eval_data_name.append(name)
        eval_data_path.append(path)

        eval_data_list.append(d_list)

    batching_schema = {
        "uid": RawFlintField(),
        "y": LabelFlintField(),
        "input_ids": ArrayIndexFlintField(
            pad_idx=padding_token_value, left_pad=left_pad
        ),
        "token_type_ids": ArrayIndexFlintField(
            pad_idx=padding_segement_value, left_pad=left_pad
        ),
        "attention_mask": ArrayIndexFlintField(
            pad_idx=padding_att_value, left_pad=left_pad
        ),
    }

    if args.flip_sent:
        print("Flipping hypothesis and premise")
        data_transformer = FlippedNLITransform(model_name, tokenizer, max_length)
    else:
        data_transformer = NLITransform(model_name, tokenizer, max_length)
    eval_data_loaders = []
    for eval_d_list in eval_data_list:
        d_dataset, d_sampler, d_dataloader = build_eval_dataset_loader_and_sampler(
            eval_d_list, data_transformer, batching_schema, batch_size_per_gpu_eval
        )
        eval_data_loaders.append(d_dataloader)

    if not args.cpu:
        torch.cuda.set_device(0)
        model.cuda(0)

    r_dict = dict()
    # Eval loop:
    for i in range(len(eval_data_name)):
        cur_eval_data_name = eval_data_name[i]
        cur_eval_data_list = eval_data_list[i]
        cur_eval_dataloader = eval_data_loaders[i]
        # cur_eval_raw_data_list = eval_raw_data_list[i]

        evaluation_dataset(
            args,
            cur_eval_dataloader,
            cur_eval_data_list,
            model,
            r_dict,
            eval_name=cur_eval_data_name,
        )

    # save prediction:
    if args.output_prediction_path is not None:
        cur_results_path = Path(args.output_prediction_path)
        if not cur_results_path.exists():
            cur_results_path.mkdir(parents=True)
        for key, item in r_dict.items():
            common.save_jsonl(item["predictions"], cur_results_path / f"{key}.jsonl")

        # avoid saving too many things
        for key, item in r_dict.items():
            del r_dict[key]["predictions"]
        common.save_json(r_dict, cur_results_path / "results_dict.json", indent=2)

    return r_dict