def evaluate_data()

in ml/eval/reward_eval.py [0:0]


def evaluate_data(args, model, tokenizer, eval_data_list_dict) -> List[Dict[str, Any]]:
    """
    Evaluate the dataset using the reward model.
    """
    reward_output_fn = get_reward_output_fn(args.reward_output_fmt, args.apply_sigmoid_to_reward)
    pbar = tqdm(total=len(eval_data_list_dict), desc="Evaluating Rewards")
    rewards_list = []

    for idx in range(0, len(eval_data_list_dict), args.per_device_batch_size):
        batch_list_dict = eval_data_list_dict[idx:idx+args.per_device_batch_size]

        # Create prompt-response pairs
        batch_full_outputs = [
            f"{l['prompt']} {l['output']}" for l in batch_list_dict
        ] if 'prompt' in batch_list_dict[0] else [f"Below is an instruction: {l['instruction']} Response: {l['output']}" for l in batch_list_dict]

        # Tokenize reponse and send to device
        encoded_full_responses = tokenizer(batch_full_outputs, return_tensors="pt", padding=True, truncation=True)
        encoded_full_responses = encoded_full_responses.to(model.device)

        # Generate rewards
        with torch.inference_mode():
            reward_outputs = model(**encoded_full_responses)
            rewards = reward_output_fn(reward_outputs.logits)
            rewards_list.extend(rewards)

        pbar.update(len(batch_list_dict))

    # Adding reward scores to original data
    for i, data in enumerate(eval_data_list_dict):
        data['reward'] = rewards_list[i]

    return eval_data_list_dict