def correctness_reward()

in train_rick/run_grpo.py [0:0]


def correctness_reward(completions, solutions, **kwargs):
    rewards = []
    for completion, ground_truths in zip(completions, solutions):
        content = completion[0]["content"]
        matches = [ground_truth in content for ground_truth in ground_truths]
        reward = 1.0 if any(matches) else 0.0
        rewards.append(reward)
    return rewards