in src/open_r1/rewards.py [0:0]
def get_soft_overlong_punishment(max_completion_len, soft_punish_cache):
"""
Reward function that penalizes overlong completions. It is used to penalize overlong completions,
but not to reward shorter completions. Reference: Eq. (13) from the DAPO paper (https://huggingface.co/papers/2503.14476)
Args:
max_completion_len: Maximum length of the completion
soft_punish_cache: Minimum length of the completion. If set to 0, no minimum length is applied.
"""
def soft_overlong_punishment_reward(completion_ids: list[list[int]], **kwargs) -> list[float]:
"""Reward function that penalizes overlong completions."""
rewards = []
for ids in completion_ids:
completion_length = len(ids)
if completion_length <= max_completion_len - soft_punish_cache:
rewards.append(0.0)
elif max_completion_len - soft_punish_cache < completion_length <= max_completion_len:
rewards.append((max_completion_len - soft_punish_cache - completion_length) / soft_punish_cache)
else:
rewards.append(-1.0)
return rewards
return soft_overlong_punishment_reward