in src/alpaca_eval/completion_parsers.py [0:0]
def single_logprob_parser(top_logprobs: list[dict[str, Any]]) -> float:
map_tokens_to_logprobs = {
t["token"]: t["logprob"] for t in top_logprobs if t["token"] in denominator_tokens + [numerator_token]
}
# if it's not present we say it's probability is 0, which only makes sense if at least one is present
missing = float("-inf")
if len(map_tokens_to_logprobs) == 0:
logging.warning(f"Cannot find any logprobs from {denominator_tokens + [numerator_token]} in {completion}.")
return np.nan
baseline_logprob = map_tokens_to_logprobs.get(numerator_token, missing)
denominator_logprob = logsumexp([map_tokens_to_logprobs.get(t, missing) for t in denominator_tokens])
if is_binarize:
# in the binary case, we want to know whether the baseline token has a higher logprob than all the others
denominator_not_numerator_tokens = [t for t in denominator_tokens if t != numerator_token]
denominator_not_baseline_logprobs = [
map_tokens_to_logprobs.get(t, missing) for t in denominator_not_numerator_tokens
]
is_baseline_best = all([baseline_logprob > t for t in denominator_not_baseline_logprobs])
out = 1 if is_baseline_best else 2
else:
out_logprob = baseline_logprob - denominator_logprob # typecheck doesn't recognize it's a float
probability = np.exp(out_logprob)
# if you have probability 1 of preferring first then say 1 if 0 say 2
# output between 1 and 2 for historical reasons
out = 2 - probability
return out