in src/lighteval/metrics/utils/extractive_match_utils.py [0:0]
def lazy_latex_regex(latex_config: LatexExtractionConfig, language: Language) -> list[tuple[re.Pattern[str], int]]:
translation_literal = TRANSLATION_LITERALS[language]
# Pattern for multiple latex environments connected by and/or
# Create patterns for up to 5 connected expressions
first_latex_group = make_latex_env_pattern("first_")
and_word = translation_literal.and_word
or_word = translation_literal.or_word
next_groups = "".join(
[rf"(?:\s*(?:{and_word}|{or_word}|,)\s*{make_latex_env_pattern(f'next{i}_')})?" for i in range(1, 6)]
)
latex_envs_re = rf"(?:{first_latex_group}{next_groups})"
colon_re = rf"[{re.escape(translation_literal.colon)}\:]"
answer_prefix_re = rf"(?i:{translation_literal.answer})"
# We first match boxed env, for some reason that's the most common case of output
# Then we match the latex with environments, then we try to match the fraction
regexes: list[tuple[str, int]] = []
for latex_re in [latex_envs_re]:
if language == Language.ENGLISH:
final_answer_prefixed_re = rf"(?i:final answer is)\:?\s*{latex_re}\.?\s?I hope"
final_answer_prefixed_just_is = rf"(?i:final answer.{{0,100}}?)\s+is\:?\s*{latex_re}"
regexes.append((final_answer_prefixed_re, 0))
regexes.append((final_answer_prefixed_just_is, 50))
# Match with answer word - higher priority than plain latex
answer_re_colon = rf"{answer_prefix_re}\s?{colon_re}.{{0,50}}?{latex_re}"
answer_re = f"{answer_prefix_re}.{{0,50}}?{latex_re}"
regexes.extend([(answer_re_colon, 100), (answer_re, 200)])
# Match plain LaTeX - lowest priority
if latex_config.try_extract_without_anchor:
regexes.append((latex_re, 300))
# This ensures that boxed is matched right after the final answer xxxx
if latex_config.boxed_match_priority >= 0:
latex_re_boxed = make_latex_env_pattern(prefix="first_", context="boxed")
next_groups = "".join(
[
rf"(?:\s*(?:{and_word}|{or_word}|,)\s*{make_latex_env_pattern(f'next{i}_', context='boxed')})?"
for i in range(1, 6)
]
)
latex_re_boxed = rf"{latex_re_boxed}{next_groups}"
regexes.append((latex_re_boxed, latex_config.boxed_match_priority))
# Match plain boxed, the issue with plain boxed is that it's impossible to know where it stops, so if there are
# till last }. We do the actuall extraction in the normalization step.
regexes.append((r"(?P<first_latexBoxed>\\boxed{.+})", latex_config.boxed_match_priority))
return [(re.compile(pattern, re.DOTALL), priority) for pattern, priority in regexes]