def lazy_latex

def lazy_latex_regex()

in src/lighteval/metrics/utils/extractive_match_utils.py [0:0]
35 lines of code
7 McCabe index (conditional complexity)

def lazy_latex_regex(latex_config: LatexExtractionConfig, language: Language) -> list[tuple[re.Pattern[str], int]]:
    translation_literal = TRANSLATION_LITERALS[language]
    # Pattern for multiple latex environments connected by and/or
    # Create patterns for up to 5 connected expressions
    first_latex_group = make_latex_env_pattern("first_")
    and_word = translation_literal.and_word
    or_word = translation_literal.or_word
    next_groups = "".join(
        [rf"(?:\s*(?:{and_word}|{or_word}|,)\s*{make_latex_env_pattern(f'next{i}_')})?" for i in range(1, 6)]
    )

    latex_envs_re = rf"(?:{first_latex_group}{next_groups})"
    colon_re = rf"[{re.escape(translation_literal.colon)}\:]"
    answer_prefix_re = rf"(?i:{translation_literal.answer})"

    # We first match boxed env, for some reason that's the most common case of output
    # Then we match the latex with environments, then we try to match the fraction
    regexes: list[tuple[str, int]] = []
    for latex_re in [latex_envs_re]:
        if language == Language.ENGLISH:
            final_answer_prefixed_re = rf"(?i:final answer is)\:?\s*{latex_re}\.?\s?I hope"
            final_answer_prefixed_just_is = rf"(?i:final answer.{{0,100}}?)\s+is\:?\s*{latex_re}"
            regexes.append((final_answer_prefixed_re, 0))
            regexes.append((final_answer_prefixed_just_is, 50))

        # Match with answer word - higher priority than plain latex
        answer_re_colon = rf"{answer_prefix_re}\s?{colon_re}.{{0,50}}?{latex_re}"
        answer_re = f"{answer_prefix_re}.{{0,50}}?{latex_re}"

        regexes.extend([(answer_re_colon, 100), (answer_re, 200)])

        # Match plain LaTeX - lowest priority
        if latex_config.try_extract_without_anchor:
            regexes.append((latex_re, 300))

    # This ensures that boxed is matched right after the final answer xxxx
    if latex_config.boxed_match_priority >= 0:
        latex_re_boxed = make_latex_env_pattern(prefix="first_", context="boxed")
        next_groups = "".join(
            [
                rf"(?:\s*(?:{and_word}|{or_word}|,)\s*{make_latex_env_pattern(f'next{i}_', context='boxed')})?"
                for i in range(1, 6)
            ]
        )
        latex_re_boxed = rf"{latex_re_boxed}{next_groups}"
        regexes.append((latex_re_boxed, latex_config.boxed_match_priority))
        # Match plain boxed, the issue with plain boxed is that it's impossible to know where it stops, so if there are
        # till last }. We do the actuall extraction in the normalization step.
        regexes.append((r"(?P<first_latexBoxed>\\boxed{.+})", latex_config.boxed_match_priority))

    return [(re.compile(pattern, re.DOTALL), priority) for pattern, priority in regexes]