def grade_call_based()

in src/lighteval/tasks/extended/lcb/codegen_metrics.py [0:0]


def grade_call_based(code: str, all_inputs: list, all_outputs: list, fn_name: str, timeout: int) -> list[int | bool]:
    # call-based clean up logic
    # need to wrap in try-catch logic after to catch the correct errors, but for now this is fine.
    code = import_string + "\n\n" + code
    compiled_sol = compile_code(code, timeout)
    if compiled_sol is None:
        # The code could not be compiled as the text was maybe malformed, return [-4] error code.
        return [-4]

    method = get_function(compiled_sol, fn_name)
    if method is None:
        # The function to evaluate could not be extracted from the code, return [-4] error code.
        return [-4]

    all_inputs = [[json.loads(line) for line in inputs.split("\n")] for inputs in all_inputs]

    all_outputs = [json.loads(output) for output in all_outputs]

    total_execution = 0
    all_results = []
    for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
        signal.alarm(timeout)
        faulthandler.enable()
        try:
            # can lock here so time is useful
            start = time.time()
            prediction = method(*gt_inp)
            total_execution += time.time() - start
            signal.alarm(0)

            # don't penalize model if it produces tuples instead of lists
            # ground truth sequences are not tuples
            if isinstance(prediction, tuple):
                prediction = list(prediction)

            tmp_result = prediction == gt_out

            all_results.append(tmp_result)

            if not tmp_result:
                return all_results

        except Exception as e:
            signal.alarm(0)
            if "timeoutexception" in repr(e).lower():
                all_results.append(-3)
                return all_results

            else:
                all_results.append(-4)
                return all_results

        finally:
            signal.alarm(0)
            faulthandler.disable()

    return all_results