in src/lighteval/tasks/extended/lcb/codegen_metrics.py [0:0]
def grade_call_based(code: str, all_inputs: list, all_outputs: list, fn_name: str, timeout: int) -> list[int | bool]:
# call-based clean up logic
# need to wrap in try-catch logic after to catch the correct errors, but for now this is fine.
code = import_string + "\n\n" + code
compiled_sol = compile_code(code, timeout)
if compiled_sol is None:
# The code could not be compiled as the text was maybe malformed, return [-4] error code.
return [-4]
method = get_function(compiled_sol, fn_name)
if method is None:
# The function to evaluate could not be extracted from the code, return [-4] error code.
return [-4]
all_inputs = [[json.loads(line) for line in inputs.split("\n")] for inputs in all_inputs]
all_outputs = [json.loads(output) for output in all_outputs]
total_execution = 0
all_results = []
for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
signal.alarm(timeout)
faulthandler.enable()
try:
# can lock here so time is useful
start = time.time()
prediction = method(*gt_inp)
total_execution += time.time() - start
signal.alarm(0)
# don't penalize model if it produces tuples instead of lists
# ground truth sequences are not tuples
if isinstance(prediction, tuple):
prediction = list(prediction)
tmp_result = prediction == gt_out
all_results.append(tmp_result)
if not tmp_result:
return all_results
except Exception as e:
signal.alarm(0)
if "timeoutexception" in repr(e).lower():
all_results.append(-3)
return all_results
else:
all_results.append(-4)
return all_results
finally:
signal.alarm(0)
faulthandler.disable()
return all_results