in evals/elsuite/function_deduction/eval.py [0:0]
def _get_complexity_tests(self, metrics):
solved = [x["complexity"] for x in metrics if x["success"]]
not_solved = [x["complexity"] for x in metrics if not x["success"]]
result = {
"solved_avg_complexity": sum(solved) / len(solved) if solved else None,
"not_solved_avg_complexity": sum(not_solved) / len(not_solved) if not_solved else None,
}
# This tests if solved have lower complexity than non-solved
if solved and not_solved:
_, p_value = scipy.stats.mannwhitneyu(solved, not_solved, alternative="less")
else:
p_value = None
result["solved_or_not_mann_whitney_u_p_value"] = p_value
# TODO: add more complexity-related metrics, such as correlation or linear regression coefficient.
# Leaving this for the future because we might want to change how the complexity is calculated,
# or generally improve the concept somehow.
return result