def evaluate()

in src/pixparse/framework/eval.py [0:0]


def evaluate(task: TaskEval, loaders):
    # loaders a mapping? or tuple / dataclass with evaluation loader + attributes to specify what types of eval tasks are valid?
    # loaders_and_tasks is collated container of an eval dataset loader
    # + list of eval tasks compat with each loader
    metrics = dict()
    authorized_loaders = task.prepare_for_evaluation(loaders)
    # FIXME (Pablo) not sure if I understand this correctly,
    #  are tasks in loader_and tasks -training- tasks? or other eval tasks?
    # If they are train tasks, it means each train task must have en eval_step
    # Which feels less general-purpose
    for key, loader in authorized_loaders.items():
        metrics[key] = dict()
        for index_batch, sample in enumerate(loader.loader):
            metrics[key][index_batch] = task.step(sample)

        if hasattr(task, 'average_metrics'):
            # This is the end/finalize method to aggregate metrics
            averaged_metrics = task.average_metrics(metrics[key])
            metrics[key] = {}
            metrics[key]["average"] = averaged_metrics
    return metrics