def evaluate()

in func/train.py [0:0]


def evaluate(
        train_eval_op,
        data_loaders: dict,
        tb_writer,
        logger,
        epoch: float,  # Can be a partial epoch
        store=True,
        store_endpoint='logits',
        only_run_featext=False):
    """
    Args:
        data_loaders: A dict from key (name) to a data loader. Allows to
            multiple dataloaders for testing on.
        only_run_featext (bool): Set this to true and it will return after the
            features are extracted and won't compute final numbers etc. So
            it will never try to sync processes etc, which leads to crashes.
    """
    all_metric_loggers = {}
    final_accuracies = {}
    for data_key, data_loader in data_loaders.items():
        logger.info('Running evaluation for {0}{1}'.format(
            DATASET_EVAL_CFG_KEY, data_key))
        header = f'[{data_key}] Test:'
        metric_logger = MetricLogger(delimiter='  ',
                                     writer=tb_writer,
                                     stat_set='val' + data_key,
                                     logger=logger)
        all_metric_loggers[data_key] = metric_logger
        this_save_dir = RESULTS_SAVE_DIR + data_key + '/'
        if not only_run_featext:
            # Delete the stored output features files, since with H5 they
            # might be getting appended and will blow up. Note that if
            # feature extraction was the goal and we wanted to append,
            # need to set in the config to not delete the old files so it
            # can append to what has already been computed
            logger.info('Clearing %s/%s/*', os.getcwd(), this_save_dir)
            subprocess.call(f'rm -r {this_save_dir}/*', shell=True)
        for data in metric_logger.log_every(data_loader, 2, header):
            with torch.no_grad():
                data, outputs, losses, accuracies = train_eval_op(
                    data, train_mode=False)
                # Reduce the losses, since by default I no longer reduce the
                # losses, to be able to store the outputs
                losses_reduced = {
                    key: torch.mean(val)
                    for key, val in losses.items()
                }
                loss = torch.sum(torch.stack(list(losses_reduced.values())))
            if store:
                # allow to store logits and logits_regression if that's in too
                all_logits = {
                    key: outputs[key].detach().cpu().numpy()
                    for key in outputs if key.startswith(store_endpoint)
                }
                all_logits.update({'idx': data['idx'].detach().cpu().numpy()})
                uid_data = np.array(data['uid'])
                # If strings, convert format to work with HDF5
                if uid_data.dtype.kind == 'U':
                    # So that it can store upto 64 char strings -- will be
                    # used by the hdf5 too
                    assert int(uid_data.dtype.str[2:]) < STR_UID_MAXLEN, (
                        f'Make sure UID data is smaller than '
                        f'{STR_UID_MAXLEN}, or update that value of '
                        f'STR_UID_MAXLEN')
                    uid_data = uid_data.astype(f'S{STR_UID_MAXLEN}')
                all_logits.update({'uid': uid_data})
                # Storing the actual per batch/elt unreduced losses for
                # potential analysis
                all_logits.update({
                    'loss/' + key: val.detach().cpu()
                    for key, val in losses.items()
                })
                if not only_run_featext:
                    # store the targets as well
                    all_logits.update({
                        'target/' + key: val.detach().cpu().numpy()
                        for key, val in data['target'].items()
                    })
                # Do the actual storage into HDF5s that can append to the
                # stuff from previous batch. Doing it here rather than
                # collecting (as I used to do) so that this can be used
                # for feature extraction where storing into a list will
                # be too expensive
                all_logits.update({'epoch': np.array([epoch])})
                store_append_h5(all_logits, this_save_dir)
            # FIXME need to take into account that the datasets
            # could have been padded in distributed setup
            batch_size = data_loader.batch_size
            metric_logger.update(loss=loss.item())
            for acc_key, acc_val in accuracies.items():
                metric_logger.meters[acc_key].update(acc_val.item(),
                                                     n=batch_size)
            for loss_name, loss_val in losses_reduced.items():
                metric_logger.meters[loss_name].update(loss_val.item(),
                                                       n=batch_size)
        if not only_run_featext:
            final_accuracies[data_key] = _evaluate_store_logs(
                logger, metric_logger, accuracies.keys(), store, this_save_dir,
                data_key, data_loader, epoch, losses_reduced.keys())

    if only_run_featext:
        # None of the rest is needed
        return 0.0

    # Return the accuracy on the main evaluation dataset, which must be the
    # one which doesn't have any prefix (i.e. in the dataset_eval)
    # Returning the accuracy metric that is most relevant to the dataset.
    main_dataset_key = ''
    main_metric = final_accuracies[main_dataset_key][
        data_loaders[main_dataset_key].dataset.primary_metric]
    return main_metric