def real_eval()

in agents/offline_agents.py [0:0]
87 lines of code
7 McCabe index (conditional complexity)

    def real_eval(cls, cache, model, trainer, actions_per_task, task_ids, tier,
                  max_attempts_per_task, cfg):
        # Parameters
        if cfg.eval.batch_size:
            eval_batch_size = cfg.eval.batch_size
        else:
            eval_batch_size = cfg.train.batch_size * cfg.eval.bs_multiplier
            # Since scaling the eval batch size by this, should scale down the
            # workers for training, since the memory might blow up
            cfg.eval.data_loader.num_workers = max(
                16,
                cfg.train.data_loader.num_workers // cfg.eval.bs_multiplier)
            logging.warning('Scaling down eval workers to %d',
                            cfg.eval.data_loader.num_workers)
        assert eval_batch_size % cfg.num_gpus == 0, 'Otherwise will error'

        model.cuda()
        # Not passing in the drop_objs here, since this simulator is only
        # used for evaluation
        simulator = phyre.initialize_simulator(task_ids, tier)
        assert tuple(task_ids) == simulator.task_ids

        # New evaluation code only does 1 prediction no matter length of rollout
        evaluator = EvaluatorWrapper(simulator, task_ids, 1,
                                     max_attempts_per_task)
        if cfg.eval.store_vis:
            # Subselect actions that are diverse (some solve, others don't)
            # And keep a small subset of actions, not too many
            # eval_batch_size = 4  # What I typically visualize for
            # store_vis_nsamples = max(cfg.eval.store_vis_nsamples,
            #                          eval_batch_size)
            # Make this consistent, to keep numbers always consistent
            store_vis_nsamples = cfg.eval.store_vis_nsamples
            actions_override = None
            if cfg.eval.store_vis_actions is not None:
                actions_override = np.array(
                    cls.read_actions_override(cfg.eval.store_vis_actions))
                eval_batch_size = len(actions_override)
            task_indices = []
            actions = []
            # Running separately to be able to match the set that was used
            # in before multi-worker testing
            for task_index, task_id in enumerate(
                    tqdm.tqdm(task_ids, 'gen-ing task IDs for vis')):
                if actions_override is not None:
                    this_actions = actions_override
                else:
                    _, _, this_actions, _, _ = (
                        neural_agent.create_balanced_eval_set(
                            cache, [task_id], store_vis_nsamples, cfg.tier))
                actions.append(this_actions)
                task_indices += [task_index] * len(this_actions)
            task_indices = np.array(task_indices)
            actions = np.concatenate(actions, axis=0)
        else:
            task_indices = np.repeat(np.arange(len(task_ids)),
                                     len(actions_per_task))
            actions = np.concatenate([actions_per_task] * len(task_ids),
                                     axis=0)
        logging.info('Ranking %d actions and simulating top %d',
                     len(actions) // len(task_ids), max_attempts_per_task)
        assert len(task_indices) == len(actions)
        if cfg.train.data_loader.fwd_model.use_obj_fwd_model:
            obj_fwd_model = obj_fwd_agent.ObjTrainer.gen_model(cfg)
            if cfg.train.data_loader.fwd_model.weights is not None:
                obj_fwd_model = trainer.load_agent_from_folder(
                    obj_fwd_model, cfg.train.data_loader.fwd_model.weights)
            obj_fwd_model = obj_fwd_model.module.cpu()
        else:
            obj_fwd_model = None
        dataset = PhyreDataset(
            tier,
            task_ids,
            task_indices,
            # This info not needed for test case
            torch.LongTensor([0] * len(task_indices)),
            actions,
            cfg.simulator,
            mode='test',
            balance_classes=False,
            hard_negatives=False,
            init_clip_ratio_to_sim=cfg.eval.init_clip_ratio_to_sim,
            init_frames_to_sim=cfg.eval.init_frames_to_sim,
            frames_per_clip=cfg.eval.frames_per_clip,
            n_hist_frames=cfg.eval.n_hist_frames,
            drop_objs=cfg.eval.drop_objs,
            obj_fwd_model=obj_fwd_model,
        )
        # res_actions may be different from actions since the last batch
        # might be smaller than the others, and we might end up dropping it
        res_scores, res_actions, res_indices, res_pixel_accs = (
            trainer.eval_actions(model, dataset, len(actions), eval_batch_size,
                                 cfg))
        for task_index, _ in enumerate(task_ids):
            mask = (res_indices == task_index)
            # When store_vis, the actions are selected differently, so this
            # assertion would not hold
            assert (cfg.eval.store_vis
                    or (np.sum(mask) == (len(actions) // len(task_ids))))
            if np.sum(mask) == 0:
                logging.warning('Missing task %s from evaluation!',
                                task_ids[task_index])
                continue
            # statuses = cache.load_simulation_states(task_id)
            evaluator.wrapper_add_scores(task_index, res_scores[:, mask],
                                         res_actions[mask])
            # # Order of descending scores.
            # action_order = np.argsort(-scores)


        cls.print_pixel_accs_summary([res_pixel_accs],
                                     cfg.phyre_movable_channels)
        return evaluator