def get_epic_marginalize_late

def get_epic_marginalize_late_fuse()

in notebooks/utils.py [0:0]
69 lines of code
22 McCabe index (conditional complexity)

def get_epic_marginalize_late_fuse(
        run_infos,
        weights=1.0,
        dataset_key_suffix=DATASET_EVAL_CFG_KEY_SUFFIX,
        uid_key='uid',
        eventual_fname='seen.json',
        normalize_before_combine=None):
    """
    Args:
        eventual_fname: This is used to read prepackaged outputs from result
            files, and using the filename to know which file to look for
            when a directory is passed in as run info.
        normalize_before_combine: Set to non-None to normalize the features
            by that p-norm, and then combine. So the weights would have to be
            defined w.r.t normalized features.
    """
    all_scores = []
    all_datasets = []
    for run_info_id, run_info in enumerate(run_infos):
        if isinstance(run_info[0], dict):
            # This is likely a pre-computed scores (so eg a nested
            # get_epic_marginalize.. function). So I just use the scores as is.
            scores = run_info
        elif os.path.isdir(run_info[0]):
            assert len(all_datasets) > 0, (
                'Need at least 1 datasets to be read before reading from json '
                'to figure the verb/noun -> action_id and '
                'to figure the total number of classes to gen feat vectors')
            scores = load_json(
                os.path.join(run_info[0], eventual_fname),
                all_datasets[-1].verb_noun_to_action,
                [list(el.values())[0].shape[-1] for el in all_scores[-1]])
        elif run_info[0].endswith('.pkl'):
            # This is the input used to read predictions from the action_banks
            # codebase, where I dump output into pkl and read here for late
            # fusion.
            scores = read_scores_from_pkl(run_info[0])
            assert len(
                all_datasets) > 0, 'At least one run_info must be passed in'
            scores = _concat_with_uids(scores, all_datasets[-1], uid_key)
        else:
            accuracies, scores, dataset = get_epic_marginalize_verb_noun(
                run_info, dataset_key_suffix=dataset_key_suffix)
            scores = _concat_with_uids(scores, dataset, uid_key)
            print_accuracies_epic(accuracies, prefix=run_info)
            all_datasets.append(dataset)
        if normalize_before_combine is not None:
            scores = _normalize_scores(scores, p=normalize_before_combine)
        logging.warning(
            'Adding scores from run_info %d with avg action L1 norm of %f',
            run_info_id, _get_avg_norm_scores(scores[-1], p=1))
        all_scores.append(scores)
    # Late fuse
    if isinstance(weights, float):
        weights = [weights] * len(run_infos)
    else:
        assert len(weights) == len(run_infos)
    # broadcastable_weights = np.array(weights)[:, np.newaxis, np.newaxis]
    # Combined scores by combining the corresponding score for each uid.
    combined = []
    for space_id in range(3):  # verb/noun/action
        scores_for_space = [scores[space_id] for scores in all_scores]
        # Take the union of all the UIDs we have score for
        total_uids = set.union(*[set(el.keys()) for el in scores_for_space])
        logging.warning('Combined UIDs: %d. UIDs in the runs %s',
                        len(total_uids),
                        [len(el.keys()) for el in scores_for_space])
        combined_for_space = {}
        for uid in total_uids:
            combined_for_space[uid] = []
            for run_id, scores_for_space_per_run in enumerate(
                    scores_for_space):
                if uid in scores_for_space_per_run:
                    combined_for_space[uid].append(
                        scores_for_space_per_run[uid] * weights[run_id])
            combined_for_space[uid] = np.sum(np.stack(combined_for_space[uid]),
                                             axis=0)
        combined.append(combined_for_space)
    # Now to compute accuracies, need to convert back to np arrays from dict.
    # Would only work for parts that are in the dataset
    combined_np = []
    for combined_for_space in combined:
        combined_np.append(
            np.array([
                combined_for_space[str(uid)]
                for uid in all_datasets[-1].df[uid_key].values
            ]))
    accuracies = compute_accuracies_epic(combined_np, all_datasets[-1])
    return accuracies, combined, all_datasets[-1]