def convert_dataset()

in benchmarking/blackbox_repository/conversion_scripts/scripts/nasbench201_import.py [0:0]


def convert_dataset(data, dataset):
    hp_cols = list(CONFIG_KEYS)

    hps = dict()

    for h in hp_cols:
        hps[h] = []

    n_hps = data['total_archs']

    for i in range(n_hps):
        config = str_to_list(data['arch2infos'][i]['200']['arch_str'])

        for j, hp in enumerate(config):
            hps[CONFIG_KEYS[j]].append(hp)

    hyperparameters = pd.DataFrame(
        data=hps,
        columns=hp_cols
    )

    objective_names = [
        'valid_error',
        'train_error',
        'runtime',
        'latency',
        'flops',
        'params',

    ]

    fidelity_values = np.arange(1, 201)
    n_fidelities = len(fidelity_values)
    n_objectives = len(objective_names)
    n_seeds = 3

    objective_evaluations = np.empty((n_hps, n_seeds, n_fidelities, n_objectives)).astype('float32')
    name_index = {name: i for i, name in enumerate(objective_names)}

    def save_objective_values_helper(name, values):
        assert values.shape == (n_hps, n_seeds, n_fidelities)

        objective_evaluations[..., name_index[name]] = values

    ve = np.empty((n_hps, n_seeds, n_fidelities)).astype('float32')
    te = np.empty((n_hps, n_seeds, n_fidelities)).astype('float32')
    rt = np.empty((n_hps, n_seeds, n_fidelities)).astype('float32')

    for ai in range(n_hps):
        for si, seed in enumerate([777, 888, 999]):

            try:
                entry = data['arch2infos'][ai]['200']['all_results'][(dataset, seed)]
                validation_error = [1 - entry['eval_acc1es']['ori-test@%d' % ei] / 100 for ei in range(n_fidelities)]
                train_error = [1 - entry['train_acc1es'][ei] / 100 for ei in range(n_fidelities)]
                # runtime measure the time for a single epoch
                runtime = [entry['train_times'][ei] + entry['eval_times']['ori-test@%d' % ei]
                           for ei in range(n_fidelities)]

            except KeyError:
                validation_error = [np.nan] * n_fidelities
                train_error = [np.nan] * n_fidelities
                runtime = [np.nan] * n_fidelities
            ve[ai, si, :] = validation_error
            te[ai, si, :] = train_error
            rt[ai, si, :] = runtime

    def impute(values):
        idx = np.isnan(values)
        a, s, e = np.where(idx == True)
        for ai, si, ei in zip(a, s, e):
            l = values[ai, :, ei]
            m = np.mean(np.delete(l, si))
            values[ai, si, ei] = m
        return values

    # The original data contains missing values, since not all architectures were evaluated for all three seeds
    # We impute these missing values by taking the average of the available datapoints for the corresponding
    # architecture and time step

    save_objective_values_helper('valid_error', impute(ve))
    save_objective_values_helper('train_error', impute(te))
    save_objective_values_helper('runtime', impute(rt))

    latency = np.array(
        [data['arch2infos'][ai]['200']['all_results'][(dataset, 777)]['latency'][0] for ai in range(n_hps)])
    latency = np.repeat(np.expand_dims(latency, axis=-1), n_seeds, axis=-1)
    latency = np.repeat(np.expand_dims(latency, axis=-1), n_fidelities, axis=-1)
    save_objective_values_helper('latency', latency)

    flops = np.array([data['arch2infos'][ai]['200']['all_results'][(dataset, 777)]['flop'] for ai in range(n_hps)])
    flops = np.repeat(np.expand_dims(flops, axis=-1), n_seeds, axis=-1)
    flops = np.repeat(np.expand_dims(flops, axis=-1), n_fidelities, axis=-1)
    save_objective_values_helper('flops', flops)

    params = np.array([data['arch2infos'][ai]['200']['all_results'][(dataset, 777)]['params'] for ai in range(n_hps)])
    params = np.repeat(np.expand_dims(params, axis=-1), n_seeds, axis=-1)
    params = np.repeat(np.expand_dims(params, axis=-1), n_fidelities, axis=-1)
    save_objective_values_helper('params', params)

    configuration_space = {
        node: search_space.choice(['avg_pool_3x3', 'nor_conv_3x3', 'skip_connect', 'nor_conv_1x1', 'none'])
        for node in hp_cols
    }

    fidelity_space = {
        RESOURCE_ATTR: search_space.randint(lower=1, upper=201)
    }

    objective_names = [f"metric_{m}" for m in objective_names]
    # Sanity checks:
    assert objective_names[0] == METRIC_VALID_ERROR
    assert objective_names[2] == METRIC_TIME_THIS_RESOURCE
    return BlackboxTabular(
        hyperparameters=hyperparameters,
        configuration_space=configuration_space,
        fidelity_space=fidelity_space,
        objectives_evaluations=objective_evaluations,
        fidelity_values=fidelity_values,
        objectives_names=objective_names,
    )