def do_hyperparameter_tuning()

in 10_mlops/train_on_vertexai.py [0:0]


def do_hyperparameter_tuning(data_set, timestamp, develop_mode, cpu_only_mode, tf_version):
    # Vertex AI services require regional API endpoints.
    if cpu_only_mode:
        train_image='us-docker.pkg.dev/vertex-ai/training/tf-cpu.{}:latest'.format(tf_version)
    else: 
        train_image = "us-docker.pkg.dev/vertex-ai/training/tf-gpu.{}:latest".format(tf_version)

    # a single trial job
    model_display_name = '{}-{}'.format(ENDPOINT_NAME, timestamp)
    if cpu_only_mode:
        trial_job = aiplatform.CustomJob.from_local_script(
            display_name='train-{}'.format(model_display_name),
            script_path="model.py",
            container_uri=train_image,
            args=[
                '--bucket', BUCKET,
                '--skip_full_eval',  # no need to evaluate on test data set
                '--num_epochs', '10',
                '--num_examples', '500000'  # 1/10 actual size to finish faster
            ],
            requirements=['cloudml-hypertune'],  # any extra Python packages
            replica_count=1,
            machine_type='n1-standard-4'
        )
    else:
        trial_job = aiplatform.CustomJob.from_local_script(
            display_name='train-{}'.format(model_display_name),
            script_path="model.py",
            container_uri=train_image,
            args=[
                '--bucket', BUCKET,
                '--skip_full_eval',  # no need to evaluate on test data set
                '--num_epochs', '10',
                '--num_examples', '500000'  # 1/10 actual size to finish faster
            ],
            requirements=['cloudml-hypertune'],  # any extra Python packages
            replica_count=1,
            machine_type='n1-standard-4',
            # See https://cloud.google.com/vertex-ai/docs/general/locations#accelerators
            accelerator_type=aip.AcceleratorType.NVIDIA_TESLA_T4.name,
            accelerator_count=1,
        )

    # the tuning job
    hparam_job = aiplatform.HyperparameterTuningJob(
        # See https://googleapis.dev/python/aiplatform/latest/aiplatform.html#
        display_name='hparam-{}'.format(model_display_name),
        custom_job=trial_job,
        metric_spec={'val_rmse': 'minimize'},
        parameter_spec={
            "train_batch_size": hpt.IntegerParameterSpec(min=16, max=256, scale='log'),
            "nbuckets": hpt.IntegerParameterSpec(min=5, max=10, scale='linear'),
            "dnn_hidden_units": hpt.CategoricalParameterSpec(values=["64,16", "64,16,4", "64,64,64,8", "256,64,16"])
        },
        max_trial_count=2 if develop_mode else NUM_HPARAM_TRIALS,
        parallel_trial_count=2,
        search_algorithm=None,  # Bayesian
    )

    hparam_job.run(sync=True)  # has to finish before we can get trials.

    # get the parameters corresponding to the best trial
    best = sorted(hparam_job.trials, key=lambda x: x.final_measurement.metrics[0].value)[0]
    logging.info('Best trial: {}'.format(best))
    best_params = []
    for param in best.parameters:
        best_params.append('--{}'.format(param.parameter_id))

        if param.parameter_id in ["train_batch_size", "nbuckets"]:
            # hparam returns 10.0 even though it's an integer param. so round it.
            # but CustomTrainingJob makes integer args into floats. so make it a string
            best_params.append(str(int(round(param.value))))
        else:
            # string or float parameters
            best_params.append(param.value)

    # run the best trial to completion
    logging.info('Launching full training job with {}'.format(best_params))
    return train_custom_model(data_set, timestamp, develop_mode, cpu_only_mode, tf_version, extra_args=best_params)