dags/framework3p/configs/microbenchmarks_config.py (129 lines of code) (raw):

from xlml.apis import gcp_config, metric_config, task, test_config from dags import gcs_bucket from dags.common import test_owner import datetime import dags.common.vm_resource as resource def get_microbenchmark_config( tpu_version: resource.TpuVersion, tpu_cores: int, tpu_zone: resource.Zone, time_out_in_min: int, runtime_version: resource.RuntimeVersion, project: resource.Project, network: str = "default", subnetwork: str = "default", extraFlags: str = "", ): job_gcp_config = gcp_config.GCPConfig( project_name=project.value, zone=tpu_zone.value, dataset_name=metric_config.DatasetOption.XLML_DATASET, ) set_up_cmds = ( "pip install --upgrade pip", ( "pip install jax[tpu] -f" " https://storage.googleapis.com/jax-releases/libtpu_releases.html" ), "JAX_PLATFORMS=tpu,cpu ENABLE_PJRT_COMPATIBILITY=true ", ) benchmark_config = f"xlml_v{tpu_version.value}_{tpu_cores}.yaml" metrics_report = "/tmp/microbenchmarks/outputs/metrics_report.jsonl" # Initial commands run_model_cmds = ( # Create the output directory "mkdir -p /tmp/microbenchmarks/outputs ", # Remove any existing metrics report (f"if [ -f {metrics_report} ]; then " f"rm -rf {metrics_report}; " "fi"), ) # Run the benchmark tests. run_model_cmds += ( " rm -rf accelerator-microbenchmarks ", "git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git ", "cd accelerator-microbenchmarks ", "pip install -r requirements.txt ", # Run the benchmark script f"python3 src/run_benchmark.py " f"--config=configs/{benchmark_config} ", ) # Check if the metrics report exists, and if so, upload it to GCS run_model_cmds += ( f"if [ -f {metrics_report} ]; then " f"gsutil cp {metrics_report} {metric_config.SshEnvVars.GCS_OUTPUT.value}; " "fi", ) job_test_config = test_config.TpuVmTest( test_config.Tpu( version=tpu_version, cores=tpu_cores, runtime_version=runtime_version.value, network=network, subnetwork=subnetwork, reserved=True, ), test_name="framework-microbenchmark", set_up_cmds=set_up_cmds, run_model_cmds=run_model_cmds, timeout=datetime.timedelta(minutes=time_out_in_min), task_owner=test_owner.QINY_Y, ) job_metric_config = metric_config.MetricConfig( json_lines=metric_config.JSONLinesConfig("metrics_report.jsonl"), use_runtime_generated_gcs_folder=True, ) return task.run_queued_resource_test( task_test_config=job_test_config, task_gcp_config=job_gcp_config, task_metric_config=job_metric_config, ) def get_microbenchmark_xpk_config( time_out_in_min: int, test_name: str, docker_image: str, test_owner: str, cluster: resource.XpkClusterConfig, num_slices: int = 1, dataset_name: metric_config.DatasetOption = metric_config.DatasetOption.XLML_DATASET, dataset_project: str = resource.Project.CLOUD_ML_AUTO_SOLUTIONS.value, composer_project: str = resource.Project.CLOUD_ML_AUTO_SOLUTIONS.value, ) -> task.XpkTask: job_gcp_config = gcp_config.GCPConfig( project_name=cluster.project, zone=cluster.zone, dataset_name=dataset_name, dataset_project=dataset_project, composer_project=composer_project, ) benchmark_config = ( f"xlml_v{cluster.device_version.value}_{cluster.core_count}.yaml" ) metrics_report = "/tmp/microbenchmarks/outputs/metrics_report.jsonl" # Initial commands run_model_cmds = ( # Create the output directory "mkdir -p /tmp/microbenchmarks/outputs ", # Remove any existing metrics report (f"if [ -f {metrics_report} ]; then " f"rm -rf {metrics_report}; " "fi"), ) # Run the benchmark tests. run_model_cmds += ( "cd /app/accelerator-microbenchmarks ", # Run the benchmark script f"python3 src/run_benchmark.py " f"--config=configs/{benchmark_config} ", ) # Check if the metrics report exists, and if so, upload it to GCS run_model_cmds += ( f"if [ -f {metrics_report} ]; then " f"gsutil cp {metrics_report} {metric_config.SshEnvVars.GCS_OUTPUT.value} ; " "fi ", ) job_test_config = test_config.TpuGkeTest( test_config.Tpu( version=cluster.device_version, cores=cluster.core_count, ), test_name=test_name, set_up_cmds=None, run_model_cmds=run_model_cmds, timeout=datetime.timedelta(minutes=time_out_in_min), task_owner=test_owner, num_slices=num_slices, cluster_name=cluster.name, docker_image=docker_image, ) job_metric_config = metric_config.MetricConfig( json_lines=metric_config.JSONLinesConfig("metrics_report.jsonl"), use_runtime_generated_gcs_folder=True, ) return task.XpkTask( task_test_config=job_test_config, task_gcp_config=job_gcp_config, task_metric_config=job_metric_config, )