benchmarks/benchmark/tools/dlio/variables.tf (235 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. variable "namespace" { type = string description = "Kubernetes namespace where resources are deployed" default = "benchmark" } variable "k8s_service_account" { type = string description = "Kubernetes service account name as in the Configure access to Cloud Storage buckets using GKE Workload Identity step" default = "benchmark-ksa" } variable "gcs_bucket" { type = string description = "GCS Bucket name" default = "<your gcs bucket>" } variable "result_bucket" { type = string description = "GCS Bucket name to store dlio results" default = "<result bucket>" } // at most one of the below trigers can be set to true variable "run_with_gcs_fuse_csi" { type = string description = "Set to true if running DLIO on GCSFuse" default = "\"true\"" } variable "run_with_parallelstore_csi" { type = string description = "Set to true if running DLIO on Parallelstore and the Parallelstore CSI driver is enabled on your cluster" default = "\"false\"" } // at most one of the above triggeres can be set to true // DLIO Job configurations variable "job_backoffLimit" { type = number description = "The number of retries before considering a Job as failed" default = 0 } variable "job_completions" { type = number description = "The number of Pods that are successful while the Job is considered to be complete" default = 1 } variable "job_parallelism" { type = number description = "The desired number of Pods to run in parallel for a Job. Kubernetes will ensure that no more than this number of Pods are running at any given time." default = 1 } variable "gcs_fuse_sidecar_cpu_limit" { type = string description = "The maximum amount of CPU resource that the sidecar container can use" default = "\"20\"" } variable "gcs_fuse_sidecar_memory_limit" { type = string description = "The maximum amount of Memory resource that the sidecar container can use" default = "\"20Gi\"" } variable "gcs_fuse_sidecar_ephemeral_storage_limit" { type = string description = "The maximum amount of Ephemeral Storage resource that the sidecar container can use" default = "\"100Gi\"" } variable "pscsi_sidecar_cpu_limit" { type = string description = "The maximum amount of CPU resource that the sidecar container can use" default = "\"20\"" } variable "pscsi_sidecar_memory_limit" { type = string description = "The maximum amount of Memory resource that the sidecar container can use" default = "\"20Gi\"" } variable "dlio_container_cpu_limit" { type = number description = "The maximum amount of CPU resource that the DLIO benchmark workload container can use" default = "30" } variable "dlio_container_memory_limit" { type = string description = "The maximum amount of Memory resource that the DLIO benchmark workload container can use" default = "150Gi" } variable "dlio_container_ephemeral_storage" { type = string description = "The maximum amount of Ephemeral Storage resource that the DLIO benchmark workload container can use" default = "100Gi" } variable "dlio_data_mount_path" { type = string description = "The path where your GCS bucket volume or other volume is mounted" default = "/data" } variable "dlio_benchmark_result" { type = string description = "The path stores benchmark result reports for a specific DLIO run. When doing multi-pod runs, this folder stores results logged from all the pods, needs to be changed every run to guarantee result isolation." default = "<a result folder name unique to your run>" } // DLIO configurations, detailed explanation check // https://github.com/argonne-lcf/dlio_benchmark // https://argonne-lcf.github.io/dlio_benchmark/config.html variable "dlio_generate_data" { type = string description = "Set to True to generate dataset. Set to False to perform data train" default = "False" } variable "dlio_number_of_processors" { type = number description = "The number of processors used to run the task" default = 8 } variable "dlio_model" { type = string description = "Specifying the name of the model" default = "unet3d" } variable "dlio_record_length" { type = number description = "Size of each sample" default = "150000000" } variable "dlio_record_length_stdev" { type = number description = "Standard deviation of the size of samples" default = 0 } variable "dlio_record_length_resize" { type = number description = "Resized sample size" default = 0 } variable "dlio_number_of_files" { type = number description = "Number of files for the training set" default = 5000 } variable "dlio_profiler" { type = string description = "Specifying the profiler to use [none|iostat|tensorflow|pytorch]" default = "none" } variable "dlio_iostat_devices" { type = string description = "Specifying the devices to perform iostat tracing" default = "" } variable "dlio_batch_size" { type = number description = "Batch size for training" default = 4 } variable "dlio_train_epochs" { type = number description = "Number of epochs to simulate" default = 1 } variable "dlio_read_threads" { type = number description = "Number of threads to load the data (for tensorflow and pytorch data loader)" default = 10 } // pv, pvc variable "pv_name" { type = string description = "Name of the PersistentVolume used for DLIO dataset" default = "benchmark-pv" } variable "pvc_name" { type = string description = "Name of the PersistentVolumeClaim used for DLIO dataset" default = "benchmark-pvc" } // gcsfuse cache configurations variable "gcsfuse_stat_cache_capacity" { type = string description = "Size of the Cloud Storage Fuse stat cache. Set value to 0 to disable the stat cache" default = "20000" } variable "gcsfuse_stat_cache_ttl" { type = string description = "Specifies how long Cloud Storage FUSE caches stat entries" default = "120m0s" } variable "gcsfuse_type_cache_ttl" { type = string description = "Specifies how long Cloud Storage FUSE caches the mapping of objects in Cloud Storage to their corresponding type, such as files or directories" default = "120m0s" } // parallelstore variables variable "run_parallelstore_data_loader" { type = string description = "Set to true if running the dataloader for parallelstore" default = "\"true\"" } variable "parallelstore_instance_name" { type = string description = "instance name of parallelstore" default = "<instance name>" } // The IPs are listed as "accessPoints" in the result of instance describe command variable "parallelstore_ip_address_1" { type = string description = "ip address of the parallelstore instance's accessPoints" default = "<ip-address>" } variable "parallelstore_ip_address_2" { type = string description = "ip address of the parallelstore instance's accessPoints" default = "<ip-address>" } variable "parallelstore_ip_address_3" { type = string description = "ip address of the parallelstore instance's accessPoints" default = "<ip-address>" } variable "parallelstore_network_name" { type = string description = "network name of the parallelstore instance" default = "<network name>" } variable "parallelstore_location" { type = string description = "location of the parallelstore instance, e.g. us-central1-a" default = "<location>" } variable "parallelstore_storageclass" { type = string description = "the storage class used for dynamic provisioning. if using static provisioning, set it to nil" default = "parallelstore-rwx" } variable "parallelstore_project" { type = string description = "the project name of the parallelstore instance" default = "<project name>" }