benchmarks/inference-server/text-generation-inference/sample-terraform.tfvars (32 lines of code) (raw):
credentials_config = {
fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/ai-benchmark"
}
project_id = "$PROJECT_ID"
namespace = "benchmark"
ksa = "benchmark-ksa"
model_id = "tiiuae/falcon-7b"
gpu_count = 1
# How to (horizontally) scale the workload. Allowed values are:
# - null (no scaling),
# - Workload resources:
# - "cpu" (scale on cpu utilization).
# - Workload metrics (i.e. custom metrics):
# - "tgi_queue_size"
# - "tgi_batch_current_size"
# - "tgi_batch_current_max_tokens"
# - Other possibilities coming soon...
#
# See `autoscaling.md` for more details and recommendations.
hpa_type = null
# Sets the averagevalue target of the hpa metric.
#
# e.g for cpu scaling, this is the cpu utilization, expressed as a value
# between 0-100. 50 is a reasonable starting point.
#hpa_averagevalue_target = 50
#
# For tgi_batch_current_size, try 10. (TODO: experiment with this to determine
# optimal values).
#hpa_averagevalue_target = 10
# Adjust these if you want different min/max values
# hpa_min_replicas = 1
# hpa_max_replicas = 5