modules/management/kubectl-apply/main.tf (166 lines of code) (raw):
/**
* Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
locals {
cluster_id_parts = split("/", var.cluster_id)
cluster_name = local.cluster_id_parts[5]
cluster_location = local.cluster_id_parts[3]
project_id = var.project_id != null ? var.project_id : local.cluster_id_parts[1]
apply_manifests_map = tomap({
for index, manifest in var.apply_manifests : index => manifest
})
install_kueue = try(var.kueue.install, false)
install_jobset = try(var.jobset.install, false)
install_gpu_operator = try(var.gpu_operator.install, false)
install_nvidia_dra_driver = try(var.nvidia_dra_driver.install, false)
kueue_install_source = format("${path.module}/manifests/kueue-%s.yaml", try(var.kueue.version, ""))
jobset_install_source = format("${path.module}/manifests/jobset-%s.yaml", try(var.jobset.version, ""))
}
data "google_container_cluster" "gke_cluster" {
project = local.project_id
name = local.cluster_name
location = local.cluster_location
}
data "google_client_config" "default" {}
module "kubectl_apply_manifests" {
for_each = local.apply_manifests_map
source = "./kubectl"
content = each.value.content
source_path = each.value.source
template_vars = each.value.template_vars
server_side_apply = each.value.server_side_apply
wait_for_rollout = each.value.wait_for_rollout
providers = {
kubectl = kubectl
http = http.h
}
}
module "install_kueue" {
source = "./kubectl"
source_path = local.install_kueue ? local.kueue_install_source : null
server_side_apply = true
providers = {
kubectl = kubectl
http = http.h
}
}
module "install_jobset" {
source = "./kubectl"
source_path = local.install_jobset ? local.jobset_install_source : null
server_side_apply = true
providers = {
kubectl = kubectl
http = http.h
}
}
module "configure_kueue" {
source = "./kubectl"
source_path = local.install_kueue ? try(var.kueue.config_path, "") : null
template_vars = local.install_kueue ? try(var.kueue.config_template_vars, null) : null
depends_on = [module.install_kueue]
server_side_apply = true
wait_for_rollout = true
providers = {
kubectl = kubectl
http = http.h
}
}
module "install_nvidia_dra_driver" {
count = local.install_nvidia_dra_driver ? 1 : 0
depends_on = [module.kubectl_apply_manifests, var.gke_cluster_exists]
source = "./helm_install"
release_name = "nvidia-dra-driver-gpu" # The release name
chart_repository = "https://helm.ngc.nvidia.com/nvidia" # The Helm repository URL for nvidia charts
chart_name = "nvidia-dra-driver-gpu" # The chart name
chart_version = var.nvidia_dra_driver.version # The chart version
namespace = "nvidia-dra-driver-gpu" # The target namespace
create_namespace = true # Equivalent to --create-namespace
# Use the 'values' argument to pass the YAML content
# This corresponds to the -f <(cat <<EOF ... EOF) part
values_yaml = [<<EOF
nvidiaDriverRoot: /home/kubernetes/bin/nvidia
nvidiaCtkPath: /home/kubernetes/bin/nvidia/toolkit/nvidia-ctk
resources:
gpus:
enabled: false
controller:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "nvidia.com/gpu"
operator: "DoesNotExist"
kubeletPlugin:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: feature.node.kubernetes.io/pci-10de.present
operator: In
values:
- "true"
- matchExpressions:
- key: feature.node.kubernetes.io/cpu-model.vendor_id
operator: In
values:
- "ARM"
- matchExpressions:
- key: "nvidia.com/gpu.present"
operator: In
values:
- "true"
tolerations:
- key: nvidia.com/gpu
operator: Equal
value: present
effect: NoSchedule
EOF
]
atomic = true
cleanup_on_fail = true
}
module "install_gpu_operator" {
count = local.install_gpu_operator ? 1 : 0
source = "./helm_install"
chart_repository = "https://helm.ngc.nvidia.com/nvidia"
depends_on = [module.kubectl_apply_manifests, var.gke_cluster_exists]
namespace = "gpu-operator"
create_namespace = true
release_name = "gpu-operator"
chart_name = "gpu-operator"
chart_version = var.gpu_operator.version
wait = true
set_values = [
{
name = "hostPaths.driverInstallDir",
value = "/home/kubernetes/bin/nvidia"
},
{
name = "toolkit.installDir"
value = "/home/kubernetes/bin/nvidia"
},
{
name = "cdi.enabled"
value = true
},
{
name = "cdi.default"
value = true
},
{
name = "driver.enabled"
value = false
}]
}