terraform/modules/finetuning-service/deployment.tf (93 lines of code) (raw):
locals {
sa_name = "finetuning-service-sa"
service_name = "finetuning-service"
}
resource "google_service_account" "this" {
project = var.project_id
account_id = local.sa_name
display_name = "Terraform-managed service account for finetuning service"
}
resource "kubectl_manifest" "sa" {
yaml_body = <<YAML
apiVersion: v1
kind: ServiceAccount
metadata:
name: "${local.sa_name}"
namespace: "${var.ns_name}"
annotations:
iam.gke.io/gcp-service-account: "${local.sa_name}@${var.project_id}.iam.gserviceaccount.com"
YAML
}
resource "google_service_account_iam_member" "this" {
service_account_id = google_service_account.this.name
role = "roles/iam.workloadIdentityUser"
member = "serviceAccount:${var.project_id}.svc.id.goog[${var.ns_name}/${local.sa_name}]"
depends_on = [kubectl_manifest.sa]
}
# Grant the Service Account Access to GCS Bucket
resource "google_storage_bucket_iam_member" "bucket_access" {
bucket = "finetuning-data-bucket" # Replace with your actual bucket name
role = "roles/storage.objectUser"
member = "serviceAccount:${local.sa_name}@${var.project_id}.iam.gserviceaccount.com"
}
# Create a Kubernetes Job for finetuning
resource "kubectl_manifest" "this" {
yaml_body = <<YAML
apiVersion: batch/v1
kind: Job
metadata:
name: "${local.service_name}-job"
namespace: ${var.ns_name}
spec:
template:
metadata:
labels:
app: "${local.service_name}"
spec:
serviceAccountName: ${local.sa_name}
containers:
- name: "${local.service_name}"
image: ${var.region}-docker.pkg.dev/${var.project_id}/${var.artifactory_repo_name}/${local.service_name}:latest
imagePullPolicy: Always
resources:
requests:
cpu: "1"
memory: "8Gi" # Adjust if needed
nvidia.com/gpu: "1" # GPU needed for finetuning
limits:
cpu: "2"
memory: "16Gi" # Adjust if needed
nvidia.com/gpu: "1" # GPU needed for finetuning
ports:
- name: server-port
containerPort: 8000
env:
- name: EXPERIMENT
value: ""
- name: MLFLOW_ENABLE
value: "false"
- name: MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING
value: "false"
- name: MLFLOW_TRACKING_URI
value: ""
- name: MODEL_NAME
value: "google/gemma-2-9b-it"
- name: TRAINING_DATASET_BUCKET
value: "finetuning-data-bucket"
- name: TRAINING_DATA_PATH
value: "prepared_data.jsonl"
- name: NEW_MODEL
value: "gemma-finetuned"
- name: MODEL_PATH
value: "/model-data"
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-secret
key: HUGGING_FACE_TOKEN
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-l4 # Adjust desired GPU type if needed
restartPolicy: OnFailure
YAML
depends_on = [
google_service_account_iam_member.this
]
}