deployment/terraform-module-knfsd/compute.tf (281 lines of code) (raw):
/*
* Copyright 2020 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
locals {
FSID_DATABASE_CONFIG = (
# this module deployed an external fsid database, so generate our own config
local.deploy_fsid_database ? templatefile("${path.module}/resources/knfsd-fsidd.conf.tftpl", {
connection_name = module.fsid_database.0.connection_name,
sql_user = module.fsid_database.0.sql_user,
database_name = module.fsid_database.0.database_name,
private_ip_address = module.fsid_database.0.private_ip_address,
enable_metrics = var.ENABLE_STACKDRIVER_METRICS,
}) :
# otherwise use an external config
var.FSID_DATABASE_CONFIG
)
}
# Optionally create a Compute Engine Reservation for the cluster
# The Knfsd nodes are often large instances, with lots of Local SSD's. This means they can sometimes be difficult to schedule
# which can cause delays when replacing unhealthy instances, or performing rolling replacements.
#
# A reservation ensures that the capacity for the Knfsd Cluster is always available in Google Cloud, regardless of the state of
# the instances. A reservation is not a commitment, and can be deleted at any time.
resource "google_compute_reservation" "knfsd_reservation" {
# Only create the reservation if RESERVE_KNFSD_CAPACITY is true
count = var.RESERVE_KNFSD_CAPACITY ? 1 : 0
name = "${var.PROXY_BASENAME}-group"
zone = var.ZONE
// We want this reservation to only be used by this specific Knfsd Cluster
specific_reservation_required = true
description = "Compute reservation for Knfsd Cluster ${var.PROXY_BASENAME}-group"
specific_reservation {
count = var.KNFSD_NODES
instance_properties {
min_cpu_platform = lower(split(var.MACHINE_TYPE, "-")[0]) == "n1" ? "Intel Skylake" : null // Set Skylake only if N1
machine_type = var.MACHINE_TYPE
dynamic "local_ssds" {
for_each = var.CACHEFILESD_DISK_TYPE == "local-ssd" ? range(1, var.LOCAL_SSDS + 1) : []
content {
interface = "NVME"
disk_size_gb = 375
}
}
}
}
lifecycle {
precondition {
# RESERVE_KNFSD_CAPACITY requires ENABLE_KNFSD_AUTOSCALING to be false
condition = var.ENABLE_KNFSD_AUTOSCALING == false
error_message = "ENABLE_KNFSD_AUTOSCALING must be disabled when RESERVE_KNFSD_CAPACITY is enabled."
}
}
}
# Instance Template for the KNFSD nodes
resource "google_compute_instance_template" "nfsproxy-template" {
provider = google-beta # Required due to network_performance_config being in beta provider only
project = var.PROJECT
region = var.REGION
name_prefix = var.PROXY_BASENAME
machine_type = var.MACHINE_TYPE
min_cpu_platform = lower(split(var.MACHINE_TYPE, "-")[0]) == "n1" ? "Intel Skylake" : null // Set Skylake only if N1
can_ip_forward = false
tags = ["knfsd-cache-server"]
labels = var.PROXY_LABELS
disk {
source_image = var.PROXY_IMAGENAME
auto_delete = true
boot = true
disk_size_gb = 100
}
# Configuration for Persistent Disk for FS-Cache directory
dynamic "disk" {
for_each = var.CACHEFILESD_DISK_TYPE != "local-ssd" ? [1] : []
content {
disk_type = var.CACHEFILESD_DISK_TYPE
type = "PERSISTENT"
mode = "READ_WRITE"
device_name = "pd-fscache"
disk_size_gb = var.CACHEFILESD_PERSISTENT_DISK_SIZE_GB
}
}
# Configuration for Local SSDs for FS-Cache directory
dynamic "disk" {
for_each = var.CACHEFILESD_DISK_TYPE == "local-ssd" ? range(1, var.LOCAL_SSDS + 1) : []
content {
interface = "NVME"
disk_type = "local-ssd"
type = "SCRATCH"
mode = "READ_WRITE"
device_name = "local-ssd-${disk.value}"
disk_size_gb = 375
}
}
network_performance_config {
total_egress_bandwidth_tier = var.ENABLE_HIGH_BANDWIDTH_CONFIGURATION ? "TIER_1" : "DEFAULT"
}
network_interface {
network = local.network
subnetwork = local.subnetwork
nic_type = (var.ENABLE_HIGH_BANDWIDTH_CONFIGURATION || var.ENABLE_GVNIC) ? "GVNIC" : "VIRTIO_NET"
}
metadata = {
# mounts
EXPORT_MAP = var.EXPORT_MAP
EXPORT_HOST_AUTO_DETECT = var.EXPORT_HOST_AUTO_DETECT
EXCLUDED_EXPORTS = join("\n", var.EXCLUDED_EXPORTS)
INCLUDED_EXPORTS = join("\n", var.INCLUDED_EXPORTS)
EXPORT_CIDR = var.EXPORT_CIDR
# NetApp auto-discovery
ENABLE_NETAPP_AUTO_DETECT = var.ENABLE_NETAPP_AUTO_DETECT
NETAPP_HOST = var.NETAPP_HOST
NETAPP_URL = var.NETAPP_URL
NETAPP_USER = var.NETAPP_USER
NETAPP_SECRET = var.NETAPP_SECRET
NETAPP_SECRET_PROJECT = var.NETAPP_SECRET_PROJECT
NETAPP_SECRET_VERSION = var.NETAPP_SECRET_VERSION
NETAPP_CA = var.NETAPP_CA
NETAPP_ALLOW_COMMON_NAME = var.NETAPP_ALLOW_COMMON_NAME
# mount options
NCONNECT = var.NCONNECT_VALUE
ACDIRMIN = var.ACDIRMIN
ACDIRMAX = var.ACDIRMAX
ACREGMIN = var.ACREGMIN
ACREGMAX = var.ACREGMAX
RSIZE = var.RSIZE
WSIZE = var.WSIZE
NOHIDE = var.NOHIDE
MOUNT_OPTIONS = var.MOUNT_OPTIONS
EXPORT_OPTIONS = var.EXPORT_OPTIONS
NFS_MOUNT_VERSION = var.NFS_MOUNT_VERSION
# auto-reexport nested mounts
AUTO_REEXPORT = var.AUTO_REEXPORT
FSID_MODE = var.FSID_MODE
FSID_DATABASE_CONFIG = local.FSID_DATABASE_CONFIG
# system
NUM_NFS_THREADS = var.NUM_NFS_THREADS
VFS_CACHE_PRESSURE = var.VFS_CACHE_PRESSURE
DISABLED_NFS_VERSIONS = var.DISABLED_NFS_VERSIONS
READ_AHEAD_KB = floor(var.READ_AHEAD / 1024)
LOADBALANCER_IP = one(google_compute_address.nfsproxy_static.*.address)
serial-port-enable = "TRUE"
# metrics
ENABLE_STACKDRIVER_METRICS = var.ENABLE_STACKDRIVER_METRICS
METRICS_AGENT_CONFIG = var.METRICS_AGENT_CONFIG
ROUTE_METRICS_PRIVATE_GOOGLEAPIS = var.ROUTE_METRICS_PRIVATE_GOOGLEAPIS
# scripts / software
startup-script = file("${path.module}/resources/proxy-startup.sh")
CUSTOM_PRE_STARTUP_SCRIPT = var.CUSTOM_PRE_STARTUP_SCRIPT
CUSTOM_POST_STARTUP_SCRIPT = var.CUSTOM_POST_STARTUP_SCRIPT
ENABLE_KNFSD_AGENT = var.ENABLE_KNFSD_AGENT
}
scheduling {
automatic_restart = true
on_host_maintenance = "MIGRATE"
preemptible = false
}
dynamic "reservation_affinity" {
for_each = var.RESERVE_KNFSD_CAPACITY ? [1] : []
content {
type = "SPECIFIC_RESERVATION"
specific_reservation {
key = "compute.googleapis.com/reservation-name"
values = [google_compute_reservation.knfsd_reservation[0].name]
}
}
}
# We use a dynamic block for service_account here as we only want to assign an SA if we have metrics enabled.
# If we do not have metrics enabled there is no need for an SA
dynamic "service_account" {
for_each = local.enable_service_account ? [1] : []
content {
email = var.SERVICE_ACCOUNT
scopes = local.scopes
}
}
lifecycle {
create_before_destroy = true
# Most of these preconditions are conditional. Terraform 1.20 does not have
# support for some kind of "enabled" or "when" condition, so write the
# conditions in the form:
# condition = (when ? check : true)
#
# This roughly translates to
# if when then
# check precondition
# else
# skip precondition
precondition {
# AUTO_REEXPORT requires an fsid service to be enabled
condition = (
var.AUTO_REEXPORT
? contains(["local", "external"], var.FSID_MODE)
: true
)
error_message = "FSID_MODE must be either \"local\" or \"external\" when AUTO_REEXPORT is enabled."
}
# If this module has created the database ensure that the user did not try
# and provide their own FSID_DATABASE_CONFIG, as that configuration will be
# ignored.
# This just avoids two possible errors:
# * This module ignores the custom configuration leading to confusion.
# * This module uses the custom configuration leading to an unused Cloud
# SQL being deployed.
precondition {
condition = (
var.FSID_MODE == "external" && var.FSID_DATABASE_DEPLOY
? var.FSID_DATABASE_CONFIG == ""
: true
)
error_message = "Can only provide a custom FSID_DATABASE_CONFIG when using a custom external fsid database (FSID_MODE = \"external\" and FSID_DATABASE_DEPLOY = false)."
}
# Again to avoid confusion, if not using an external database do not allow
# the user to provide their own FSID_DATABASE_CONFIG.
precondition {
condition = (
var.FSID_MODE != "external"
? var.FSID_DATABASE_CONFIG == ""
: true
)
error_message = "Can only provide a custom FSID_DATABASE_CONFIG when using a custom external fsid database (FSID_MODE = \"external\" and FSID_DATABASE_DEPLOY = false)."
}
# When using an custom external database then FSID_DATABASE_DEPLOY must be provided
precondition {
condition = (
var.FSID_MODE == "external" && !var.FSID_DATABASE_DEPLOY
? var.FSID_DATABASE_CONFIG != ""
: true
)
error_message = "Must specify a database configuration (FSID_DATABASE_CONFIG) when using a custom external database (FSID_MODE = \"external\" and FSID_DATABASE_DEPLOY = false)."
}
# Bug check: This should not occur and indicates a bug in the Terraform script.
# Fail early during terraform plan, otherwise the proxy will deploy and enter
# a reboot loop.
precondition {
condition = (
local.deploy_fsid_database
? local.FSID_DATABASE_CONFIG != ""
: true
)
error_message = "BUG: database configuration not set for external fsid database."
}
# Bug check: This should not occur and indicates a bug in the Terraform script.
# Check that if the script deployed a Cloud SQL database, that database will
# be used by the proxy otherwise its just wasting money.
# Including this here as this is the likely place people will update when
# changing how FSID_MODE is handled as this is the main proxy validation.
# This acts as a cross check for the local.deploy_fsid_database logic in
# case only one is updated.
precondition {
condition = (
local.deploy_fsid_database
? var.FSID_MODE == "external"
: true
)
error_message = "BUG: deployed Cloud SQL database, but that database is not in use."
}
}
}
# Healthcheck on port 2049, used for monitoring the NFS Health Status
resource "google_compute_health_check" "autohealing" {
project = var.PROJECT
name = "${var.PROXY_BASENAME}-autohealing-health-check"
check_interval_sec = var.HEALTHCHECK_INTERVAL_SECONDS
timeout_sec = var.HEALTHCHECK_TIMEOUT_SECONDS
healthy_threshold = var.HEALTHCHECK_HEALTHY_THRESHOLD
unhealthy_threshold = var.HEALTHCHECK_UNHEALTHY_THRESHOLD
tcp_health_check {
port = "2049"
}
depends_on = [
# Ensure that the firewall rules are not deleted while the health check
# still exists. Otherwise when removing clusters, Terraform may delete the
# firewall rule causing the proxy group to start replacing instances.
# Terraform will then get stuck waiting for the instance group to complete
# the changes before removing the instance group.
google_compute_firewall.allow-tcp-healthcheck
]
}
# Instance Group Manager for the Knfsd Nodes
resource "google_compute_instance_group_manager" "proxy-group" {
provider = google-beta # required to support stateful_internal_ip
project = var.PROJECT
name = "${var.PROXY_BASENAME}-group"
base_instance_name = var.PROXY_BASENAME
zone = var.ZONE
// Set the Target Size to null if autoscaling is enabled
target_size = (var.ENABLE_KNFSD_AUTOSCALING == true ? null : var.KNFSD_NODES)
# when using static IPs, wait for all the instances to be updated so that the
# IPs of the Compute Instances can be fetched using the instance_ips module.
wait_for_instances = var.ASSIGN_STATIC_IPS
wait_for_instances_status = "UPDATED"
update_policy {
type = "PROACTIVE"
minimal_action = var.MIG_MINIMAL_ACTION
max_unavailable_percent = var.MIG_MAX_UNAVAILABLE_PERCENT
replacement_method = coalesce(var.MIG_REPLACEMENT_METHOD, local.MIG_REPLACEMENT_METHOD_DEFAULT)
}
version {
name = "v1"
instance_template = google_compute_instance_template.nfsproxy-template.self_link
}
# We use a dynamic block for auto_healing_policies here as we only want to assign a healthcheck if the ENABLE_AUTOHEALING_HEALTHCHECKS is set
dynamic "auto_healing_policies" {
for_each = var.ENABLE_AUTOHEALING_HEALTHCHECKS ? [1] : []
content {
health_check = google_compute_health_check.autohealing.self_link
initial_delay_sec = var.HEALTHCHECK_INITIAL_DELAY_SECONDS
}
}
dynamic "stateful_internal_ip" {
for_each = toset(var.ASSIGN_STATIC_IPS ? ["nic0"] : [])
content {
interface_name = stateful_internal_ip.value
delete_rule = "ON_PERMANENT_INSTANCE_DELETION"
}
}
}
# Firewall rule to allow healthchecks from the GCP Healthcheck ranges
resource "google_compute_firewall" "allow-tcp-healthcheck" {
// Count is used here to determine if the firewall rules should automatically be created.
// If var.AUTO_CREATE_FIREWALL_RULES is true then we want 1 firewall rule, else 0
count = var.AUTO_CREATE_FIREWALL_RULES ? 1 : 0
project = var.PROJECT
name = "allow-nfs-tcp-healthcheck"
network = local.network
priority = 1000
allow {
protocol = "tcp"
ports = ["2049"]
}
source_ranges = ["130.211.0.0/22", "35.191.0.0/16", "209.85.152.0/22", "209.85.204.0/22"]
target_tags = ["knfsd-cache-server"]
}