terraform/eks/daemon/awsneuron/main.tf (763 lines of code) (raw):

// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: MIT module "common" { source = "../../../common" cwagent_image_repo = var.cwagent_image_repo cwagent_image_tag = var.cwagent_image_tag } module "basic_components" { source = "../../../basic_components" region = var.region } data "aws_eks_cluster_auth" "this" { name = aws_eks_cluster.this.name } resource "aws_eks_cluster" "this" { name = "cwagent-eks-integ-${module.common.testing_id}" role_arn = module.basic_components.role_arn version = var.k8s_version enabled_cluster_log_types = [ "api", "audit", "authenticator", "controllerManager", "scheduler" ] vpc_config { subnet_ids = module.basic_components.public_subnet_ids security_group_ids = [module.basic_components.security_group] } } # EKS Node Groups resource "aws_eks_node_group" "this" { cluster_name = aws_eks_cluster.this.name node_group_name = "cwagent-eks-integ-node" node_role_arn = aws_iam_role.node_role.arn subnet_ids = module.basic_components.public_subnet_ids scaling_config { desired_size = 1 max_size = 1 min_size = 1 } ami_type = "AL2_x86_64" capacity_type = "ON_DEMAND" disk_size = 20 instance_types = ["t3.medium"] depends_on = [ aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly, aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy, aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy, aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy ] } # EKS Node IAM Role resource "aws_iam_role" "node_role" { name = "cwagent-eks-Worker-Role-${module.common.testing_id}" assume_role_policy = jsonencode({ Version = "2012-10-17", Statement = [ { Effect = "Allow", Principal = { Service = "ec2.amazonaws.com" }, Action = "sts:AssumeRole" } ] }) } resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" { policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" role = aws_iam_role.node_role.name } resource "aws_iam_role_policy_attachment" "node_AmazonEKS_CNI_Policy" { policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy" role = aws_iam_role.node_role.name } resource "aws_iam_role_policy_attachment" "node_AmazonEC2ContainerRegistryReadOnly" { policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" role = aws_iam_role.node_role.name } resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" { policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy" role = aws_iam_role.node_role.name } # TODO: these security groups be created once and then reused # EKS Cluster Security Group resource "aws_security_group" "eks_cluster_sg" { name = "cwagent-eks-cluster-sg-${module.common.testing_id}" description = "Cluster communication with worker nodes" vpc_id = module.basic_components.vpc_id } resource "aws_security_group_rule" "cluster_inbound" { description = "Allow worker nodes to communicate with the cluster API Server" from_port = 443 protocol = "tcp" security_group_id = aws_security_group.eks_cluster_sg.id source_security_group_id = aws_security_group.eks_nodes_sg.id to_port = 443 type = "ingress" } resource "aws_security_group_rule" "cluster_outbound" { description = "Allow cluster API Server to communicate with the worker nodes" from_port = 1024 protocol = "tcp" security_group_id = aws_security_group.eks_cluster_sg.id source_security_group_id = aws_security_group.eks_nodes_sg.id to_port = 65535 type = "egress" } # EKS Node Security Group resource "aws_security_group" "eks_nodes_sg" { name = "cwagent-eks-node-sg-${module.common.testing_id}" description = "Security group for all nodes in the cluster" vpc_id = module.basic_components.vpc_id egress { from_port = 0 to_port = 0 protocol = "-1" cidr_blocks = ["0.0.0.0/0"] } } resource "aws_security_group_rule" "nodes_internal" { description = "Allow nodes to communicate with each other" from_port = 0 protocol = "-1" security_group_id = aws_security_group.eks_nodes_sg.id source_security_group_id = aws_security_group.eks_nodes_sg.id to_port = 65535 type = "ingress" } resource "aws_security_group_rule" "nodes_cluster_inbound" { description = "Allow worker Kubelets and pods to receive communication from the cluster control plane" from_port = 1025 protocol = "tcp" security_group_id = aws_security_group.eks_nodes_sg.id source_security_group_id = aws_security_group.eks_cluster_sg.id to_port = 65535 type = "ingress" } # create cert for communication between agent and neuron monitor resource "tls_private_key" "private_key" { algorithm = "RSA" } resource "local_file" "ca_key" { content = tls_private_key.private_key.private_key_pem filename = "${path.module}/certs/ca.key" } resource "tls_self_signed_cert" "ca_cert" { private_key_pem = tls_private_key.private_key.private_key_pem is_ca_certificate = true subject { common_name = "neuron-monitor-service.amazon-cloudwatch.svc" organization = "Amazon CloudWatch Agent" } validity_period_hours = 24 allowed_uses = [ "digital_signature", "key_encipherment", "cert_signing", "crl_signing", "server_auth", "client_auth", ] } resource "local_file" "ca_cert_file" { content = tls_self_signed_cert.ca_cert.cert_pem filename = "${path.module}/certs/ca.cert" } resource "tls_private_key" "server_private_key" { algorithm = "RSA" } resource "local_file" "server_key" { content = tls_private_key.server_private_key.private_key_pem filename = "${path.module}/certs/server.key" } resource "tls_cert_request" "local_csr" { private_key_pem = tls_private_key.server_private_key.private_key_pem dns_names = ["localhost", "127.0.0.1", "neuron-monitor-service.amazon-cloudwatch.svc"] subject { common_name = "neuron-monitor-service.amazon-cloudwatch.svc" organization = "Amazon CloudWatch Agent" } } resource "tls_locally_signed_cert" "server_cert" { cert_request_pem = tls_cert_request.local_csr.cert_request_pem ca_private_key_pem = tls_private_key.private_key.private_key_pem ca_cert_pem = tls_self_signed_cert.ca_cert.cert_pem validity_period_hours = 12 allowed_uses = [ "digital_signature", "key_encipherment", "server_auth", "client_auth", ] } resource "local_file" "server_cert_file" { content = tls_locally_signed_cert.server_cert.cert_pem filename = "${path.module}/certs/server.cert" } resource "kubernetes_secret" "agent_cert" { metadata { name = "amazon-cloudwatch-observability-agent-cert" namespace = "amazon-cloudwatch" } data = { "ca.crt" = tls_self_signed_cert.ca_cert.cert_pem #filebase64(local_file.ca_cert_file.filename) "tls.crt" = tls_locally_signed_cert.server_cert.cert_pem #filebase64(local_file.server_cert_file.filename) "tls.key" = tls_private_key.server_private_key.private_key_pem #filebase64(local_file.server_key.filename) } } resource "kubernetes_namespace" "namespace" { metadata { name = "amazon-cloudwatch" } } resource "kubernetes_config_map" "neuron_monitor_config_map" { depends_on = [ kubernetes_namespace.namespace ] metadata { name = "neuron-monitor-config-map" namespace = "amazon-cloudwatch" } data = { "monitor.json" = jsonencode({ period = "5s" neuron_runtimes = [ { tag_filter : ".*" metrics = [ { type = "neuroncore_counters" }, { type = "memory_used" }, { type = "neuron_runtime_vcpu_usage" }, { type = "execution_stats" } ] } ] system_metrics = [ { type = "memory_info" }, { period = "5s" type = "neuron_hw_counters" } ] }) } } resource "kubernetes_service_account" "neuron_monitor_service_account" { depends_on = [ kubernetes_namespace.namespace ] metadata { name = "neuron-monitor-service-acct" namespace = "amazon-cloudwatch" } } resource "kubernetes_role" "neuron_monitor_role" { depends_on = [ kubernetes_namespace.namespace, kubernetes_service_account.neuron_monitor_service_account, kubernetes_config_map.neuron_monitor_config_map ] metadata { name = "neuron-monitor-role" namespace = "amazon-cloudwatch" } rule { api_groups = [""] resources = ["configmaps"] resource_names = ["neuron-monitor-config-map"] verbs = ["get"] } } resource "kubernetes_role_binding" "neuron_monitor_role_binding" { depends_on = [ kubernetes_namespace.namespace, kubernetes_service_account.neuron_monitor_service_account, kubernetes_role.neuron_monitor_role ] metadata { namespace = "amazon-cloudwatch" name = "neuron-monitor-role-binding" } role_ref { kind = "Role" name = "neuron-monitor-role" api_group = "rbac.authorization.k8s.io" } subject { kind = "ServiceAccount" name = "neuron-monitor-service-acct" namespace = "amazon-cloudwatch" } } resource "kubernetes_daemonset" "neuron_monitor" { depends_on = [ kubernetes_namespace.namespace, kubernetes_service_account.neuron_monitor_service_account, kubernetes_role.neuron_monitor_role, kubernetes_role_binding.neuron_monitor_role_binding, kubernetes_config_map.neuron_monitor_config_map ] metadata { name = "neuron-monitor" namespace = "amazon-cloudwatch" labels = { k8s-app = "neuron-monitor" version = "v1" } } spec { selector { match_labels = { k8s-app = "neuron-monitor" } } template { metadata { labels = { k8s-app = "neuron-monitor" version = "v1" } } spec { affinity { node_affinity { required_during_scheduling_ignored_during_execution { node_selector_term { match_expressions { key = "kubernetes.io/os" operator = "In" values = ["linux"] } } } } } container { name = "neuron-monitor-prometheus" image = "506463145083.dkr.ecr.us-west-2.amazonaws.com/mocked-neuron-monitor:v2" port { container_port = 8000 } command = [ "/bin/sh", "-c", "/opt/aws/neuron/bin/dummy_neuron_monitor.py --port 8000 --cert-file /etc/amazon-cloudwatch-observability-neuron-cert/server.crt --key-file /etc/amazon-cloudwatch-observability-neuron-cert/server.key" ] resources { limits = { cpu = "500m" memory = "256Mi" } requests = { cpu = "256m" memory = "128Mi" } } security_context { privileged = true } env { name = "NODE_NAME" value_from { field_ref { field_path = "spec.nodeName" } } } env { name = "PATH" value = "/usr/local/bin:/usr/bin:/bin:/opt/aws/neuron/bin" } volume_mount { mount_path = "/etc/amazon-cloudwatch-observability-neuron-cert/" name = "neurontls" read_only = true } volume_mount { mount_path = "/etc/neuron-monitor-config/" name = "neuron-monitor-config" read_only = true } } volume { name = "neurontls" secret { secret_name = "amazon-cloudwatch-observability-agent-cert" items { key = "tls.crt" path = "server.crt" } items { key = "tls.key" path = "server.key" } } } volume { name = "neuron-monitor-config" config_map { name = "neuron-monitor-config-map" } } service_account_name = "neuron-monitor-service-acct" } } } } resource "kubernetes_service" "neuron_monitor_service" { depends_on = [ kubernetes_namespace.namespace, kubernetes_service_account.cwagentservice, aws_eks_node_group.this, kubernetes_daemonset.neuron_monitor ] metadata { name = "neuron-monitor-service" namespace = "amazon-cloudwatch" labels = { "k8s-app" : "neuron-monitor-service" } annotations = { "prometheus.io/scrape" : "true" } } spec { type = "ClusterIP" selector = { k8s-app = "neuron-monitor" } port { name = "metrics" port = 8000 target_port = 8000 protocol = "TCP" } internal_traffic_policy = "Local" } } resource "kubernetes_daemonset" "service" { depends_on = [ kubernetes_namespace.namespace, kubernetes_service_account.cwagentservice, aws_eks_node_group.this, kubernetes_daemonset.neuron_monitor ] metadata { name = "cloudwatch-agent" namespace = "amazon-cloudwatch" } spec { selector { match_labels = { "name" : "cloudwatch-agent" } } template { metadata { labels = { "name" : "cloudwatch-agent" } } spec { node_selector = { "kubernetes.io/os" : "linux" } container { name = "cwagent" image = "${var.cwagent_image_repo}:${var.cwagent_image_tag}" image_pull_policy = "Always" resources { limits = { "cpu" : "200m", "memory" : "200Mi" } requests = { "cpu" : "200m", "memory" : "200Mi" } } port { container_port = 25888 host_port = 25888 protocol = "UDP" } env { name = "HOST_IP" value_from { field_ref { field_path = "status.hostIP" } } } env { name = "HOST_NAME" value_from { field_ref { field_path = "spec.nodeName" } } } env { name = "K8S_NAMESPACE" value_from { field_ref { field_path = "metadata.namespace" } } } volume_mount { mount_path = "/etc/cwagentconfig" name = "cwagentconfig" } volume_mount { mount_path = "/rootfs" name = "rootfs" read_only = true } volume_mount { mount_path = "/var/run/docker.sock" name = "dockersock" read_only = true } volume_mount { mount_path = "/var/lib/docker" name = "varlibdocker" read_only = true } volume_mount { mount_path = "/run/containerd/containerd.sock" name = "containerdsock" read_only = true } volume_mount { mount_path = "/sys" name = "sys" read_only = true } volume_mount { mount_path = "/dev/disk" name = "devdisk" read_only = true } volume_mount { mount_path = "/etc/amazon-cloudwatch-observability-agent-cert" name = "agenttls" read_only = true } volume_mount { mount_path = "/var/lib/kubelet/pod-resources" name = "kubelet-podresources" read_only = true } } volume { name = "cwagentconfig" config_map { name = "cwagentconfig" } } volume { name = "rootfs" host_path { path = "/" } } volume { name = "dockersock" host_path { path = "/var/run/docker.sock" } } volume { name = "varlibdocker" host_path { path = "/var/lib/docker" } } volume { name = "containerdsock" host_path { path = "/run/containerd/containerd.sock" } } volume { name = "sys" host_path { path = "/sys" } } volume { name = "devdisk" host_path { path = "/dev/disk" } } volume { name = "kubelet-podresources" host_path { path = "/var/lib/kubelet/pod-resources" } } volume { name = "agenttls" secret { secret_name = "amazon-cloudwatch-observability-agent-cert" items { key = "ca.crt" path = "tls-ca.crt" } } } service_account_name = "cloudwatch-agent" termination_grace_period_seconds = 60 } } } } ########################################## # Template Files ########################################## locals { httpd_config = "../../../../${var.test_dir}/resources/httpd.conf" httpd_ssl_config = "../../../../${var.test_dir}/resources/httpd-ssl.conf" cwagent_config = fileexists("../../../../${var.test_dir}/resources/config.json") ? "../../../../${var.test_dir}/resources/config.json" : "../default_resources/default_amazon_cloudwatch_agent.json" } data "template_file" "cwagent_config" { template = file(local.cwagent_config) vars = { } } resource "kubernetes_config_map" "cwagentconfig" { depends_on = [ kubernetes_namespace.namespace, kubernetes_service_account.cwagentservice ] metadata { name = "cwagentconfig" namespace = "amazon-cloudwatch" } data = { "cwagentconfig.json" : data.template_file.cwagent_config.rendered } } data "template_file" "httpd_config" { template = file(local.httpd_config) vars = {} } data "template_file" "httpd_ssl_config" { template = file(local.httpd_ssl_config) vars = {} } resource "kubernetes_config_map" "httpdconfig" { depends_on = [ kubernetes_namespace.namespace, kubernetes_service_account.cwagentservice ] metadata { name = "httpdconfig" namespace = "amazon-cloudwatch" } data = { "httpd.conf" : data.template_file.httpd_config.rendered "httpd-ssl.conf" : data.template_file.httpd_ssl_config.rendered } } resource "kubernetes_service_account" "cwagentservice" { depends_on = [kubernetes_namespace.namespace] metadata { name = "cloudwatch-agent" namespace = "amazon-cloudwatch" } } resource "kubernetes_cluster_role" "clusterrole" { depends_on = [kubernetes_namespace.namespace] metadata { name = "cloudwatch-agent-role" } rule { verbs = ["get", "list", "watch"] resources = ["pods", "pods/logs", "nodes", "nodes/proxy", "namespaces", "endpoints"] api_groups = [""] } rule { verbs = ["list", "watch"] resources = ["replicasets"] api_groups = ["apps"] } rule { verbs = ["list", "watch"] resources = ["jobs"] api_groups = ["batch"] } rule { verbs = ["get"] resources = ["nodes/proxy"] api_groups = [""] } rule { verbs = ["create"] resources = ["nodes/stats", "configmaps", "events"] api_groups = [""] } rule { verbs = ["get", "update"] resource_names = ["cwagent-clusterleader"] resources = ["configmaps"] api_groups = [""] } rule { verbs = ["get"] resource_names = ["neuron-monitor-config-map"] resources = ["configmaps"] api_groups = [""] } rule { verbs = ["list", "watch"] resources = ["services"] api_groups = [""] } rule { non_resource_urls = ["/metrics"] verbs = ["get", "list", "watch"] } rule { verbs = ["list", "watch", "get"] resources = ["endpointslices"] api_groups = ["discovery.k8s.io"] } } resource "kubernetes_cluster_role_binding" "rolebinding" { depends_on = [kubernetes_namespace.namespace] metadata { name = "cloudwatch-agent-role-binding" } role_ref { api_group = "rbac.authorization.k8s.io" kind = "ClusterRole" name = "cloudwatch-agent-role" } subject { kind = "ServiceAccount" name = "cloudwatch-agent" namespace = "amazon-cloudwatch" } } resource "null_resource" "validator" { depends_on = [ aws_eks_node_group.this, kubernetes_daemonset.service, kubernetes_cluster_role_binding.rolebinding, kubernetes_service_account.cwagentservice, ] provisioner "local-exec" { command = <<-EOT echo "Validating EKS metrics/logs for AWS Neuron" cd ../../../.. go test -timeout 30m ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON EOT } }