community/lustre/lustre.jinja (308 lines of code) (raw):

# Copyright 2019 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Determine region {% set region_ext = properties['zone'].split('-')[0] ~ '-' ~ properties['zone'].split('-')[1] %} {% set max_gb_per_vm = 263168 %} {% set max_gb_per_pd = 64000 %} {% set max_gb_per_localssd = 9000 %} resources: # If we're not using an existing VPC, create a VPC, subnet and firewall rules {% if not properties['vpc_net'] %} # Split the subnet CIDR range into octets {% set subnet = properties['cidr'].split('/')[0] %} {% set subnet_split = subnet.split('.') %} # Create a network for the new cluster - name: {{properties["cluster_name"]}}-lustre-network type: compute.v1.network properties: autoCreateSubnetworks: false {% endif %} {% if not properties['vpc_subnet'] %} # Create a subnet for the new cluster - name: {{properties["cluster_name"]}}-lustre-subnet type: compute.v1.subnetwork properties: network: $(ref.{{properties["cluster_name"]}}-lustre-network.selfLink) ipCidrRange: {{ properties["cidr"]}} region: {{ region_ext }} privateIpGoogleAccess: TRUE # Create a firewall rule to allow SSH to the cluster - name: {{properties["cluster_name"]}}-ssh-firewall-rule type: compute.v1.firewall properties: network: $(ref.{{properties["cluster_name"]}}-lustre-network.selfLink) sourceRanges: ["0.0.0.0/0"] allowed: - IPProtocol: TCP ports: ["22"] - IPProtocol: ICMP # Create firewall rule to allow all internal traffic between nodes - name: {{properties["cluster_name"]}}-all-internal-firewall-rule type: compute.v1.firewall properties: network: $(ref.{{properties["cluster_name"]}}-lustre-network.selfLink) sourceRanges: [{{properties["cidr"]}}] allowed: - IPProtocol: TCP ports: ["0-65535"] - IPProtocol: UDP ports: ["0-65535"] - IPProtocol: ICMP {% endif %} #{% if not properties['external_compute_ips'] %} #- name: {{properties["cluster_name"]}}-no-ip-internet-route # type: compute.v1.route # properties: # network: $(ref.{{properties["cluster_name"]}}-exascaler-network.selfLink) # tags: ["es-noip"] # destRange: 0.0.0.0/0 # nextHopInstance: https://www.googleapis.com/compute/v1/projects/{{ env["project"] }}/zones/{{ properties["zone"] }}/instances/{{properties["cluster_name"]}}-mgs # priority: 800 # metadata: # dependsOn: # - {{properties["cluster_name"]}}-mgs #{% endif %} # If we're not using external IPs or an existing VPC, create a Cloud NAT {% if (not properties['external_ips'] and not properties['vpc_net']) %} - name: {{properties["cluster_name"]}}-router type: compute.beta.router properties: network: $(ref.{{properties["cluster_name"]}}-lustre-network.selfLink) region: {{ region_ext }} nats: - name: {{properties["cluster_name"]}}-nat natIpAllocateOption: "AUTO_ONLY" sourceSubnetworkIpRangesToNat: "LIST_OF_SUBNETWORKS" subnetworks: - name: $(ref.{{properties["cluster_name"]}}-lustre-subnet.selfLink) sourceIpRangesToNat: ["PRIMARY_IP_RANGE"] {% endif %} # Create N MDS/MGS nodes {% for n in range(properties['mds_node_count']) %} - name: {{properties["cluster_name"]}}-mds{{ n + 1 }} type: compute.v1.instance properties: zone: {{ properties["zone"] }} machineType: https://www.googleapis.com/compute/v1/projects/{{ env["project"] }}/zones/{{ properties["zone"] }}/machineTypes/{{ properties["mds_machine_type"] }} disks: - deviceName: boot type: PERSISTENT boot: true autoDelete: true initializeParams: sourceImage: https://www.googleapis.com/compute/v1/projects/centos-cloud/global/images/family/centos-7 diskType: https://www.googleapis.com/compute/v1/projects/{{ env["project"] }}/zones/{{ properties["zone"] }}/diskTypes/{{ properties["mds_boot_disk_type"] }} diskSizeGb: {{ properties["mds_boot_disk_size_gb"] }} {% if properties["mdt_disk_type"] == "local-ssd" %} {% if (properties["mdt_disk_size_gb"]|int) <= 3000 %} {% set mdt_per_mds = (properties['mdt_disk_size_gb'] / 375)|round(0, 'ceil')|int %} {% elif (properties["mdt_disk_size_gb"]|int) <= 6000 %} {% set mdt_per_mds = 16 %} {% elif (properties["mdt_disk_size_gb"]|int) <= max_gb_per_localssd %} {% set mdt_per_mds = 24 %} {% elif (properties["mdt_disk_size_gb"]|int) > max_gb_per_localssd %} {% set mdt_per_mds = 24 %} {% endif %} {% for i in range(mdt_per_mds) %} - type: SCRATCH autoDelete: true interface: NVME initializeParams: diskType: https://www.googleapis.com/compute/v1/projects/{{ env["project"] }}/zones/{{ properties["zone"] }}/diskTypes/{{ properties["mdt_disk_type"] }} {% endfor %} {% else %} {% set mdt_per_mds = properties["mdt_per_mds"] %} {% if properties["mdt_disk_size_gb"] > max_gb_per_pd %} {% set mdt_disk_size_gb = max_gb_per_pd %} {% else %} {% set mdt_disk_size_gb = properties["mdt_disk_size_gb"] %} {% endif %} {% if mdt_per_mds*mdt_disk_size_gb > max_gb_per_vm %} {% set mdt_per_mds = (max_gb_per_vm / mdt_disk_size_gb)|round(0, 'floor')|int %} {% endif %} {% for i in range(mdt_per_mds) %} - deviceName: mdt-{{ i + 1 }} type: PERSISTENT autoDelete: true initializeParams: diskName: {{properties["cluster_name"]}}-mdt{{ n + 1 }}-{{ i + 1 }} diskType: https://www.googleapis.com/compute/v1/projects/{{ env["project"] }}/zones/{{ properties["zone"] }}/diskTypes/{{ properties["mdt_disk_type"] }} diskSizeGb: {{ mdt_disk_size_gb }} {% endfor %} {% endif %} {% if not properties['external_ips'] %} canIpForward: true {% endif %} {% if (properties['vpc_net'] and properties['vpc_subnet'] and properties['shared_vpc_host_proj']) %} networkInterfaces: - subnetwork: https://www.googleapis.com/compute/v1/projects/{{ properties["shared_vpc_host_proj"] }}/regions/{{ region_ext }}/subnetworks/{{ properties["vpc_subnet"] }} {% elif (properties['vpc_subnet'] and properties['vpc_net']) %} networkInterfaces: - subnetwork: https://www.googleapis.com/compute/v1/projects/{{ env["project"] }}/regions/{{ region_ext }}/subnetworks/{{ properties["vpc_subnet"] }} {% else %} networkInterfaces: - subnetwork: $(ref.{{properties["cluster_name"]}}-lustre-subnet.selfLink) {% endif %} {% if properties['external_ips'] or properties['shared_vpc_host_proj'] %} accessConfigs: - name: External NAT type: ONE_TO_ONE_NAT {% endif %} # If the MDS IP is specified, use that {% if properties['mds_ip_range_start'] %} {% set mds_ip_split = properties['mds_ip_range_start'].split('.')[3] | int %} {% set mds_ip_uniq = mds_ip_split + n %} {% set mds_ip = subnet_split[0] ~ '.' ~ subnet_split[1] ~ '.' ~ subnet_split[2] ~ '.' ~ mds_ip_uniq %} networkIP: {{ mds_ip }} {% endif %} serviceAccounts: - email: "default" scopes: - "https://www.googleapis.com/auth/cloud-platform" metadata: dependsOn: - {{properties["cluster_name"]}}-mds1 items: - key: startup-script value: | {{ imports["scripts/startup-script.sh"]|indent(12)|replace("@CLUSTER_NAME@",properties["cluster_name"])|replace("@FS_NAME@",properties["fs_name"])|replace("@LUSTRE_VERSION@",properties["lustre_version"])|replace("@E2FS_VERSION@",properties["e2fs_version"])|replace("@NODE_ROLE@","MDS")|replace("@MDT_PER_MDS@",mdt_per_mds)|replace("@OST_PER_OSS@",properties["ost_per_oss"])|replace("@OST_DISK_TYPE@",properties["ost_disk_type"])|replace("@MDT_DISK_TYPE@",properties["mdt_disk_type"]) }} - key: enable-oslogin value: "TRUE" {% endfor %} # Create N OSS nodes {% for n in range(properties['oss_node_count']) %} - name: {{properties["cluster_name"]}}-oss{{ n + 1 }} type: compute.v1.instance properties: zone: {{ properties["zone"] }} machineType: https://www.googleapis.com/compute/v1/projects/{{ env["project"] }}/zones/{{ properties["zone"] }}/machineTypes/{{ properties["oss_machine_type"] }} disks: - deviceName: boot type: PERSISTENT boot: true autoDelete: true initializeParams: sourceImage: https://www.googleapis.com/compute/v1/projects/centos-cloud/global/images/family/centos-7 diskType: https://www.googleapis.com/compute/v1/projects/{{ env["project"] }}/zones/{{ properties["zone"] }}/diskTypes/{{ properties["oss_boot_disk_type"] }} diskSizeGb: {{ properties["oss_boot_disk_size_gb"] }} {% if properties["ost_disk_type"] == "local-ssd" %} {% if (properties["ost_disk_size_gb"]|int) <= 3000 %} {% set ost_per_oss = (properties['ost_disk_size_gb'] / 375)|round(0, 'ceil')|int %} {% elif (properties["ost_disk_size_gb"]|int) <= 6000 %} {% set ost_per_oss = 16 %} {% elif (properties["ost_disk_size_gb"]|int) <= max_gb_per_localssd %} {% set ost_per_oss = 24 %} {% elif (properties["ost_disk_size_gb"]|int) > max_gb_per_localssd %} {% set ost_per_oss = 24 %} {% endif %} {% for i in range(ost_per_oss) %} - type: SCRATCH autoDelete: true interface: NVME initializeParams: diskType: https://www.googleapis.com/compute/v1/projects/{{ env["project"] }}/zones/{{ properties["zone"] }}/diskTypes/{{ properties["ost_disk_type"] }} {% endfor %} {% else %} {% set ost_per_oss = properties["ost_per_oss"] %} {% if properties["ost_disk_size_gb"] > max_gb_per_pd %} {% set ost_disk_size_gb = max_gb_per_pd %} {% else %} {% set ost_disk_size_gb = properties["ost_disk_size_gb"] %} {% endif %} {% if ost_per_oss*ost_disk_size_gb > max_gb_per_vm %} {% set ost_per_oss = (max_gb_per_vm / ost_disk_size_gb)|round(0, 'floor')|int %} {% endif %} {% for i in range(ost_per_oss) %} - deviceName: ost-{{ i + 1 }} type: PERSISTENT autoDelete: true initializeParams: diskName: {{properties["cluster_name"]}}-ost{{ n + 1 }}-{{ i + 1 }} diskType: https://www.googleapis.com/compute/v1/projects/{{ env["project"] }}/zones/{{ properties["zone"] }}/diskTypes/{{ properties["ost_disk_type"] }} diskSizeGb: {{ ost_disk_size_gb }} {% endfor %} {% endif %} {% if (properties['vpc_subnet'] and properties['vpc_net'] and properties['shared_vpc_host_proj']) %} networkInterfaces: - subnetwork: https://www.googleapis.com/compute/v1/projects/{{ properties["shared_vpc_host_proj"] }}/regions/{{ region_ext }}/subnetworks/{{ properties["vpc_subnet"] }} {% elif (properties['vpc_subnet'] and properties['vpc_net']) %} networkInterfaces: - subnetwork: https://www.googleapis.com/compute/v1/projects/{{ env["project"] }}/regions/{{ region_ext }}/subnetworks/{{ properties["vpc_subnet"] }} {% else %} networkInterfaces: - subnetwork: $(ref.{{properties["cluster_name"]}}-lustre-subnet.selfLink) {% endif %} {% if properties['external_ips'] or properties['shared_vpc_host_proj'] %} accessConfigs: - name: External NAT type: ONE_TO_ONE_NAT {% endif %} # If the range start is set, create OSS nodes with consistently sequential IPs. Otherwise use DHCP {% if properties['oss_ip_range_start'] %} {% set oss_ip_split = properties['oss_ip_range_start'].split('.')[3] | int %} {% set oss_ip_uniq = oss_ip_split + n %} {% set oss_ip = subnet_split[0] ~ '.' ~ subnet_split[1] ~ '.' ~ subnet_split[2] ~ '.' ~ oss_ip_uniq %} networkIP: {{ oss_ip }} {% endif %} serviceAccounts: - email: "default" scopes: - "https://www.googleapis.com/auth/cloud-platform" metadata: items: - key: startup-script value: | {{ imports["scripts/startup-script.sh"]|indent(12)|replace("@CLUSTER_NAME@",properties["cluster_name"])|replace("@FS_NAME@",properties["fs_name"])|replace("@LUSTRE_VERSION@",properties["lustre_version"])|replace("@E2FS_VERSION@",properties["e2fs_version"])|replace("@NODE_ROLE@","OSS")|replace("@OST_PER_OSS@",ost_per_oss)|replace("@MDT_PER_MDS@",properties["mdt_per_mds"])|replace("@OST_DISK_TYPE@",properties["ost_disk_type"])|replace("@MDT_DISK_TYPE@",properties["mdt_disk_type"]) }} {% endfor %} # Create N Lustre HSM Data Movers nodes {% for n in range(properties['hsm_node_count']) %} - name: {{properties["cluster_name"]}}-hsm{{ n + 1 }} type: compute.v1.instance properties: zone: {{ properties["zone"] }} machineType: https://www.googleapis.com/compute/v1/projects/{{ env["project"] }}/zones/{{ properties["zone"] }}/machineTypes/{{ properties["hsm_machine_type"] }} disks: - deviceName: boot type: PERSISTENT boot: true autoDelete: true initializeParams: sourceImage: https://www.googleapis.com/compute/v1/projects/centos-cloud/global/images/family/centos-7 diskType: https://www.googleapis.com/compute/v1/projects/{{ env["project"] }}/zones/{{ properties["zone"] }}/diskTypes/{{ properties["hsm_boot_disk_type"] }} diskSizeGb: {{ properties["hsm_boot_disk_size_gb"] }} {% if (properties['vpc_subnet'] and properties['vpc_net'] and properties['shared_vpc_host_proj']) %} networkInterfaces: - subnetwork: https://www.googleapis.com/compute/v1/projects/{{ properties["shared_vpc_host_proj"] }}/regions/{{ region_ext }}/subnetworks/{{ properties["vpc_subnet"] }} {% elif (properties['vpc_subnet'] and properties['vpc_net']) %} networkInterfaces: - subnetwork: https://www.googleapis.com/compute/v1/projects/{{ env["project"] }}/regions/{{ region_ext }}/subnetworks/{{ properties["vpc_subnet"] }} {% else %} networkInterfaces: - subnetwork: $(ref.{{properties["cluster_name"]}}-lustre-subnet.selfLink) {% endif %} {% if properties['external_ips'] or properties['shared_vpc_host_proj'] %} accessConfigs: - name: External NAT type: ONE_TO_ONE_NAT {% endif %} serviceAccounts: - email: "default" scopes: - "https://www.googleapis.com/auth/cloud-platform" metadata: items: - key: startup-script value: | {{ imports["scripts/startup-script.sh"]|indent(12)|replace("@CLUSTER_NAME@",properties["cluster_name"])|replace("@FS_NAME@",properties["fs_name"])|replace("@LUSTRE_VERSION@",properties["lustre_version"])|replace("@E2FS_VERSION@",properties["e2fs_version"])|replace("@NODE_ROLE@","HSM")|replace("@HSM_GCS_BUCKET@",properties["hsm_gcs_bucket"])|replace("@HSM_GCS_BUCKET_IMPORT@",properties["hsm_gcs_bucket_import"]) }} {% endfor %}