def run_rule()

in gcpdiag/lint/gke/bp_2023_002_stateful_workloads_not_on_preemptible_node.py [0:0]


def run_rule(context: models.Context, report: lint.LintReportRuleInterface):

  instances = gce.get_instances(context=context)
  clusters = gke.get_clusters(context=context)

  # A 'failed cluster' in this rule is defined as a GKE cluster that:
  # 1. has at least one preemptible node
  # 2. the preemptible node has at least one writeable non-boot PD attached
  failed_clusters = set()
  for i in instances.values():
    if i.is_gke_node() and i.is_preemptible_vm():
      for d in i.disks:
        # Skip checking if the disk is not PD (e.g. localSSD)
        if 'type' in d and d['type'] != 'PERSISTENT':
          continue
        # Skip checking if the PD is not writeable
        if 'mode' in d and 'WRITE' not in d['mode']:
          continue
        # A writeable non-boot PD indicates stateful workloads on this node.
        if 'boot' in d and not d['boot']:
          instance_cluster_name = i.get_metadata('cluster-name')
          instance_zone = i.zone
          for c in clusters.values():
            if instance_cluster_name == c.name and c.location in instance_zone:
              failed_clusters.add(c)

  if not clusters:
    report.add_skipped(None, 'no clusters found')
    return

  for c in sorted(clusters.values(), key=lambda cluster: cluster.short_path):
    if c not in failed_clusters:
      report.add_ok(c)
    else:
      report.add_failed(c, (
          f'Stateful workload is running on preemptible/spot node(s) "{c.name}"'
      ))