def execute()

in gcpdiag/runbook/gke/node_bootstrapping.py [0:0]


  def execute(self):
    """
    Verify if Node Registration Checker completed running.

    If the node was successfully registered, provide log entry proving successful registration
    If the node wasn't registered successfully, provide Node Registration Checker summary to
    understand why.
    """
    project = op.get(flags.PROJECT_ID)
    project_path = crm.get_project(project)
    location = op.get(flags.LOCATION)
    node = op.get(flags.NODE)

    if node:

      # default filter that is used in all log searches
      default_filter = [
          'resource.type="gce_instance"', f'resource.labels.zone="{location}"',
          'log_id("serialconsole.googleapis.com/serial_port_1_output")',
          'textPayload:"node-registration-checker.sh"'
      ]
      default_filter = '\n'.join(default_filter)
      # get node instance
      node_vm = get_node_instance(project, location, node)

      if node_vm and node_vm.is_running:
        # Check if NODE_BOOT_NCR_READY_MIN_TIME minutes (should be at least 7 minutes) passed since
        # the instance was booted, to make sure there was enough time for Node Registration Checker
        # to finish running
        time_since_creation = datetime.now(
        ) - node_vm.creation_timestamp >= timedelta(
            minutes=self.NODE_BOOT_NCR_READY_MIN_TIME)

        if not time_since_creation:
          op.add_failed(
              project_path,
              reason=
              (f'Instance {node} with instance-id {node_vm.id} in location {location} just booted '
               f'at {node_vm.creation_timestamp}.'),
              remediation=
              (f'Please allow for at least {self.NODE_BOOT_NCR_READY_MIN_TIME} minutes since '
               'starting the instance to allow for Node Registration Checker to finish running.'
              ))
          return

      if node_vm:
        # check node service account has logging write permissions
        iam_policy = iam.get_project_policy(project)
        if not iam_policy.has_role_permissions(
            f'serviceAccount:{node_vm.service_account}',
            'roles/logging.logWriter'):
          op.add_failed(
              project_path,
              reason=
              (f'The service account {node_vm.service_account} for node {node} in location '
               f'{location} does not have "Logs Writer (roles/logging.logWriter)" role '
               'permissions. "Logs Writer" permissions are needed for the Node Registration '
               'Checker\'s output to be written in Cloud Logging, where we can analyse it.'
              ),
              remediation=
              ('Add the minimum permissions required to operate GKE to the Node\'s Service '
               f'Account {node_vm.service_account} following the documentation: '
               'https://cloud.google.com/kubernetes-engine/docs/how-to/hardening-your-cluster'
               '#permissions'))
          return

      # if serial log is enabled, we can check Cloud Logging for node-registration-checker.sh
      # output:
      if node_vm and node_vm.is_running and node_vm.is_serial_port_logging_enabled(
      ):
        # check if START_TIME is after node's boot time if yes we might not find Node
        # Registration Checker logs, so the user needs to set earlier START_TIME

        # get the offset-aware datetime instead of offset-naive
        node_start_time = datetime.fromisoformat(str(
            node_vm.creation_timestamp)).replace(tzinfo=timezone.utc)
        if node_start_time < op.get(flags.START_TIME):
          op.add_failed(
              project_path,
              reason=
              (f'The node {node} in the location {location} booted at {node_start_time} before '
               f'the provided START_TIME {op.get(flags.START_TIME)} '
               '(default is 8 hours from now)'),
              remediation=
              ('Please provide the START_TIME parameter (-p start_time) with a date '
               f'before {node_start_time}, so that the runbook can find the Node Registration '
               'Checker logs for the node'))
          return

        filter_str = [
            f'resource.labels.instance_id="{node_vm.id}"', default_filter,
            'textPayload:"Node ready and registered."'
        ]
        filter_str = '\n'.join(filter_str)

        log_entries_success = local_realtime_query(filter_str)

        if log_entries_success:
          # node registered successfully, we're all good
          sample_log = log_entries_success.pop()
          sample_log = str(sample_log).replace(', ', '\n')
          op.add_ok(project_path,
                    reason=op.prep_msg(op.SUCCESS_REASON,
                                       log_entry=sample_log,
                                       node=node))

        else:
          # node failed to register, need to find Node Registration Checker summary verify if
          # node-registration-checker.sh finished running
          filter_str = [
              f'resource.labels.instance_id="{node_vm.id}"', default_filter,
              'textPayload:"Completed running Node Registration Checker"'
          ]
          filter_str = '\n'.join(filter_str)

          log_entries_completed = local_realtime_query(filter_str)

          if log_entries_completed:
            # node registration finished running but node didn't register. Get all logs for info
            filter_str = [
                f'resource.labels.instance_id="{node_vm.id}"', default_filter
            ]
            filter_str = '\n'.join(filter_str)
            log_entries_all = local_realtime_query(filter_str)

            # log_entries_all have now all the logs until the final "Completed running Node
            # Registration Checker" message, thus we need to pop messages one by one and go back
            # until the start of the NRC report message "Here is a summary of the checks performed"
            found = False
            nrc_summary = []
            while not found:
              nrc_summary.insert(0, log_entries_all.pop()['textPayload'])
              if TOKEN_NRC_START in nrc_summary[0]:
                found = True

            op.add_failed(project_path,
                          reason=op.prep_msg(op.FAILURE_REASON,
                                             log_entries=nrc_summary,
                                             node=node,
                                             location=location),
                          remediation=op.prep_msg(op.FAILURE_REMEDIATION))
            return

          else:
            # Could not find message that Node Registration Checker finished running for instance
            # id, checking by node name and look for potential repair loop
            filter_str = [
                f'labels."compute.googleapis.com/resource_name"="{node}"',
                default_filter,
                'textPayload:"Completed running Node Registration Checker"'
            ]
            filter_str = '\n'.join(filter_str)

            log_entries_completed = local_realtime_query(filter_str)

            if log_entries_completed:
              # as there is "Completed running Node Registration Checker" log entry but not for
              # current instance_id, this means that the node is in a repair loop. Need to find out
              # the summary taking into account that there could be multiple summaries
              nrc_summary = get_nrc_summary(node, op.get(flags.LOCATION))

              op.add_failed(project_path,
                            reason=op.prep_msg(op.FAILURE_REASON_ALT1,
                                               log_entries=nrc_summary,
                                               node=node,
                                               location=location),
                            remediation=op.prep_msg(op.FAILURE_REMEDIATION))
              return
            else:
              # node is running, but there's no "Completed running Node Registration Checker" log
              # entry in the provided time range
              op.add_failed(project_path,
                            reason=op.prep_msg(op.UNCERTAIN_REASON,
                                               node=node,
                                               location=location,
                                               start_time=op.get(
                                                   flags.START_TIME),
                                               end_time=op.get(flags.END_TIME)),
                            remediation=op.prep_msg(op.UNCERTAIN_REMEDIATION,
                                                    node=node))
              return

      else:
        # node doesn't exist, checking older logs by node name and trying to find if Node
        # Registration Checker completed running at least once
        filter_str = [
            f'labels."compute.googleapis.com/resource_name"="{node}"',
            default_filter, 'textPayload:"Node ready and registered."'
        ]
        filter_str = '\n'.join(filter_str)

        log_entries_success = local_realtime_query(filter_str)

        if log_entries_success:
          # node isn't running now, but it registered successfully in the past
          sample_log = log_entries_success.pop()
          sample_log = str(sample_log).replace(', ', '\n')
          op.add_ok(project_path,
                    reason=op.prep_msg(op.SUCCESS_REASON_ALT1,
                                       log_entry=sample_log,
                                       node=node))
        else:
          filter_str = [
              f'labels."compute.googleapis.com/resource_name"="{node}"',
              default_filter,
              'textPayload:"Completed running Node Registration Checker"'
          ]
          filter_str = '\n'.join(filter_str)

          log_entries_completed = local_realtime_query(filter_str)

          if log_entries_completed:
            # Node Registration Checker completed running.  Need to find out the summary, taking
            # into account that there could be multiple summaries
            nrc_summary = get_nrc_summary(node, op.get(flags.LOCATION))

            op.add_failed(project_path,
                          reason=op.prep_msg(op.FAILURE_REASON_ALT2,
                                             log_entries=nrc_summary,
                                             node=node,
                                             location=location),
                          remediation=op.prep_msg(op.FAILURE_REMEDIATION))
            return

          else:
            # node is not running and Node Registration Checker did not complete running. Most
            # probably the node was deleted before Node Registration Checker could finish running.
            op.add_failed(project_path,
                          reason=op.prep_msg(op.FAILURE_REASON_ALT3,
                                             node=node,
                                             location=location),
                          remediation=op.prep_msg(op.FAILURE_REMEDIATION_ALT3))
            return
    else:
      op.add_skipped(
          project_path,
          reason=
          ('No node name provided, skipping this step .\n'
           'Please provide node name (-p node=<nodename>) if the node appears in the nodepool, '
           'but fails registration.\n'))