in gcpdiag/runbook/gke/node_bootstrapping.py [0:0]
def execute(self):
"""
Verify if Node Registration Checker completed running.
If the node was successfully registered, provide log entry proving successful registration
If the node wasn't registered successfully, provide Node Registration Checker summary to
understand why.
"""
project = op.get(flags.PROJECT_ID)
project_path = crm.get_project(project)
location = op.get(flags.LOCATION)
node = op.get(flags.NODE)
if node:
# default filter that is used in all log searches
default_filter = [
'resource.type="gce_instance"', f'resource.labels.zone="{location}"',
'log_id("serialconsole.googleapis.com/serial_port_1_output")',
'textPayload:"node-registration-checker.sh"'
]
default_filter = '\n'.join(default_filter)
# get node instance
node_vm = get_node_instance(project, location, node)
if node_vm and node_vm.is_running:
# Check if NODE_BOOT_NCR_READY_MIN_TIME minutes (should be at least 7 minutes) passed since
# the instance was booted, to make sure there was enough time for Node Registration Checker
# to finish running
time_since_creation = datetime.now(
) - node_vm.creation_timestamp >= timedelta(
minutes=self.NODE_BOOT_NCR_READY_MIN_TIME)
if not time_since_creation:
op.add_failed(
project_path,
reason=
(f'Instance {node} with instance-id {node_vm.id} in location {location} just booted '
f'at {node_vm.creation_timestamp}.'),
remediation=
(f'Please allow for at least {self.NODE_BOOT_NCR_READY_MIN_TIME} minutes since '
'starting the instance to allow for Node Registration Checker to finish running.'
))
return
if node_vm:
# check node service account has logging write permissions
iam_policy = iam.get_project_policy(project)
if not iam_policy.has_role_permissions(
f'serviceAccount:{node_vm.service_account}',
'roles/logging.logWriter'):
op.add_failed(
project_path,
reason=
(f'The service account {node_vm.service_account} for node {node} in location '
f'{location} does not have "Logs Writer (roles/logging.logWriter)" role '
'permissions. "Logs Writer" permissions are needed for the Node Registration '
'Checker\'s output to be written in Cloud Logging, where we can analyse it.'
),
remediation=
('Add the minimum permissions required to operate GKE to the Node\'s Service '
f'Account {node_vm.service_account} following the documentation: '
'https://cloud.google.com/kubernetes-engine/docs/how-to/hardening-your-cluster'
'#permissions'))
return
# if serial log is enabled, we can check Cloud Logging for node-registration-checker.sh
# output:
if node_vm and node_vm.is_running and node_vm.is_serial_port_logging_enabled(
):
# check if START_TIME is after node's boot time if yes we might not find Node
# Registration Checker logs, so the user needs to set earlier START_TIME
# get the offset-aware datetime instead of offset-naive
node_start_time = datetime.fromisoformat(str(
node_vm.creation_timestamp)).replace(tzinfo=timezone.utc)
if node_start_time < op.get(flags.START_TIME):
op.add_failed(
project_path,
reason=
(f'The node {node} in the location {location} booted at {node_start_time} before '
f'the provided START_TIME {op.get(flags.START_TIME)} '
'(default is 8 hours from now)'),
remediation=
('Please provide the START_TIME parameter (-p start_time) with a date '
f'before {node_start_time}, so that the runbook can find the Node Registration '
'Checker logs for the node'))
return
filter_str = [
f'resource.labels.instance_id="{node_vm.id}"', default_filter,
'textPayload:"Node ready and registered."'
]
filter_str = '\n'.join(filter_str)
log_entries_success = local_realtime_query(filter_str)
if log_entries_success:
# node registered successfully, we're all good
sample_log = log_entries_success.pop()
sample_log = str(sample_log).replace(', ', '\n')
op.add_ok(project_path,
reason=op.prep_msg(op.SUCCESS_REASON,
log_entry=sample_log,
node=node))
else:
# node failed to register, need to find Node Registration Checker summary verify if
# node-registration-checker.sh finished running
filter_str = [
f'resource.labels.instance_id="{node_vm.id}"', default_filter,
'textPayload:"Completed running Node Registration Checker"'
]
filter_str = '\n'.join(filter_str)
log_entries_completed = local_realtime_query(filter_str)
if log_entries_completed:
# node registration finished running but node didn't register. Get all logs for info
filter_str = [
f'resource.labels.instance_id="{node_vm.id}"', default_filter
]
filter_str = '\n'.join(filter_str)
log_entries_all = local_realtime_query(filter_str)
# log_entries_all have now all the logs until the final "Completed running Node
# Registration Checker" message, thus we need to pop messages one by one and go back
# until the start of the NRC report message "Here is a summary of the checks performed"
found = False
nrc_summary = []
while not found:
nrc_summary.insert(0, log_entries_all.pop()['textPayload'])
if TOKEN_NRC_START in nrc_summary[0]:
found = True
op.add_failed(project_path,
reason=op.prep_msg(op.FAILURE_REASON,
log_entries=nrc_summary,
node=node,
location=location),
remediation=op.prep_msg(op.FAILURE_REMEDIATION))
return
else:
# Could not find message that Node Registration Checker finished running for instance
# id, checking by node name and look for potential repair loop
filter_str = [
f'labels."compute.googleapis.com/resource_name"="{node}"',
default_filter,
'textPayload:"Completed running Node Registration Checker"'
]
filter_str = '\n'.join(filter_str)
log_entries_completed = local_realtime_query(filter_str)
if log_entries_completed:
# as there is "Completed running Node Registration Checker" log entry but not for
# current instance_id, this means that the node is in a repair loop. Need to find out
# the summary taking into account that there could be multiple summaries
nrc_summary = get_nrc_summary(node, op.get(flags.LOCATION))
op.add_failed(project_path,
reason=op.prep_msg(op.FAILURE_REASON_ALT1,
log_entries=nrc_summary,
node=node,
location=location),
remediation=op.prep_msg(op.FAILURE_REMEDIATION))
return
else:
# node is running, but there's no "Completed running Node Registration Checker" log
# entry in the provided time range
op.add_failed(project_path,
reason=op.prep_msg(op.UNCERTAIN_REASON,
node=node,
location=location,
start_time=op.get(
flags.START_TIME),
end_time=op.get(flags.END_TIME)),
remediation=op.prep_msg(op.UNCERTAIN_REMEDIATION,
node=node))
return
else:
# node doesn't exist, checking older logs by node name and trying to find if Node
# Registration Checker completed running at least once
filter_str = [
f'labels."compute.googleapis.com/resource_name"="{node}"',
default_filter, 'textPayload:"Node ready and registered."'
]
filter_str = '\n'.join(filter_str)
log_entries_success = local_realtime_query(filter_str)
if log_entries_success:
# node isn't running now, but it registered successfully in the past
sample_log = log_entries_success.pop()
sample_log = str(sample_log).replace(', ', '\n')
op.add_ok(project_path,
reason=op.prep_msg(op.SUCCESS_REASON_ALT1,
log_entry=sample_log,
node=node))
else:
filter_str = [
f'labels."compute.googleapis.com/resource_name"="{node}"',
default_filter,
'textPayload:"Completed running Node Registration Checker"'
]
filter_str = '\n'.join(filter_str)
log_entries_completed = local_realtime_query(filter_str)
if log_entries_completed:
# Node Registration Checker completed running. Need to find out the summary, taking
# into account that there could be multiple summaries
nrc_summary = get_nrc_summary(node, op.get(flags.LOCATION))
op.add_failed(project_path,
reason=op.prep_msg(op.FAILURE_REASON_ALT2,
log_entries=nrc_summary,
node=node,
location=location),
remediation=op.prep_msg(op.FAILURE_REMEDIATION))
return
else:
# node is not running and Node Registration Checker did not complete running. Most
# probably the node was deleted before Node Registration Checker could finish running.
op.add_failed(project_path,
reason=op.prep_msg(op.FAILURE_REASON_ALT3,
node=node,
location=location),
remediation=op.prep_msg(op.FAILURE_REMEDIATION_ALT3))
return
else:
op.add_skipped(
project_path,
reason=
('No node name provided, skipping this step .\n'
'Please provide node name (-p node=<nodename>) if the node appears in the nodepool, '
'but fails registration.\n'))