gcpdiag/runbook/gke/image_pull.py (323 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""GKE Image pull failures runbook"""
from datetime import datetime
from boltons.iterutils import get_path
from gcpdiag import runbook
from gcpdiag.queries import apis, crm, gke, logs
from gcpdiag.runbook import op
from gcpdiag.runbook.gke import flags
from gcpdiag.utils import GcpApiError
def local_realtime_query(filter_list):
filter_str = '\n'.join(filter_list)
result = logs.realtime_query(project_id=op.get(flags.PROJECT_ID),
start_time=op.get(flags.START_TIME),
end_time=op.get(flags.END_TIME),
filter_str=filter_str)
return result
class ImagePull(runbook.DiagnosticTree):
"""Analysis and Resolution of Image Pull Failures on GKE clusters.
This runbook investigates the gke cluster for Image pull failures and recommends remediation
steps.
Areas Examined:
- GKE cluster
- Stackdriver logs
"""
# Specify parameters common to all steps in the diagnostic tree class.
parameters = {
flags.PROJECT_ID: {
'type': str,
'help': 'The Project ID of the resource under investigation',
'required': True
},
flags.NAME: {
'type':
str,
'help':
'The name of the GKE cluster, to limit search only for this cluster',
'required':
True
},
flags.LOCATION: {
'type': str,
'help': 'The zone or region of the GKE cluster',
'required': True
},
flags.START_TIME: {
'type':
datetime,
'help':
'(Optional) The start window to query the logs. Format: YYYY-MM-DDTHH:MM:SSZ',
'required':
False
},
flags.END_TIME: {
'type':
datetime,
'help':
'(Optional) The end window for the logs. Format: YYYY-MM-DDTHH:MM:SSZ',
'required':
False
}
}
def build_tree(self):
"""Construct the diagnostic tree with appropriate steps."""
# Instantiate your step classes
start = ImagePullStart()
# add them to your tree
self.add_start(start)
image_not_found = ImageNotFound()
image_forbidden = ImageForbidden()
image_dns_issue = ImageDnsIssue()
image_connection_timeout_restricted_private = ImageConnectionTimeoutRestrictedPrivate(
)
image_connection_timeout = ImageConnectionTimeout()
image_not_found_insufficient_scope = ImageNotFoundInsufficientScope()
# Describe the step relationships
self.add_step(parent=start, child=image_not_found)
self.add_step(parent=image_not_found, child=image_forbidden)
self.add_step(parent=image_forbidden, child=image_dns_issue)
self.add_step(parent=image_dns_issue,
child=image_connection_timeout_restricted_private)
self.add_step(parent=image_connection_timeout_restricted_private,
child=image_connection_timeout)
self.add_step(parent=image_connection_timeout,
child=image_not_found_insufficient_scope)
# Ending runbook
self.add_end(ImagePullEnd())
class ImagePullStart(runbook.StartStep):
"""Initiates diagnostics for Image pull runbook.
Check
- if logging API is enabled
- verify that the cluster exists at that location
"""
def execute(self):
"""Starting the image pull error diagnostics"""
# skip if logging is disabled
project = op.get(flags.PROJECT_ID)
project_path = crm.get_project(project)
if not apis.is_enabled(project, 'logging'):
op.add_skipped(project_path,
reason=('Logging disabled in project {}').format(project))
return
# verify if the provided cluster at location is present
project = crm.get_project(op.get(flags.PROJECT_ID))
try:
cluster = gke.get_cluster(op.get(flags.PROJECT_ID),
cluster_id=op.get(flags.NAME),
location=op.get(flags.LOCATION))
except GcpApiError:
op.add_skipped(
project,
reason=('Cluster {} does not exist in {} for project {}').format(
op.get(flags.NAME), op.get(flags.LOCATION),
op.get(flags.PROJECT_ID)))
else:
op.add_ok(project,
reason=('Cluster {} found in {} for project {}').format(
cluster.name, op.get(flags.LOCATION),
op.get(flags.PROJECT_ID)))
class ImageNotFound(runbook.Step):
"""Check for Image not found log entries"""
template = 'imagepull::image_not_found'
def execute(self):
"""Check for "Failed to pull image.*not found" log entries."""
project = op.get(flags.PROJECT_ID)
project_path = crm.get_project(project)
cluster_location = op.get(flags.LOCATION)
cluster_name = op.get(flags.NAME)
start_time = op.get(flags.START_TIME)
end_time = op.get(flags.END_TIME)
filter_list = [
'log_id("events")', 'resource.type="k8s_pod"',
'jsonPayload.message:"Failed to pull image"',
'jsonPayload.message:"not found"',
f'resource.labels.location="{cluster_location}"',
f'resource.labels.cluster_name="{cluster_name}"'
]
log_entries = local_realtime_query(filter_list)
if log_entries:
sample_log = format_log_entries(log_entries)
op.add_failed(project_path,
reason=op.prep_msg(
op.FAILURE_REASON,
log_entry=sample_log,
start_time=start_time,
end_time=end_time,
),
remediation=op.prep_msg(op.FAILURE_REMEDIATION))
else:
op.add_ok(project_path,
reason=op.prep_msg(op.SUCCESS_REASON,
start_time=start_time,
end_time=end_time))
class ImageForbidden(runbook.Step):
"""Image cannot be pulled, insufficiente permissions"""
template = 'imagepull::image_forbidden'
def execute(self):
"""Check for "Failed to pull image.*403 Forbidden" log entries."""
project = op.get(flags.PROJECT_ID)
project_path = crm.get_project(project)
cluster_location = op.get(flags.LOCATION)
cluster_name = op.get(flags.NAME)
start_time = op.get(flags.START_TIME)
end_time = op.get(flags.END_TIME)
filter_list = [
'log_id("events")', 'resource.type="k8s_pod"',
'jsonPayload.message:"Failed to pull image"',
'jsonPayload.message:"403 Forbidden"',
f'resource.labels.location="{cluster_location}"',
f'resource.labels.cluster_name="{cluster_name}"'
]
log_entries = local_realtime_query(filter_list)
if log_entries:
sample_log = format_log_entries(log_entries)
op.add_failed(project_path,
reason=op.prep_msg(
op.FAILURE_REASON,
log_entry=sample_log,
start_time=start_time,
end_time=end_time,
),
remediation=op.prep_msg(op.FAILURE_REMEDIATION))
else:
op.add_ok(project_path,
reason=op.prep_msg(op.SUCCESS_REASON,
start_time=start_time,
end_time=end_time))
class ImageDnsIssue(runbook.Step):
"""Node DNS sever cannot resolve the IP of the repository"""
template = 'imagepull::image_dns_issue'
def execute(self):
"""Check for "Failed to pull image.*lookup.*server misbehaving" log entries."""
project = op.get(flags.PROJECT_ID)
project_path = crm.get_project(project)
cluster_location = op.get(flags.LOCATION)
cluster_name = op.get(flags.NAME)
start_time = op.get(flags.START_TIME)
end_time = op.get(flags.END_TIME)
filter_list = [
'log_id("events")', 'resource.type="k8s_pod"',
'jsonPayload.message:"Failed to pull image"',
'jsonPayload.message:"lookup"',
'jsonPayload.message:"server misbehaving"',
f'resource.labels.location="{cluster_location}"',
f'resource.labels.cluster_name="{cluster_name}"'
]
log_entries = local_realtime_query(filter_list)
if log_entries:
sample_log = format_log_entries(log_entries)
op.add_failed(project_path,
reason=op.prep_msg(
op.FAILURE_REASON,
log_entry=sample_log,
start_time=start_time,
end_time=end_time,
),
remediation=op.prep_msg(op.FAILURE_REMEDIATION))
else:
op.add_ok(project_path,
reason=op.prep_msg(op.SUCCESS_REASON,
start_time=start_time,
end_time=end_time))
class ImageConnectionTimeoutRestrictedPrivate(runbook.Step):
"""The connection to restricted.googleapis.com or private.googleapis.com is timing out"""
template = 'imagepull::image_connection_timeout_restricted_private'
def execute(self):
"""
Check for "Failed to pull image.*dial tcp.*199.36.153.\\d:443: i/o timeout" log entries
"""
project = op.get(flags.PROJECT_ID)
project_path = crm.get_project(project)
cluster_location = op.get(flags.LOCATION)
cluster_name = op.get(flags.NAME)
start_time = op.get(flags.START_TIME)
end_time = op.get(flags.END_TIME)
filter_list = [
'log_id("events")', 'resource.type="k8s_pod"',
'jsonPayload.message:"Failed to pull image"',
'jsonPayload.message:"dial tcp"',
'jsonPayload.message:"199.36.153.*:443: i/o timeout"',
f'resource.labels.location="{cluster_location}"',
f'resource.labels.cluster_name="{cluster_name}"'
]
log_entries = local_realtime_query(filter_list)
if log_entries:
sample_log = format_log_entries(log_entries)
op.add_failed(project_path,
reason=op.prep_msg(
op.FAILURE_REASON,
log_entry=sample_log,
start_time=start_time,
end_time=end_time,
),
remediation=op.prep_msg(op.FAILURE_REMEDIATION))
else:
op.add_ok(project_path,
reason=op.prep_msg(op.SUCCESS_REASON,
start_time=start_time,
end_time=end_time))
class ImageConnectionTimeout(runbook.Step):
"""The connection to Google APIs is timing out"""
template = 'imagepull::image_connection_timeout'
def execute(self):
"""
Check for "Failed to pull image.*dial tcp.*i/o timeout" log entries
"""
project = op.get(flags.PROJECT_ID)
project_path = crm.get_project(project)
cluster_location = op.get(flags.LOCATION)
cluster_name = op.get(flags.NAME)
start_time = op.get(flags.START_TIME)
end_time = op.get(flags.END_TIME)
filter_list = [
'log_id("events")', 'resource.type="k8s_pod"',
'jsonPayload.message:"Failed to pull image"',
'jsonPayload.message:"dial tcp"', 'jsonPayload.message:"i/o timeout"',
f'resource.labels.location="{cluster_location}"',
f'resource.labels.cluster_name="{cluster_name}"'
]
log_entries = local_realtime_query(filter_list)
if log_entries:
sample_log = format_log_entries(log_entries)
op.add_failed(project_path,
reason=op.prep_msg(
op.FAILURE_REASON,
log_entry=sample_log,
start_time=start_time,
end_time=end_time,
),
remediation=op.prep_msg(op.FAILURE_REMEDIATION))
else:
op.add_ok(project_path,
reason=op.prep_msg(op.SUCCESS_REASON,
start_time=start_time,
end_time=end_time))
class ImageNotFoundInsufficientScope(runbook.Step):
"""Check for Image not found log entries with insufficient_scope server message"""
template = 'imagepull::image_not_found_insufficient_scope'
def execute(self):
"""
Check for "Failed to pull image.*insufficient_scope" log entries
"""
project = op.get(flags.PROJECT_ID)
project_path = crm.get_project(project)
cluster_location = op.get(flags.LOCATION)
cluster_name = op.get(flags.NAME)
start_time = op.get(flags.START_TIME)
end_time = op.get(flags.END_TIME)
filter_list = [
'log_id("events")', 'resource.type="k8s_pod"',
'jsonPayload.message:"Failed to pull image"',
'jsonPayload.message:"insufficient_scope"',
f'resource.labels.location="{cluster_location}"',
f'resource.labels.cluster_name="{cluster_name}"'
]
log_entries = local_realtime_query(filter_list)
if log_entries:
sample_log = format_log_entries(log_entries)
op.add_failed(project_path,
reason=op.prep_msg(
op.FAILURE_REASON,
log_entry=sample_log,
start_time=start_time,
end_time=end_time,
),
remediation=op.prep_msg(op.FAILURE_REMEDIATION))
else:
op.add_ok(project_path,
reason=op.prep_msg(op.SUCCESS_REASON,
start_time=start_time,
end_time=end_time))
class ImagePullEnd(runbook.EndStep):
"""Finalizes the diagnostics process for `GKE Image Pull runbbok`.
This step prompts the user to confirm satisfaction with the analysis performed for
`GKE Image Pull runbbok`.
Depending on the user's response, it may conclude the runbook execution or trigger additional
steps, such as generating a report of the findings.
"""
def execute(self):
"""Finalize `GKE Image Pull runbbok` diagnostics."""
response = op.prompt(
kind=op.CONFIRMATION,
message='Are you satisfied with the `GKE Image Pull runbbok` analysis?')
if response == op.NO:
op.info(message=op.END_MESSAGE)
def format_log_entries(log_entries):
"""Formats a list of log entries into a readable string.
Args:
log_entries: A list of log entry dictionaries.
Returns:
A formatted string containing information from all log entries.
"""
log_entry = log_entries[-1]
formatted_log = []
labels = get_path(log_entry, ('resource', 'labels'),
default={}) # Provide default empty dict
if labels:
formatted_log.extend([
f"Cluster name: {labels.get('cluster_name', 'N/A')}",
f"Location: {labels.get('location', 'N/A')}",
f"Namespace Name: {labels.get('namespace_name', 'N/A')}",
f"Pod Name: {labels.get('pod_name', 'N/A')}",
f"Project ID: {labels.get('project_id', 'N/A')}"
])
else:
formatted_log.extend([
'Cluster name: Not found', 'Location: Not found',
'Namespace Name: Not found', 'Pod Name: Not found',
'Project ID: Not found'
])
json_payload = get_path(log_entry, ('jsonPayload',),
default={}) # Provide default empty dict
formatted_log.extend([
f"Log Message: {json_payload.get('message', 'N/A')}",
f"Reporting Instance: {json_payload.get('reportingInstance', 'N/A')}",
f"Last Timestamp: {json_payload.get('lastTimestamp', 'N/A')}"
])
return '\n'.join(formatted_log)