gcpdiag/runbook/gce/ops_agent.py (255 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains diagnostic tree for ops agent onboarding and investigation as well as custom steps."""
from datetime import datetime
import googleapiclient.errors
from boltons.iterutils import get_path
from gcpdiag import runbook
# Interact with GCP APIs using gcpdiag queries
from gcpdiag.queries import crm, gce, iam, logs, monitoring
from gcpdiag.runbook import op
# Reuse generalized steps from other products within this runbook
from gcpdiag.runbook.crm import generalized_steps as crm_gs
from gcpdiag.runbook.gce import constants, flags
from gcpdiag.runbook.gce import generalized_steps as gce_gs
from gcpdiag.runbook.gcp import generalized_steps as gcp_gs
from gcpdiag.runbook.iam import generalized_steps as iam_gs
GAC_SERVICE_ACCOUNT = 'gac_service_account'
CHECK_LOGGING = 'check_logging'
CHECK_MONITORING = 'check_monitoring'
CHECK_SERIAL_PORT_LOGGING = 'check_serial_port_logging'
class OpsAgent(runbook.DiagnosticTree):
"""Investigates the necessary GCP components for the proper functioning of the Ops Agent in a VM
This runbook will examine the following key areas:
1. API Service Checks:
- Ensures that Cloud APIs for Logging and/or Monitoring are accessible.
2. Permission Checks:
- Verifies that the necessary permissions are in place for exporting logs and/or metrics.
3. Workload Authentication:
- Confirms that the Ops Agent has a service account for authentication.
- If using Google Application Credentials, provide the service account
with the `gac_service_account` parameter.
4. Scope of Investigation:
- Note that this runbook does not include internal VM checks, such as guest OS investigations.
"""
parameters = {
flags.PROJECT_ID: {
'type': str,
'help': 'The Project ID containing the VM',
'required': True
},
flags.INSTANCE_NAME: {
'type': str,
'help': 'Name of the GCE instance running the Ops Agent',
'required': True
},
flags.INSTANCE_ID: {
'type': str,
'help': 'ID of the GCE instance running the Ops Agent',
},
flags.ZONE: {
'type': str,
'help': 'Zone of the GCE instance running the Ops Agent',
'required': True
},
flags.START_TIME: {
'type': datetime,
'help': 'Start time of the issue',
},
flags.END_TIME: {
'type': datetime,
'help': 'End time of the issue',
},
GAC_SERVICE_ACCOUNT: {
'type':
str,
'help':
'GOOGLE_APPLICATION_CREDENTIALS used by ops agent, if applicable'
},
CHECK_LOGGING: {
'type': bool,
'help': 'Investigate logging issues',
'default': True
},
CHECK_MONITORING: {
'type': bool,
'help': 'Investigate monitoring issues',
'default': True
},
CHECK_SERIAL_PORT_LOGGING: {
'type': bool,
'help': 'Check if VM Serial logging is enabled',
'default': True
}
}
def build_tree(self):
"""Describes step relationships"""
# Instantiate your start class
start = OpsAgentStart()
# add it to your tree
self.add_start(start)
sa_check = VmHasAServiceAccount()
self.add_step(parent=start, child=sa_check)
self.add_step(parent=sa_check, child=iam_gs.VmHasAnActiveServiceAccount())
self.add_step(parent=sa_check, child=InvestigateLoggingMonitoring())
self.add_end(OpsAgentEnd())
class OpsAgentStart(runbook.StartStep):
"""Prepares the parameters required for the gce/ops-agent runbook.
Looks up the GCE resource running the ops agent binary
Ensures both instance_id and instance_name parameters are available.
"""
def execute(self):
"""Verify context and parameters required for Ops Agent runbook checks"""
project = crm.get_project(op.get(flags.PROJECT_ID))
try:
instance = gce.get_instance(project_id=op.get(flags.PROJECT_ID),
zone=op.get(flags.ZONE),
instance_name=op.get(flags.INSTANCE_NAME))
except googleapiclient.errors.HttpError:
op.add_skipped(
project,
reason=('Instance {} does not exist in zone {} or project {}').format(
op.get(flags.INSTANCE_NAME), op.get(flags.ZONE),
op.get(flags.PROJECT_ID)))
else:
# Prepare extra parameters.
if instance and op.get(flags.INSTANCE_NAME):
op.put(flags.INSTANCE_ID, instance.id)
if instance and op.get(flags.INSTANCE_ID):
op.put(flags.INSTANCE_NAME, instance.name)
class VmHasAServiceAccount(runbook.Step):
"""Verifies the existence of a service account for the Ops Agent to use.
This investigation only happens from the perspective googleapis and
user provided input. We don't look inside the VM for cases like
GOOGLE_APPLICATION_CREDENTIALS. User will have to know and specify that if
They are using the application
"""
template = 'vm_attributes::service_account_exists'
def execute(self):
"""Verify Ops Agent has a service account."""
instance = gce.get_instance(project_id=op.get(flags.PROJECT_ID),
zone=op.get(flags.ZONE),
instance_name=op.get(flags.INSTANCE_NAME))
if not op.get(GAC_SERVICE_ACCOUNT):
if instance.service_account:
op.put(flags.SERVICE_ACCOUNT, instance.service_account)
op.add_ok(instance,
reason=op.prep_msg(op.SUCCESS_REASON,
full_resource_path=instance.full_path,
sa=instance.service_account))
else:
op.add_failed(instance,
reason=op.prep_msg(op.FAILURE_REASON,
full_resource_path=instance.full_path),
remediation=op.prep_msg(op.FAILURE_REMEDIATION))
return
if op.get(GAC_SERVICE_ACCOUNT):
sa_list = iam.get_service_account_list(op.get(flags.PROJECT_ID))
for sa in sa_list:
if sa.email == op.get(GAC_SERVICE_ACCOUNT):
op.put(flags.SERVICE_ACCOUNT, sa.email)
op.add_ok(instance,
reason=op.prep_msg(op.SUCCESS_REASON,
full_resource_path=instance.full_path,
sa=sa.email))
break
elif not op.get(GAC_SERVICE_ACCOUNT) and not instance.service_account:
op.add_failed(instance,
reason=op.prep_msg(op.FAILURE_REASON,
full_resource_path=instance.full_path),
remediation=op.prep_msg(op.FAILURE_REMEDIATION))
class InvestigateLoggingMonitoring(runbook.Gateway):
"""A Decision Point for to check Logging and/or Monitoring related issues
Decides whether to check for ops agent
- logging related issues if CHECK_LOGGING is set to true
- monitoring related issues if CHECK_MONITORING is set to true
"""
def execute(self):
"""Decision point to investigate Logging and/or Monitoring related issues."""
if op.get(CHECK_LOGGING):
logging_api = gcp_gs.ServiceApiStatusCheck()
logging_api.api_name = 'logging'
logging_api.project_id = op.get(flags.PROJECT_ID)
logging_api.expected_state = constants.APIState.ENABLED
self.add_child(logging_api)
log_permission_check = iam_gs.IamPolicyCheck()
log_permission_check.project = op.get(flags.PROJECT_ID)
log_permission_check.principal = (
f'serviceAccount:{op.get(flags.SERVICE_ACCOUNT)}')
log_permission_check.roles = [
'roles/owner',
'roles/editor',
'roles/logging.logWriter',
'roles/logging.admin',
]
logging_api.add_child(log_permission_check)
logging_access_scope = gce_gs.VmScope()
logging_access_scope.project_id = op.get(flags.PROJECT_ID)
logging_access_scope.zone = op.get(flags.ZONE)
logging_access_scope.instance_name = op.get(flags.INSTANCE_NAME)
logging_access_scope.access_scopes = {
'https://www.googleapis.com/auth/logging.write',
'https://www.googleapis.com/auth/cloud-platform',
'https://www.googleapis.com/auth/logging.admin',
}
logging_api.add_child(logging_access_scope)
logging_subagent_check = gce_gs.VmHasOpsAgent()
logging_subagent_check.project_id = op.get(flags.PROJECT_ID)
logging_subagent_check.zone = op.get(flags.ZONE)
logging_subagent_check.instance_name = op.get(flags.INSTANCE_NAME)
logging_subagent_check.instance_id = op.get(flags.INSTANCE_ID)
logging_subagent_check.start_time = op.get(flags.START_TIME)
logging_subagent_check.end_time = op.get(flags.END_TIME)
logging_subagent_check.check_logging = True
logging_subagent_check.check_metrics = False
logging_access_scope.add_child(logging_subagent_check)
if op.get(CHECK_SERIAL_PORT_LOGGING):
logging_api.add_child(child=CheckSerialPortLogging())
if op.get(CHECK_MONITORING):
monitoring_api = gcp_gs.ServiceApiStatusCheck()
monitoring_api.project_id = op.get(flags.PROJECT_ID)
monitoring_api.api_name = 'monitoring'
monitoring_api.expected_state = constants.APIState.ENABLED
self.add_child(monitoring_api)
monitoring_permission_check = iam_gs.IamPolicyCheck()
monitoring_permission_check.project = op.get(flags.PROJECT_ID)
monitoring_permission_check.principal = f'serviceAccount:{op.get(flags.SERVICE_ACCOUNT)}'
monitoring_permission_check.roles = [
'roles/monitoring.metricWriter', 'roles/monitoring.admin',
'roles/monitoring.editor', 'roles/owner', 'roles/editor'
]
monitoring_api.add_child(child=monitoring_permission_check)
monitoring_access_scope = gce_gs.VmScope()
monitoring_access_scope.project_id = op.get(flags.PROJECT_ID)
monitoring_access_scope.zone = op.get(flags.ZONE)
monitoring_access_scope.instance_name = op.get(flags.INSTANCE_NAME)
monitoring_access_scope.access_scopes = {
'https://www.googleapis.com/auth/monitoring.write',
'https://www.googleapis.com/auth/cloud-platform',
'https://www.googleapis.com/auth/monitoring'
}
monitoring_api.add_child(monitoring_access_scope)
# Check if ops agent metric subagent is installed.
metric_subagent_check = gce_gs.VmHasOpsAgent()
metric_subagent_check.project_id = op.get(flags.PROJECT_ID)
metric_subagent_check.zone = op.get(flags.ZONE)
metric_subagent_check.instance_name = op.get(flags.INSTANCE_NAME)
metric_subagent_check.instance_id = op.get(flags.INSTANCE_ID)
metric_subagent_check.start_time = op.get(flags.START_TIME)
metric_subagent_check.end_time = op.get(flags.END_TIME)
metric_subagent_check.check_logging = False
metric_subagent_check.check_metrics = True
monitoring_access_scope.add_child(metric_subagent_check)
class CheckSerialPortLogging(runbook.CompositeStep):
"""Checks if ops agent serial port logging
Verifies Organization policy and VM configuration to issue serial port logging
to Stackdriver from Compute Engine VMs is feasible.
"""
def execute(self):
"""Verify GCP config required for serial port logging with ops agent"""
serial_logging_orgpolicy_check = crm_gs.OrgPolicyCheck()
serial_logging_orgpolicy_check.constraint = 'constraints/compute.disableSerialPortLogging'
serial_logging_orgpolicy_check.is_enforced = False
self.add_child(serial_logging_orgpolicy_check)
serial_logging_md_check = gce_gs.VmMetadataCheck()
serial_logging_md_check.project_id = op.get(flags.PROJECT_ID)
serial_logging_md_check.zone = op.get(flags.ZONE)
serial_logging_md_check.instance_name = op.get(flags.INSTANCE_NAME)
serial_logging_md_check.metadata_key = 'serial-port-logging-enable'
serial_logging_md_check.expected_value = True
self.add_child(serial_logging_md_check)
class OpsAgentEnd(runbook.EndStep):
"""Finalizes the OpsAgent checks.
Checks if logs or metrics are currently present after diagnosing the issue.
"""
def _has_ops_agent_metric_logging_agent(self, metric_data):
"""Checks if ops agent logging agent and metric agent is installed"""
pass
def execute(self):
"""Finalize Ops agent checks"""
serial_log_entries = None
has_expected_opsagent_logs = False
ops_agent_uptime = None
has_opsagent = False
if op.get(CHECK_SERIAL_PORT_LOGGING):
serial_log_entries = logs.realtime_query(
project_id=op.get(flags.PROJECT_ID),
filter_str='''resource.type="gce_instance"
log_name="projects/{}/logs/ops-agent-health"
resource.labels.instance_id="{}" AND
"LogPingOpsAgent"'''.format(op.get(flags.PROJECT_ID),
op.get(flags.INSTANCE_ID)),
start_time=op.get(flags.END_TIME),
end_time=datetime.now())
if serial_log_entries:
has_expected_opsagent_logs = True
op.info(
'There are new logs indicating ops agent is exporting serial logs')
if op.get(CHECK_MONITORING):
ops_agent_uptime = monitoring.query(
op.get(flags.PROJECT_ID), """
fetch gce_instance
| metric 'agent.googleapis.com/agent/uptime'
| filter (resource.instance_id == '{}')
| align rate(1m)
| every 1m
| group_by [resource.instance_id, metric.version],
[value_uptime_aggregate: aggregate(value.uptime)]
""".format(op.get(flags.INSTANCE_ID)))
for entry in ops_agent_uptime.values():
version = get_path(entry, ('labels', 'metric.version'), '')
if 'google-cloud-ops-agent-metrics' in version:
has_opsagent = True
op.info(
'There is metrics data indicating ops agent is exporting metrics correctly!'
)
if not has_expected_opsagent_logs and not has_opsagent:
response = op.prompt(
kind=op.CONFIRMATION,
message=
f'Is your ops agent issues resolved for "{op.get(flags.INSTANCE_NAME)}?"'
)
if response == op.NO:
op.info(message=op.END_MESSAGE)