gcpdiag/lint/dataproc/err_2023_005_dataproc_quota.py (56 lines of code) (raw):
#
# Copyright 2021 Google LLC
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Dataproc cluster has sufficient quota.
When creating a Dataproc cluster, the project must have available quotas for
the resources you request, such as CPU, disk, and IP addresses.
"""
import re
from boltons.iterutils import get_path
from gcpdiag import lint, models
from gcpdiag.queries import apis, crm, dataproc, logs
MATCH_STR = 'Insufficient .* quota'
RESOURCE_TYPE = 'cloud_dataproc_cluster'
# Criteria to filter for logs
LOG_FILTER = f"""
protoPayload.status.message=~("{MATCH_STR}")
severity=ERROR
"""
logs_by_project = {}
log_search_pattern = re.compile(MATCH_STR)
def prepare_rule(context: models.Context):
logs_by_project[context.project_id] = logs.query(
project_id=context.project_id,
resource_type=RESOURCE_TYPE,
log_name='log_id("cloudaudit.googleapis.com/activity")',
filter_str=LOG_FILTER,
)
def format_cluster(cluster_name, uuid):
return cluster_name + (f'(UUID: {uuid})' if uuid else '')
def run_rule(context: models.Context, report: lint.LintReportRuleInterface):
project = crm.get_project(context.project_id)
# skip entire rule is logging disabled
if not apis.is_enabled(context.project_id, 'logging'):
report.add_skipped(project, 'logging api is disabled')
return
clusters = dataproc.get_clusters(context)
if not clusters:
report.add_skipped(project, 'no clusters found')
return
failed_clusters = set()
if logs_by_project.get(context.project_id):
entries = logs_by_project[context.project_id].entries
for log_entry in entries:
msg = get_path(log_entry, ('protoPayload', 'status', 'message'),
default='')
is_pattern_found = log_search_pattern.search(msg)
# Filter out non-relevant log entries.
if not (log_entry['severity'] == 'ERROR' and is_pattern_found):
continue
cluster_name = get_path(
log_entry,
('resource', 'labels', 'cluster_name'),
default='Unknown Cluster',
)
uuid = get_path(log_entry, ('resource', 'labels', 'cluster_uuid'),
default='')
failed_clusters.add((cluster_name, uuid))
if failed_clusters:
report.add_failed(
project,
'The following clusters failed because of quota errors : {}'.format(
', '.join(
format_cluster(cluster_name, uuid)
for cluster_name, uuid in failed_clusters)),
)
else:
report.add_ok(project)
else:
report.add_ok(project)