in gcpdiag/runbook/nat/generalized_steps.py [0:0]
def execute(self):
"""Checking NATGW received_packets_dropped metric for elevated drops"""
vm = gce.get_instance(project_id=op.get(flags.PROJECT_ID),
zone=op.get(flags.ZONE),
instance_name=op.get(flags.NAME))
region = utils.region_from_zone(op.get(flags.ZONE))
if op.get('nat_gateway_name'):
gw_name = op.get('nat_gateway_name')
received_packets_dropped = monitoring.query(
op.get(flags.PROJECT_ID),
'fetch nat_gateway::router.googleapis.com/nat/dropped_received_packets_count '
f'| filter (resource.gateway_name == \'{gw_name}\' && resource.region == \'{region}\')'
'| align rate(5m) | within 5m | group_by [],'
'[value_dropped_received_packets_count_aggregate:'
'aggregate(value.dropped_received_packets_count)]')
if received_packets_dropped:
values = received_packets_dropped.values()
for value in values:
if value.get('values')[0][0] >= 1:
op.put('natgw_rcv_pkt_drops', True)
op.add_uncertain(vm,
reason=op.prep_msg(
op.UNCERTAIN_REASON,
nat_gateway_name=op.get('nat_gateway_name'),
metric_value=value.get('values')[0][0]),
remediation=op.prep_msg(op.UNCERTAIN_REMEDIATION))
# Also check the for received packet drops at the vm level
vm_received_packets_dropped_count = monitoring.query(
op.get(flags.PROJECT_ID),
'fetch gce_instance::compute.googleapis.com/nat/dropped_received_packets_count '
f'| filter (resource.gateway_name == \'{gw_name}\' '
f'&& resource.region == \'{region}\')'
'| align rate(5m)'
'| every 5m'
'| group_by [resource.instance_id], '
'[value_dropped_received_packets_count_aggregate: '
'aggregate(value.dropped_received_packets_count)]')
if vm_received_packets_dropped_count:
vm_drop_list = []
vm_values = vm_received_packets_dropped_count.values()
for vm_value in vm_values:
if vm_value.get('values')[0][0] >= 1 and len(vm_drop_list) <= 5:
vm_drop_list.append({
'instance_id':
vm_value.get('labels',
{}).get('resource.instance_id'),
'rcv_pkt_drp_count':
vm_value.get('values')[0][0]
})
if vm_drop_list:
op.add_uncertain(
vm,
reason='Elevated received_packet_drop_count metric noticed'
f'for following VMs {str(vm_drop_list)}',
remediation=
"""VMs could be dropping packets for various reasons; however,
the drops are not always indicative of an issue.
See more on troubleshooting cloud NAT and reducing the drops here [1] and [2]:
Open a case to GCP Support for justification for the packet drops.
[1] https://cloud.google.com/nat/docs/troubleshooting
[2] https://cloud.google.com/knowledge/kb
/reduce-received-packets-dropped-count-on-cloud-nat-000006744"""
)
else:
op.add_ok(vm, reason=op.prep_msg(op.SUCCESS_REASON))
else:
op.add_ok(vm,
reason=op.prep_msg(
op.SUCCESS_REASON,
nat_gateway_name=op.get('nat_gateway_name')))
else:
op.add_uncertain(
vm, 'Cloud not get dropped_received_packets_count'
f"metric for NATGW {op.get('nat_gateway_name')}")