in gcpdiag/runbook/interconnect/bgp_down_flap.py [0:0]
def execute(self):
"""Check if there are BGP flap events."""
project = crm.get_project(op.get(flags.PROJECT_ID))
filter_str = [
'resource.type="gce_router"', '"BGP "', '("came up" OR "went down")'
]
region = op.get(flags.REGION)
if region:
filter_str.append(f'resource.labels.region="{region}"')
filter_str = '\n'.join(filter_str)
last_down_time = {}
router_logs = {}
err_router_logs = {}
bgp_flaps = {}
serial_log_entries = local_realtime_query(op.get(flags.START_TIME),
op.get(flags.END_TIME),
filter_str)
# ensure the serial_log_entries have oldest timestamp first
if len(serial_log_entries) > 1:
t1 = get_path(serial_log_entries[0], ('timestamp'), default=None)
t2 = get_path(serial_log_entries[-1], ('timestamp'), default=None)
delta = calculate_time(t1, t2)
if delta[0] == '-':
reversed_list = []
for item in serial_log_entries:
reversed_list.insert(0, item)
serial_log_entries = reversed_list
for item in serial_log_entries:
errflag = False
payload = get_path(item, ('textPayload'), default=None)
timestamp = get_path(item, ('timestamp'), default=None)
router_id = get_path(item, ('resource', 'labels', 'router_id'),
default=None)
tmp = payload.split('peering with ')[1]
ip = tmp.split()[0].strip()
event = 'went down'
if 'came up' in tmp:
event = 'came up'
logentry = []
logentry.append(router_id)
logentry.append(ip)
logentry.append(event)
logentry.append(timestamp)
if 'came up' in payload:
if router_id in last_down_time:
downtime = last_down_time[router_id][0]
last_down_time[router_id] = []
delta = calculate_time(downtime, timestamp)
logentry.append(delta)
if int(delta.split('.', maxsplit=1)[0]) > 60:
# BGP flaps over 60s is an error
errflag = True
else:
# BGP flaps less than 60s need further check
# save router_id, down and up timestamps for next step Cloud Router maintenance check
down_up_times = downtime + ',' + timestamp
bgp_flaps.setdefault(router_id, []).append(down_up_times)
else:
last_down_time.setdefault(router_id, []).append(timestamp)
router_logs.setdefault(router_id, []).append(logentry)
if errflag:
lastlogentry = router_logs[router_id][-2]
err_router_logs.setdefault(router_id, []).append(lastlogentry)
err_router_logs.setdefault(router_id, []).append(logentry)
op.put(flags.BGP_FLAP_LIST, json.dumps(bgp_flaps))
if len(router_logs) > 0:
if len(err_router_logs) == 0:
op.add_uncertain(project,
reason=op.prep_msg(op.UNCERTAIN_REASON,
project_id=project.id),
remediation=op.prep_msg(op.UNCERTAIN_REMEDIATION))
else:
# display BGP flaps with time duration over 60s.
op.info('')
op.info('There are Cloud Router BGP flaps over 60s: ')
errstr = ''
for key, value in err_router_logs.items():
for item in value:
tmp = str(item)
op.info(tmp)
if key not in errstr:
errstr += key + ','
errstr = errstr[:-1]
op.add_failed(project,
reason=op.prep_msg(op.FAILURE_REASON,
project_id=project.id),
remediation=op.prep_msg(op.FAILURE_REMEDIATION))
else:
op.add_ok(project, reason=op.prep_msg(op.SUCCESS_REASON))