def execute()

in gcpdiag/runbook/interconnect/bgp_down_flap.py [0:0]


  def execute(self):
    """Check if there are BGP flap events."""
    project = crm.get_project(op.get(flags.PROJECT_ID))

    filter_str = [
        'resource.type="gce_router"', '"BGP "', '("came up" OR "went down")'
    ]

    region = op.get(flags.REGION)
    if region:
      filter_str.append(f'resource.labels.region="{region}"')
    filter_str = '\n'.join(filter_str)

    last_down_time = {}
    router_logs = {}
    err_router_logs = {}
    bgp_flaps = {}

    serial_log_entries = local_realtime_query(op.get(flags.START_TIME),
                                              op.get(flags.END_TIME),
                                              filter_str)

    # ensure the serial_log_entries have oldest timestamp first
    if len(serial_log_entries) > 1:
      t1 = get_path(serial_log_entries[0], ('timestamp'), default=None)
      t2 = get_path(serial_log_entries[-1], ('timestamp'), default=None)
      delta = calculate_time(t1, t2)
      if delta[0] == '-':
        reversed_list = []
        for item in serial_log_entries:
          reversed_list.insert(0, item)
        serial_log_entries = reversed_list

    for item in serial_log_entries:
      errflag = False

      payload = get_path(item, ('textPayload'), default=None)
      timestamp = get_path(item, ('timestamp'), default=None)
      router_id = get_path(item, ('resource', 'labels', 'router_id'),
                           default=None)
      tmp = payload.split('peering with ')[1]
      ip = tmp.split()[0].strip()

      event = 'went down'
      if 'came up' in tmp:
        event = 'came up'

      logentry = []
      logentry.append(router_id)
      logentry.append(ip)
      logentry.append(event)
      logentry.append(timestamp)

      if 'came up' in payload:
        if router_id in last_down_time:
          downtime = last_down_time[router_id][0]
          last_down_time[router_id] = []

          delta = calculate_time(downtime, timestamp)

          logentry.append(delta)
          if int(delta.split('.', maxsplit=1)[0]) > 60:
            # BGP flaps over 60s is an error
            errflag = True
          else:
            # BGP flaps less than 60s need further check
            # save router_id, down and up timestamps for next step Cloud Router maintenance check
            down_up_times = downtime + ',' + timestamp
            bgp_flaps.setdefault(router_id, []).append(down_up_times)
      else:
        last_down_time.setdefault(router_id, []).append(timestamp)

      router_logs.setdefault(router_id, []).append(logentry)
      if errflag:
        lastlogentry = router_logs[router_id][-2]
        err_router_logs.setdefault(router_id, []).append(lastlogentry)
        err_router_logs.setdefault(router_id, []).append(logentry)

    op.put(flags.BGP_FLAP_LIST, json.dumps(bgp_flaps))

    if len(router_logs) > 0:
      if len(err_router_logs) == 0:
        op.add_uncertain(project,
                         reason=op.prep_msg(op.UNCERTAIN_REASON,
                                            project_id=project.id),
                         remediation=op.prep_msg(op.UNCERTAIN_REMEDIATION))
      else:
        # display BGP flaps with time duration over 60s.
        op.info('')
        op.info('There are Cloud Router BGP flaps over 60s: ')

        errstr = ''
        for key, value in err_router_logs.items():
          for item in value:
            tmp = str(item)
            op.info(tmp)
            if key not in errstr:
              errstr += key + ','
        errstr = errstr[:-1]

        op.add_failed(project,
                      reason=op.prep_msg(op.FAILURE_REASON,
                                         project_id=project.id),
                      remediation=op.prep_msg(op.FAILURE_REMEDIATION))
    else:
      op.add_ok(project, reason=op.prep_msg(op.SUCCESS_REASON))