void Reboot_cluster_from_complete_outage::reboot_seed()

in modules/adminapi/dba/reboot_cluster_from_complete_outage.cc [670:1023]


void Reboot_cluster_from_complete_outage::reboot_seed(
    const Cluster_set_info &cs_info) {
  // If GR auto-started stop it otherwise changing GR member actions will fail
  if (cluster_topology_executor_ops::is_member_auto_rejoining(
          m_target_instance)) {
    cluster_topology_executor_ops::ensure_not_auto_rejoining(m_target_instance);
  }

  const auto is_gr_sro_if_primary_disabled =
      [](const mysqlshdk::mysql::IInstance &instance) {
        bool enabled = false;
        bool action_exists = mysqlshdk::gr::get_member_action_status(
            instance, mysqlshdk::gr::k_gr_disable_super_read_only_if_primary,
            &enabled);

        return !action_exists || !enabled;
      };

  bool remove_cs_replication_channel = false;

  if (cs_info.removed_from_set) {
    // enable mysql_disable_super_read_only_if_primary if needed
    if (is_gr_sro_if_primary_disabled(*m_target_instance)) {
      log_info("Enabling automatic super_read_only management on '%s'",
               m_target_instance->descr().c_str());

      mysqlshdk::gr::enable_member_action(
          *m_target_instance,
          mysqlshdk::gr::k_gr_disable_super_read_only_if_primary,
          mysqlshdk::gr::k_gr_member_action_after_primary_election);
    }

    // If this is a Replica Cluster we must remove the replication channel too
    if (!cs_info.is_primary) {
      remove_cs_replication_channel = true;
    }
  } else if (cs_info.is_invalidated) {  // The Cluster does not know if it was
                                        // removed from the ClusterSet
    // disable mysql_disable_super_read_only_if_primary if needed
    if (!is_gr_sro_if_primary_disabled(*m_target_instance)) {
      log_info("Disabling automatic super_read_only management on '%s'",
               m_target_instance->descr().c_str());

      mysqlshdk::gr::disable_member_action(
          *m_target_instance,
          mysqlshdk::gr::k_gr_disable_super_read_only_if_primary,
          mysqlshdk::gr::k_gr_member_action_after_primary_election);
    }

    // If this is a Replica Cluster we must remove the replication channel too
    if (!cs_info.is_primary) {
      remove_cs_replication_channel = true;
    }
  }

  // remove replication channel
  if (remove_cs_replication_channel) {
    if (mysqlshdk::mysql::Replication_channel channel;
        mysqlshdk::mysql::get_channel_status(
            *m_target_instance, k_clusterset_async_channel_name, &channel)) {
      auto status = channel.status();
      log_info("State of clusterset replication channel: %d", status);

      if (status == mysqlshdk::mysql::Replication_channel::OFF) {
        try {
          mysqlshdk::mysql::reset_slave(*m_target_instance,
                                        k_clusterset_async_channel_name, true);
        } catch (const shcore::Error &e) {
          throw shcore::Exception::mysql_error_with_code(e.what(), e.code());
        }
      }
    }
  }

  // Validations and variables initialization
  {
    // Set the communicationStack if option used
    if (m_options.switch_communication_stack.has_value()) {
      m_options.gr_options.communication_stack =
          *(m_options.switch_communication_stack);
    }

    // Validate the GR options.
    // Note: If the user provides no group_seeds value, it is automatically
    // assigned a value with the local_address values of the existing cluster
    // members and those local_address values are already validated on the
    // validate_local_address_ip_compatibility method, so we only need to
    // validate the group_seeds value provided by the user.
    m_options.gr_options.check_option_values(
        m_target_instance->get_version(),
        m_target_instance->get_canonical_port());

    m_options.gr_options.manual_start_on_boot =
        m_cluster->impl()->get_manual_start_on_boot_option();

    // Make sure the target instance does not already belong to a different
    // cluster.
    try {
      mysqlsh::dba::checks::ensure_instance_not_belong_to_cluster(
          m_target_instance, m_cluster->impl()->get_cluster_server(),
          m_cluster->impl()->get_id());
    } catch (const shcore::Exception &exp) {
      m_already_member =
          (exp.code() == SHERR_DBA_ASYNC_MEMBER_INCONSISTENT) ||
          (exp.code() == SHERR_DBA_BADARG_INSTANCE_MANAGED_IN_CLUSTER);
      if (!m_already_member) throw;
    }

    check_instance_configuration();

    if (get_executed_gtid_set(*m_target_instance).empty()) {
      current_console()->print_note(
          "The target instance '" + m_target_instance->descr() +
          "' has not been pre-provisioned (GTID set is empty). The "
          "Shell is unable to determine whether the instance has "
          "pre-existing data that would be overwritten.");

      throw shcore::Exception("The instance '" + m_target_instance->descr() +
                                  "' has an empty GTID set.",
                              SHERR_DBA_GTID_SYNC_ERROR);
    }
  }

  // Re-bootstrap
  {
    // Set the internal configuration object: read/write configs from the
    // server.
    auto cfg = mysqlsh::dba::create_server_config(
        m_target_instance.get(), mysqlshdk::config::k_dft_cfg_server_handler);

    // Common informative logging
    cluster_topology_executor_ops::log_used_gr_options(m_options.gr_options);

    // If the Cluster is using the 'MySQL' communication stack, we cannot
    // guarantee that:
    //
    //   - The recovery account exists and is configured at every Cluster
    //   member
    //   - The recovery credentials didn't change (for example after a
    //   .resetRecoveryAccountsPassword())
    //   - The recovery credentials have the required Grants
    //
    // For those reasons, we must simply re-create the recovery account
    if (m_options.gr_options.communication_stack.value_or("") ==
        kCommunicationStackMySQL) {
      // If it's a Replica cluster, we must disable the binary logging and
      // ensure the are created later
      if (m_cluster->impl()->is_cluster_set_member() &&
          !m_cluster->impl()->is_primary_cluster()) {
        m_target_instance->execute("SET session sql_log_bin = 0");
      }

      // Disable SRO if enabled
      if (m_target_instance->get_sysvar_bool("super_read_only", false)) {
        m_target_instance->set_sysvar("super_read_only", false);
      }

      // Get the recovery account stored in the Metadata
      std::string recovery_user;
      std::vector<std::string> recovery_user_hosts;
      try {
        std::tie(recovery_user, recovery_user_hosts, std::ignore) =
            m_cluster->impl()->get_replication_user(*m_target_instance);
      } catch (const shcore::Exception &re) {
        if (re.is_runtime()) {
          mysqlsh::current_console()->print_error(
              "Unsupported recovery account has been found for "
              "instance " +
              m_target_instance->descr() +
              ". Operations such as "
              "<Cluster>.<<<resetRecoveryAccountsPassword>>>() and "
              "<Cluster>.<<<addInstance>>>() may fail. Please remove and "
              "add the instance back to the Cluster to ensure a "
              "supported recovery account is used.");
        }
        throw;
      }

      mysqlshdk::mysql::Auth_options repl_account;
      repl_account.user = recovery_user;

      // Check if the replication user already exists to delete it
      // before creating it again
      for (const auto &hostname : recovery_user_hosts) {
        if (!m_target_instance->user_exists(repl_account.user, hostname))
          continue;

        current_console()->print_note(shcore::str_format(
            "User '%s'@'%s' already existed at instance '%s'. It will be "
            "deleted and created again with a new password.",
            repl_account.user.c_str(), hostname.c_str(),
            m_target_instance->descr().c_str()));

        m_target_instance->drop_user(repl_account.user, hostname);
      }

      // Get the replicationAllowedHost value set for the Cluster
      std::string repl_account_host = "%";

      if (shcore::Value allowed_host;
          m_cluster->impl()
              ->get_metadata_storage()
              ->query_cluster_set_attribute(
                  m_cluster->impl()->get_id(),
                  k_cluster_attribute_replication_allowed_host,
                  &allowed_host) &&
          allowed_host.type == shcore::String &&
          !allowed_host.as_string().empty()) {
        repl_account_host = allowed_host.as_string();
      }

      // Create a new recovery account
      {
        std::vector<std::string> hosts;
        hosts.push_back(repl_account_host);

        mysqlshdk::gr::Create_recovery_user_options options;
        options.clone_supported = true;
        options.auto_failover = false;
        options.mysql_comm_stack_supported = true;

        repl_account = mysqlshdk::gr::create_recovery_user(
            repl_account.user, *m_target_instance, hosts, options);
      }

      // Change GR's recovery replication credentials in all possible
      // donors so whenever GR picks a suitable donor it will be able to
      // connect and authenticate at the target
      // NOTE: Instances in RECOVERING must be skipped since won't be used
      // as donor and the change source command would fail anyway
      mysqlshdk::mysql::Replication_credentials_options options;
      options.password = repl_account.password.value_or("");

      mysqlshdk::mysql::change_replication_credentials(
          *m_target_instance, mysqlshdk::gr::k_gr_recovery_channel,
          repl_account.user, options);

      if (m_cluster->impl()->is_cluster_set_member() &&
          !m_cluster->impl()->is_primary_cluster()) {
        m_target_instance->execute("SET session sql_log_bin = 1");
      }

      // Insert the recovery account on the Metadata Schema.
      m_cluster->impl()->get_metadata_storage()->update_instance_repl_account(
          m_target_instance->get_uuid(), Cluster_type::GROUP_REPLICATION,
          repl_account.user, repl_account_host);

      // Set the allowlist to 'AUTOMATIC' to ensure no older values are used
      // since reboot will re-use the values persisted in the instance.
      // NOTE: AUTOMATIC because there's no other allowed value when using the
      // 'MySQL' communication stack
      m_options.gr_options.ip_allowlist = "AUTOMATIC";
    }

    // Make sure the GR plugin is installed (only installed if needed).
    // NOTE: An error is issued if it fails to be installed (e.g., DISABLED).
    //       Disable read-only temporarily to install the plugin if needed.
    mysqlshdk::gr::install_group_replication_plugin(*m_target_instance,
                                                    nullptr);

    if (m_options.gr_options.group_name.has_value() &&
        !m_options.gr_options.group_name->empty()) {
      log_info("Using Group Replication group name: %s",
               m_options.gr_options.group_name->c_str());
    }

    // Get the value for transaction size limit stored in the Metadata if it
    // wasn't set by the caller
    if (!m_options.gr_options.transaction_size_limit.has_value()) {
      int64_t transaction_size_limit;

      if (shcore::Value value;
          m_cluster->impl()->get_metadata_storage()->query_cluster_attribute(
              m_cluster->impl()->get_id(),
              k_cluster_attribute_transaction_size_limit, &value)) {
        transaction_size_limit = value.as_int();
      } else {
        // Use what's set in the instance
        transaction_size_limit = get_transaction_size_limit(
            *m_cluster->impl()->get_cluster_server());
      }

      log_info("Using Group Replication transaction size limit: %" PRId64,
               transaction_size_limit);

      m_options.gr_options.transaction_size_limit = transaction_size_limit;
    }

    // Get the persisted value of paxosSingleLeader to use it
    if (!m_options.gr_options.paxos_single_leader.has_value() &&
        m_target_instance->is_set_persist_supported()) {
      std::string paxos_single_leader =
          m_target_instance
              ->get_persisted_value("group_replication_paxos_single_leader")
              .value_or("");

      if (!paxos_single_leader.empty()) {
        m_options.gr_options.paxos_single_leader =
            shcore::str_caseeq(paxos_single_leader, "on") ? true : false;
      }
    }

    log_info("Starting cluster with '%s' using account %s",
             m_target_instance->descr().c_str(),
             m_target_instance->get_connection_options().get_user().c_str());

    // Determine the topology mode to use.
    auto multi_primary = m_cluster->impl()->get_cluster_topology_type() ==
                         mysqlshdk::gr::Topology_mode::MULTI_PRIMARY;

    bool requires_certificates{false};
    switch (m_cluster->impl()->query_cluster_auth_type()) {
      case mysqlsh::dba::Replication_auth_type::CERT_ISSUER:
      case mysqlsh::dba::Replication_auth_type::CERT_SUBJECT:
      case mysqlsh::dba::Replication_auth_type::CERT_ISSUER_PASSWORD:
      case mysqlsh::dba::Replication_auth_type::CERT_SUBJECT_PASSWORD:
        requires_certificates = true;
      default:
        break;
    }

    // Start the cluster to bootstrap Group Replication.
    mysqlsh::dba::start_cluster(*m_target_instance, m_options.gr_options,
                                requires_certificates, multi_primary,
                                cfg.get());

    // Wait for the seed instance to become ONLINE in the Group.
    // Especially relevant on Replica Clusters to ensure the seed instance is
    // already ONLINE when other members try to rejoin the Cluster, otherwise,
    // the rejoining members will fail to rejoin because the managed
    // replication channel may be started with auto-failover while the members
    // haven't obtained those channel configurations yet.
    uint32_t timeout = 5 * 60 * 1000;  // 5 minutes

    mysqlsh::current_console()->print_info(
        "* Waiting for seed instance to become ONLINE...");

    mysqlshdk::gr::wait_member_online(*m_target_instance, timeout);

    // Update the instances Metadata to ensure 'grLocal' reflects the new
    // value for local_address Do it only when communicationStack is used and
    // it's not a replica cluster to not introduce errant transactions
    if (m_options.gr_options.communication_stack.has_value() &&
        (!m_cluster->impl()->is_cluster_set_member() ||
         m_cluster->impl()->is_primary_cluster())) {
      m_cluster->impl()->update_metadata_for_instance(*m_target_instance);
    }

    log_debug("Instance add finished");

    current_console()->print_info(m_target_instance->descr() +
                                  " was restored.");
  }
}