in modules/adminapi/dba/reboot_cluster_from_complete_outage.cc [670:1023]
void Reboot_cluster_from_complete_outage::reboot_seed(
const Cluster_set_info &cs_info) {
// If GR auto-started stop it otherwise changing GR member actions will fail
if (cluster_topology_executor_ops::is_member_auto_rejoining(
m_target_instance)) {
cluster_topology_executor_ops::ensure_not_auto_rejoining(m_target_instance);
}
const auto is_gr_sro_if_primary_disabled =
[](const mysqlshdk::mysql::IInstance &instance) {
bool enabled = false;
bool action_exists = mysqlshdk::gr::get_member_action_status(
instance, mysqlshdk::gr::k_gr_disable_super_read_only_if_primary,
&enabled);
return !action_exists || !enabled;
};
bool remove_cs_replication_channel = false;
if (cs_info.removed_from_set) {
// enable mysql_disable_super_read_only_if_primary if needed
if (is_gr_sro_if_primary_disabled(*m_target_instance)) {
log_info("Enabling automatic super_read_only management on '%s'",
m_target_instance->descr().c_str());
mysqlshdk::gr::enable_member_action(
*m_target_instance,
mysqlshdk::gr::k_gr_disable_super_read_only_if_primary,
mysqlshdk::gr::k_gr_member_action_after_primary_election);
}
// If this is a Replica Cluster we must remove the replication channel too
if (!cs_info.is_primary) {
remove_cs_replication_channel = true;
}
} else if (cs_info.is_invalidated) { // The Cluster does not know if it was
// removed from the ClusterSet
// disable mysql_disable_super_read_only_if_primary if needed
if (!is_gr_sro_if_primary_disabled(*m_target_instance)) {
log_info("Disabling automatic super_read_only management on '%s'",
m_target_instance->descr().c_str());
mysqlshdk::gr::disable_member_action(
*m_target_instance,
mysqlshdk::gr::k_gr_disable_super_read_only_if_primary,
mysqlshdk::gr::k_gr_member_action_after_primary_election);
}
// If this is a Replica Cluster we must remove the replication channel too
if (!cs_info.is_primary) {
remove_cs_replication_channel = true;
}
}
// remove replication channel
if (remove_cs_replication_channel) {
if (mysqlshdk::mysql::Replication_channel channel;
mysqlshdk::mysql::get_channel_status(
*m_target_instance, k_clusterset_async_channel_name, &channel)) {
auto status = channel.status();
log_info("State of clusterset replication channel: %d", status);
if (status == mysqlshdk::mysql::Replication_channel::OFF) {
try {
mysqlshdk::mysql::reset_slave(*m_target_instance,
k_clusterset_async_channel_name, true);
} catch (const shcore::Error &e) {
throw shcore::Exception::mysql_error_with_code(e.what(), e.code());
}
}
}
}
// Validations and variables initialization
{
// Set the communicationStack if option used
if (m_options.switch_communication_stack.has_value()) {
m_options.gr_options.communication_stack =
*(m_options.switch_communication_stack);
}
// Validate the GR options.
// Note: If the user provides no group_seeds value, it is automatically
// assigned a value with the local_address values of the existing cluster
// members and those local_address values are already validated on the
// validate_local_address_ip_compatibility method, so we only need to
// validate the group_seeds value provided by the user.
m_options.gr_options.check_option_values(
m_target_instance->get_version(),
m_target_instance->get_canonical_port());
m_options.gr_options.manual_start_on_boot =
m_cluster->impl()->get_manual_start_on_boot_option();
// Make sure the target instance does not already belong to a different
// cluster.
try {
mysqlsh::dba::checks::ensure_instance_not_belong_to_cluster(
m_target_instance, m_cluster->impl()->get_cluster_server(),
m_cluster->impl()->get_id());
} catch (const shcore::Exception &exp) {
m_already_member =
(exp.code() == SHERR_DBA_ASYNC_MEMBER_INCONSISTENT) ||
(exp.code() == SHERR_DBA_BADARG_INSTANCE_MANAGED_IN_CLUSTER);
if (!m_already_member) throw;
}
check_instance_configuration();
if (get_executed_gtid_set(*m_target_instance).empty()) {
current_console()->print_note(
"The target instance '" + m_target_instance->descr() +
"' has not been pre-provisioned (GTID set is empty). The "
"Shell is unable to determine whether the instance has "
"pre-existing data that would be overwritten.");
throw shcore::Exception("The instance '" + m_target_instance->descr() +
"' has an empty GTID set.",
SHERR_DBA_GTID_SYNC_ERROR);
}
}
// Re-bootstrap
{
// Set the internal configuration object: read/write configs from the
// server.
auto cfg = mysqlsh::dba::create_server_config(
m_target_instance.get(), mysqlshdk::config::k_dft_cfg_server_handler);
// Common informative logging
cluster_topology_executor_ops::log_used_gr_options(m_options.gr_options);
// If the Cluster is using the 'MySQL' communication stack, we cannot
// guarantee that:
//
// - The recovery account exists and is configured at every Cluster
// member
// - The recovery credentials didn't change (for example after a
// .resetRecoveryAccountsPassword())
// - The recovery credentials have the required Grants
//
// For those reasons, we must simply re-create the recovery account
if (m_options.gr_options.communication_stack.value_or("") ==
kCommunicationStackMySQL) {
// If it's a Replica cluster, we must disable the binary logging and
// ensure the are created later
if (m_cluster->impl()->is_cluster_set_member() &&
!m_cluster->impl()->is_primary_cluster()) {
m_target_instance->execute("SET session sql_log_bin = 0");
}
// Disable SRO if enabled
if (m_target_instance->get_sysvar_bool("super_read_only", false)) {
m_target_instance->set_sysvar("super_read_only", false);
}
// Get the recovery account stored in the Metadata
std::string recovery_user;
std::vector<std::string> recovery_user_hosts;
try {
std::tie(recovery_user, recovery_user_hosts, std::ignore) =
m_cluster->impl()->get_replication_user(*m_target_instance);
} catch (const shcore::Exception &re) {
if (re.is_runtime()) {
mysqlsh::current_console()->print_error(
"Unsupported recovery account has been found for "
"instance " +
m_target_instance->descr() +
". Operations such as "
"<Cluster>.<<<resetRecoveryAccountsPassword>>>() and "
"<Cluster>.<<<addInstance>>>() may fail. Please remove and "
"add the instance back to the Cluster to ensure a "
"supported recovery account is used.");
}
throw;
}
mysqlshdk::mysql::Auth_options repl_account;
repl_account.user = recovery_user;
// Check if the replication user already exists to delete it
// before creating it again
for (const auto &hostname : recovery_user_hosts) {
if (!m_target_instance->user_exists(repl_account.user, hostname))
continue;
current_console()->print_note(shcore::str_format(
"User '%s'@'%s' already existed at instance '%s'. It will be "
"deleted and created again with a new password.",
repl_account.user.c_str(), hostname.c_str(),
m_target_instance->descr().c_str()));
m_target_instance->drop_user(repl_account.user, hostname);
}
// Get the replicationAllowedHost value set for the Cluster
std::string repl_account_host = "%";
if (shcore::Value allowed_host;
m_cluster->impl()
->get_metadata_storage()
->query_cluster_set_attribute(
m_cluster->impl()->get_id(),
k_cluster_attribute_replication_allowed_host,
&allowed_host) &&
allowed_host.type == shcore::String &&
!allowed_host.as_string().empty()) {
repl_account_host = allowed_host.as_string();
}
// Create a new recovery account
{
std::vector<std::string> hosts;
hosts.push_back(repl_account_host);
mysqlshdk::gr::Create_recovery_user_options options;
options.clone_supported = true;
options.auto_failover = false;
options.mysql_comm_stack_supported = true;
repl_account = mysqlshdk::gr::create_recovery_user(
repl_account.user, *m_target_instance, hosts, options);
}
// Change GR's recovery replication credentials in all possible
// donors so whenever GR picks a suitable donor it will be able to
// connect and authenticate at the target
// NOTE: Instances in RECOVERING must be skipped since won't be used
// as donor and the change source command would fail anyway
mysqlshdk::mysql::Replication_credentials_options options;
options.password = repl_account.password.value_or("");
mysqlshdk::mysql::change_replication_credentials(
*m_target_instance, mysqlshdk::gr::k_gr_recovery_channel,
repl_account.user, options);
if (m_cluster->impl()->is_cluster_set_member() &&
!m_cluster->impl()->is_primary_cluster()) {
m_target_instance->execute("SET session sql_log_bin = 1");
}
// Insert the recovery account on the Metadata Schema.
m_cluster->impl()->get_metadata_storage()->update_instance_repl_account(
m_target_instance->get_uuid(), Cluster_type::GROUP_REPLICATION,
repl_account.user, repl_account_host);
// Set the allowlist to 'AUTOMATIC' to ensure no older values are used
// since reboot will re-use the values persisted in the instance.
// NOTE: AUTOMATIC because there's no other allowed value when using the
// 'MySQL' communication stack
m_options.gr_options.ip_allowlist = "AUTOMATIC";
}
// Make sure the GR plugin is installed (only installed if needed).
// NOTE: An error is issued if it fails to be installed (e.g., DISABLED).
// Disable read-only temporarily to install the plugin if needed.
mysqlshdk::gr::install_group_replication_plugin(*m_target_instance,
nullptr);
if (m_options.gr_options.group_name.has_value() &&
!m_options.gr_options.group_name->empty()) {
log_info("Using Group Replication group name: %s",
m_options.gr_options.group_name->c_str());
}
// Get the value for transaction size limit stored in the Metadata if it
// wasn't set by the caller
if (!m_options.gr_options.transaction_size_limit.has_value()) {
int64_t transaction_size_limit;
if (shcore::Value value;
m_cluster->impl()->get_metadata_storage()->query_cluster_attribute(
m_cluster->impl()->get_id(),
k_cluster_attribute_transaction_size_limit, &value)) {
transaction_size_limit = value.as_int();
} else {
// Use what's set in the instance
transaction_size_limit = get_transaction_size_limit(
*m_cluster->impl()->get_cluster_server());
}
log_info("Using Group Replication transaction size limit: %" PRId64,
transaction_size_limit);
m_options.gr_options.transaction_size_limit = transaction_size_limit;
}
// Get the persisted value of paxosSingleLeader to use it
if (!m_options.gr_options.paxos_single_leader.has_value() &&
m_target_instance->is_set_persist_supported()) {
std::string paxos_single_leader =
m_target_instance
->get_persisted_value("group_replication_paxos_single_leader")
.value_or("");
if (!paxos_single_leader.empty()) {
m_options.gr_options.paxos_single_leader =
shcore::str_caseeq(paxos_single_leader, "on") ? true : false;
}
}
log_info("Starting cluster with '%s' using account %s",
m_target_instance->descr().c_str(),
m_target_instance->get_connection_options().get_user().c_str());
// Determine the topology mode to use.
auto multi_primary = m_cluster->impl()->get_cluster_topology_type() ==
mysqlshdk::gr::Topology_mode::MULTI_PRIMARY;
bool requires_certificates{false};
switch (m_cluster->impl()->query_cluster_auth_type()) {
case mysqlsh::dba::Replication_auth_type::CERT_ISSUER:
case mysqlsh::dba::Replication_auth_type::CERT_SUBJECT:
case mysqlsh::dba::Replication_auth_type::CERT_ISSUER_PASSWORD:
case mysqlsh::dba::Replication_auth_type::CERT_SUBJECT_PASSWORD:
requires_certificates = true;
default:
break;
}
// Start the cluster to bootstrap Group Replication.
mysqlsh::dba::start_cluster(*m_target_instance, m_options.gr_options,
requires_certificates, multi_primary,
cfg.get());
// Wait for the seed instance to become ONLINE in the Group.
// Especially relevant on Replica Clusters to ensure the seed instance is
// already ONLINE when other members try to rejoin the Cluster, otherwise,
// the rejoining members will fail to rejoin because the managed
// replication channel may be started with auto-failover while the members
// haven't obtained those channel configurations yet.
uint32_t timeout = 5 * 60 * 1000; // 5 minutes
mysqlsh::current_console()->print_info(
"* Waiting for seed instance to become ONLINE...");
mysqlshdk::gr::wait_member_online(*m_target_instance, timeout);
// Update the instances Metadata to ensure 'grLocal' reflects the new
// value for local_address Do it only when communicationStack is used and
// it's not a replica cluster to not introduce errant transactions
if (m_options.gr_options.communication_stack.has_value() &&
(!m_cluster->impl()->is_cluster_set_member() ||
m_cluster->impl()->is_primary_cluster())) {
m_cluster->impl()->update_metadata_for_instance(*m_target_instance);
}
log_debug("Instance add finished");
current_console()->print_info(m_target_instance->descr() +
" was restored.");
}
}