in modules/adminapi/cluster_set/cluster_set_impl.cc [923:1502]
void Cluster_set_impl::remove_cluster(
const std::string &cluster_name,
const clusterset::Remove_cluster_options &options) {
check_preconditions("removeCluster");
std::shared_ptr<Cluster_impl> target_cluster;
bool skip_channel_check = false;
auto console = mysqlsh::current_console();
// put an exclusive lock on the clusterset and one exclusive cluster lock on
// the primary cluster
auto cs_lock = get_lock_exclusive();
auto pc_lock = m_primary_cluster->get_lock_exclusive();
// Validations and initializations of variables
{
// Validate the Cluster name
mysqlsh::dba::validate_cluster_name(cluster_name,
Cluster_type::GROUP_REPLICATION);
// Get the Cluster object
// NOTE: This will throw an exception if the Cluster does not exist in the
// metadata
try {
target_cluster = get_cluster(cluster_name, false, true);
} catch (const shcore::Exception &e) {
if (e.code() == SHERR_DBA_METADATA_MISSING ||
e.code() == SHERR_DBA_CLUSTER_DOES_NOT_BELONG_TO_CLUSTERSET) {
console->print_error(shcore::str_format(
"The Cluster '%s' does not exist or does not belong to the "
"ClusterSet.",
cluster_name.c_str()));
throw;
} else {
if (options.force.get_safe()) {
target_cluster = get_cluster(cluster_name, true, true);
console->print_warning(
"The target Cluster's Primary instance is unavailable so the "
"Cluster cannot be cleanly removed from the ClusterSet. Removing "
"anyway because 'force' is enabled.");
skip_channel_check = true;
} else {
console->print_error(
"The target Cluster's Primary instance is unavailable so the "
"Cluster cannot be cleanly removed from the ClusterSet. Use the "
"'force' option to remove anyway from the replication topology.");
throw shcore::Exception(
shcore::str_format(
"PRIMARY instance of Cluster '%s' is unavailable: '%s'",
cluster_name.c_str(), e.format().c_str()),
SHERR_DBA_ASYNC_PRIMARY_UNAVAILABLE);
}
}
}
// Check if the target cluster is a PRIMARY Cluster
if (target_cluster->is_primary_cluster()) {
console->print_error(
"Cannot remove the PRIMARY Cluster of the ClusterSet.");
throw shcore::Exception("The Cluster '" + cluster_name +
"' is the PRIMARY Cluster of the ClusterSet.",
SHERR_DBA_CLUSTER_CANNOT_REMOVE_PRIMARY_CLUSTER);
}
// Check if there is any Router instance registered in the ClusterSet using
// the Cluster as Target Cluster
auto routers = get_metadata_storage()->get_routers_using_cluster_as_target(
target_cluster->get_group_name());
ensure_no_router_uses_cluster(routers, target_cluster->get_name());
}
// put an exclusive cluster lock on the target cluster (if reachable)
mysqlshdk::mysql::Lock_scoped tc_lock;
if (!skip_channel_check) tc_lock = target_cluster->get_lock_exclusive();
// Execute the operation
Undo_tracker undo_tracker;
Undo_tracker::Undo_entry *drop_cluster_undo = nullptr;
Undo_tracker::Undo_entry *replication_user_undo = nullptr;
std::vector<Scoped_instance> cluster_reachable_members;
std::vector<std::pair<std::string, std::string>> cluster_unreachable_members;
try {
console->print_info("The Cluster '" + cluster_name +
"' will be removed from the InnoDB ClusterSet.");
console->print_info();
// Check if the channel exists and its OK (not ERROR) first
mysqlshdk::mysql::Replication_channel channel;
if (!skip_channel_check) {
if (!get_channel_status(*target_cluster->get_cluster_server(),
{k_clusterset_async_channel_name}, &channel)) {
if (!options.force.get_safe()) {
console->print_error(
"The ClusterSet Replication channel could not be found at the "
"Cluster '" +
cluster_name + "'. Use the 'force' option to ignore this check.");
throw shcore::Exception("Replication channel does not exist",
SHERR_DBA_REPLICATION_OFF);
} else {
skip_channel_check = true;
console->print_warning(
"Ignoring non-existing ClusterSet Replication channel because "
"of 'force' option");
}
} else {
if (channel.status() !=
mysqlshdk::mysql::Replication_channel::Status::ON) {
if (!options.force.get_safe()) {
console->print_error(
"The ClusterSet Replication channel has an invalid state '" +
to_string(channel.status()) +
"'. Use the 'force' option to ignore this check.");
throw shcore::Exception(
"ClusterSet Replication Channel not in expected state",
SHERR_DBA_REPLICATION_INVALID);
} else {
skip_channel_check = true;
console->print_warning(
"ClusterSet Replication channel has invalid state '" +
to_string(channel.status()) +
"'. Ignoring because of 'force' option");
}
}
}
}
// Get the list of the reachable and unreachable members of the cluster
target_cluster->execute_in_members(
[&cluster_reachable_members](const std::shared_ptr<Instance> &instance,
const Instance_md_and_gr_member &) {
Scoped_instance reachable_member(instance);
cluster_reachable_members.emplace_back(reachable_member);
return true;
},
[&cluster_unreachable_members](const shcore::Error &connection_error,
const Instance_md_and_gr_member &info) {
cluster_unreachable_members.emplace_back(info.first.endpoint,
connection_error.format());
return true;
});
// Sync transactions before making any changes
console->print_info(
"* Waiting for the Cluster to synchronize with the PRIMARY "
"Cluster...");
if (!target_cluster->get_primary_master()) {
console->print_note(
"Transaction sync was skipped because cluster is unavailable");
} else if (!skip_channel_check) {
try {
sync_transactions(*target_cluster->get_cluster_server(),
{k_clusterset_async_channel_name}, options.timeout);
} catch (const shcore::Exception &e) {
if (e.code() == SHERR_DBA_GTID_SYNC_TIMEOUT) {
console->print_error(
"The Cluster failed to synchronize its transaction set "
"with the PRIMARY Cluster. You may increase the "
"transaction sync timeout with the option 'timeout' or use "
"the 'force' option to ignore the timeout.");
throw;
} else if (options.force.get_safe()) {
console->print_warning(
"Transaction sync failed but ignored because of 'force' "
"option: " +
e.format());
} else {
console->print_error(
"Transaction sync failed. Use the 'force' option to remove "
"anyway.");
throw;
}
}
}
// Restore the transaction_size_limit value to the original one
if (target_cluster->get_cluster_server()) {
restore_transaction_size_limit(target_cluster.get(), options.dry_run);
}
// Disable skip_replica_start
if (!options.dry_run && target_cluster->get_cluster_server() &&
target_cluster->cluster_availability() ==
Cluster_availability::ONLINE) {
log_info("Persisting skip_replica_start=0 across the cluster...");
auto config = target_cluster->create_config_object({}, true, true, true);
config->set("skip_replica_start", std::optional<bool>(false));
config->apply();
undo_tracker.add("", [=]() {
log_info("Revert: Enabling skip_replica_start");
auto config_undo =
target_cluster->create_config_object({}, true, true, true);
config_undo->set("skip_replica_start", std::optional<bool>(true));
config_undo->apply();
});
// Set debug trap to test reversion of the ClusterSet setting set-up
DBUG_EXECUTE_IF("dba_remove_cluster_fail_disable_skip_replica_start",
{ throw std::logic_error("debug"); });
}
// Update Metadata
console->print_info("* Updating topology");
log_debug("Removing Cluster from the Metadata.");
auto metadata = get_metadata_storage();
struct {
mysqlsh::dba::Replication_auth_type cluster_auth_type;
std::string cluster_auth_cert_issuer;
std::optional<std::string> primary_cert_subject;
} auth_data_backup;
// store auth data of the cluster to be removed (in case reverted is
// executed)
auth_data_backup.cluster_auth_type =
target_cluster->query_cluster_auth_type();
auth_data_backup.cluster_auth_cert_issuer =
target_cluster->query_cluster_auth_cert_issuer();
if (auto primary = metadata->get_md_server(); primary) {
auth_data_backup.primary_cert_subject =
query_cluster_instance_auth_cert_subject(*primary);
}
if (!options.dry_run) {
{
MetadataStorage::Transaction trx(metadata);
metadata->record_cluster_set_member_removed(get_id(),
target_cluster->get_id());
// Push the whole transaction and changes to the Undo list
undo_tracker.add("Recording back ClusterSet member removed", [=]() {
Cluster_set_member_metadata cluster_md;
cluster_md.cluster.cluster_id = target_cluster->get_id();
cluster_md.cluster_set_id = get_id();
cluster_md.master_cluster_id = get_primary_cluster()->get_id();
cluster_md.primary_cluster = false;
MetadataStorage::Transaction trx2(metadata);
metadata->record_cluster_set_member_added(cluster_md);
trx2.commit();
});
// Only commit transactions once everything is done
trx.commit();
}
// Set debug trap to test reversion of Metadata topology updates to
// remove the member
DBUG_EXECUTE_IF("dba_remove_cluster_fail_post_cs_member_removed",
{ throw std::logic_error("debug"); });
// Drop cluster replication user
{
Sql_undo_list sql_undo;
drop_cluster_replication_user(target_cluster.get(), &sql_undo);
replication_user_undo = &undo_tracker.add(
"Restore cluster replication account", std::move(sql_undo),
[this]() { return get_metadata_storage()->get_md_server(); });
}
// Set debug trap to test reversion of replication user creation
DBUG_EXECUTE_IF("dba_remove_cluster_fail_post_replication_user_removal",
{ throw std::logic_error("debug"); });
// Remove Cluster's recovery accounts
if (!options.dry_run && target_cluster->get_cluster_server()) {
Sql_undo_list undo_drop_users;
// The accounts must be dropped from the ClusterSet
// NOTE: If the replication channel is down and 'force' was used, the
// accounts won't be dropped in the target cluster. This is expected,
// otherwise, it wouldn't be possible to reboot the cluster from
// complete outage later on
target_cluster->drop_replication_users(&undo_drop_users);
undo_tracker.add("Re-creating Cluster recovery accounts",
std::move(undo_drop_users),
[this]() { return get_primary_master(); });
}
// Remove the Cluster's Metadata and Cluster members' info from the
// Metadata and drop recovery accounts
{
auto drop_cluster_trx_undo = Transaction_undo::create();
MetadataStorage::Transaction trx(metadata);
metadata->drop_cluster(target_cluster->get_name(),
drop_cluster_trx_undo.get());
trx.commit();
log_debug("removeCluster() metadata updates done");
drop_cluster_undo = &undo_tracker.add(
"Re-creating Cluster's metadata",
Sql_undo_list(std::move(drop_cluster_trx_undo)),
[this]() { return get_metadata_storage()->get_md_server(); });
}
// Sync again to catch-up the drop user and metadata update
if (target_cluster->cluster_availability() ==
Cluster_availability::ONLINE &&
!skip_channel_check && options.timeout >= 0) {
try {
console->print_info(
"* Waiting for the Cluster to synchronize the Metadata updates "
"with the PRIMARY Cluster...");
sync_transactions(*target_cluster->get_cluster_server(),
{k_clusterset_async_channel_name}, options.timeout);
} catch (const shcore::Exception &e) {
if (options.force.get_safe()) {
console->print_warning(
"Transaction sync failed but ignored because of 'force' "
"option: " +
e.format());
} else {
console->print_error(
"Transaction sync failed. Use the 'force' option to remove "
"anyway.");
throw;
}
}
}
}
// Stop replication
// NOTE: This is done last so all other changes are propagated first to
// the Cluster being removed
console->print_info(
"* Stopping and deleting ClusterSet managed replication channel...");
// Store the replication options before stopping and deleting the
// channel to use in the revert process in case of a failure
auto ar_options = get_clusterset_replication_options();
if (target_cluster->get_cluster_server() &&
target_cluster->cluster_availability() ==
Cluster_availability::ONLINE) {
// Call the primitive to remove the replica, ensuring:
// - super_read_only management is enabled
// - the ClusterSet replication channel is stopped and reset
// - The managed connection failover configurations are deleted
// ... on all members
const auto &primary_uuid =
target_cluster->get_cluster_server()->get_uuid();
for (const auto &instance : cluster_reachable_members) {
remove_replica(instance.get(), options.dry_run, primary_uuid);
}
// Revert in case of failure
undo_tracker.add("Re-adding Cluster as Replica", [=]() {
drop_cluster_undo->call();
replication_user_undo->call();
assert(auth_data_backup.primary_cert_subject.has_value());
auto repl_credentials = create_cluster_replication_user(
target_cluster->get_cluster_server().get(), "",
auth_data_backup.cluster_auth_type,
auth_data_backup.cluster_auth_cert_issuer,
auth_data_backup.primary_cert_subject.value_or(""),
options.dry_run);
auto ar_channel_options = ar_options;
ar_channel_options.repl_credentials = repl_credentials.first;
// create async channel on all secondaries, update member actions
target_cluster->execute_in_members(
{mysqlshdk::gr::Member_state::ONLINE,
mysqlshdk::gr::Member_state::RECOVERING},
get_primary_master()->get_connection_options(), {},
[=](const std::shared_ptr<Instance> &instance,
const mysqlshdk::gr::Member &) {
if (target_cluster->get_cluster_server()->get_uuid() !=
instance->get_uuid()) {
async_create_channel(instance.get(), get_primary_master().get(),
k_clusterset_async_channel_name,
ar_options, options.dry_run);
update_replica_settings(
target_cluster->get_cluster_server().get(),
get_primary_master().get(), false, options.dry_run);
}
return true;
});
update_replica(target_cluster->get_cluster_server().get(),
get_primary_master().get(), ar_channel_options, true,
options.dry_run);
});
} else {
// If the target cluster is OFFLINE, check if there are reachable
// members in order to reset the settings in those instances
for (const auto &reachable_member : cluster_reachable_members) {
// Check if super_read_only is enabled. If so it must be
// disabled to reset the ClusterSet settings
if (reachable_member->get_sysvar_bool("super_read_only", false)) {
reachable_member->set_sysvar("super_read_only", false);
}
// Reset the ClusterSet settings and replication channel
try {
if (cluster_topology_executor_ops::is_member_auto_rejoining(
reachable_member))
cluster_topology_executor_ops::ensure_not_auto_rejoining(
reachable_member);
remove_replica(reachable_member.get(), options.dry_run);
} catch (...) {
if (options.force.get_safe()) {
current_console()->print_warning(
"Could not reset replication settings for " +
reachable_member->descr() + ": " + format_active_exception());
} else {
throw;
}
}
// Disable skip_replica_start
reachable_member->set_sysvar(
"skip_replica_start", false,
mysqlshdk::mysql::Var_qualifier::PERSIST_ONLY);
// Revert in case of failure
undo_tracker.add("", [=]() {
log_info("Revert: Re-adding Cluster as Replica");
update_replica(reachable_member.get(), get_primary_master().get(),
ar_options, false, options.dry_run);
log_info("Revert: Enabling skip_replica_start");
reachable_member->set_sysvar(
"skip_replica_start", true,
mysqlshdk::mysql::Var_qualifier::PERSIST_ONLY);
});
for (const auto &unreachable_member : cluster_unreachable_members) {
console->print_warning(
shcore::str_format("Configuration update of %s skipped: %s",
unreachable_member.first.c_str(),
unreachable_member.second.c_str()));
}
}
}
// Set debug trap to test reversion of replica removal
DBUG_EXECUTE_IF("dba_remove_cluster_fail_post_replica_removal",
{ throw std::logic_error("debug"); });
// Dissolve the Cluster
if (target_cluster->get_cluster_server() &&
target_cluster->cluster_availability() ==
Cluster_availability::ONLINE) {
auto target_cluster_primary = target_cluster->get_cluster_server();
console->print_info("* Dissolving the Cluster...");
auto comm_stack = get_communication_stack(*target_cluster_primary);
bool requires_certificates{false};
switch (target_cluster->query_cluster_auth_type()) {
case mysqlsh::dba::Replication_auth_type::CERT_ISSUER:
case mysqlsh::dba::Replication_auth_type::CERT_SUBJECT:
case mysqlsh::dba::Replication_auth_type::CERT_ISSUER_PASSWORD:
case mysqlsh::dba::Replication_auth_type::CERT_SUBJECT_PASSWORD:
requires_certificates = true;
default:
break;
}
// First the secondaries
for (const auto &member : cluster_reachable_members) {
if (member->get_uuid() == target_cluster_primary->get_uuid()) continue;
try {
// Stop Group Replication and reset GR variables
log_debug("Stopping GR at %s", member->descr().c_str());
if (!options.dry_run) {
// Do not reset Group Replication's recovery channel credentials,
// otherwise, when MySQL communication stack is used it won't be
// possible to reboot the cluster from complete outage. That would
// require re-creating the account, but that's a problem if the
// cluster is a replica cluster since it would create an errant
// transaction since the cluster is not yet rejoined back to the
// clusterset and, on the other side, suppressing the binary log is
// not an option since then the recovery account wouldn't be
// replicated to the other cluster members
if (comm_stack == kCommunicationStackMySQL) {
mysqlsh::dba::leave_cluster(*member, false, false);
} else {
mysqlsh::dba::leave_cluster(*member);
}
undo_tracker.add("", [=]() {
Group_replication_options gr_opts;
std::unique_ptr<mysqlshdk::config::Config> cfg =
create_server_config(
member.get(),
mysqlshdk::config::k_dft_cfg_server_handler);
mysqlsh::dba::join_cluster(*member, *target_cluster_primary,
gr_opts, requires_certificates,
cluster_reachable_members.size() - 1,
cfg.get());
});
}
} catch (const std::exception &err) {
console->print_error(shcore::str_format(
"Instance '%s' failed to leave the cluster: %s",
member->get_canonical_address().c_str(), err.what()));
}
}
// Reconcile the view change GTIDs generated when the Replica Cluster
// was created and when members were added to it. Required to ensure
// those transactions are not detected an errant in further operations
// on those instances.
// Do it before completing the dissolve of the Cluster (removal of the
// Primary) to ensure no events are missing.
console->print_info("* Reconciling internally generated GTIDs...");
if (!options.dry_run) {
reconcile_view_change_gtids(target_cluster_primary.get());
}
// Finally the primary
try {
log_debug("Stopping GR at %s", target_cluster_primary->descr().c_str());
if (!options.dry_run) {
// Do not reset Group Replication's recovery channel credentials
if (comm_stack == kCommunicationStackMySQL) {
mysqlsh::dba::leave_cluster(*target_cluster_primary, false, false);
} else {
mysqlsh::dba::leave_cluster(*target_cluster_primary);
}
}
} catch (const std::exception &err) {
console->print_error(shcore::str_format(
"Instance '%s' failed to leave the cluster: %s",
target_cluster_primary->get_canonical_address().c_str(),
err.what()));
throw;
}
}
console->print_info();
console->print_info("The Cluster '" + cluster_name +
"' was removed from the ClusterSet.");
console->print_info();
if (options.dry_run) {
console->print_info("dryRun finished.");
console->print_info();
}
} catch (...) {
console->print_error("Error removing Replica Cluster: " +
format_active_exception());
console->print_note("Reverting changes...");
undo_tracker.execute();
console->print_info();
console->print_info("Changes successfully reverted.");
throw;
}
}