in src/ccf/ccf-provider/CcfNetworkProvider.cs [172:381]
public async Task<CcfNetwork> UpdateNetwork(
string networkName,
int nodeCount,
string? nodeLogLevel,
SecurityPolicyConfiguration policyOption,
JsonObject? providerConfig)
{
if (nodeCount < 1)
{
throw new ArgumentException($"New node count value cannot be less than 1.");
}
// Before proceeding further check that signing cert/key that would be required to submit
// any proposal are configured.
await this.ccfClientManager.CheckSigningConfig();
var lbEndpoint =
await this.lbProvider.GetLoadBalancerEndpoint(networkName, providerConfig);
var lbFqdn = this.lbProvider.GenerateLoadBalancerFqdn(
lbEndpoint.Name,
networkName,
providerConfig);
// Pick primary per CCF network as the target node to use for joining/removing nodes from
// the network.
(NodeEndpoint primaryNodeEndpoint, string primaryNodeId, string serviceCertPem) =
await this.GetPrimaryNodeEndpoint(networkName, providerConfig, lbEndpoint);
this.logger.LogInformation(
$"Current primary node: " +
$"{primaryNodeEndpoint.NodeName}, endpoint: {primaryNodeEndpoint.ClientRpcAddress}");
var primaryClient = this.GetOrAddServiceClient(primaryNodeEndpoint, serviceCertPem);
// First clean up any nodes already in Retired state so that we don't count them towards
// node addition/removal as these are stale and would have been left behind from a
// previous failed/aborted attempt.
bool retiredAny =
await this.CleanupRetiredNodes(networkName, providerConfig, primaryClient);
// Populate node endpoints after cleanup as the above cleanup could have reduced the nodes
// reported by the infra provider.
var nodeEndpoints = await this.nodeProvider.GetNodes(networkName, providerConfig);
bool addedAny = false;
bool removedAny = false;
if (nodeCount > nodeEndpoints.Count)
{
this.logger.LogInformation("Going to add nodes.");
// Add nodes.
// TODO (gsinha): Above check does not handle the situation where nodes got orphaned
// during the creation process and were never reported on the network but are
// enumerated by the infra provider via GetNodes. We should not be counting them as
// added nodes but the ordinal calculation below needs to account for their name to
// calculate the next highest ordinal.
// Similarly nodes that need replacement (per GetNodeHealth) should not be counted
// as added nodes.
int ordinal = int.Parse(nodeEndpoints.OrderBy(
n => n.NodeName.PadForNaturalNumberOrdering())
.Last().NodeName.Split("-").Last()) + 1;
int numNodesToCreate = nodeCount - nodeEndpoints.Count;
addedAny = await this.AddNodes(
networkName,
providerConfig,
numNodesToCreate,
primaryNodeEndpoint,
lbFqdn.NodeSanFormat(),
serviceCertPem,
nodeLogLevel,
policyOption,
ordinal);
}
else if (nodeCount < nodeEndpoints.Count)
{
this.logger.LogInformation("Going to remove nodes.");
// Remove nodes.
int numNodesToRemove = nodeEndpoints.Count - nodeCount;
List<NetworkNode> nodesToRemove = await this.PickNodesToRemove(
networkName,
providerConfig,
primaryClient,
primaryNodeId,
numNodesToRemove);
removedAny =
await this.RemoveNodes(networkName, providerConfig, nodesToRemove, primaryClient);
}
if (addedAny || removedAny)
{
nodeEndpoints = await this.nodeProvider.GetNodes(networkName, providerConfig);
}
if (nodeCount == nodeEndpoints.Count)
{
this.logger.LogInformation($"Checking if any nodes need to be replaced.");
var nodesHealth = await this.nodeProvider.GetNodesHealth(networkName, providerConfig);
var unhealthyNodes = nodeEndpoints.Where(
n => nodesHealth.Any(nh => nh.Name == n.NodeName &&
nh.Status == nameof(NodeStatus.NeedsReplacement)))
.ToList();
if (unhealthyNodes.Any())
{
var nodes =
(await primaryClient.GetFromJsonAsync<NetworkNodeList>("/node/network/nodes"))!
.Nodes;
this.logger.LogInformation(
$"Current nodes: {JsonSerializer.Serialize(nodes, Utils.Options)}.");
List<NetworkNode> nodesToRemove =
nodes
.Where(n => unhealthyNodes.Any(nh => ToId(nh.NodeName) == ToId(n.NodeData.Name)))
.ToList();
this.logger.LogInformation(
$"Need to replace {unhealthyNodes.Count} nodes out of {nodeCount} that are " +
$"reporting status as needing replacement. Nodes health: " +
$"{JsonSerializer.Serialize(nodesHealth, Utils.Options)}");
var unexpectedPrimary = nodesToRemove.Find(n => n.Primary);
var currentPrimary = nodesToRemove.Find(n => n.NodeId == primaryNodeId);
if (unexpectedPrimary != null)
{
// Primary has shifted and we picked a node that reported itself as primary but
// the infra provider has marked it as needing replacement. Let the primary
// stabilize as most likely a new primary will get elected. So do nothing.
this.logger.LogWarning(
$"'{unexpectedPrimary.NodeId}' is reporting itself as Primary but was " +
$"the infra provider is indicating that the node be replaced. Let the " +
$"primary stabilize as most likely a new primary will get elected. " +
$"Try again later.");
}
else if (currentPrimary != null)
{
// Primary has shifted and we picked a node that reported itself as primary but
// the infra provider has marked it as needing replacement. Let the primary
// stabilize as most likely a new primary will get elected. So do nothing.
this.logger.LogWarning(
$"'{currentPrimary.NodeId}' was considered primary but " +
$"the infra provider is indicating that the node be replaced. Let the " +
$"primary stabilize as most likely a new primary will get elected. " +
$"Try again later.");
}
else
{
// We first add a new set of nodes before removing. In failure situations
// we might have more nodes lying around if removal of the unhealthy nodes
// fail. This would get cleaned up in the next attempt to update the node
// count (or once we have a health watcher that periodically reconciles
// with the desired node count.
int ordinal = int.Parse(nodeEndpoints.OrderBy(
n => n.NodeName.PadForNaturalNumberOrdering())
.Last().NodeName.Split("-").Last()) + 1;
int numNodesToCreate = unhealthyNodes.Count;
addedAny = await this.AddNodes(
networkName,
providerConfig,
numNodesToCreate,
primaryNodeEndpoint,
lbFqdn.NodeSanFormat(),
serviceCertPem,
nodeLogLevel,
policyOption,
ordinal);
removedAny = await this.RemoveNodes(
networkName,
providerConfig,
nodesToRemove,
primaryClient);
}
}
else
{
this.logger.LogInformation(
$"Not replacing any nodes as input nodeCount {nodeCount} matches number " +
$"of healthy nodes reported by the infra provider.");
}
}
if (retiredAny || addedAny || removedAny)
{
var nodesHealth = await this.nodeProvider.GetNodesHealth(networkName, providerConfig);
nodeEndpoints = await this.nodeProvider.GetNodes(networkName, providerConfig);
var availableNodeEndpoints = nodeEndpoints.Where(
n => !nodesHealth.Any(nh => nh.Name == n.NodeName &&
nh.Status == nameof(NodeStatus.NeedsReplacement)));
List<string> servers = new(availableNodeEndpoints.Select(n => n.ClientRpcAddress));
this.logger.LogInformation(
$"Updating LB with servers: {JsonSerializer.Serialize(servers)}.");
lbEndpoint = await this.lbProvider.UpdateLoadBalancer(
lbEndpoint.Name,
networkName,
servers,
providerConfig);
await this.WaitForLoadBalancerReady(lbEndpoint, serviceCertPem);
}
this.logger.LogInformation($"CCF endpoint is up at: {lbEndpoint.Endpoint}.");
return new CcfNetwork
{
Name = networkName,
InfraType = this.nodeProvider.InfraType.ToString(),
NodeCount = nodeEndpoints.Count,
Endpoint = lbEndpoint.Endpoint,
Nodes = nodeEndpoints.ConvertAll(n => n.ClientRpcAddress)
};
}