public async Task UpdateNetwork()

in src/ccf/ccf-provider/CcfNetworkProvider.cs [172:381]


    public async Task<CcfNetwork> UpdateNetwork(
        string networkName,
        int nodeCount,
        string? nodeLogLevel,
        SecurityPolicyConfiguration policyOption,
        JsonObject? providerConfig)
    {
        if (nodeCount < 1)
        {
            throw new ArgumentException($"New node count value cannot be less than 1.");
        }

        // Before proceeding further check that signing cert/key that would be required to submit
        // any proposal are configured.
        await this.ccfClientManager.CheckSigningConfig();

        var lbEndpoint =
            await this.lbProvider.GetLoadBalancerEndpoint(networkName, providerConfig);
        var lbFqdn = this.lbProvider.GenerateLoadBalancerFqdn(
            lbEndpoint.Name,
            networkName,
            providerConfig);

        // Pick primary per CCF network as the target node to use for joining/removing nodes from
        // the network.
        (NodeEndpoint primaryNodeEndpoint, string primaryNodeId, string serviceCertPem) =
            await this.GetPrimaryNodeEndpoint(networkName, providerConfig, lbEndpoint);

        this.logger.LogInformation(
            $"Current primary node: " +
            $"{primaryNodeEndpoint.NodeName}, endpoint: {primaryNodeEndpoint.ClientRpcAddress}");

        var primaryClient = this.GetOrAddServiceClient(primaryNodeEndpoint, serviceCertPem);

        // First clean up any nodes already in Retired state so that we don't count them towards
        // node addition/removal as these are stale and would have been left behind from a
        // previous failed/aborted attempt.
        bool retiredAny =
            await this.CleanupRetiredNodes(networkName, providerConfig, primaryClient);

        // Populate node endpoints after cleanup as the above cleanup could have reduced the nodes
        // reported by the infra provider.
        var nodeEndpoints = await this.nodeProvider.GetNodes(networkName, providerConfig);
        bool addedAny = false;
        bool removedAny = false;
        if (nodeCount > nodeEndpoints.Count)
        {
            this.logger.LogInformation("Going to add nodes.");

            // Add nodes.
            // TODO (gsinha): Above check does not handle the situation where nodes got orphaned
            // during the creation process and were never reported on the network but are
            // enumerated by the infra provider via GetNodes. We should not be counting them as
            // added nodes but the ordinal calculation below needs to account for their name to
            // calculate the next highest ordinal.
            // Similarly nodes that need replacement (per GetNodeHealth) should not be counted
            // as added nodes.
            int ordinal = int.Parse(nodeEndpoints.OrderBy(
                n => n.NodeName.PadForNaturalNumberOrdering())
                .Last().NodeName.Split("-").Last()) + 1;
            int numNodesToCreate = nodeCount - nodeEndpoints.Count;
            addedAny = await this.AddNodes(
                networkName,
                providerConfig,
                numNodesToCreate,
                primaryNodeEndpoint,
                lbFqdn.NodeSanFormat(),
                serviceCertPem,
                nodeLogLevel,
                policyOption,
                ordinal);
        }
        else if (nodeCount < nodeEndpoints.Count)
        {
            this.logger.LogInformation("Going to remove nodes.");

            // Remove nodes.
            int numNodesToRemove = nodeEndpoints.Count - nodeCount;
            List<NetworkNode> nodesToRemove = await this.PickNodesToRemove(
                networkName,
                providerConfig,
                primaryClient,
                primaryNodeId,
                numNodesToRemove);
            removedAny =
                await this.RemoveNodes(networkName, providerConfig, nodesToRemove, primaryClient);
        }

        if (addedAny || removedAny)
        {
            nodeEndpoints = await this.nodeProvider.GetNodes(networkName, providerConfig);
        }

        if (nodeCount == nodeEndpoints.Count)
        {
            this.logger.LogInformation($"Checking if any nodes need to be replaced.");

            var nodesHealth = await this.nodeProvider.GetNodesHealth(networkName, providerConfig);
            var unhealthyNodes = nodeEndpoints.Where(
                n => nodesHealth.Any(nh => nh.Name == n.NodeName &&
                    nh.Status == nameof(NodeStatus.NeedsReplacement)))
                .ToList();
            if (unhealthyNodes.Any())
            {
                var nodes =
                    (await primaryClient.GetFromJsonAsync<NetworkNodeList>("/node/network/nodes"))!
                    .Nodes;
                this.logger.LogInformation(
                    $"Current nodes: {JsonSerializer.Serialize(nodes, Utils.Options)}.");

                List<NetworkNode> nodesToRemove =
                    nodes
                    .Where(n => unhealthyNodes.Any(nh => ToId(nh.NodeName) == ToId(n.NodeData.Name)))
                    .ToList();

                this.logger.LogInformation(
                    $"Need to replace {unhealthyNodes.Count} nodes out of {nodeCount} that are " +
                    $"reporting status as needing replacement. Nodes health: " +
                    $"{JsonSerializer.Serialize(nodesHealth, Utils.Options)}");

                var unexpectedPrimary = nodesToRemove.Find(n => n.Primary);
                var currentPrimary = nodesToRemove.Find(n => n.NodeId == primaryNodeId);
                if (unexpectedPrimary != null)
                {
                    // Primary has shifted and we picked a node that reported itself as primary but
                    // the infra provider has marked it as needing replacement. Let the primary
                    // stabilize as most likely a new primary will get elected. So do nothing.
                    this.logger.LogWarning(
                        $"'{unexpectedPrimary.NodeId}' is reporting itself as Primary but was " +
                        $"the infra provider is indicating that the node be replaced. Let the " +
                        $"primary stabilize as most likely a new primary will get elected. " +
                        $"Try again later.");
                }
                else if (currentPrimary != null)
                {
                    // Primary has shifted and we picked a node that reported itself as primary but
                    // the infra provider has marked it as needing replacement. Let the primary
                    // stabilize as most likely a new primary will get elected. So do nothing.
                    this.logger.LogWarning(
                        $"'{currentPrimary.NodeId}' was considered primary but " +
                        $"the infra provider is indicating that the node be replaced. Let the " +
                        $"primary stabilize as most likely a new primary will get elected. " +
                        $"Try again later.");
                }
                else
                {
                    // We first add a new set of nodes before removing. In failure situations
                    // we might have more nodes lying around if removal of the unhealthy nodes
                    // fail. This would get cleaned up in the next attempt to update the node
                    // count (or once we have a health watcher that periodically reconciles
                    // with the desired node count.
                    int ordinal = int.Parse(nodeEndpoints.OrderBy(
                        n => n.NodeName.PadForNaturalNumberOrdering())
                        .Last().NodeName.Split("-").Last()) + 1;
                    int numNodesToCreate = unhealthyNodes.Count;
                    addedAny = await this.AddNodes(
                        networkName,
                        providerConfig,
                        numNodesToCreate,
                        primaryNodeEndpoint,
                        lbFqdn.NodeSanFormat(),
                        serviceCertPem,
                        nodeLogLevel,
                        policyOption,
                        ordinal);

                    removedAny = await this.RemoveNodes(
                        networkName,
                        providerConfig,
                        nodesToRemove,
                        primaryClient);
                }
            }
            else
            {
                this.logger.LogInformation(
                    $"Not replacing any nodes as input nodeCount {nodeCount} matches number " +
                    $"of healthy nodes reported by the infra provider.");
            }
        }

        if (retiredAny || addedAny || removedAny)
        {
            var nodesHealth = await this.nodeProvider.GetNodesHealth(networkName, providerConfig);
            nodeEndpoints = await this.nodeProvider.GetNodes(networkName, providerConfig);
            var availableNodeEndpoints = nodeEndpoints.Where(
                n => !nodesHealth.Any(nh => nh.Name == n.NodeName &&
                nh.Status == nameof(NodeStatus.NeedsReplacement)));
            List<string> servers = new(availableNodeEndpoints.Select(n => n.ClientRpcAddress));
            this.logger.LogInformation(
                $"Updating LB with servers: {JsonSerializer.Serialize(servers)}.");
            lbEndpoint = await this.lbProvider.UpdateLoadBalancer(
                lbEndpoint.Name,
                networkName,
                servers,
                providerConfig);

            await this.WaitForLoadBalancerReady(lbEndpoint, serviceCertPem);
        }

        this.logger.LogInformation($"CCF endpoint is up at: {lbEndpoint.Endpoint}.");
        return new CcfNetwork
        {
            Name = networkName,
            InfraType = this.nodeProvider.InfraType.ToString(),
            NodeCount = nodeEndpoints.Count,
            Endpoint = lbEndpoint.Endpoint,
            Nodes = nodeEndpoints.ConvertAll(n => n.ClientRpcAddress)
        };
    }