private async Task WaitForJoinNodeReady()

in src/ccf/ccf-provider/CcfNetworkProvider.cs [1127:1269]


    private async Task WaitForJoinNodeReady(
        string networkName,
        JsonObject? providerConfig,
        NodeEndpoint targetNodeEndpoint,
        NodeEndpoint joinNodeEndpoint,
        string serviceCertPem,
        DesiredJoinNodeState desiredState)
    {
        var serviceClient = this.GetOrAddServiceClient(targetNodeEndpoint, serviceCertPem);

        // For nodes joining in network open state we need to transition the node to trusted
        // before the node can finish joining successfully.
        // TODO (gsinha): Add retries around GetFromJsonAsync transient failure.
        var networkState = (await serviceClient.GetFromJsonAsync<JsonObject>("/node/network"))!;
        if (networkState["service_status"]!.ToString() == "Open")
        {
            JsonObject nodeState =
                await this.WaitForNodeToAppearOnNetwork(
                    serviceClient,
                    joinNodeEndpoint.NodeName,
                    onRetry: () => this.CheckNodeHealthy(
                        networkName,
                        joinNodeEndpoint.NodeName,
                        providerConfig));
            var status = nodeState["status"]!.ToString();
            if (status == "Pending")
            {
#pragma warning disable MEN002 // Line is too long
                // At times node to node communication between the new and the primary takes
                // a while to get established due to DNS resolve/caching issues. This shows up
                // as the create proposal transcation commit taking time. So set a higher timeout
                // to give a chance to communication to get established.
                //# Node 1 added to Raft config
                // 2024-11-07T08:17:28.355534Z -0.017 0   [info ] ../src/node/channels.h:828           | Initiating node channel with n[d267d70732038c31038eabfc093b63745520560ec0969160595673ad95b05701].
                // 2024-11-07T08:17:28.355852Z        100 [debug] ../src/host/node_connections.h:458   | Added node connection with n[d267d70732038c31038eabfc093b63745520560ec0969160595673ad95b05701] (foo-1.westeurope.azurecontainer.io:8081)
                // 2024-11-07T08:17:28.355863Z        100 [debug] ../src/host/node_connections.h:434   | node send to n[d267d70732038c31038eabfc093b63745520560ec0969160595673ad95b05701] [1208]
                // 2024-11-07T08:17:28.355868Z -0.018 0   [info ] ../src/consensus/aft/raft.h:2567     | Added raft node n[d267d70732038c31038eabfc093b63745520560ec0969160595673ad95b05701] (foo-1.westeurope.azurecontainer.io:8081)
                //# Still unable to connect to Node 1
                // 2024-11-07T08:17:30.358161Z -0.004 0   [info ] ../src/node/channels.h:828           | Initiating node channel with n[d267d70732038c31038eabfc093b63745520560ec0969160595673ad95b05701].
                // 2024-11-07T08:17:30.358490Z        100 [debug] ../src/host/node_connections.h:434   | node send to n[d267d70732038c31038eabfc093b63745520560ec0969160595673ad95b05701] [1208]
                // 2024-11-07T08:17:30.382290Z        100 [debug] ../src/host/tcp.h:699                | uv_tcp_connect async retry: connection timed out
                // 2024-11-07T08:17:30.382401Z        100 [info ] ../src/host/tcp.h:536                | Unable to connect: all resolved addresses failed: foo-1.westeurope.azurecontainer.io:8081
                // 2024-11-07T08:17:30.382412Z        100 [debug] ../src/host/node_connections.h:227   | Disconnecting outgoing connection with n[d267d70732038c31038eabfc093b63745520560ec0969160595673ad95b05701]: connect failed
                // 2024-11-07T08:17:30.382454Z        100 [debug] ../src/host/node_connections.h:472   | Removed node connection with n[d267d70732038c31038eabfc093b63745520560ec0969160595673ad95b05701]
                //...
                //# Eventually succeed in connecting to Node 1
                // 2024-11-07T08:18:32.142617Z -0.004 0   [info ] ../src/node/channels.h:828           | Initiating node channel with n[d267d70732038c31038eabfc093b63745520560ec0969160595673ad95b05701].
                // 2024-11-07T08:18:32.146380Z        100 [debug] ../src/host/node_connections.h:458   | Added node connection with n[d267d70732038c31038eabfc093b63745520560ec0969160595673ad95b05701] (foo-1.westeurope.azurecontainer.io:8081)
                // 2024-11-07T08:18:32.146430Z        100 [debug] ../src/host/node_connections.h:434   | node send to n[d267d70732038c31038eabfc093b63745520560ec0969160595673ad95b05701] [1208]
                // 2024-11-07T08:18:32.150568Z        100 [info ] ../src/host/socket.h:53              | TCP Node Outgoing connected
                //# Commit advances
                // 2024-11-07T08:18:32.926155Z        100 [debug] ../src/host/ledger.h:1435            | Ledger commit: 133/133
#pragma warning restore MEN002 // Line is too long
                var timeout = TimeSpan.FromSeconds(180);
                await TransitionNodeToTrusted(
                    serviceClient,
                    nodeState["node_id"]!.ToString(),
                    timeout);
            }
        }

        // Do a health check as part of retries as in case the join node fails to start then the
        // https endpoint won't respond and there would be no point retrying.
        var selfSignedCertPem = await this.GetNodeSelfSignedCert(
            joinNodeEndpoint,
            onRetry: () => this.CheckNodeHealthy(
                networkName,
                joinNodeEndpoint.NodeName,
                providerConfig));
        var client = this.GetOrAddNodeClient(
            joinNodeEndpoint,
            serviceCertPem,
            selfSignedCertPem);

        TimeSpan readyTimeout = TimeSpan.FromSeconds(60);
        var stopwatch = Stopwatch.StartNew();
        var joinNodeName = joinNodeEndpoint.NodeName;
        var expectedState = desiredState.ToString();
        while (true)
        {
            using var response = await client.GetAsync("/node/state");
            if (response.IsSuccessStatusCode)
            {
                var nodeState = (await response.Content.ReadFromJsonAsync<JsonObject>())!;
                var state = nodeState["state"]!.ToString();
                if (state == expectedState)
                {
                    this.logger.LogInformation(
                        $"{joinNodeName}: {joinNodeEndpoint.ClientRpcAddress}/node/state " +
                        $"is reporting {expectedState}.");
                    break;
                }

                this.logger.LogInformation(
                    $"{joinNodeName}: Waiting for " +
                    $"{joinNodeEndpoint.ClientRpcAddress}/node/state " +
                    $"to report {expectedState}. Current state: {state}");
            }
            else
            {
                this.logger.LogInformation(
                    $"{joinNodeName}: Waiting for " +
                    $"{joinNodeEndpoint.ClientRpcAddress}/node/state " +
                    $"to report " +
                    $"{expectedState}. Current statusCode: {response.StatusCode}.");
            }

            if (stopwatch.Elapsed > readyTimeout)
            {
                throw new TimeoutException(
                    $"{joinNodeName}: Hit timeout waiting for join node " +
                    $"{joinNodeEndpoint.ClientRpcAddress} to become {expectedState}");
            }

            await Task.Delay(TimeSpan.FromSeconds(1));
        }

        async Task TransitionNodeToTrusted(
            HttpClient serviceClient,
            string nodeId,
            TimeSpan? timeout = null)
        {
            this.logger.LogInformation(
                $"Submitting transition_node_to_trusted proposal for {nodeId}.");
            var proposalContent = new JsonObject
            {
                ["actions"] = new JsonArray
                {
                    new JsonObject
                    {
                        ["name"] = "transition_node_to_trusted",
                        ["args"] = new JsonObject
                        {
                            ["node_id"] = nodeId,
                            ["valid_from"] = DateTime.UtcNow.ToString("O")
                        }
                    }
                }
            };
            var result = await this.CreateProposal(serviceClient, proposalContent, timeout);
            this.logger.LogInformation(JsonSerializer.Serialize(result, Utils.Options));
        }
    }