in src/devhostagent.restorationjob/RestorationJobApp.cs [73:195]
public override int Execute(string[] args, CancellationToken cancellationToken)
=> (int)AsyncHelpers.RunSync(() => this.ExecuteInnerAsync(cancellationToken));
/// <summary>
/// Execution implementation
/// </summary>
private async Task<ExitCode> ExecuteInnerAsync(CancellationToken cancellationToken)
{
try
{
AssertHelper.NotNullOrEmpty(_restorationJobEnvironmentVariables.Namespace, nameof(_restorationJobEnvironmentVariables.Namespace));
AssertHelper.NotNullOrEmpty(_restorationJobEnvironmentVariables.InstanceLabelValue, nameof(_restorationJobEnvironmentVariables.InstanceLabelValue));
// Load patch state
var patchState = this._ParsePatchState();
_log.Info("Waiting to restore previous state on {0} {1}/{2}...", patchState.KubernetesType.GetStringValue(), new PII(patchState.Namespace), new PII(patchState.Name));
// Extra wait at the beginning to allow things to initialize
await Task.Delay(_restorationJobEnvironmentVariables.PingInterval, cancellationToken);
int numFailedPings = 0;
DateTimeOffset? lastPingWithSessions = null;
DateTimeOffset? timeSinceLastPingIsNull = null;
bool restoredWorkload = false;
while (!cancellationToken.IsCancellationRequested && !restoredWorkload)
{
if (numFailedPings >= _restorationJobEnvironmentVariables.NumFailedPingsBeforeExit)
{
_log.Error($"Failed to ping agent {numFailedPings} times. Exiting...");
return ExitCode.Fail;
}
// Sleep
await Task.Delay(_restorationJobEnvironmentVariables.PingInterval, cancellationToken);
_log.Verbose("Pinging...");
using (var perfLogger = _log.StartPerformanceLogger(
Events.RestorationJob.AreaName,
Events.RestorationJob.Operations.AgentPing,
new Dictionary<string, object>()
{
{ RestorePerformed, false },
{ NumFailedPings, numFailedPings },
{ HasConnectedClients, "" }
}))
{
// Get agent endpoint
Uri agentEndpoint = await this._GetAgentEndpointAsync((dynamic)patchState, cancellationToken);
if (agentEndpoint == null)
{
_log.Verbose("Couldn't get agent endpoint");
numFailedPings++;
continue;
}
// Ping agent
var result = await this._PingAgentAsync(agentEndpoint, cancellationToken);
if (result == null)
{
_log.Verbose("Failed to ping agent");
numFailedPings++;
continue;
}
else if (result.NumConnectedSessions > 0)
{
_log.Verbose($"Agent has {result.NumConnectedSessions} connected sessions");
lastPingWithSessions = DateTimeOffset.Now;
perfLogger.SetProperty(HasConnectedClients, true);
}
else
{
perfLogger.SetProperty(HasConnectedClients, false);
TimeSpan? disconnectedTimeSpan = null;
if (lastPingWithSessions == null)
{
// first loop timeUntilLastPingIsNull will be set to current time and then next while loop it will preserve that time.
// if lastPingWithSessions is being null for last 60 seconds or more then restoration will happen.
timeSinceLastPingIsNull = timeSinceLastPingIsNull == null ? DateTimeOffset.Now : timeSinceLastPingIsNull;
disconnectedTimeSpan = DateTimeOffset.Now - timeSinceLastPingIsNull;
} else
{
disconnectedTimeSpan = DateTimeOffset.Now - lastPingWithSessions;
}
if (disconnectedTimeSpan != null && disconnectedTimeSpan.Value > _restorationJobEnvironmentVariables.RestoreTimeout)
{
// Restore workload
_log.Info($"Agent has no connected sessions for {disconnectedTimeSpan.Value:g}. Restoring...");
await this._RestoreAsync((dynamic)patchState, cancellationToken);
_log.Info("Restored {0} {1}/{2}.", patchState.KubernetesType.GetStringValue(), new PII(patchState.Namespace), new PII(patchState.Name));
perfLogger.SetProperty(RestorePerformed, true);
restoredWorkload = true;
}
}
numFailedPings = 0;
perfLogger.SetSucceeded();
}
}
if (restoredWorkload)
{
// Clean up restoration job
// Don't pass cancellationToken because we don't want a race condition for Kubernetes killing the pod
await _remoteRestoreJobCleaner.CleanupRemoteRestoreJobByInstanceLabelAsync(_restorationJobEnvironmentVariables.InstanceLabelValue, _restorationJobEnvironmentVariables.Namespace, default(CancellationToken));
return ExitCode.Success;
}
// If we get here, that means the cancellationToken must have been canceled.
return ExitCode.Cancel;
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
_log.Info("Restoration job cancelled");
return ExitCode.Cancel;
}
catch (Exception e)
{
_log.WithoutTelemetry.Error($"Encountered exception: {e.Message}");
_log.WithoutConsole.Exception(e);
return ExitCode.Fail;
}
}