in subprojects/frameworklauncher/yarn/src/main/java/com/microsoft/frameworklauncher/applicationmaster/ApplicationMaster.java [741:825]
private void attemptToRetry(TaskStatus taskStatus) throws Exception {
String taskRoleName = taskStatus.getTaskRoleName();
TaskStatusLocator taskLocator = new TaskStatusLocator(taskRoleName, taskStatus.getTaskIndex());
Integer exitCode = taskStatus.getContainerExitCode();
ExitType exitType = taskStatus.getContainerExitType();
Integer retriedCount = taskStatus.getTaskRetryPolicyState().getRetriedCount();
RetryPolicyState newRetryPolicyState = YamlUtils.deepCopy(taskStatus.getTaskRetryPolicyState(), RetryPolicyState.class);
RetryPolicyDescriptor retryPolicy = requestManager.getTaskRetryPolicy(taskRoleName);
Boolean fancyRetryPolicy = retryPolicy.getFancyRetryPolicy();
Integer maxRetryCount = retryPolicy.getMaxRetryCount();
String logPrefix = String.format("%s: attemptToRetry: ", taskLocator);
LOGGER.logSplittedLines(Level.INFO,
logPrefix + "ContainerExitCode: [%s], ContainerExitType: [%s], RetryPolicyState:\n[%s]",
exitCode, exitType, WebCommon.toJson(newRetryPolicyState));
String completeTaskLogPrefix = logPrefix + "Will completeTask. Reason: ";
String retryTaskLogPrefix = logPrefix + "Will retryTask with new Container. Reason: ";
// 1. FancyRetryPolicy
String fancyRetryPolicyLogSuffix = String.format("FancyRetryPolicy: Task exited due to %s.", exitType);
if (exitType == ExitType.TRANSIENT_NORMAL) {
newRetryPolicyState.setTransientNormalRetriedCount(newRetryPolicyState.getTransientNormalRetriedCount() + 1);
if (fancyRetryPolicy) {
LOGGER.logWarning(retryTaskLogPrefix + fancyRetryPolicyLogSuffix);
retryTask(taskStatus, newRetryPolicyState);
return;
}
} else if (exitType == ExitType.TRANSIENT_CONFLICT) {
newRetryPolicyState.setTransientConflictRetriedCount(newRetryPolicyState.getTransientConflictRetriedCount() + 1);
if (fancyRetryPolicy) {
LOGGER.logWarning(retryTaskLogPrefix + fancyRetryPolicyLogSuffix);
retryTask(taskStatus, newRetryPolicyState);
return;
}
} else if (exitType == ExitType.NON_TRANSIENT) {
newRetryPolicyState.setNonTransientRetriedCount(newRetryPolicyState.getNonTransientRetriedCount() + 1);
if (fancyRetryPolicy) {
LOGGER.logWarning(completeTaskLogPrefix + fancyRetryPolicyLogSuffix);
completeTask(taskStatus);
return;
}
} else {
if (exitType == ExitType.SUCCEEDED) {
newRetryPolicyState.setSucceededRetriedCount(newRetryPolicyState.getSucceededRetriedCount() + 1);
} else {
newRetryPolicyState.setUnKnownRetriedCount(newRetryPolicyState.getUnKnownRetriedCount() + 1);
}
if (fancyRetryPolicy) {
// FancyRetryPolicy only handle exit due to transient and non-transient failure specially,
// Leave exit due to others to NormalRetryPolicy
LOGGER.logInfo(logPrefix +
"Transfer the RetryDecision to NormalRetryPolicy. Reason: " +
fancyRetryPolicyLogSuffix);
}
}
// 2. NormalRetryPolicy
if (maxRetryCount == GlobalConstants.USING_EXTENDED_UNLIMITED_VALUE ||
(exitType != ExitType.SUCCEEDED && maxRetryCount == GlobalConstants.USING_UNLIMITED_VALUE) ||
(exitType != ExitType.SUCCEEDED && retriedCount < maxRetryCount)) {
newRetryPolicyState.setRetriedCount(newRetryPolicyState.getRetriedCount() + 1);
LOGGER.logWarning(retryTaskLogPrefix +
"RetriedCount %s has not reached MaxRetryCount %s.",
retriedCount, maxRetryCount);
retryTask(taskStatus, newRetryPolicyState);
return;
} else {
if (exitType == ExitType.SUCCEEDED) {
LOGGER.logInfo(completeTaskLogPrefix +
"Task exited due to %s.", exitType);
completeTask(taskStatus);
return;
} else {
LOGGER.logWarning(completeTaskLogPrefix +
"RetriedCount %s has reached MaxRetryCount %s.",
retriedCount, maxRetryCount);
completeTask(taskStatus);
return;
}
}
}