in server/src/main/java/org/elasticsearch/cluster/routing/UnassignedInfo.java [67:526]
public record UnassignedInfo(
Reason reason,
@Nullable String message,
@Nullable Exception failure,
int failedAllocations,
long unassignedTimeNanos,
long unassignedTimeMillis,
boolean delayed,
AllocationStatus lastAllocationStatus,
Set<String> failedNodeIds,
@Nullable String lastAllocatedNodeId
) implements ToXContentFragment, Writeable {
private static final TransportVersion VERSION_UNPROMOTABLE_REPLICA_ADDED = TransportVersions.V_8_7_0;
public static final DateFormatter DATE_TIME_FORMATTER = DateFormatter.forPattern("date_optional_time").withZone(ZoneOffset.UTC);
public static final Setting<TimeValue> INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING = Setting.timeSetting(
"index.unassigned.node_left.delayed_timeout",
settings -> EXISTING_SHARDS_ALLOCATOR_SETTING.get(settings).equals("stateless")
? TimeValue.timeValueSeconds(10)
: TimeValue.timeValueMinutes(1),
TimeValue.timeValueMillis(0),
Property.Dynamic,
Property.IndexScope
);
/**
* Reason why the shard is in unassigned state.
* <p>
* Note, ordering of the enum is important, make sure to add new values
* at the end and handle version serialization properly.
*/
public enum Reason {
/**
* Unassigned as a result of an API creation of an index.
*/
INDEX_CREATED,
/**
* Unassigned as a result of a full cluster recovery.
*/
CLUSTER_RECOVERED,
/**
* Unassigned as a result of opening a closed index.
*/
INDEX_REOPENED,
/**
* Unassigned as a result of importing a dangling index.
*/
DANGLING_INDEX_IMPORTED,
/**
* Unassigned as a result of restoring into a new index.
*/
NEW_INDEX_RESTORED,
/**
* Unassigned as a result of restoring into a closed index.
*/
EXISTING_INDEX_RESTORED,
/**
* Unassigned as a result of explicit addition of a replica.
*/
REPLICA_ADDED,
/**
* Unassigned as a result of a failed allocation of the shard.
*/
ALLOCATION_FAILED,
/**
* Unassigned as a result of the node hosting it leaving the cluster.
*/
NODE_LEFT,
/**
* Unassigned as a result of explicit cancel reroute command.
*/
REROUTE_CANCELLED,
/**
* When a shard moves from started back to initializing.
*/
REINITIALIZED,
/**
* A better replica location is identified and causes the existing replica allocation to be cancelled.
*/
REALLOCATED_REPLICA,
/**
* Unassigned as a result of a failed primary while the replica was initializing.
*/
PRIMARY_FAILED,
/**
* Unassigned after forcing an empty primary
*/
FORCED_EMPTY_PRIMARY,
/**
* Forced manually to allocate
*/
MANUAL_ALLOCATION,
/**
* Unassigned as a result of closing an index.
*/
INDEX_CLOSED,
/**
* Similar to NODE_LEFT, but at the time the node left, it had been registered for a restart via the Node Shutdown API. Note that
* there is no verification that it was ready to be restarted, so this may be an intentional restart or a node crash.
*/
NODE_RESTARTING,
/**
* Replica is unpromotable and the primary failed.
*/
UNPROMOTABLE_REPLICA,
/**
* New shard added as part of index re-sharding operation
*/
RESHARD_ADDED
}
/**
* Captures the status of an unsuccessful allocation attempt for the shard,
* causing it to remain in the unassigned state.
*
* Note, ordering of the enum is important, make sure to add new values
* at the end and handle version serialization properly.
*/
public enum AllocationStatus implements Writeable {
/**
* The shard was denied allocation to a node because the allocation deciders all returned a NO decision
*/
DECIDERS_NO((byte) 0),
/**
* The shard was denied allocation to a node because there were no valid shard copies found for it;
* this can happen on node restart with gateway allocation
*/
NO_VALID_SHARD_COPY((byte) 1),
/**
* The allocation attempt was throttled on the shard by the allocation deciders
*/
DECIDERS_THROTTLED((byte) 2),
/**
* Waiting on getting shard data from all nodes before making a decision about where to allocate the shard
*/
FETCHING_SHARD_DATA((byte) 3),
/**
* Allocation decision has been delayed
*/
DELAYED_ALLOCATION((byte) 4),
/**
* No allocation attempt has been made yet
*/
NO_ATTEMPT((byte) 5);
private final byte id;
AllocationStatus(byte id) {
this.id = id;
}
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeByte(id);
}
public static AllocationStatus readFrom(StreamInput in) throws IOException {
byte id = in.readByte();
return switch (id) {
case 0 -> DECIDERS_NO;
case 1 -> NO_VALID_SHARD_COPY;
case 2 -> DECIDERS_THROTTLED;
case 3 -> FETCHING_SHARD_DATA;
case 4 -> DELAYED_ALLOCATION;
case 5 -> NO_ATTEMPT;
default -> throw new IllegalArgumentException("Unknown AllocationStatus value [" + id + "]");
};
}
public static AllocationStatus fromDecision(Decision.Type decision) {
Objects.requireNonNull(decision);
return switch (decision) {
case NO -> DECIDERS_NO;
case THROTTLE -> DECIDERS_THROTTLED;
default -> throw new IllegalArgumentException("no allocation attempt from decision[" + decision + "]");
};
}
public String value() {
return toString().toLowerCase(Locale.ROOT);
}
}
/**
* creates an UnassignedInfo object based on **current** time
*
* @param reason the cause for making this shard unassigned. See {@link Reason} for more information.
* @param message more information about cause.
**/
public UnassignedInfo(Reason reason, String message) {
this(
reason,
message,
null,
reason == Reason.ALLOCATION_FAILED ? 1 : 0,
System.nanoTime(),
System.currentTimeMillis(),
false,
AllocationStatus.NO_ATTEMPT,
Collections.emptySet(),
null
);
}
/**
* @param reason the cause for making this shard unassigned. See {@link Reason} for more information.
* @param message more information about cause.
* @param failure the shard level failure that caused this shard to be unassigned, if exists.
* @param unassignedTimeNanos the time to use as the base for any delayed re-assignment calculation
* @param unassignedTimeMillis the time of unassignment used to display to in our reporting.
* @param delayed if allocation of this shard is delayed due to INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.
* @param lastAllocationStatus the result of the last allocation attempt for this shard
* @param failedNodeIds a set of nodeIds that failed to complete allocations for this shard
* @param lastAllocatedNodeId the ID of the node this shard was last allocated to
*/
public UnassignedInfo {
Objects.requireNonNull(reason);
Objects.requireNonNull(lastAllocationStatus);
failedNodeIds = Set.copyOf(failedNodeIds);
assert (failedAllocations > 0) == (reason == Reason.ALLOCATION_FAILED)
: "failedAllocations: " + failedAllocations + " for reason " + reason;
assert (message == null && failure != null) == false : "provide a message if a failure exception is provided";
assert (delayed && reason != Reason.NODE_LEFT && reason != Reason.NODE_RESTARTING) == false
: "shard can only be delayed if it is unassigned due to a node leaving";
// The below check should be expanded to require `lastAllocatedNodeId` for `NODE_LEFT` as well, once we no longer have to consider
// BWC with versions prior to `VERSION_LAST_ALLOCATED_NODE_ADDED`.
assert (reason == Reason.NODE_RESTARTING && lastAllocatedNodeId == null) == false
: "last allocated node ID must be set if the shard is unassigned due to a node restarting";
}
public static UnassignedInfo fromStreamInput(StreamInput in) throws IOException {
// Because Reason.NODE_RESTARTING is new and can't be sent by older versions, there's no need to vary the deserialization behavior
var reason = Reason.values()[(int) in.readByte()];
var unassignedTimeMillis = in.readLong();
// As System.nanoTime() cannot be compared across different JVMs, reset it to now.
// This means that in master fail-over situations, elapsed delay time is forgotten.
var unassignedTimeNanos = System.nanoTime();
var delayed = in.readBoolean();
var message = in.readOptionalString();
var failure = in.readException();
var failedAllocations = in.readVInt();
var lastAllocationStatus = AllocationStatus.readFrom(in);
var failedNodeIds = in.readCollectionAsImmutableSet(StreamInput::readString);
String lastAllocatedNodeId;
lastAllocatedNodeId = in.readOptionalString();
return new UnassignedInfo(
reason,
message,
failure,
failedAllocations,
unassignedTimeNanos,
unassignedTimeMillis,
delayed,
lastAllocationStatus,
failedNodeIds,
lastAllocatedNodeId
);
}
public void writeTo(StreamOutput out) throws IOException {
if (reason.equals(Reason.UNPROMOTABLE_REPLICA) && out.getTransportVersion().before(VERSION_UNPROMOTABLE_REPLICA_ADDED)) {
out.writeByte((byte) Reason.PRIMARY_FAILED.ordinal());
} else if (reason.equals(Reason.RESHARD_ADDED)
&& out.getTransportVersion().before(TransportVersions.UNASSIGENEDINFO_RESHARD_ADDED)) {
// We should have protection to ensure we do not reshard in mixed clusters
assert false;
out.writeByte((byte) Reason.FORCED_EMPTY_PRIMARY.ordinal());
} else {
out.writeByte((byte) reason.ordinal());
}
out.writeLong(unassignedTimeMillis);
// Do not serialize unassignedTimeNanos as System.nanoTime() cannot be compared across different JVMs
out.writeBoolean(delayed);
out.writeOptionalString(message);
out.writeException(failure);
out.writeVInt(failedAllocations);
lastAllocationStatus.writeTo(out);
out.writeStringCollection(failedNodeIds);
out.writeOptionalString(lastAllocatedNodeId);
}
/**
* Builds a string representation of the message and the failure if exists.
*/
@Nullable
public String details() {
if (message == null) {
return null;
}
return message + (failure == null ? "" : ", failure " + ExceptionsHelper.stackTrace(failure));
}
/**
* Calculates the delay left based on current time (in nanoseconds) and the delay defined by the index settings.
* Only relevant if shard is effectively delayed (see {@link #delayed()})
* Returns 0 if delay is negative
*
* @return calculated delay in nanoseconds
*/
public long remainingDelay(final long nanoTimeNow, final Settings indexSettings, final NodesShutdownMetadata nodesShutdownMetadata) {
final long indexLevelDelay = INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.get(indexSettings).nanos();
long delayTimeoutNanos = Optional.ofNullable(lastAllocatedNodeId)
// If the node wasn't restarting when this became unassigned, use default delay
.filter(nodeId -> reason.equals(Reason.NODE_RESTARTING))
.map(nodeId -> nodesShutdownMetadata.get(nodeId, SingleNodeShutdownMetadata.Type.RESTART))
.map(SingleNodeShutdownMetadata::getAllocationDelay)
.map(TimeValue::nanos)
.map(knownRestartDelay -> Math.max(indexLevelDelay, knownRestartDelay))
.orElse(indexLevelDelay);
assert nanoTimeNow - unassignedTimeNanos >= 0;
return Math.max(0L, delayTimeoutNanos - (nanoTimeNow - unassignedTimeNanos));
}
/**
* Returns the number of shards that are unassigned and currently being delayed.
*/
public static int getNumberOfDelayedUnassigned(ClusterState state) {
int count = 0;
for (ShardRouting shard : state.getRoutingNodes().unassigned()) {
if (shard.unassignedInfo().delayed()) {
count++;
}
}
return count;
}
/**
* Finds the next (closest) delay expiration of an delayed shard in nanoseconds based on current time.
* Returns 0 if delay is negative.
* Returns -1 if no delayed shard is found.
*/
public static long findNextDelayedAllocation(long currentNanoTime, ClusterState state) {
Metadata metadata = state.metadata();
long nextDelayNanos = Long.MAX_VALUE;
for (ShardRouting shard : state.getRoutingNodes().unassigned()) {
UnassignedInfo unassignedInfo = shard.unassignedInfo();
if (unassignedInfo.delayed()) {
Settings indexSettings = metadata.indexMetadata(shard.index()).getSettings();
// calculate next time to schedule
final long newComputedLeftDelayNanos = unassignedInfo.remainingDelay(
currentNanoTime,
indexSettings,
metadata.nodeShutdowns()
);
if (newComputedLeftDelayNanos < nextDelayNanos) {
nextDelayNanos = newComputedLeftDelayNanos;
}
}
}
return nextDelayNanos == Long.MAX_VALUE ? -1L : nextDelayNanos;
}
public String shortSummary() {
StringBuilder sb = new StringBuilder();
sb.append("[reason=").append(reason).append("]");
sb.append(", at[").append(DATE_TIME_FORMATTER.format(Instant.ofEpochMilli(unassignedTimeMillis))).append("]");
if (failedAllocations > 0) {
sb.append(", failed_attempts[").append(failedAllocations).append("]");
}
if (failedNodeIds.isEmpty() == false) {
sb.append(", failed_nodes[").append(failedNodeIds).append("]");
}
sb.append(", delayed=").append(delayed);
if (lastAllocatedNodeId != null) {
sb.append(", last_node[").append(lastAllocatedNodeId).append("]");
}
String details = details();
if (details != null) {
sb.append(", details[").append(details).append("]");
}
sb.append(", allocation_status[").append(lastAllocationStatus.value()).append("]");
return sb.toString();
}
@Override
public String toString() {
return "unassigned_info[" + shortSummary() + "]";
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject("unassigned_info");
builder.field("reason", reason);
builder.field("at", DATE_TIME_FORMATTER.format(Instant.ofEpochMilli(unassignedTimeMillis)));
if (failedAllocations > 0) {
builder.field("failed_attempts", failedAllocations);
}
if (failedNodeIds.isEmpty() == false) {
builder.stringListField("failed_nodes", failedNodeIds);
}
builder.field("delayed", delayed);
if (lastAllocatedNodeId != null) {
builder.field("last_node", lastAllocatedNodeId);
}
String details = details();
if (details != null) {
builder.field("details", details);
}
builder.field("allocation_status", lastAllocationStatus.value());
builder.endObject();
return builder;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
UnassignedInfo that = (UnassignedInfo) o;
if (unassignedTimeMillis != that.unassignedTimeMillis) {
return false;
}
if (delayed != that.delayed) {
return false;
}
if (failedAllocations != that.failedAllocations) {
return false;
}
if (reason != that.reason) {
return false;
}
if (Objects.equals(message, that.message) == false) {
return false;
}
if (lastAllocationStatus != that.lastAllocationStatus) {
return false;
}
if (Objects.equals(failure, that.failure) == false) {
return false;
}
if (Objects.equals(lastAllocatedNodeId, that.lastAllocatedNodeId) == false) {
return false;
}
return failedNodeIds.equals(that.failedNodeIds);
}
@Override
public int hashCode() {
int result = reason.hashCode();
result = 31 * result + Boolean.hashCode(delayed);
result = 31 * result + Integer.hashCode(failedAllocations);
result = 31 * result + Long.hashCode(unassignedTimeMillis);
result = 31 * result + (message != null ? message.hashCode() : 0);
result = 31 * result + (failure != null ? failure.hashCode() : 0);
result = 31 * result + lastAllocationStatus.hashCode();
result = 31 * result + failedNodeIds.hashCode();
result = 31 * result + (lastAllocatedNodeId != null ? lastAllocatedNodeId.hashCode() : 0);
return result;
}
}