in genie-web/src/main/java/com/netflix/genie/web/services/impl/JobResolverServiceImpl.java [452:570]
private void resolveCluster(final JobResolutionContext context) throws GenieJobResolutionException {
final long start = System.nanoTime();
final Set<Tag> tags = new HashSet<>();
final String jobId = context.getJobId();
try {
final Command command = context
.getCommand()
.orElseThrow(
() -> new IllegalStateException(
"Command not resolved before attempting to resolve a cluster for job " + jobId
)
);
final Set<Cluster> candidateClusters = context
.getCommandClusters()
.orElseThrow(
() -> new IllegalStateException("Command to candidate cluster map not available for job " + jobId)
)
.get(command);
if (candidateClusters == null || candidateClusters.isEmpty()) {
throw new IllegalStateException(
"Command " + command.getId() + " had no candidate clusters for job " + jobId
);
}
Cluster cluster = null;
for (final ClusterSelector clusterSelector : this.clusterSelectors) {
// Create subset of tags just for this selector. Copy existing tags if any.
final Set<Tag> selectorTags = new HashSet<>(tags);
// Note: This is done before the selection because if we do it after and the selector throws
// exception then we don't have this tag in the metrics. Which is unfortunate since the result
// does return the selector
final String clusterSelectorClass = this.getProxyObjectClassName(clusterSelector);
selectorTags.add(Tag.of(MetricsConstants.TagKeys.CLASS_NAME, clusterSelectorClass));
try {
final ResourceSelectionResult<Cluster> result = clusterSelector.select(
new ClusterSelectionContext(
jobId,
context.getJobRequest(),
context.isApiJob(),
command,
candidateClusters
)
);
final Optional<Cluster> selectedClusterOptional = result.getSelectedResource();
if (selectedClusterOptional.isPresent()) {
cluster = selectedClusterOptional.get();
LOG.debug(
"Successfully selected cluster {} using selector {} for job {} with rationale: {}",
cluster.getId(),
clusterSelectorClass,
jobId,
result.getSelectionRationale().orElse(NO_RATIONALE)
);
selectorTags.add(Tag.of(MetricsConstants.TagKeys.STATUS, CLUSTER_SELECTOR_STATUS_SUCCESS));
selectorTags.add(Tag.of(MetricsConstants.TagKeys.CLUSTER_ID, cluster.getId()));
selectorTags.add(
Tag.of(MetricsConstants.TagKeys.CLUSTER_NAME, cluster.getMetadata().getName())
);
break;
} else {
selectorTags.add(
Tag.of(MetricsConstants.TagKeys.STATUS, CLUSTER_SELECTOR_STATUS_NO_PREFERENCE)
);
selectorTags.add(NO_CLUSTER_RESOLVED_ID);
selectorTags.add(NO_CLUSTER_RESOLVED_NAME);
LOG.debug(
"Selector {} returned no preference with rationale: {}",
clusterSelectorClass,
result.getSelectionRationale().orElse(NO_RATIONALE)
);
}
} catch (final Exception e) {
// Swallow exception and proceed to next selector.
// This is a choice to provides "best-service": select a cluster as long as it matches criteria,
// even if one of the selectors encountered an error and cannot choose the best candidate.
MetricsUtils.addFailureTagsWithException(selectorTags, e);
LOG.warn(
"Cluster selector {} evaluation threw exception for job {}",
clusterSelectorClass,
jobId,
e
);
} finally {
this.registry.counter(CLUSTER_SELECTOR_COUNTER, selectorTags).increment();
}
}
if (cluster == null) {
throw new GenieJobResolutionException("No cluster resolved for job " + jobId);
}
LOG.debug("Resolved cluster {} for job {}", cluster.getId(), jobId);
context.setCluster(cluster);
MetricsUtils.addSuccessTags(tags);
final String clusterId = cluster.getId();
final String clusterName = cluster.getMetadata().getName();
tags.add(Tag.of(MetricsConstants.TagKeys.CLUSTER_ID, clusterId));
tags.add(Tag.of(MetricsConstants.TagKeys.CLUSTER_NAME, clusterName));
final SpanCustomizer spanCustomizer = context.getSpanCustomizer();
this.tagAdapter.tag(spanCustomizer, TracingConstants.JOB_CLUSTER_ID_TAG, clusterId);
this.tagAdapter.tag(spanCustomizer, TracingConstants.JOB_CLUSTER_NAME_TAG, clusterName);
} catch (final GenieJobResolutionException e) {
tags.add(NO_CLUSTER_RESOLVED_ID);
tags.add(NO_CLUSTER_RESOLVED_NAME);
MetricsUtils.addFailureTagsWithException(tags, e);
throw e;
} catch (final Throwable t) {
MetricsUtils.addFailureTagsWithException(tags, t);
throw new GenieJobResolutionRuntimeException(t);
} finally {
this.registry
.timer(RESOLVE_CLUSTER_TIMER, tags)
.record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
}
}