in genie-web/src/main/java/com/netflix/genie/web/services/impl/JobLaunchServiceImpl.java [114:223]
public String launchJob(
@Valid final JobSubmission jobSubmission
) throws
AgentLaunchException,
GenieJobResolutionException,
IdAlreadyExistsException,
NotFoundException {
final long start = System.nanoTime();
final SpanCustomizer span = this.tracer.currentSpanCustomizer();
span.annotate(BEGIN_LAUNCH_JOB_ANNOTATION);
final Set<Tag> tags = Sets.newHashSet();
try {
/*
* Steps:
*
* 1. Save the job information
* 2. Attempt to resolve the job information (includes saving)
* 3. Mark the job as accepted
* 4. Launch the agent process given the implementation configured for this Genie instance
* 5. If the agent launch fails mark the job failed else return
*/
final String jobId = this.persistenceService.saveJobSubmission(jobSubmission);
span.annotate(SAVED_JOB_SUBMISSION_ANNOTATION);
final ResolvedJob resolvedJob;
try {
resolvedJob = this.jobResolverService.resolveJob(jobId);
} catch (final Throwable t) {
final String message;
if (t instanceof GenieJobResolutionException) {
message = JobStatusMessages.FAILED_TO_RESOLVE_JOB;
} else {
message = JobStatusMessages.RESOLUTION_RUNTIME_ERROR;
}
MetricsUtils.addFailureTagsWithException(tags, t);
this.persistenceService.updateJobArchiveStatus(jobId, ArchiveStatus.NO_FILES);
if (
this.updateJobStatus(jobId, JobStatus.RESERVED, JobStatus.FAILED, message, INITIAL_ATTEMPT)
!= JobStatus.FAILED
) {
log.error("Updating status to failed didn't succeed");
}
throw t; // Caught below for metrics gathering
}
span.annotate(RESOLVED_JOB_ANNOTATION);
// Job state should be RESOLVED now. Mark it ACCEPTED to avoid race condition with agent starting up
// before we get return from launchAgent and trying to set it to CLAIMED
try {
final JobStatus updatedStatus = this.updateJobStatus(
jobId,
JobStatus.RESOLVED,
JobStatus.ACCEPTED,
ACCEPTED_MESSAGE,
INITIAL_ATTEMPT
);
if (updatedStatus != JobStatus.ACCEPTED) {
throw new AgentLaunchException("Unable to mark job accepted. Job state " + updatedStatus);
}
} catch (final Exception e) {
this.persistenceService.updateJobArchiveStatus(jobId, ArchiveStatus.NO_FILES);
// TODO: Failed to update the status to accepted. Try to set it to failed or rely on other cleanup
// mechanism? For now rely on janitor mechanisms
throw e;
}
span.annotate(MARKED_JOB_ACCEPTED_ANNOTATION);
// TODO: at the moment this is not populated, it's going to be a null node (not null)
final JsonNode requestedLauncherExt = this.persistenceService.getRequestedLauncherExt(jobId);
final Optional<JsonNode> launcherExt;
try {
final AgentLauncher launcher = this.selectLauncher(jobId, jobSubmission, resolvedJob);
tags.add(Tag.of(LAUNCHER_CLASS_TAG, launcher.getClass().getCanonicalName()));
launcherExt = launcher.launchAgent(resolvedJob, requestedLauncherExt);
} catch (final AgentLaunchException e) {
this.persistenceService.updateJobArchiveStatus(jobId, ArchiveStatus.NO_FILES);
this.updateJobStatus(jobId, JobStatus.ACCEPTED, JobStatus.FAILED, e.getMessage(), INITIAL_ATTEMPT);
// TODO: How will we get the ID back to the user? Should we add it to an exception? We don't get
// We don't get the ID until after saveJobSubmission so if that fails we'd still return nothing
// Probably need multiple exceptions to be thrown from this API (if we go with checked)
throw e;
}
span.annotate(LAUNCHED_AGENT_ANNOTATION);
if (launcherExt.isPresent()) {
try {
this.persistenceService.updateLauncherExt(jobId, launcherExt.get());
} catch (final Exception e) {
// Being unable to update the launcher ext is not optimal however
// it's not worth returning an error to the user at this point as
// the agent has launched and we have all the other pieces in place
log.error("Unable to update the launcher ext for job {}", jobId, e);
}
}
span.annotate(SAVED_LAUNCHER_EXT_ANNOTATION);
MetricsUtils.addSuccessTags(tags);
return jobId;
} catch (final Throwable t) {
MetricsUtils.addFailureTagsWithException(tags, t);
throw t;
} finally {
span.annotate(END_LAUNCH_JOB_ANNOTATION);
this.registry
.timer(LAUNCH_JOB_TIMER, tags)
.record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
}
}