Skip to content

Commit

Permalink
Force updating the job status to KILLED when killing a job that has a…
Browse files Browse the repository at this point in the history
… connected agent but no response observer (#1192)

Co-authored-by: bhou <bhou@netflix.com>
  • Loading branch information
bhou2 and bhou authored Oct 24, 2023
1 parent c6c81dc commit 73aa7cc
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -154,15 +154,42 @@ public void killJob(
= this.parkedJobKillResponseObservers.remove(jobId);

if (responseObserver == null) {
log.error("Job {} not killed. Expected local agent connection not found", jobId);
throw new GenieServerException(
"Job " + jobId + " not killed. Expected local agent connection not found."
// This might happen when the agent has gone but its status is not updated
// In this case, we force updating the job status to KILLED.
log.warn("Tried to kill Job {}, but expected local agent connection not found. "
+ "Trying to force updating the job status to {}",
jobId,
JobStatus.KILLED
);
}
responseObserver.onNext(JobKillRegistrationResponse.newBuilder().build());
responseObserver.onCompleted();
try {
this.persistenceService.updateJobStatus(jobId, currentJobStatus, JobStatus.KILLED, reason);
log.info("Succeeded to force updating the status of Job {} to {}",
jobId,
JobStatus.KILLED
);
} catch (final GenieInvalidStatusException e) {
log.error(
"Failed to force updating the status of Job {} to {} "
+ "due to current status not being expected {}",
jobId,
JobStatus.KILLED,
currentJobStatus
);
throw e;
} catch (final NotFoundException e) {
log.error(
"Failed to force updating the status of Job {} to {} due to job not found",
jobId,
JobStatus.KILLED
);
throw new GenieJobNotFoundException(e);
}
} else {
responseObserver.onNext(JobKillRegistrationResponse.newBuilder().build());
responseObserver.onCompleted();

log.info("Agent notified for killing job {}", jobId);
log.info("Agent notified for killing job {}", jobId);
}
} else {
// Agent is running somewhere else try to forward the request
final String hostname = this.agentRoutingService
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,16 +145,42 @@ class GRpcJobKillServiceImplSpec extends Specification {
0 * this.agentRoutingService.isAgentConnectionLocal(this.jobId)
noExceptionThrown()

when: "The job is active, the agent is connected, the job is local but no observer"
when: "The job is active, the agent is connected, the job is local but no observer, and force updating job status succeeded"
this.serviceSpy.killJob(this.jobId, this.reason, this.servletRequest)

then: "Correct exception is thrown"
then: "The database is updated and no exception is thrown"
1 * this.persistenceService.getJobStatus(this.jobId) >> JobStatus.CLAIMED
0 * this.persistenceService.updateJobStatus(_ as String, _ as JobStatus, _ as JobStatus, _ as String)
1 * this.persistenceService.updateJobStatus(_ as String, _ as JobStatus, _ as JobStatus, _ as String)
1 * this.agentRoutingService.isAgentConnectionLocal(this.jobId) >> true
0 * this.responseObserver.onNext(_ as JobKillRegistrationResponse)
0 * this.responseObserver.onCompleted()
thrown(GenieServerException)
noExceptionThrown()

when: "The job is active, the agent is connected, the job is local but no observer, and current job status is invalid for updating"
this.serviceSpy.killJob(this.jobId, this.reason, this.servletRequest)

then: "The database is not updated and GenieInvalidStatusException is thrown"
1 * this.persistenceService.getJobStatus(this.jobId) >> JobStatus.CLAIMED
1 * this.persistenceService.updateJobStatus(this.jobId, JobStatus.CLAIMED, JobStatus.KILLED, this.reason) >> {
throw new GenieInvalidStatusException()
}
1 * this.agentRoutingService.isAgentConnectionLocal(this.jobId) >> true
0 * this.responseObserver.onNext(_ as JobKillRegistrationResponse)
0 * this.responseObserver.onCompleted()
thrown(GenieInvalidStatusException)

when: "The job is active, the agent is connected, the job is local but no observer, and the job is not found"
this.serviceSpy.killJob(this.jobId, this.reason, this.servletRequest)

then: "The database is not updated and GenieJobNotFoundException is thrown"
1 * this.persistenceService.getJobStatus(this.jobId) >> JobStatus.CLAIMED
1 * this.persistenceService.updateJobStatus(this.jobId, JobStatus.CLAIMED, JobStatus.KILLED, this.reason) >> {
throw new NotFoundException()
}
1 * this.agentRoutingService.isAgentConnectionLocal(this.jobId) >> true
0 * this.responseObserver.onNext(_ as JobKillRegistrationResponse)
0 * this.responseObserver.onCompleted()
thrown(GenieJobNotFoundException)

when: "The job is active, the agent is connected, and there is an observer"
this.serviceSpy.registerForKillNotification(this.request, this.responseObserver)
Expand Down

0 comments on commit 73aa7cc

Please sign in to comment.