Skip to content

Commit

Permalink
Make diagnostic message with job type and index
Browse files Browse the repository at this point in the history
  • Loading branch information
zuston committed Nov 17, 2021
1 parent 65d1b0a commit 2f5f0ae
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 8 deletions.
14 changes: 8 additions & 6 deletions tony-core/src/main/java/com/linkedin/tony/ApplicationMaster.java
Original file line number Diff line number Diff line change
Expand Up @@ -1038,21 +1038,23 @@ public void onStopContainerError(ContainerId containerId, Throwable t) {
private class RMCallbackHandler implements AMRMClientAsync.CallbackHandler {
@Override
public void onContainersCompleted(List<ContainerStatus> completedContainers) {
LOG.info("Completed containers: " + completedContainers.size());
LOG.info("onContainersCompleted called in RMCallbackHandler, completed containers size: " + completedContainers.size());
sleepForTesting();

for (ContainerStatus containerStatus : completedContainers) {
int exitStatus = containerStatus.getExitStatus();
LOG.info("ContainerID = " + containerStatus.getContainerId()
+ ", state = " + containerStatus.getState()
+ ", exitStatus = " + exitStatus);
String diagnostics = containerStatus.getDiagnostics();
String outputLog = "ContainerID = " + containerStatus.getContainerId()
+ ", state = " + containerStatus.getState()
+ ", exitStatus = " + exitStatus
+ ", diagnostics = " + diagnostics;

String errorInformation = null;
if (ContainerExitStatus.SUCCESS != exitStatus) {
errorInformation = diagnostics;
LOG.error(diagnostics);
LOG.error(outputLog);
} else {
LOG.info(diagnostics);
LOG.info(outputLog);
}
processFinishedContainer(containerStatus.getContainerId(), exitStatus, errorInformation);
}
Expand Down
6 changes: 4 additions & 2 deletions tony-core/src/main/java/com/linkedin/tony/TonySession.java
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,9 @@ public Map<String, List<String>> getClusterSpec() {
* Refresh task status when a TaskExecutor registers its exit code with AM.
*/
public void onTaskCompleted(String jobName, String jobIndex, int exitCode, String taskDiagnosticMsg) {
LOG.info(String.format("Job %s:%s exited with %d", jobName, jobIndex, exitCode));
String outputLog = String.format("Job %s:%s exited with %d", jobName, jobIndex, exitCode);
LOG.info(outputLog);

TonyTask task = getTask(jobName, jobIndex);
Preconditions.checkNotNull(task);
task.setExitStatus(exitCode);
Expand All @@ -271,7 +273,7 @@ public void onTaskCompleted(String jobName, String jobIndex, int exitCode, Strin
// short circuit and immediately stop training if a worker failed.
if (exitCode != ContainerExitStatus.SUCCESS && exitCode != ContainerExitStatus.KILLED_BY_APPMASTER) {
if (isChief(jobName, jobIndex) || shouldStopOnFailure(jobName) || isFailOnWorkerFailure()) {
String diagnostic = "Exit status: " + exitCode;
String diagnostic = outputLog + ". Exit status: " + exitCode;
if (taskDiagnosticMsg != null) {
diagnostic += ". Error msg: " + taskDiagnosticMsg;
}
Expand Down

0 comments on commit 2f5f0ae

Please sign in to comment.