diff --git a/tony-core/src/main/java/com/linkedin/tony/ApplicationMaster.java b/tony-core/src/main/java/com/linkedin/tony/ApplicationMaster.java index b9e7dac3..498fac18 100644 --- a/tony-core/src/main/java/com/linkedin/tony/ApplicationMaster.java +++ b/tony-core/src/main/java/com/linkedin/tony/ApplicationMaster.java @@ -1038,21 +1038,23 @@ public void onStopContainerError(ContainerId containerId, Throwable t) { private class RMCallbackHandler implements AMRMClientAsync.CallbackHandler { @Override public void onContainersCompleted(List completedContainers) { - LOG.info("Completed containers: " + completedContainers.size()); + LOG.info("onContainersCompleted called in RMCallbackHandler, completed containers size: " + completedContainers.size()); sleepForTesting(); for (ContainerStatus containerStatus : completedContainers) { int exitStatus = containerStatus.getExitStatus(); - LOG.info("ContainerID = " + containerStatus.getContainerId() - + ", state = " + containerStatus.getState() - + ", exitStatus = " + exitStatus); String diagnostics = containerStatus.getDiagnostics(); + String outputLog = "ContainerID = " + containerStatus.getContainerId() + + ", state = " + containerStatus.getState() + + ", exitStatus = " + exitStatus + + ", diagnostics = " + diagnostics; + String errorInformation = null; if (ContainerExitStatus.SUCCESS != exitStatus) { errorInformation = diagnostics; - LOG.error(diagnostics); + LOG.error(outputLog); } else { - LOG.info(diagnostics); + LOG.info(outputLog); } processFinishedContainer(containerStatus.getContainerId(), exitStatus, errorInformation); } diff --git a/tony-core/src/main/java/com/linkedin/tony/TonySession.java b/tony-core/src/main/java/com/linkedin/tony/TonySession.java index ba63f04f..a7d2dd2f 100644 --- a/tony-core/src/main/java/com/linkedin/tony/TonySession.java +++ b/tony-core/src/main/java/com/linkedin/tony/TonySession.java @@ -256,7 +256,9 @@ public Map> getClusterSpec() { * Refresh task status when a TaskExecutor registers its exit code with AM. */ public void onTaskCompleted(String jobName, String jobIndex, int exitCode, String taskDiagnosticMsg) { - LOG.info(String.format("Job %s:%s exited with %d", jobName, jobIndex, exitCode)); + String outputLog = String.format("Job %s:%s exited with %d", jobName, jobIndex, exitCode); + LOG.info(outputLog); + TonyTask task = getTask(jobName, jobIndex); Preconditions.checkNotNull(task); task.setExitStatus(exitCode); @@ -271,7 +273,7 @@ public void onTaskCompleted(String jobName, String jobIndex, int exitCode, Strin // short circuit and immediately stop training if a worker failed. if (exitCode != ContainerExitStatus.SUCCESS && exitCode != ContainerExitStatus.KILLED_BY_APPMASTER) { if (isChief(jobName, jobIndex) || shouldStopOnFailure(jobName) || isFailOnWorkerFailure()) { - String diagnostic = "Exit status: " + exitCode; + String diagnostic = outputLog + ". Exit status: " + exitCode; if (taskDiagnosticMsg != null) { diagnostic += ". Error msg: " + taskDiagnosticMsg; }