Skip to content

Commit

Permalink
Make the conf of tensorboard-log-dir valid in tony-conf xml
Browse files Browse the repository at this point in the history
  • Loading branch information
zuston committed Nov 16, 2021
1 parent 15c0552 commit 686ce01
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 3 deletions.
5 changes: 4 additions & 1 deletion tony-core/src/main/java/com/linkedin/tony/TonyClient.java
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,10 @@ public boolean init(String[] args) throws ParseException, IOException {
}

if (cliParser.hasOption("sidecar_tensorboard_log_dir") || tonyConf.get(TENSORBOARD_LOG_DIR) != null) {
String tbLogDir = cliParser.getOptionValue("sidecar_tensorboard_log_dir");
// When tb logdir in tony-cli and tony-conf are configured at the same time,
// the configuration priority of tony-cli is higher
String tbLogDir = cliParser.hasOption("sidecar_tensorboard_log_dir") ?
cliParser.getOptionValue("sidecar_tensorboard_log_dir") : tonyConf.get(TENSORBOARD_LOG_DIR);
setSidecarTBResources(tbLogDir, executionEnvPair);
}

Expand Down
4 changes: 2 additions & 2 deletions tony-core/src/main/java/com/linkedin/tony/TonySession.java
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ public Map<String, List<String>> getClusterSpec() {
* Refresh task status when a TaskExecutor registers its exit code with AM.
*/
public void onTaskCompleted(String jobName, String jobIndex, int exitCode, String taskDiagnosticMsg) {
String outputLog = String.format("Job %s:%s exited with %d. ", jobName, jobIndex, exitCode);
String outputLog = String.format("Job %s:%s exited with %d", jobName, jobIndex, exitCode);
LOG.info(outputLog);

TonyTask task = getTask(jobName, jobIndex);
Expand All @@ -273,7 +273,7 @@ public void onTaskCompleted(String jobName, String jobIndex, int exitCode, Strin
// short circuit and immediately stop training if a worker failed.
if (exitCode != ContainerExitStatus.SUCCESS && exitCode != ContainerExitStatus.KILLED_BY_APPMASTER) {
if (isChief(jobName, jobIndex) || shouldStopOnFailure(jobName) || isFailOnWorkerFailure()) {
String diagnostic = outputLog + "Exit status: " + exitCode;
String diagnostic = outputLog + ". Exit status: " + exitCode;
if (taskDiagnosticMsg != null) {
diagnostic += ". Error msg: " + taskDiagnosticMsg;
}
Expand Down

0 comments on commit 686ce01

Please sign in to comment.