diff --git a/tony-core/src/main/java/com/linkedin/tony/TonyConfigurationKeys.java b/tony-core/src/main/java/com/linkedin/tony/TonyConfigurationKeys.java index fbe7cd8e..3a0ba311 100644 --- a/tony-core/src/main/java/com/linkedin/tony/TonyConfigurationKeys.java +++ b/tony-core/src/main/java/com/linkedin/tony/TonyConfigurationKeys.java @@ -345,6 +345,8 @@ public static String getContainerDockerMountKey() { public static final String GROUP_REGEX = TONY_APPLICATION_PREFIX + "group\\.([A-Za-z]+)$"; public static final String GROUP_DEPEND_TIMEOUT_REGEX = TONY_APPLICATION_PREFIX + "dependency\\.([A-Za-z]+)\\.timeout\\.after\\.([A-Za-z]+)$"; + public static final String GROUP_DEPEND_TIMEOUT_IGNORE_REGEX = + TONY_APPLICATION_PREFIX + "dependency\\.([A-Za-z]+)\\.timeout\\.after\\.([A-Za-z]+)$.ignored"; public static String getGroupKey(String groupName) { return String.format(TONY_APPLICATION_PREFIX + "group.%s", groupName); @@ -353,4 +355,8 @@ public static String getGroupKey(String groupName) { public static String getGroupDependentKey(String grp, String dependentGrp) { return String.format(TONY_APPLICATION_PREFIX + "dependency.%s.timeout.after.%s", grp, dependentGrp); } + + public static String getGroupDependentIgnoredKey(String roleType, String dependentGrp) { + return String.format(TONY_APPLICATION_PREFIX + "dependency.%s.timeout.after.%s.ignored", roleType, dependentGrp); + } } diff --git a/tony-core/src/main/java/com/linkedin/tony/runtime/MLGenericRuntime.java b/tony-core/src/main/java/com/linkedin/tony/runtime/MLGenericRuntime.java index ee5e1765..5c1928bd 100644 --- a/tony-core/src/main/java/com/linkedin/tony/runtime/MLGenericRuntime.java +++ b/tony-core/src/main/java/com/linkedin/tony/runtime/MLGenericRuntime.java @@ -41,6 +41,7 @@ import com.linkedin.tony.util.Utils; import static com.linkedin.tony.Constants.SIDECAR_TB_ROLE_NAME; +import static com.linkedin.tony.TonyConfigurationKeys.getGroupDependentIgnoredKey; public abstract class MLGenericRuntime implements FrameworkRuntime { private static final long REGISTRATION_STATUS_INTERVAL_MS = 15 * 1000; @@ -137,9 +138,16 @@ public boolean isHealthy(Configuration tonyConf) { * chief/workers are finished, the mechanism of dependency group timeout will make job failed. * * Dependency group timeout configuration as follows: - * + * ``` * tony.application.group.A = worker,chief * tony.application.dependency.evaluator.timeout.after.A = 3600 + * ``` + * + * And in some of the cases, we don't want to fail the whole job even though a dependency times out. + * For example, if chief succeeded and there is a worker hanging for 1 hour, + * users can configure the job to still pass. So it introduces the new config of + * `tony.application.dependency.[X].timeout.after.[GROUP].ignored = true`, and more details could be + * found in https://github.com/tony-framework/TonY/issues/641. * */ String errorMsg = null; @@ -231,10 +239,19 @@ protected String groupDependencyTimeout(Configuration tonyConf) { log.info("Running job type: " + runningTaskType + ", all dependent task finished: " + allDependentTaskFinished); if (System.currentTimeMillis() - latestEndTimeInAllDependentTasks > timeout) { - return String.format("Jobtype: %s runs exceeded timeout(%s sec) because it's " - + "dependent group: %s (task set: [%s]) has been finished", - runningTaskType, dependentGroupPair.getValue(), dependentGroupName, - StringUtils.join(grpWithMembersIndex.get(dependentGroupName), ",")); + String ignoredTaskTypeKey = getGroupDependentIgnoredKey(runningTaskType, dependentGroupName); + boolean ignoreTimeout = tonyConf.getBoolean(ignoredTaskTypeKey, false); + if (!ignoreTimeout) { + return String.format("Task type: %s runs exceeded timeout because it's " + + "dependent group: %s (task set: [%s]) has been finished.", + runningTaskType, dependentGroupName, + StringUtils.join(grpWithMembersIndex.get(dependentGroupName), ",")); + } + + log.info( + String.format("Task type: %s is marked as untracked.", runningJobTypes) + ); + session.makeTaskTypeUntracked(runningTaskType); } } diff --git a/tony-core/src/main/java/com/linkedin/tony/tensorflow/TonySession.java b/tony-core/src/main/java/com/linkedin/tony/tensorflow/TonySession.java index 2d2eb1d3..78f56074 100644 --- a/tony-core/src/main/java/com/linkedin/tony/tensorflow/TonySession.java +++ b/tony-core/src/main/java/com/linkedin/tony/tensorflow/TonySession.java @@ -22,6 +22,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -39,6 +40,8 @@ import static com.linkedin.tony.Constants.CHIEF_JOB_NAME; import static com.linkedin.tony.Constants.WORKER_JOB_NAME; +import static com.linkedin.tony.TonyConfigurationKeys.UNTRACKED_JOBTYPES; +import static com.linkedin.tony.util.Utils.getUntrackedJobTypes; /** @@ -639,4 +642,11 @@ public int getNumRegisteredTasks() { public Set getRegisteredTasks() { return registeredTasks; } + + public void makeTaskTypeUntracked(String taskType) { + String[] defaultUntrackedTypes = getUntrackedJobTypes(tonyConf); + List untrackedList = Arrays.stream(defaultUntrackedTypes).collect(Collectors.toList()); + untrackedList.add(taskType); + tonyConf.set(UNTRACKED_JOBTYPES, StringUtils.join(untrackedList, ",")); + } }